diff --git a/Jenkinsfile b/Jenkinsfile index 8493c5090f..f1fb07ea52 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,2 +1,2 @@ @Library('pipeline-library')_ -FullVitisLibPipeline (branch: 'next', libname: 'Vitis_Libraries', TARGETS: 'hls_csim:hls_csynth:hls_cosim:vivado_syn:vitis_sw_emu:vitis_hw_emu:vitis_hw_build:vitis_aie_sim:vitis_aie_x86sim', TOOLVERSION: '2022.1_stable_latest') +FullVitisLibPipeline (branch: 'master', libname: 'Vitis_Libraries', TARGETS: 'hls_csim:hls_csynth:hls_cosim:vivado_syn:vitis_sw_emu:vitis_hw_emu:vitis_hw_build:vitis_aie_sim:vitis_aie_x86sim', TOOLVERSION: '2022.1_released') diff --git a/blas/Jenkinsfile b/blas/Jenkinsfile index b79985f137..0ea47ca8ee 100644 --- a/blas/Jenkinsfile +++ b/blas/Jenkinsfile @@ -1,5 +1,4 @@ @Library('pipeline-library')_ -VitisLibPipeline (branch: 'next', libname: 'xf_blas', TARGETS: 'hls_csim:hls_csynth:hls_cosim:vitis_sw_emu:vitis_hw_emu:vitis_hw_build', - upstream_dependencies: 'xf_hpc,next,../hpc', - devtest: 'RunDeploy.sh', TOOLVERSION: '2022.1_stable_latest') +VitisLibPipeline (branch: 'main', libname: 'xf_blas', TARGETS: 'hls_csim:hls_csynth:hls_cosim:vitis_sw_emu:vitis_hw_emu:vitis_hw_build', + upstream_dependencies: 'xf_hpc,main,../hpc', devtest: 'RunDeploy.sh', TOOLVERSION: '2022.1_released', mail_on:'daily:PR') diff --git a/blas/L2/tests/memKernel/gemm_1CU/Makefile b/blas/L2/tests/memKernel/gemm_1CU/Makefile index 23a05a3cd0..bd70f04ebb 100644 --- a/blas/L2/tests/memKernel/gemm_1CU/Makefile +++ b/blas/L2/tests/memKernel/gemm_1CU/Makefile @@ -34,9 +34,15 @@ help:: $(ECHO) " Command to build host application." $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" $(ECHO) "" - $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, env variable SYSROOT and EDGE_COMMON_SW need to be set first, and HOST_ARCH is either aarch32 or aarch64. For example," - $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" - $(ECHO) " export EDGE_COMMON_SW=< path-to-rootfs-and-Image-files >" + $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, HOST_ARCH is either aarch32 or aarch64." + $(ECHO) " a.IF Download the platform, and common-image from Xilinx Download Center(Suggested):" + $(ECHO) " Run the sdk.sh script from the common-image directory to install sysroot using the command : ./sdk.sh -y -d ./ -p " + $(ECHO) " Unzip the rootfs file : gunzip ./rootfs.ext4.gz" + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " b. User could also define SYSROOT, K_IMAGE and ROOTFS by themselves: " + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " export K_IMAGE=< path-to-Image-files >" + $(ECHO) " export ROOTFS=< path-to-rootfs >" $(ECHO) "" $(ECHO) " make clean " $(ECHO) " Command to remove the generated non-hardware files." @@ -108,10 +114,6 @@ ifeq ($(TARGET),hw_emu) CXXFLAGS += -D HW_EMU_TEST endif -ifeq (,$(findstring opencv,$(CXXFLAGS))) -CXXFLAGS += $(XRT_CXXFLAGS) -endif - #Inclue Required Host Source Files ifneq (,$(shell echo $(XPLATFORM) | awk '/u250/')) HOST_SRCS += $(XFLIB_DIR)/L2/src/memKernel/sw/main.cpp $(XFLIB_DIR)/L2/src/xcl2/xcl2.cpp @@ -134,6 +136,11 @@ CXXFLAGS += -I $(XFLIB_DIR)/L1/include/hw/xf_blas/helpers/utils -I $(XFLIB_DIR) CXXFLAGS += -O3 endif +# workaround for opencv +ifeq (,$(findstring opencv,$(CXXFLAGS))) +CXXFLAGS += $(XRT_CXXFLAGS) +endif + EXE_NAME := host.exe EXE_FILE := $(BUILD_DIR)/$(EXE_NAME) EXE_FILE_DEPS := $(HOST_SRCS) $(EXE_FILE_DEPS) @@ -206,11 +213,6 @@ $(EMCONFIG): emconfigutil --platform $(XPLATFORM) --od $(BUILD_DIR) ############################## Preparing sdcard folder ############################## ifneq ($(HOST_ARCH), x86) -ifneq (,$(findstring zc706, $(PLATFORM_NAME))) -K_IMAGE := $(SYSROOT)/../../uImage -else -K_IMAGE := $(SYSROOT)/../../Image -endif RUN_SCRIPT := $(BUILD_DIR)/run_script.sh $(RUN_SCRIPT): rm -rf $(RUN_SCRIPT) @@ -244,21 +246,21 @@ SD_DIRS_WITH_PREFIX = $(foreach sd_dir,$(DATA_DIR),--package.sd_dir $(sd_dir)) PACKAGE_FILES := $(BINARY_CONTAINERS) PACKAGE_FILES += $(AIE_CONTAINER) SD_CARD := $(CUR_DIR)/package_$(TARGET) -vck190_dfx_hw := false -$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) +dfx_hw := false +$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) check_kimage check_rootfs @echo "Generating sd_card folder...." mkdir -p $(SD_CARD) chmod a+rx $(BUILD_DIR)/run_script.sh -ifneq (,$(findstring vck190_base_dfx, $(PLATFORM_NAME))) +ifeq ($(findstring _dfx_, $(PLATFORM_NAME)),_dfx_) ifeq ($(TARGET),hw) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) -o $(BINARY_CONTAINERS_PKG) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) $(VPP_PACKAGE) -o $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(ROOTFS) --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) @echo "### ***** sd_card generation done! ***** ###" -vck190_dfx_hw := true +dfx_hw := true endif endif -ifeq ($(vck190_dfx_hw), false) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) +ifeq ($(dfx_hw), false) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(ROOTFS) --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) @echo "### ***** sd_card generation done! ***** ###" endif @@ -305,14 +307,16 @@ endif #hw ifeq ($(TARGET), hw) ifneq (,$(findstring aws-vu9p-f1, $(PLATFORM_NAME))) -ifneq ($(JENKINS_INTERNAL_BUILD), 1) +ifeq (,$(wildcard $(BUILD_DIR)/blas.awsxclbin)) $(ECHO) "This makefile does not directly support converting .xclbin to .awsxclbin, please refer https://github.com/aws/aws-fpga/blob/master/Vitis/README.md for next operations" else - $(ECHO) "Running inside Xilinx regression without converting to .awsxclbin" - $(EXE_FILE) $(HOST_ARGS) + $(ECHO) "Running HW using generated .awsxclbin" + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(EXE_FILE) $(subst .xclbin,.awsxclbin,$(HOST_ARGS)) make check endif else ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ $(EXE_FILE) $(HOST_ARGS) make check else @@ -349,12 +353,11 @@ cleanh: cleank: -$(RMDIR) $(BUILD_DIR)/*.xclbin _vimage *xclbin.run_summary qemu-memory-_* emulation/ _vimage/ pl*start_simulation. sh *.xclbin - -$(RMDIR) _x_temp.*/_x.* _x_temp.*/.Xil _x_temp.*/profile_summary.* xo_* _x* - -$(RMDIR) _x_temp.*/dltmp* _x_temp.*/kernel_info.dat _x_temp.*/*.log - -$(RMDIR) _x_temp.* + -$(RMDIR) _x_temp.* cleanall: cleanh cleank -$(RMDIR) $(BUILD_DIR) emconfig.json *.html $(TEMP_DIR) $(CUR_DIR)/reports *.csv *.run_summary $(CUR_DIR)/*.raw package_* $(BUILD_DIR)/run_script.sh .ipcache *.str -$(RMDIR) $(CUR_DIR)/Work $(CUR_DIR)/*.xpe $(CUR_DIR)/hw.o $(CUR_DIR)/*.xsa $(CUR_DIR)/xnwOut + -$(RMDIR) clean: cleanh \ No newline at end of file diff --git a/blas/L2/tests/memKernel/gemm_1CU/utils.mk b/blas/L2/tests/memKernel/gemm_1CU/utils.mk index 0ee80e90da..1d97b0ad1a 100644 --- a/blas/L2/tests/memKernel/gemm_1CU/utils.mk +++ b/blas/L2/tests/memKernel/gemm_1CU/utils.mk @@ -50,6 +50,7 @@ ifndef XILINX_XRT export XILINX_XRT endif +.PHONY: check_device check_device: @set -eu; \ inallowlist=False; \ @@ -107,6 +108,7 @@ ifneq ($(HOST_ARCH), $(filter $(HOST_ARCH),aarch64 aarch32 x86)) $(error HOST_ARCH variable not set, please set correctly and rerun) endif +.PHONY: check_version check_sysroot check_kimage check_rootfs check_version: ifneq (, $(shell which git)) ifneq (,$(wildcard $(XFLIB_DIR)/.git)) @@ -114,13 +116,34 @@ ifneq (,$(wildcard $(XFLIB_DIR)/.git)) endif endif -#Checks for SYSROOT +#Set/Check SYSROOT/K_IMAGE/ROOTFS +ifneq ($(HOST_ARCH), x86) +ifneq (,$(findstring zc706, $(PLATFORM_NAME))) +K_IMAGE ?= $(SYSROOT)/../../uImage +else +K_IMAGE ?= $(SYSROOT)/../../Image +endif +ROOTFS ?= $(SYSROOT)/../../rootfs.ext4 +endif + check_sysroot: ifneq ($(HOST_ARCH), x86) -ifndef SYSROOT +ifeq (,$(wildcard $(SYSROOT))) $(error SYSROOT ENV variable is not set, please set ENV variable correctly and rerun) endif endif +check_kimage: +ifneq ($(HOST_ARCH), x86) +ifeq (,$(wildcard $(K_IMAGE))) + $(error K_IMAGE ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif +check_rootfs: +ifneq ($(HOST_ARCH), x86) +ifeq (,$(wildcard $(ROOTFS))) + $(error ROOTFS ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif #Checks for g++ CXX := g++ diff --git a/blas/L2/tests/memKernel/gemm_1CU_gui/Makefile b/blas/L2/tests/memKernel/gemm_1CU_gui/Makefile index e7c13d65e3..cbfcad3e0e 100644 --- a/blas/L2/tests/memKernel/gemm_1CU_gui/Makefile +++ b/blas/L2/tests/memKernel/gemm_1CU_gui/Makefile @@ -34,9 +34,15 @@ help:: $(ECHO) " Command to build host application." $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" $(ECHO) "" - $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, env variable SYSROOT and EDGE_COMMON_SW need to be set first, and HOST_ARCH is either aarch32 or aarch64. For example," - $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" - $(ECHO) " export EDGE_COMMON_SW=< path-to-rootfs-and-Image-files >" + $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, HOST_ARCH is either aarch32 or aarch64." + $(ECHO) " a.IF Download the platform, and common-image from Xilinx Download Center(Suggested):" + $(ECHO) " Run the sdk.sh script from the common-image directory to install sysroot using the command : ./sdk.sh -y -d ./ -p " + $(ECHO) " Unzip the rootfs file : gunzip ./rootfs.ext4.gz" + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " b. User could also define SYSROOT, K_IMAGE and ROOTFS by themselves: " + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " export K_IMAGE=< path-to-Image-files >" + $(ECHO) " export ROOTFS=< path-to-rootfs >" $(ECHO) "" $(ECHO) " make clean " $(ECHO) " Command to remove the generated non-hardware files." @@ -107,10 +113,6 @@ ifeq ($(TARGET),hw_emu) CXXFLAGS += -D HW_EMU_TEST endif -ifeq (,$(findstring opencv,$(CXXFLAGS))) -CXXFLAGS += $(XRT_CXXFLAGS) -endif - #Inclue Required Host Source Files ifneq (,$(shell echo $(XPLATFORM) | awk '/u250/')) HOST_SRCS += $(XFLIB_DIR)/L2/src/memKernel/sw/api_gemm.cpp $(XFLIB_DIR)/L2/src/xcl2/xcl2.cpp @@ -125,6 +127,11 @@ CXXFLAGS += -I $(XFLIB_DIR)/L1/include/hw/xf_blas/helpers/utils -I $(XFLIB_DIR) CXXFLAGS += -O3 -Wall -Wno-unknown-pragmas -Wno-unused-label endif +# workaround for opencv +ifeq (,$(findstring opencv,$(CXXFLAGS))) +CXXFLAGS += $(XRT_CXXFLAGS) +endif + EXE_NAME := api_gemm.exe EXE_FILE := $(BUILD_DIR)/$(EXE_NAME) EXE_FILE_DEPS := $(HOST_SRCS) $(EXE_FILE_DEPS) @@ -190,11 +197,6 @@ $(EMCONFIG): emconfigutil --platform $(XPLATFORM) --od $(BUILD_DIR) ############################## Preparing sdcard folder ############################## ifneq ($(HOST_ARCH), x86) -ifneq (,$(findstring zc706, $(PLATFORM_NAME))) -K_IMAGE := $(SYSROOT)/../../uImage -else -K_IMAGE := $(SYSROOT)/../../Image -endif RUN_SCRIPT := $(BUILD_DIR)/run_script.sh $(RUN_SCRIPT): rm -rf $(RUN_SCRIPT) @@ -228,21 +230,21 @@ SD_DIRS_WITH_PREFIX = $(foreach sd_dir,$(DATA_DIR),--package.sd_dir $(sd_dir)) PACKAGE_FILES := $(BINARY_CONTAINERS) PACKAGE_FILES += $(AIE_CONTAINER) SD_CARD := $(CUR_DIR)/package_$(TARGET) -vck190_dfx_hw := false -$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) +dfx_hw := false +$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) check_kimage check_rootfs @echo "Generating sd_card folder...." mkdir -p $(SD_CARD) chmod a+rx $(BUILD_DIR)/run_script.sh -ifneq (,$(findstring vck190_base_dfx, $(PLATFORM_NAME))) +ifeq ($(findstring _dfx_, $(PLATFORM_NAME)),_dfx_) ifeq ($(TARGET),hw) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) -o $(BINARY_CONTAINERS_PKG) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) $(VPP_PACKAGE) -o $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(ROOTFS) --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) @echo "### ***** sd_card generation done! ***** ###" -vck190_dfx_hw := true +dfx_hw := true endif endif -ifeq ($(vck190_dfx_hw), false) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) +ifeq ($(dfx_hw), false) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(ROOTFS) --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) @echo "### ***** sd_card generation done! ***** ###" endif @@ -289,14 +291,16 @@ endif #hw ifeq ($(TARGET), hw) ifneq (,$(findstring aws-vu9p-f1, $(PLATFORM_NAME))) -ifneq ($(JENKINS_INTERNAL_BUILD), 1) +ifeq (,$(wildcard $(BUILD_DIR)/blas.awsxclbin)) $(ECHO) "This makefile does not directly support converting .xclbin to .awsxclbin, please refer https://github.com/aws/aws-fpga/blob/master/Vitis/README.md for next operations" else - $(ECHO) "Running inside Xilinx regression without converting to .awsxclbin" - $(EXE_FILE) $(HOST_ARGS) + $(ECHO) "Running HW using generated .awsxclbin" + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(EXE_FILE) $(subst .xclbin,.awsxclbin,$(HOST_ARGS)) endif else ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ $(EXE_FILE) $(HOST_ARGS) else @@ -330,12 +334,11 @@ cleanh: cleank: -$(RMDIR) $(BUILD_DIR)/*.xclbin _vimage *xclbin.run_summary qemu-memory-_* emulation/ _vimage/ pl*start_simulation. sh *.xclbin - -$(RMDIR) _x_temp.*/_x.* _x_temp.*/.Xil _x_temp.*/profile_summary.* xo_* _x* - -$(RMDIR) _x_temp.*/dltmp* _x_temp.*/kernel_info.dat _x_temp.*/*.log - -$(RMDIR) _x_temp.* + -$(RMDIR) _x_temp.* cleanall: cleanh cleank -$(RMDIR) $(BUILD_DIR) emconfig.json *.html $(TEMP_DIR) $(CUR_DIR)/reports *.csv *.run_summary $(CUR_DIR)/*.raw package_* $(BUILD_DIR)/run_script.sh .ipcache *.str -$(RMDIR) $(CUR_DIR)/Work $(CUR_DIR)/*.xpe $(CUR_DIR)/hw.o $(CUR_DIR)/*.xsa $(CUR_DIR)/xnwOut + -$(RMDIR) clean: cleanh \ No newline at end of file diff --git a/blas/L2/tests/memKernel/gemm_1CU_gui/utils.mk b/blas/L2/tests/memKernel/gemm_1CU_gui/utils.mk index 0ee80e90da..1d97b0ad1a 100644 --- a/blas/L2/tests/memKernel/gemm_1CU_gui/utils.mk +++ b/blas/L2/tests/memKernel/gemm_1CU_gui/utils.mk @@ -50,6 +50,7 @@ ifndef XILINX_XRT export XILINX_XRT endif +.PHONY: check_device check_device: @set -eu; \ inallowlist=False; \ @@ -107,6 +108,7 @@ ifneq ($(HOST_ARCH), $(filter $(HOST_ARCH),aarch64 aarch32 x86)) $(error HOST_ARCH variable not set, please set correctly and rerun) endif +.PHONY: check_version check_sysroot check_kimage check_rootfs check_version: ifneq (, $(shell which git)) ifneq (,$(wildcard $(XFLIB_DIR)/.git)) @@ -114,13 +116,34 @@ ifneq (,$(wildcard $(XFLIB_DIR)/.git)) endif endif -#Checks for SYSROOT +#Set/Check SYSROOT/K_IMAGE/ROOTFS +ifneq ($(HOST_ARCH), x86) +ifneq (,$(findstring zc706, $(PLATFORM_NAME))) +K_IMAGE ?= $(SYSROOT)/../../uImage +else +K_IMAGE ?= $(SYSROOT)/../../Image +endif +ROOTFS ?= $(SYSROOT)/../../rootfs.ext4 +endif + check_sysroot: ifneq ($(HOST_ARCH), x86) -ifndef SYSROOT +ifeq (,$(wildcard $(SYSROOT))) $(error SYSROOT ENV variable is not set, please set ENV variable correctly and rerun) endif endif +check_kimage: +ifneq ($(HOST_ARCH), x86) +ifeq (,$(wildcard $(K_IMAGE))) + $(error K_IMAGE ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif +check_rootfs: +ifneq ($(HOST_ARCH), x86) +ifeq (,$(wildcard $(ROOTFS))) + $(error ROOTFS ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif #Checks for g++ CXX := g++ diff --git a/blas/L2/tests/streamingKernel/gemmKernel/Makefile b/blas/L2/tests/streamingKernel/gemmKernel/Makefile index 9d19023772..b41fb937c7 100644 --- a/blas/L2/tests/streamingKernel/gemmKernel/Makefile +++ b/blas/L2/tests/streamingKernel/gemmKernel/Makefile @@ -34,9 +34,15 @@ help:: $(ECHO) " Command to build host application." $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" $(ECHO) "" - $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, env variable SYSROOT and EDGE_COMMON_SW need to be set first, and HOST_ARCH is either aarch32 or aarch64. For example," - $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" - $(ECHO) " export EDGE_COMMON_SW=< path-to-rootfs-and-Image-files >" + $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, HOST_ARCH is either aarch32 or aarch64." + $(ECHO) " a.IF Download the platform, and common-image from Xilinx Download Center(Suggested):" + $(ECHO) " Run the sdk.sh script from the common-image directory to install sysroot using the command : ./sdk.sh -y -d ./ -p " + $(ECHO) " Unzip the rootfs file : gunzip ./rootfs.ext4.gz" + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " b. User could also define SYSROOT, K_IMAGE and ROOTFS by themselves: " + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " export K_IMAGE=< path-to-Image-files >" + $(ECHO) " export ROOTFS=< path-to-rootfs >" $(ECHO) "" $(ECHO) " make clean " $(ECHO) " Command to remove the generated non-hardware files." @@ -108,15 +114,16 @@ ifeq ($(TARGET),hw_emu) CXXFLAGS += -D HW_EMU_TEST endif -ifeq (,$(findstring opencv,$(CXXFLAGS))) -CXXFLAGS += $(XRT_CXXFLAGS) -endif - #Inclue Required Host Source Files HOST_SRCS += $(XFLIB_DIR)/L2/src/streamingKernel/sw/host/gemm_stream.cpp $(XFLIB_DIR)/L2/src/xcl2/xcl2.cpp CXXFLAGS += -I $(XFLIB_DIR)/L1/include/hw/xf_blas/helpers/utils -I $(XFLIB_DIR)/L2/include/streamingKernel/ -I $(XFLIB_DIR)/L2/include/streamingKernel/sw/host -I $(XFLIB_DIR)/L2/include/streamingKernel/sw/compiler -I $(XFLIB_DIR)/L2/include/streamingKernel/sw -I $(XFLIB_DIR)/L2/include/xcl2 -I $(XFLIB_DIR)/L2/include/streamingKernel/sw/host/ -I $(XFLIB_DIR)/L1/include/hw CXXFLAGS += -O3 +# workaround for opencv +ifeq (,$(findstring opencv,$(CXXFLAGS))) +CXXFLAGS += $(XRT_CXXFLAGS) +endif + EXE_NAME := host.exe EXE_FILE := $(BUILD_DIR)/$(EXE_NAME) EXE_FILE_DEPS := $(HOST_SRCS) $(EXE_FILE_DEPS) @@ -206,11 +213,6 @@ $(EMCONFIG): emconfigutil --platform $(XPLATFORM) --od $(BUILD_DIR) ############################## Preparing sdcard folder ############################## ifneq ($(HOST_ARCH), x86) -ifneq (,$(findstring zc706, $(PLATFORM_NAME))) -K_IMAGE := $(SYSROOT)/../../uImage -else -K_IMAGE := $(SYSROOT)/../../Image -endif RUN_SCRIPT := $(BUILD_DIR)/run_script.sh $(RUN_SCRIPT): rm -rf $(RUN_SCRIPT) @@ -244,21 +246,21 @@ SD_DIRS_WITH_PREFIX = $(foreach sd_dir,$(DATA_DIR),--package.sd_dir $(sd_dir)) PACKAGE_FILES := $(BINARY_CONTAINERS) PACKAGE_FILES += $(AIE_CONTAINER) SD_CARD := $(CUR_DIR)/package_$(TARGET) -vck190_dfx_hw := false -$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) +dfx_hw := false +$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) check_kimage check_rootfs @echo "Generating sd_card folder...." mkdir -p $(SD_CARD) chmod a+rx $(BUILD_DIR)/run_script.sh -ifneq (,$(findstring vck190_base_dfx, $(PLATFORM_NAME))) +ifeq ($(findstring _dfx_, $(PLATFORM_NAME)),_dfx_) ifeq ($(TARGET),hw) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) -o $(BINARY_CONTAINERS_PKG) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) $(VPP_PACKAGE) -o $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(ROOTFS) --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) @echo "### ***** sd_card generation done! ***** ###" -vck190_dfx_hw := true +dfx_hw := true endif endif -ifeq ($(vck190_dfx_hw), false) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) +ifeq ($(dfx_hw), false) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(ROOTFS) --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) @echo "### ***** sd_card generation done! ***** ###" endif @@ -305,14 +307,16 @@ endif #hw ifeq ($(TARGET), hw) ifneq (,$(findstring aws-vu9p-f1, $(PLATFORM_NAME))) -ifneq ($(JENKINS_INTERNAL_BUILD), 1) +ifeq (,$(wildcard $(BUILD_DIR)/gemmKernel.awsxclbin)) $(ECHO) "This makefile does not directly support converting .xclbin to .awsxclbin, please refer https://github.com/aws/aws-fpga/blob/master/Vitis/README.md for next operations" else - $(ECHO) "Running inside Xilinx regression without converting to .awsxclbin" - $(EXE_FILE) $(HOST_ARGS) + $(ECHO) "Running HW using generated .awsxclbin" + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(EXE_FILE) $(subst .xclbin,.awsxclbin,$(HOST_ARGS)) make check endif else ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ $(EXE_FILE) $(HOST_ARGS) make check else @@ -349,12 +353,11 @@ cleanh: cleank: -$(RMDIR) $(BUILD_DIR)/*.xclbin _vimage *xclbin.run_summary qemu-memory-_* emulation/ _vimage/ pl*start_simulation. sh *.xclbin - -$(RMDIR) _x_temp.*/_x.* _x_temp.*/.Xil _x_temp.*/profile_summary.* xo_* _x* - -$(RMDIR) _x_temp.*/dltmp* _x_temp.*/kernel_info.dat _x_temp.*/*.log - -$(RMDIR) _x_temp.* + -$(RMDIR) _x_temp.* cleanall: cleanh cleank -$(RMDIR) $(BUILD_DIR) emconfig.json *.html $(TEMP_DIR) $(CUR_DIR)/reports *.csv *.run_summary $(CUR_DIR)/*.raw package_* $(BUILD_DIR)/run_script.sh .ipcache *.str -$(RMDIR) $(CUR_DIR)/Work $(CUR_DIR)/*.xpe $(CUR_DIR)/hw.o $(CUR_DIR)/*.xsa $(CUR_DIR)/xnwOut + -$(RMDIR) clean: cleanh \ No newline at end of file diff --git a/blas/L2/tests/streamingKernel/gemmKernel/utils.mk b/blas/L2/tests/streamingKernel/gemmKernel/utils.mk index 0ee80e90da..1d97b0ad1a 100644 --- a/blas/L2/tests/streamingKernel/gemmKernel/utils.mk +++ b/blas/L2/tests/streamingKernel/gemmKernel/utils.mk @@ -50,6 +50,7 @@ ifndef XILINX_XRT export XILINX_XRT endif +.PHONY: check_device check_device: @set -eu; \ inallowlist=False; \ @@ -107,6 +108,7 @@ ifneq ($(HOST_ARCH), $(filter $(HOST_ARCH),aarch64 aarch32 x86)) $(error HOST_ARCH variable not set, please set correctly and rerun) endif +.PHONY: check_version check_sysroot check_kimage check_rootfs check_version: ifneq (, $(shell which git)) ifneq (,$(wildcard $(XFLIB_DIR)/.git)) @@ -114,13 +116,34 @@ ifneq (,$(wildcard $(XFLIB_DIR)/.git)) endif endif -#Checks for SYSROOT +#Set/Check SYSROOT/K_IMAGE/ROOTFS +ifneq ($(HOST_ARCH), x86) +ifneq (,$(findstring zc706, $(PLATFORM_NAME))) +K_IMAGE ?= $(SYSROOT)/../../uImage +else +K_IMAGE ?= $(SYSROOT)/../../Image +endif +ROOTFS ?= $(SYSROOT)/../../rootfs.ext4 +endif + check_sysroot: ifneq ($(HOST_ARCH), x86) -ifndef SYSROOT +ifeq (,$(wildcard $(SYSROOT))) $(error SYSROOT ENV variable is not set, please set ENV variable correctly and rerun) endif endif +check_kimage: +ifneq ($(HOST_ARCH), x86) +ifeq (,$(wildcard $(K_IMAGE))) + $(error K_IMAGE ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif +check_rootfs: +ifneq ($(HOST_ARCH), x86) +ifeq (,$(wildcard $(ROOTFS))) + $(error ROOTFS ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif #Checks for g++ CXX := g++ diff --git a/blas/L2/tests/streamingKernel/gemmLoadStore/Makefile b/blas/L2/tests/streamingKernel/gemmLoadStore/Makefile index de2b5f59bc..20675e5c63 100644 --- a/blas/L2/tests/streamingKernel/gemmLoadStore/Makefile +++ b/blas/L2/tests/streamingKernel/gemmLoadStore/Makefile @@ -34,9 +34,15 @@ help:: $(ECHO) " Command to build host application." $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" $(ECHO) "" - $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, env variable SYSROOT and EDGE_COMMON_SW need to be set first, and HOST_ARCH is either aarch32 or aarch64. For example," - $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" - $(ECHO) " export EDGE_COMMON_SW=< path-to-rootfs-and-Image-files >" + $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, HOST_ARCH is either aarch32 or aarch64." + $(ECHO) " a.IF Download the platform, and common-image from Xilinx Download Center(Suggested):" + $(ECHO) " Run the sdk.sh script from the common-image directory to install sysroot using the command : ./sdk.sh -y -d ./ -p " + $(ECHO) " Unzip the rootfs file : gunzip ./rootfs.ext4.gz" + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " b. User could also define SYSROOT, K_IMAGE and ROOTFS by themselves: " + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " export K_IMAGE=< path-to-Image-files >" + $(ECHO) " export ROOTFS=< path-to-rootfs >" $(ECHO) "" $(ECHO) " make clean " $(ECHO) " Command to remove the generated non-hardware files." @@ -108,10 +114,6 @@ ifeq ($(TARGET),hw_emu) CXXFLAGS += -D HW_EMU_TEST endif -ifeq (,$(findstring opencv,$(CXXFLAGS))) -CXXFLAGS += $(XRT_CXXFLAGS) -endif - #Inclue Required Host Source Files ifneq (,$(shell echo $(XPLATFORM) | awk '/u250/')) HOST_SRCS += $(XFLIB_DIR)/L2/src/streamingKernel/sw/host/gemmLdSt.cpp $(XFLIB_DIR)/L2/src/xcl2/xcl2.cpp @@ -124,6 +126,11 @@ CXXFLAGS += -I $(XFLIB_DIR)/L1/include/hw/xf_blas/helpers/utils -I $(XFLIB_DIR) CXXFLAGS += -O3 endif +# workaround for opencv +ifeq (,$(findstring opencv,$(CXXFLAGS))) +CXXFLAGS += $(XRT_CXXFLAGS) +endif + EXE_NAME := host.exe EXE_FILE := $(BUILD_DIR)/$(EXE_NAME) EXE_FILE_DEPS := $(HOST_SRCS) $(EXE_FILE_DEPS) @@ -200,11 +207,6 @@ $(EMCONFIG): emconfigutil --platform $(XPLATFORM) --od $(BUILD_DIR) ############################## Preparing sdcard folder ############################## ifneq ($(HOST_ARCH), x86) -ifneq (,$(findstring zc706, $(PLATFORM_NAME))) -K_IMAGE := $(SYSROOT)/../../uImage -else -K_IMAGE := $(SYSROOT)/../../Image -endif RUN_SCRIPT := $(BUILD_DIR)/run_script.sh $(RUN_SCRIPT): rm -rf $(RUN_SCRIPT) @@ -238,21 +240,21 @@ SD_DIRS_WITH_PREFIX = $(foreach sd_dir,$(DATA_DIR),--package.sd_dir $(sd_dir)) PACKAGE_FILES := $(BINARY_CONTAINERS) PACKAGE_FILES += $(AIE_CONTAINER) SD_CARD := $(CUR_DIR)/package_$(TARGET) -vck190_dfx_hw := false -$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) +dfx_hw := false +$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) check_kimage check_rootfs @echo "Generating sd_card folder...." mkdir -p $(SD_CARD) chmod a+rx $(BUILD_DIR)/run_script.sh -ifneq (,$(findstring vck190_base_dfx, $(PLATFORM_NAME))) +ifeq ($(findstring _dfx_, $(PLATFORM_NAME)),_dfx_) ifeq ($(TARGET),hw) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) -o $(BINARY_CONTAINERS_PKG) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) $(VPP_PACKAGE) -o $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(ROOTFS) --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) @echo "### ***** sd_card generation done! ***** ###" -vck190_dfx_hw := true +dfx_hw := true endif endif -ifeq ($(vck190_dfx_hw), false) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) +ifeq ($(dfx_hw), false) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(ROOTFS) --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) @echo "### ***** sd_card generation done! ***** ###" endif @@ -299,14 +301,16 @@ endif #hw ifeq ($(TARGET), hw) ifneq (,$(findstring aws-vu9p-f1, $(PLATFORM_NAME))) -ifneq ($(JENKINS_INTERNAL_BUILD), 1) +ifeq (,$(wildcard $(BUILD_DIR)/blas.awsxclbin)) $(ECHO) "This makefile does not directly support converting .xclbin to .awsxclbin, please refer https://github.com/aws/aws-fpga/blob/master/Vitis/README.md for next operations" else - $(ECHO) "Running inside Xilinx regression without converting to .awsxclbin" - $(EXE_FILE) $(HOST_ARGS) + $(ECHO) "Running HW using generated .awsxclbin" + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(EXE_FILE) $(subst .xclbin,.awsxclbin,$(HOST_ARGS)) make check endif else ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ $(EXE_FILE) $(HOST_ARGS) make check else @@ -343,12 +347,11 @@ cleanh: cleank: -$(RMDIR) $(BUILD_DIR)/*.xclbin _vimage *xclbin.run_summary qemu-memory-_* emulation/ _vimage/ pl*start_simulation. sh *.xclbin - -$(RMDIR) _x_temp.*/_x.* _x_temp.*/.Xil _x_temp.*/profile_summary.* xo_* _x* - -$(RMDIR) _x_temp.*/dltmp* _x_temp.*/kernel_info.dat _x_temp.*/*.log - -$(RMDIR) _x_temp.* + -$(RMDIR) _x_temp.* cleanall: cleanh cleank -$(RMDIR) $(BUILD_DIR) emconfig.json *.html $(TEMP_DIR) $(CUR_DIR)/reports *.csv *.run_summary $(CUR_DIR)/*.raw package_* $(BUILD_DIR)/run_script.sh .ipcache *.str -$(RMDIR) $(CUR_DIR)/Work $(CUR_DIR)/*.xpe $(CUR_DIR)/hw.o $(CUR_DIR)/*.xsa $(CUR_DIR)/xnwOut + -$(RMDIR) clean: cleanh \ No newline at end of file diff --git a/blas/L2/tests/streamingKernel/gemmLoadStore/utils.mk b/blas/L2/tests/streamingKernel/gemmLoadStore/utils.mk index 0ee80e90da..1d97b0ad1a 100644 --- a/blas/L2/tests/streamingKernel/gemmLoadStore/utils.mk +++ b/blas/L2/tests/streamingKernel/gemmLoadStore/utils.mk @@ -50,6 +50,7 @@ ifndef XILINX_XRT export XILINX_XRT endif +.PHONY: check_device check_device: @set -eu; \ inallowlist=False; \ @@ -107,6 +108,7 @@ ifneq ($(HOST_ARCH), $(filter $(HOST_ARCH),aarch64 aarch32 x86)) $(error HOST_ARCH variable not set, please set correctly and rerun) endif +.PHONY: check_version check_sysroot check_kimage check_rootfs check_version: ifneq (, $(shell which git)) ifneq (,$(wildcard $(XFLIB_DIR)/.git)) @@ -114,13 +116,34 @@ ifneq (,$(wildcard $(XFLIB_DIR)/.git)) endif endif -#Checks for SYSROOT +#Set/Check SYSROOT/K_IMAGE/ROOTFS +ifneq ($(HOST_ARCH), x86) +ifneq (,$(findstring zc706, $(PLATFORM_NAME))) +K_IMAGE ?= $(SYSROOT)/../../uImage +else +K_IMAGE ?= $(SYSROOT)/../../Image +endif +ROOTFS ?= $(SYSROOT)/../../rootfs.ext4 +endif + check_sysroot: ifneq ($(HOST_ARCH), x86) -ifndef SYSROOT +ifeq (,$(wildcard $(SYSROOT))) $(error SYSROOT ENV variable is not set, please set ENV variable correctly and rerun) endif endif +check_kimage: +ifneq ($(HOST_ARCH), x86) +ifeq (,$(wildcard $(K_IMAGE))) + $(error K_IMAGE ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif +check_rootfs: +ifneq ($(HOST_ARCH), x86) +ifeq (,$(wildcard $(ROOTFS))) + $(error ROOTFS ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif #Checks for g++ CXX := g++ diff --git a/blas/L3/benchmarks/gemm/memKernel/Makefile b/blas/L3/benchmarks/gemm/memKernel/Makefile index b565dedf0f..64f5bb9dba 100644 --- a/blas/L3/benchmarks/gemm/memKernel/Makefile +++ b/blas/L3/benchmarks/gemm/memKernel/Makefile @@ -34,9 +34,15 @@ help:: $(ECHO) " Command to build host application." $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" $(ECHO) "" - $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, env variable SYSROOT and EDGE_COMMON_SW need to be set first, and HOST_ARCH is either aarch32 or aarch64. For example," - $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" - $(ECHO) " export EDGE_COMMON_SW=< path-to-rootfs-and-Image-files >" + $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, HOST_ARCH is either aarch32 or aarch64." + $(ECHO) " a.IF Download the platform, and common-image from Xilinx Download Center(Suggested):" + $(ECHO) " Run the sdk.sh script from the common-image directory to install sysroot using the command : ./sdk.sh -y -d ./ -p " + $(ECHO) " Unzip the rootfs file : gunzip ./rootfs.ext4.gz" + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " b. User could also define SYSROOT, K_IMAGE and ROOTFS by themselves: " + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " export K_IMAGE=< path-to-Image-files >" + $(ECHO) " export ROOTFS=< path-to-rootfs >" $(ECHO) "" $(ECHO) " make clean " $(ECHO) " Command to remove the generated non-hardware files." @@ -108,10 +114,6 @@ ifeq ($(TARGET),hw_emu) CXXFLAGS += -D HW_EMU_TEST endif -ifeq (,$(findstring opencv,$(CXXFLAGS))) -CXXFLAGS += $(XRT_CXXFLAGS) -endif - #Inclue Required Host Source Files ifneq (,$(shell echo $(XPLATFORM) | awk '/u250/')) HOST_SRCS += $(XFLIB_DIR)/L3/benchmarks/gemm/gemm_bench.cpp @@ -124,6 +126,11 @@ CXXFLAGS += -I $(XFLIB_DIR)/L3/include/sw -I $(XFLIB_DIR)/L3/benchmarks/gemm -I LDFLAGS += -luuid -lxrt_coreutil endif +# workaround for opencv +ifeq (,$(findstring opencv,$(CXXFLAGS))) +CXXFLAGS += $(XRT_CXXFLAGS) +endif + EXE_NAME := gemm_bench.exe EXE_FILE := $(BUILD_DIR)/$(EXE_NAME) EXE_FILE_DEPS := $(HOST_SRCS) $(EXE_FILE_DEPS) @@ -188,11 +195,6 @@ $(EMCONFIG): emconfigutil --platform $(XPLATFORM) --od $(BUILD_DIR) ############################## Preparing sdcard folder ############################## ifneq ($(HOST_ARCH), x86) -ifneq (,$(findstring zc706, $(PLATFORM_NAME))) -K_IMAGE := $(SYSROOT)/../../uImage -else -K_IMAGE := $(SYSROOT)/../../Image -endif RUN_SCRIPT := $(BUILD_DIR)/run_script.sh $(RUN_SCRIPT): rm -rf $(RUN_SCRIPT) @@ -226,21 +228,21 @@ SD_DIRS_WITH_PREFIX = $(foreach sd_dir,$(DATA_DIR),--package.sd_dir $(sd_dir)) PACKAGE_FILES := $(BINARY_CONTAINERS) PACKAGE_FILES += $(AIE_CONTAINER) SD_CARD := $(CUR_DIR)/package_$(TARGET) -vck190_dfx_hw := false -$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) +dfx_hw := false +$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) check_kimage check_rootfs @echo "Generating sd_card folder...." mkdir -p $(SD_CARD) chmod a+rx $(BUILD_DIR)/run_script.sh -ifneq (,$(findstring vck190_base_dfx, $(PLATFORM_NAME))) +ifeq ($(findstring _dfx_, $(PLATFORM_NAME)),_dfx_) ifeq ($(TARGET),hw) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) -o $(BINARY_CONTAINERS_PKG) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) $(VPP_PACKAGE) -o $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(ROOTFS) --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) @echo "### ***** sd_card generation done! ***** ###" -vck190_dfx_hw := true +dfx_hw := true endif endif -ifeq ($(vck190_dfx_hw), false) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) +ifeq ($(dfx_hw), false) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(ROOTFS) --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) @echo "### ***** sd_card generation done! ***** ###" endif @@ -287,14 +289,16 @@ endif #hw ifeq ($(TARGET), hw) ifneq (,$(findstring aws-vu9p-f1, $(PLATFORM_NAME))) -ifneq ($(JENKINS_INTERNAL_BUILD), 1) +ifeq (,$(wildcard $(BUILD_DIR)/blas.awsxclbin)) $(ECHO) "This makefile does not directly support converting .xclbin to .awsxclbin, please refer https://github.com/aws/aws-fpga/blob/master/Vitis/README.md for next operations" else - $(ECHO) "Running inside Xilinx regression without converting to .awsxclbin" - $(EXE_FILE) $(HOST_ARGS) + $(ECHO) "Running HW using generated .awsxclbin" + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(EXE_FILE) $(subst .xclbin,.awsxclbin,$(HOST_ARGS)) endif else ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ $(EXE_FILE) $(HOST_ARGS) else @@ -331,12 +335,11 @@ cleanh: cleank: -$(RMDIR) $(BUILD_DIR)/*.xclbin _vimage *xclbin.run_summary qemu-memory-_* emulation/ _vimage/ pl*start_simulation. sh *.xclbin - -$(RMDIR) _x_temp.*/_x.* _x_temp.*/.Xil _x_temp.*/profile_summary.* xo_* _x* - -$(RMDIR) _x_temp.*/dltmp* _x_temp.*/kernel_info.dat _x_temp.*/*.log - -$(RMDIR) _x_temp.* + -$(RMDIR) _x_temp.* cleanall: cleanh cleank -$(RMDIR) $(BUILD_DIR) emconfig.json *.html $(TEMP_DIR) $(CUR_DIR)/reports *.csv *.run_summary $(CUR_DIR)/*.raw package_* $(BUILD_DIR)/run_script.sh .ipcache *.str -$(RMDIR) $(CUR_DIR)/Work $(CUR_DIR)/*.xpe $(CUR_DIR)/hw.o $(CUR_DIR)/*.xsa $(CUR_DIR)/xnwOut + -$(RMDIR) clean: cleanh \ No newline at end of file diff --git a/blas/L3/benchmarks/gemm/memKernel/utils.mk b/blas/L3/benchmarks/gemm/memKernel/utils.mk index 0ee80e90da..1d97b0ad1a 100644 --- a/blas/L3/benchmarks/gemm/memKernel/utils.mk +++ b/blas/L3/benchmarks/gemm/memKernel/utils.mk @@ -50,6 +50,7 @@ ifndef XILINX_XRT export XILINX_XRT endif +.PHONY: check_device check_device: @set -eu; \ inallowlist=False; \ @@ -107,6 +108,7 @@ ifneq ($(HOST_ARCH), $(filter $(HOST_ARCH),aarch64 aarch32 x86)) $(error HOST_ARCH variable not set, please set correctly and rerun) endif +.PHONY: check_version check_sysroot check_kimage check_rootfs check_version: ifneq (, $(shell which git)) ifneq (,$(wildcard $(XFLIB_DIR)/.git)) @@ -114,13 +116,34 @@ ifneq (,$(wildcard $(XFLIB_DIR)/.git)) endif endif -#Checks for SYSROOT +#Set/Check SYSROOT/K_IMAGE/ROOTFS +ifneq ($(HOST_ARCH), x86) +ifneq (,$(findstring zc706, $(PLATFORM_NAME))) +K_IMAGE ?= $(SYSROOT)/../../uImage +else +K_IMAGE ?= $(SYSROOT)/../../Image +endif +ROOTFS ?= $(SYSROOT)/../../rootfs.ext4 +endif + check_sysroot: ifneq ($(HOST_ARCH), x86) -ifndef SYSROOT +ifeq (,$(wildcard $(SYSROOT))) $(error SYSROOT ENV variable is not set, please set ENV variable correctly and rerun) endif endif +check_kimage: +ifneq ($(HOST_ARCH), x86) +ifeq (,$(wildcard $(K_IMAGE))) + $(error K_IMAGE ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif +check_rootfs: +ifneq ($(HOST_ARCH), x86) +ifeq (,$(wildcard $(ROOTFS))) + $(error ROOTFS ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif #Checks for g++ CXX := g++ diff --git a/blas/L3/examples/memKernel/gemm/Makefile b/blas/L3/examples/memKernel/gemm/Makefile index 4aaf354608..6e358eeb0e 100644 --- a/blas/L3/examples/memKernel/gemm/Makefile +++ b/blas/L3/examples/memKernel/gemm/Makefile @@ -34,9 +34,15 @@ help:: $(ECHO) " Command to build host application." $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" $(ECHO) "" - $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, env variable SYSROOT and EDGE_COMMON_SW need to be set first, and HOST_ARCH is either aarch32 or aarch64. For example," - $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" - $(ECHO) " export EDGE_COMMON_SW=< path-to-rootfs-and-Image-files >" + $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, HOST_ARCH is either aarch32 or aarch64." + $(ECHO) " a.IF Download the platform, and common-image from Xilinx Download Center(Suggested):" + $(ECHO) " Run the sdk.sh script from the common-image directory to install sysroot using the command : ./sdk.sh -y -d ./ -p " + $(ECHO) " Unzip the rootfs file : gunzip ./rootfs.ext4.gz" + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " b. User could also define SYSROOT, K_IMAGE and ROOTFS by themselves: " + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " export K_IMAGE=< path-to-Image-files >" + $(ECHO) " export ROOTFS=< path-to-rootfs >" $(ECHO) "" $(ECHO) " make clean " $(ECHO) " Command to remove the generated non-hardware files." @@ -108,10 +114,6 @@ ifeq ($(TARGET),hw_emu) CXXFLAGS += -D HW_EMU_TEST endif -ifeq (,$(findstring opencv,$(CXXFLAGS))) -CXXFLAGS += $(XRT_CXXFLAGS) -endif - #Inclue Required Host Source Files ifneq (,$(shell echo $(XPLATFORM) | awk '/u250/')) HOST_SRCS += $(XFLIB_DIR)/L3/examples/memKernel/gemm/gemm_example.cpp @@ -124,6 +126,11 @@ CXXFLAGS += -I $(XFLIB_DIR)/L3/include/sw -I $(XFLIB_DIR)/L3/examples/memKernel LDFLAGS += -luuid -lxrt_coreutil endif +# workaround for opencv +ifeq (,$(findstring opencv,$(CXXFLAGS))) +CXXFLAGS += $(XRT_CXXFLAGS) +endif + EXE_NAME := gemm_example.exe EXE_FILE := $(BUILD_DIR)/$(EXE_NAME) EXE_FILE_DEPS := $(HOST_SRCS) $(EXE_FILE_DEPS) @@ -188,11 +195,6 @@ $(EMCONFIG): emconfigutil --platform $(XPLATFORM) --od $(BUILD_DIR) ############################## Preparing sdcard folder ############################## ifneq ($(HOST_ARCH), x86) -ifneq (,$(findstring zc706, $(PLATFORM_NAME))) -K_IMAGE := $(SYSROOT)/../../uImage -else -K_IMAGE := $(SYSROOT)/../../Image -endif RUN_SCRIPT := $(BUILD_DIR)/run_script.sh $(RUN_SCRIPT): rm -rf $(RUN_SCRIPT) @@ -226,21 +228,21 @@ SD_DIRS_WITH_PREFIX = $(foreach sd_dir,$(DATA_DIR),--package.sd_dir $(sd_dir)) PACKAGE_FILES := $(BINARY_CONTAINERS) PACKAGE_FILES += $(AIE_CONTAINER) SD_CARD := $(CUR_DIR)/package_$(TARGET) -vck190_dfx_hw := false -$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) +dfx_hw := false +$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) check_kimage check_rootfs @echo "Generating sd_card folder...." mkdir -p $(SD_CARD) chmod a+rx $(BUILD_DIR)/run_script.sh -ifneq (,$(findstring vck190_base_dfx, $(PLATFORM_NAME))) +ifeq ($(findstring _dfx_, $(PLATFORM_NAME)),_dfx_) ifeq ($(TARGET),hw) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) -o $(BINARY_CONTAINERS_PKG) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) $(VPP_PACKAGE) -o $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(ROOTFS) --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) @echo "### ***** sd_card generation done! ***** ###" -vck190_dfx_hw := true +dfx_hw := true endif endif -ifeq ($(vck190_dfx_hw), false) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) +ifeq ($(dfx_hw), false) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(ROOTFS) --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) @echo "### ***** sd_card generation done! ***** ###" endif @@ -287,14 +289,16 @@ endif #hw ifeq ($(TARGET), hw) ifneq (,$(findstring aws-vu9p-f1, $(PLATFORM_NAME))) -ifneq ($(JENKINS_INTERNAL_BUILD), 1) +ifeq (,$(wildcard $(BUILD_DIR)/blas.awsxclbin)) $(ECHO) "This makefile does not directly support converting .xclbin to .awsxclbin, please refer https://github.com/aws/aws-fpga/blob/master/Vitis/README.md for next operations" else - $(ECHO) "Running inside Xilinx regression without converting to .awsxclbin" - $(EXE_FILE) $(HOST_ARGS) + $(ECHO) "Running HW using generated .awsxclbin" + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(EXE_FILE) $(subst .xclbin,.awsxclbin,$(HOST_ARGS)) endif else ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ $(EXE_FILE) $(HOST_ARGS) else @@ -331,12 +335,11 @@ cleanh: cleank: -$(RMDIR) $(BUILD_DIR)/*.xclbin _vimage *xclbin.run_summary qemu-memory-_* emulation/ _vimage/ pl*start_simulation. sh *.xclbin - -$(RMDIR) _x_temp.*/_x.* _x_temp.*/.Xil _x_temp.*/profile_summary.* xo_* _x* - -$(RMDIR) _x_temp.*/dltmp* _x_temp.*/kernel_info.dat _x_temp.*/*.log - -$(RMDIR) _x_temp.* + -$(RMDIR) _x_temp.* cleanall: cleanh cleank -$(RMDIR) $(BUILD_DIR) emconfig.json *.html $(TEMP_DIR) $(CUR_DIR)/reports *.csv *.run_summary $(CUR_DIR)/*.raw package_* $(BUILD_DIR)/run_script.sh .ipcache *.str -$(RMDIR) $(CUR_DIR)/Work $(CUR_DIR)/*.xpe $(CUR_DIR)/hw.o $(CUR_DIR)/*.xsa $(CUR_DIR)/xnwOut + -$(RMDIR) clean: cleanh \ No newline at end of file diff --git a/blas/L3/examples/memKernel/gemm/utils.mk b/blas/L3/examples/memKernel/gemm/utils.mk index 0ee80e90da..1d97b0ad1a 100644 --- a/blas/L3/examples/memKernel/gemm/utils.mk +++ b/blas/L3/examples/memKernel/gemm/utils.mk @@ -50,6 +50,7 @@ ifndef XILINX_XRT export XILINX_XRT endif +.PHONY: check_device check_device: @set -eu; \ inallowlist=False; \ @@ -107,6 +108,7 @@ ifneq ($(HOST_ARCH), $(filter $(HOST_ARCH),aarch64 aarch32 x86)) $(error HOST_ARCH variable not set, please set correctly and rerun) endif +.PHONY: check_version check_sysroot check_kimage check_rootfs check_version: ifneq (, $(shell which git)) ifneq (,$(wildcard $(XFLIB_DIR)/.git)) @@ -114,13 +116,34 @@ ifneq (,$(wildcard $(XFLIB_DIR)/.git)) endif endif -#Checks for SYSROOT +#Set/Check SYSROOT/K_IMAGE/ROOTFS +ifneq ($(HOST_ARCH), x86) +ifneq (,$(findstring zc706, $(PLATFORM_NAME))) +K_IMAGE ?= $(SYSROOT)/../../uImage +else +K_IMAGE ?= $(SYSROOT)/../../Image +endif +ROOTFS ?= $(SYSROOT)/../../rootfs.ext4 +endif + check_sysroot: ifneq ($(HOST_ARCH), x86) -ifndef SYSROOT +ifeq (,$(wildcard $(SYSROOT))) $(error SYSROOT ENV variable is not set, please set ENV variable correctly and rerun) endif endif +check_kimage: +ifneq ($(HOST_ARCH), x86) +ifeq (,$(wildcard $(K_IMAGE))) + $(error K_IMAGE ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif +check_rootfs: +ifneq ($(HOST_ARCH), x86) +ifeq (,$(wildcard $(ROOTFS))) + $(error ROOTFS ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif #Checks for g++ CXX := g++ diff --git a/blas/L3/tests/gemm/memKernel/gemm_float/Makefile b/blas/L3/tests/gemm/memKernel/gemm_float/Makefile index 8c67f1bcb8..4e0fff2f49 100644 --- a/blas/L3/tests/gemm/memKernel/gemm_float/Makefile +++ b/blas/L3/tests/gemm/memKernel/gemm_float/Makefile @@ -34,9 +34,15 @@ help:: $(ECHO) " Command to build host application." $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" $(ECHO) "" - $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, env variable SYSROOT and EDGE_COMMON_SW need to be set first, and HOST_ARCH is either aarch32 or aarch64. For example," - $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" - $(ECHO) " export EDGE_COMMON_SW=< path-to-rootfs-and-Image-files >" + $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, HOST_ARCH is either aarch32 or aarch64." + $(ECHO) " a.IF Download the platform, and common-image from Xilinx Download Center(Suggested):" + $(ECHO) " Run the sdk.sh script from the common-image directory to install sysroot using the command : ./sdk.sh -y -d ./ -p " + $(ECHO) " Unzip the rootfs file : gunzip ./rootfs.ext4.gz" + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " b. User could also define SYSROOT, K_IMAGE and ROOTFS by themselves: " + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " export K_IMAGE=< path-to-Image-files >" + $(ECHO) " export ROOTFS=< path-to-rootfs >" $(ECHO) "" $(ECHO) " make clean " $(ECHO) " Command to remove the generated non-hardware files." @@ -108,10 +114,6 @@ ifeq ($(TARGET),hw_emu) CXXFLAGS += -D HW_EMU_TEST endif -ifeq (,$(findstring opencv,$(CXXFLAGS))) -CXXFLAGS += $(XRT_CXXFLAGS) -endif - #Inclue Required Host Source Files ifneq (,$(shell echo $(XPLATFORM) | awk '/u250/')) HOST_SRCS += $(XFLIB_DIR)/L3/tests/gemm/gemm_test.cpp @@ -124,6 +126,11 @@ CXXFLAGS += -I $(XFLIB_DIR)/L3/include/sw -I $(XFLIB_DIR)/L3/tests/gemm -I $(XF LDFLAGS += -luuid -lxrt_coreutil endif +# workaround for opencv +ifeq (,$(findstring opencv,$(CXXFLAGS))) +CXXFLAGS += $(XRT_CXXFLAGS) +endif + EXE_NAME := gemm_test.exe EXE_FILE := $(BUILD_DIR)/$(EXE_NAME) EXE_FILE_DEPS := $(HOST_SRCS) $(EXE_FILE_DEPS) @@ -188,11 +195,6 @@ $(EMCONFIG): emconfigutil --platform $(XPLATFORM) --od $(BUILD_DIR) ############################## Preparing sdcard folder ############################## ifneq ($(HOST_ARCH), x86) -ifneq (,$(findstring zc706, $(PLATFORM_NAME))) -K_IMAGE := $(SYSROOT)/../../uImage -else -K_IMAGE := $(SYSROOT)/../../Image -endif RUN_SCRIPT := $(BUILD_DIR)/run_script.sh $(RUN_SCRIPT): rm -rf $(RUN_SCRIPT) @@ -226,21 +228,21 @@ SD_DIRS_WITH_PREFIX = $(foreach sd_dir,$(DATA_DIR),--package.sd_dir $(sd_dir)) PACKAGE_FILES := $(BINARY_CONTAINERS) PACKAGE_FILES += $(AIE_CONTAINER) SD_CARD := $(CUR_DIR)/package_$(TARGET) -vck190_dfx_hw := false -$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) +dfx_hw := false +$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) check_kimage check_rootfs @echo "Generating sd_card folder...." mkdir -p $(SD_CARD) chmod a+rx $(BUILD_DIR)/run_script.sh -ifneq (,$(findstring vck190_base_dfx, $(PLATFORM_NAME))) +ifeq ($(findstring _dfx_, $(PLATFORM_NAME)),_dfx_) ifeq ($(TARGET),hw) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) -o $(BINARY_CONTAINERS_PKG) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) $(VPP_PACKAGE) -o $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(ROOTFS) --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) @echo "### ***** sd_card generation done! ***** ###" -vck190_dfx_hw := true +dfx_hw := true endif endif -ifeq ($(vck190_dfx_hw), false) - $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) +ifeq ($(dfx_hw), false) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(ROOTFS) --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) @echo "### ***** sd_card generation done! ***** ###" endif @@ -287,14 +289,16 @@ endif #hw ifeq ($(TARGET), hw) ifneq (,$(findstring aws-vu9p-f1, $(PLATFORM_NAME))) -ifneq ($(JENKINS_INTERNAL_BUILD), 1) +ifeq (,$(wildcard $(BUILD_DIR)/blas.awsxclbin)) $(ECHO) "This makefile does not directly support converting .xclbin to .awsxclbin, please refer https://github.com/aws/aws-fpga/blob/master/Vitis/README.md for next operations" else - $(ECHO) "Running inside Xilinx regression without converting to .awsxclbin" - $(EXE_FILE) $(HOST_ARGS) + $(ECHO) "Running HW using generated .awsxclbin" + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(EXE_FILE) $(subst .xclbin,.awsxclbin,$(HOST_ARGS)) endif else ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ $(EXE_FILE) $(HOST_ARGS) else @@ -331,12 +335,11 @@ cleanh: cleank: -$(RMDIR) $(BUILD_DIR)/*.xclbin _vimage *xclbin.run_summary qemu-memory-_* emulation/ _vimage/ pl*start_simulation. sh *.xclbin - -$(RMDIR) _x_temp.*/_x.* _x_temp.*/.Xil _x_temp.*/profile_summary.* xo_* _x* - -$(RMDIR) _x_temp.*/dltmp* _x_temp.*/kernel_info.dat _x_temp.*/*.log - -$(RMDIR) _x_temp.* + -$(RMDIR) _x_temp.* cleanall: cleanh cleank -$(RMDIR) $(BUILD_DIR) emconfig.json *.html $(TEMP_DIR) $(CUR_DIR)/reports *.csv *.run_summary $(CUR_DIR)/*.raw package_* $(BUILD_DIR)/run_script.sh .ipcache *.str -$(RMDIR) $(CUR_DIR)/Work $(CUR_DIR)/*.xpe $(CUR_DIR)/hw.o $(CUR_DIR)/*.xsa $(CUR_DIR)/xnwOut + -$(RMDIR) clean: cleanh \ No newline at end of file diff --git a/blas/L3/tests/gemm/memKernel/gemm_float/utils.mk b/blas/L3/tests/gemm/memKernel/gemm_float/utils.mk index 0ee80e90da..1d97b0ad1a 100644 --- a/blas/L3/tests/gemm/memKernel/gemm_float/utils.mk +++ b/blas/L3/tests/gemm/memKernel/gemm_float/utils.mk @@ -50,6 +50,7 @@ ifndef XILINX_XRT export XILINX_XRT endif +.PHONY: check_device check_device: @set -eu; \ inallowlist=False; \ @@ -107,6 +108,7 @@ ifneq ($(HOST_ARCH), $(filter $(HOST_ARCH),aarch64 aarch32 x86)) $(error HOST_ARCH variable not set, please set correctly and rerun) endif +.PHONY: check_version check_sysroot check_kimage check_rootfs check_version: ifneq (, $(shell which git)) ifneq (,$(wildcard $(XFLIB_DIR)/.git)) @@ -114,13 +116,34 @@ ifneq (,$(wildcard $(XFLIB_DIR)/.git)) endif endif -#Checks for SYSROOT +#Set/Check SYSROOT/K_IMAGE/ROOTFS +ifneq ($(HOST_ARCH), x86) +ifneq (,$(findstring zc706, $(PLATFORM_NAME))) +K_IMAGE ?= $(SYSROOT)/../../uImage +else +K_IMAGE ?= $(SYSROOT)/../../Image +endif +ROOTFS ?= $(SYSROOT)/../../rootfs.ext4 +endif + check_sysroot: ifneq ($(HOST_ARCH), x86) -ifndef SYSROOT +ifeq (,$(wildcard $(SYSROOT))) $(error SYSROOT ENV variable is not set, please set ENV variable correctly and rerun) endif endif +check_kimage: +ifneq ($(HOST_ARCH), x86) +ifeq (,$(wildcard $(K_IMAGE))) + $(error K_IMAGE ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif +check_rootfs: +ifneq ($(HOST_ARCH), x86) +ifeq (,$(wildcard $(ROOTFS))) + $(error ROOTFS ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif #Checks for g++ CXX := g++ diff --git a/blas/docs/src/conf.py b/blas/docs/src/conf.py index a8344aa4db..b3aaf34970 100644 --- a/blas/docs/src/conf.py +++ b/blas/docs/src/conf.py @@ -39,13 +39,13 @@ # -- Project information ----------------------------------------------------- project = 'Vitis BLAS Library' -copyright = '2021, Xilinx Inc.' +copyright = '2022, Xilinx Inc.' author = 'Xilinx Inc.' # The short X.Y version -version = '2021.2' +version = '2022.1' # The full version, including alpha/beta/rc tags -release = 'v2021.2' +release = 'v2022.1' # -- General configuration --------------------------------------------------- diff --git a/codec/.gitignore b/codec/.gitignore index fc746d91a5..808b5024de 100644 --- a/codec/.gitignore +++ b/codec/.gitignore @@ -1,6 +1,5 @@ *.log *.yuv -*.jpg _x_* build_dir* emconfig.json diff --git a/codec/Jenkinsfile b/codec/Jenkinsfile index 5d5beb6a10..24c8fb076d 100644 --- a/codec/Jenkinsfile +++ b/codec/Jenkinsfile @@ -1,5 +1,5 @@ @Library('pipeline-library')_ -VitisLibPipeline (branch: 'regression', libname: 'xf_codec', TARGETS: 'hls_csim:hls_csynth:hls_cosim:vitis_sw_emu:vitis_hw_emu:vitis_hw_build', - upstream_dependencies: 'xf_utils_hw,next,../utils; xf_database,next,../database; xf_fintech,next,../quantitative_finance', - devtest: 'RunDeploy.sh', TOOLVERSION: '2022.1_stable_latest') +VitisLibPipeline (branch: 'main', libname: 'xf_codec', TARGETS: 'hls_csim:hls_csynth:hls_cosim:vitis_sw_emu:vitis_hw_emu:vitis_hw_build', + upstream_dependencies: 'xf_utils_hw,main,../utils; xf_database,main,../database; xf_fintech,main,../quantitative_finance', + devtest: 'RunDeploy.sh', TOOLVERSION: '2022.1_released') diff --git a/codec/L1/README.md b/codec/L1/README.md index bd4ef5f673..3c4da2591b 100644 --- a/codec/L1/README.md +++ b/codec/L1/README.md @@ -1,104 +1,26 @@ -JPEG Decoder -============ - -Jpeg Decoder example resides in ``L2/demos/jpegDec`` directory. The tutorial provides a step-by-step guide that covers commands for building and running kernel. - -Executable Usage ----------------- - -* **Work Directory(Step 1)** - -The steps for library download and environment setup can be found in :ref:`l2_vitis_codec`. For getting the design, - -``` - cd L2/demos/jpegDec -``` - -* **Build kernel(Step 2)** - -Run the following make command to build your XCLBIN and host binary targeting a specific device. Please be noticed that this process will take a long time, maybe couple of hours. - -``` - make run TARGET=hw DEVICE=xilinx_u250_xdma_201830_2 -``` - -* **Run kernel(Step 3)** - -To get the benchmark results, please run the following command. - -``` - ./build_dir.hw.xilinx_u250_xdma_201830_2/host.exe -xclbin build_dir.hw.xilinx_u250_xdma_201830_2/jpegDecoder.xclbin -JPEGFile android.jpg -``` - -JPEG Decoder Input Arguments: - -``` - Usage: host.exe -[-xclbin -dataSetDir -refDir] - -xclbin: the kernel name - -JPEGFile: the path point to input *.jpg -``` - -Note: Default arguments are set in Makefile, you can use other :ref:`pictures` listed in the table. - -* **Example output(Step 4)** - -``` - Found Platform - Platform Name: Xilinx - INFO: Found Device=xilinx_u250_xdma_201830_2 - INFO: Importing build_dir.hw.xilinx_u250_xdma_201830_2/jpegDecoder.xclbin - Loading: 'build_dir.hw.xilinx_u250_xdma_201830_2/jpegDecoder.xclbin' - INFO: Kernel has been created - INFO: Finish kernel setup - ... - - INFO: Finish kernel execution - INFO: Finish E2E execution - INFO: Data transfer from host to device: 40 us - INFO: Data transfer from device to host: 6 us - INFO: Average kernel execution per run: 988 us - ... - - INFO: android.yuv will be generated from the jpeg decoder's output oINFO: android.yuv is generated correctly - INFO: android.yuv is generated correctly -``` - -Profiling ---------- - -The hardware resource utilizations are listed in the following table. -Different tool versions may result slightly different resource. - -##### Table 1 IP resources for jpegDecoder with huffman decoder(L1 IP) - -| IP | BRAM | URAM | DSP | FF | LUT | Frequency(MHz) | -|-----------------------|----------|----------|----------|----------|---------|-----------------| -| huffman_decoder | 5 | 0 | 12 | 6963 | 7344 | 286 | - -##### Table 2 IP resources for jpegDecoder with jfif parser and huffman decoder(L1 IP) - -| IP | BRAM | URAM | DSP | FF | LUT | Frequency(MHz) | -|-----------------------|----------|----------|----------|----------|---------|-----------------| -| kernel_parser_decoder | 5 | 0 | 12 | 7615 | 8382 | 257 | - -##### Table 3 Hardware resources for jpegDecoder with jfif parser, huffman, iq and idct (L2 kernel) - -| Kernel | BRAM | URAM | DSP | FF | LUT | Frequency(MHz) | -|-----------------------|----------|----------|----------|----------|---------|-----------------| -| jpegDecoder | 7 | 0 | 39 | 12298 | 13417 | 257 | - -Result ------- - -To check the output yuv file, download https://sourceforge.net/projects/raw-yuvplayer/ . -Then upload the rebuild_image.yuv, set the right sample radio and custom size on the software, and check the yuv file. - -Table 1 : Jpeg Decoder profiling - -![Table 1 : Jpeg Decoder profiling](../../../docs/images/jpegDecoderpofile.png) - -##### Note -``` - | 1. MAX_DEC_PIX is for benchmark. If testcase image is larger than 20M, the value of MAX_DEC_PIX should be enlarged following the size of image. - | 2. MAXCMP_BC is for benchmark. If testcase image is larger than 20M, the value of MAXCMP_BC should be enlarged following the size of image. -``` +# Level 1: HLS Modules + +The Level 1 APIs of Vitis Codec Library is presented as HLS C++ modules. + +This level of API is mainly provided for hardware-savvy developers. +The API description and design details of these modules can be found +in _L1 Module User Guide_ section of the library document. + +## License + +Licensed using the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0). + + Copyright 2022 Xilinx, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + Copyright 2022 Xilinx, Inc. diff --git a/codec/L1/include/xlnx_cfg.h b/codec/L1/include/xlnx_cfg.h new file mode 100644 index 0000000000..1873897c90 --- /dev/null +++ b/codec/L1/include/xlnx_cfg.h @@ -0,0 +1,28 @@ +/* + * Copyright 2019 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef XLNX_CFG_H +#define XLNX_CFG_H + +#define XLNX_DEBUG_DCT +#define XLNX_DEBUG_CMAP + +#define XLNX_DISABLE_BLK_DICT +#define XLNX_DISABLE_RECT_DCT +#define XLNX_DISABLE_ARC +#define XLNX_DISABLE_2NDCMP + +#endif diff --git a/codec/L1/meta/api.json b/codec/L1/meta/api.json index 90676c5346..4b800a960c 100644 --- a/codec/L1/meta/api.json +++ b/codec/L1/meta/api.json @@ -29,43 +29,38 @@ "type": "const int" }, { - "name": "img_info", + "name": null, "direction": "", - "type": "xf::codec::img_info&" + "type": "" }, { - "name": "hls_cmpnfo[MAX_NUM_COLOR]", + "name": null, "direction": "", - "type": "xf::codec::hls_compInfo" + "type": "*pout" }, { "name": "block_strm", "direction": "", - "type": "hls::stream >&" + "type": "*pout" }, { "name": "rtn", "direction": "", - "type": "int&" + "type": "*pout" }, { "name": "rtn2", "direction": "", - "type": "bool&" - }, - { - "name": "pout", - "direction": "", - "type": "xf::codec::decOutput*" + "type": "*pout" } ] } }, { - "api_name": "xf::codec::top_order_tokenize", + "api_name": "top_order_tokenize", "spec": { "schema": "vitis_libraries_api_list_schema-1.0", - "api_name": "xf::codec::top_order_tokenize", + "api_name": "top_order_tokenize", "display_name": "top_order_tokenize", "brief": "JXL order_tokenize case", "target_domain": "", @@ -73,7 +68,7 @@ "topOrderTokenize.hpp" ], "search_paths": [ - "/L1/include/" + "/L1/tests/jxlEnc/order_tokenize/kernel" ], "instance": "function", "parameters": [], @@ -103,4 +98,4 @@ } ], "target_domain": "" -} +} \ No newline at end of file diff --git a/codec/L1/src/XAcc_jfifparser.cpp b/codec/L1/src/XAcc_jfifparser.cpp index ef1a3eca29..255fd16e7d 100644 --- a/codec/L1/src/XAcc_jfifparser.cpp +++ b/codec/L1/src/XAcc_jfifparser.cpp @@ -599,7 +599,6 @@ void decoder_jpg_top(ap_uint* ptr, * @param block_strm the stream of coefficients in block,23:is_rst, 22:is_endblock,21~16:bpos,15~0:block val * @param rtn the flag of the jfif parser succeed * @param rtn2 the flag of the decode succeed - * @param pout the decOutput */ void kernelParserDecoderTop(ap_uint* datatoDDR, const int size, diff --git a/codec/L1/src/XAcc_jpegdecoder.cpp b/codec/L1/src/XAcc_jpegdecoder.cpp index 2fe79f81ed..8dad7791a3 100644 --- a/codec/L1/src/XAcc_jpegdecoder.cpp +++ b/codec/L1/src/XAcc_jpegdecoder.cpp @@ -172,18 +172,17 @@ void Huffman_decoder( const int16_t dc_huff_start_addr[2][16], // const ap_uint<12> cyc_cmp, -// regs -#ifndef __SYNTHESIS__ + // regs + //#ifndef __SYNTHESIS__ const uint8_t hls_cs_cmpc, const uint16_t hls_mcuh, -#endif + //#endif const uint8_t hls_mbs[MAX_NUM_COLOR], const uint32_t hls_mcuc, // output bool& rtn2, hls::stream >& block_strm) { - #pragma HLS INLINE off ap_uint<12> hls_cmp = cyc_cmp; @@ -1133,8 +1132,8 @@ void top_mcu_decoder( xf::codec::details::Huffman_decoder(huff_sos_strm, sign_no_huff, dht_tbl1, ac_val, ac_huff_start_code, ac_huff_start_addr, dc_val, dc_huff_start_code, dc_huff_start_addr, hls_cmp, -#ifndef __SYNTHESIS__ + //#ifndef __SYNTHESIS__ hls_cs_cmpc, hls_mcuh, -#endif + //#endif hls_mbs, hls_mcuc, rtn2, block_strm); } diff --git a/codec/L1/tests/jpegdec/description.json b/codec/L1/tests/jpegdec/description.json index e6ea189600..981c08c43b 100644 --- a/codec/L1/tests/jpegdec/description.json +++ b/codec/L1/tests/jpegdec/description.json @@ -2,15 +2,15 @@ "name": "Xilinx jpeg decoder HLS Test", "description": "Xilinx jpeg decoder HLS Test", "flow": "hls", - "platform_whitelist": [ + "platform_allowlist": [ "u200" ], - "platform_blacklist": [], - "part_whitelist": [], - "part_blacklist": [], + "platform_blocklist": [], + "part_allowlist": [], + "part_blocklist": [], "project": "test", "solution": "solution1", - "clock": "2.5", + "clock": "3.33", "topfunction": "kernel_parser_decoder", "top": { "source": [ @@ -41,17 +41,17 @@ "env": "", "cmd": "", "max_memory_MB": { - "hls_vivado_syn": 16384, + "vivado_syn": 16384, "hls_csim": 16384, "hls_cosim": 16384, - "hls_vivado_impl": 16384, + "vivado_impl": 16384, "hls_csynth": 16384 }, "max_time_min": { - "hls_vivado_syn": 480, + "vivado_syn": 480, "hls_csim": 120, "hls_cosim": 480, - "hls_vivado_impl": 480, + "vivado_impl": 480, "hls_csynth": 240 } } @@ -60,8 +60,8 @@ "hls_csim", "hls_csynth", "hls_cosim", - "hls_vivado_syn", - "hls_vivado_impl" + "vivado_syn", + "vivado_impl" ], "category": "canary" } diff --git a/codec/L1/tests/jpegdec/run_hls.tcl b/codec/L1/tests/jpegdec/run_hls.tcl index 54799e04eb..3f9a7909b8 100644 --- a/codec/L1/tests/jpegdec/run_hls.tcl +++ b/codec/L1/tests/jpegdec/run_hls.tcl @@ -20,14 +20,15 @@ set PROJ "test.prj" set SOLN "solution1" if {![info exists CLKP]} { - set CLKP 2.5 + set CLKP 3.33 } open_project -reset $PROJ -add_files "test_decoder.cpp ${XF_PROJ_ROOT}/L1/src/XAcc_jfifparser.cpp ${XF_PROJ_ROOT}/L1/src/XAcc_jpegdecoder.cpp" -cflags "-I${XF_PROJ_ROOT}/L1/include -std=c++0x" +add_files "test_decoder.cpp ${XF_PROJ_ROOT}/L1/src/XAcc_jfifparser.cpp ${XF_PROJ_ROOT}/L1/src/XAcc_jpegdecoder.cpp" -cflags "-I${XF_PROJ_ROOT}/L1/include -std=c++0x -g -O0" add_files -tb "test_decoder.cpp" -cflags "-I${XF_PROJ_ROOT}/L1/include -std=c++0x" set_top kernel_parser_decoder +#set_top Huffman_decoder open_solution -reset $SOLN @@ -57,4 +58,4 @@ if {$VIVADO_IMPL == 1} { export_design -flow impl -rtl verilog } -exit \ No newline at end of file +exit diff --git a/codec/L1/tests/jpegdec/test_decoder.cpp b/codec/L1/tests/jpegdec/test_decoder.cpp index 696ff71901..3b949b4f76 100644 --- a/codec/L1/tests/jpegdec/test_decoder.cpp +++ b/codec/L1/tests/jpegdec/test_decoder.cpp @@ -182,7 +182,7 @@ int main(int argc, const char* argv[]) { printf("Warning: [code 3] huffman data is not in expectation!\n"); } } - printf("Info: Ready to decode next input file!\n"); + return 1; } xf::codec::details::hls_next_mcupos2(block_strm, hls_block, hls_sfv, hls_sfh, hls_mbs, hls_bch[0], hls_bc[0], @@ -214,6 +214,7 @@ int main(int argc, const char* argv[]) { free(hls_block); std::cout << "Ready for next image!\n "; + return 0; } #endif diff --git a/codec/L1/tests/jxlEnc/order_tokenize/description.json b/codec/L1/tests/jxlEnc/order_tokenize/description.json index 79271d6ad1..57dcaf4922 100644 --- a/codec/L1/tests/jxlEnc/order_tokenize/description.json +++ b/codec/L1/tests/jxlEnc/order_tokenize/description.json @@ -2,12 +2,12 @@ "name": "Xilinx Order Tokenize HLS Test", "description": "Xilinx jxl Order Tokenize HLS Test", "flow": "hls", - "platform_whitelist": [ + "platform_allowlist": [ "u200" ], - "platform_blacklist": [], - "part_whitelist": [], - "part_blacklist": [], + "platform_blocklist": [], + "part_allowlist": [], + "part_blocklist": [], "project": "tokenize", "solution": "solution1", "clock": "3.33", @@ -40,17 +40,17 @@ "env": "", "cmd": "", "max_memory_MB": { - "hls_vivado_syn": 16384, + "vivado_syn": 16384, "hls_csim": 16384, "hls_cosim": 16384, - "hls_vivado_impl": 16384, + "vivado_impl": 16384, "hls_csynth": 16384 }, "max_time_min": { - "hls_vivado_syn": 480, + "vivado_syn": 480, "hls_csim": 120, "hls_cosim": 480, - "hls_vivado_impl": 480, + "vivado_impl": 480, "hls_csynth": 240 } } @@ -59,8 +59,8 @@ "hls_csim", "hls_csynth", "hls_cosim", - "hls_vivado_syn", - "hls_vivado_impl" + "vivado_syn", + "vivado_impl" ], "category": "canary" } diff --git a/codec/L2/README.md b/codec/L2/README.md index 5fb50aea86..a3b68a0034 100644 --- a/codec/L2/README.md +++ b/codec/L2/README.md @@ -1,3 +1,22 @@ # Level 2: Predefined Codec Kernels The Level2 of Vitis Codec Library contains host-callable kernels. For more details information, please reference to _L2 User Guide_ in the document for usage and design information. + +## License + +Licensed using the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0). + + Copyright 2022 Xilinx, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + Copyright 2022 Xilinx, Inc. diff --git a/codec/L2/demos/README.md b/codec/L2/demos/README.md index ffe4a4d524..b569ac1917 100644 --- a/codec/L2/demos/README.md +++ b/codec/L2/demos/README.md @@ -7,7 +7,7 @@ Here are benchmarks of the Vitis Codec Library using the Vitis environment and c ### Vitis Codec Library - Alveo U200 installed and configured as per [Alveo U200 Data Center Accelerator Card](https://www.xilinx.com/products/boards-and-kits/alveo/u200.html#gettingStarted) - Xilinx runtime (XRT) installed -- Xilinx Vitis 2021.2 installed and configured +- Xilinx Vitis 2022.1 installed and configured ## Pictures @@ -44,7 +44,26 @@ These codec benchmarks can be downloaded from [vitis libraries](https://github.c Specifying the corresponding Vitis, XRT, and path to the platform repository by running following commands. ``` - source /installs/lin64/Vitis/2021.2/settings64.sh + source /installs/lin64/Vitis/2022.1/settings64.sh source /opt/xilinx/xrt/setup.sh export PLATFORM_REPO_PATHS=/opt/xilinx/platforms ``` + +## License + +Licensed using the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0). + + Copyright 2022 Xilinx, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + Copyright 2022 Xilinx, Inc. diff --git a/codec/L2/demos/jpegDec/Makefile b/codec/L2/demos/jpegDec/Makefile index fb700498e8..bfe33c9463 100644 --- a/codec/L2/demos/jpegDec/Makefile +++ b/codec/L2/demos/jpegDec/Makefile @@ -142,7 +142,7 @@ LIBRARY_PATH := $(LD_LIBRARY_PATH):$(XILINX_XRT)/lib ########################## Kernel compiler global settings ########################## ifneq (,$(shell echo $(XPLATFORM) | awk '/u200/')) -VPP_FLAGS += --config $(CUR_DIR)/conn_u200.cfg -g +VPP_FLAGS += --config $(CUR_DIR)/conn_u200.cfg VPP_FLAGS += -I $(XFLIB_DIR)/L2/include/hw/jpegDec -I $(XFLIB_DIR)/../utils/L1/include -I $(XFLIB_DIR)/L2/demos/jpegDec/kernel else ifneq (,$(shell echo $(XPLATFORM) | awk '/u50/')) @@ -351,4 +351,4 @@ cleanall: cleanh cleank -$(RMDIR) $(CUR_DIR)/Work $(CUR_DIR)/*.xpe $(CUR_DIR)/hw.o $(CUR_DIR)/*.xsa $(CUR_DIR)/xnwOut -$(RMDIR) -clean: cleanh +clean: cleanh \ No newline at end of file diff --git a/codec/L2/demos/jpegDec_sc/Makefile b/codec/L2/demos/jpegDec_sc/Makefile new file mode 100644 index 0000000000..8438349a9d --- /dev/null +++ b/codec/L2/demos/jpegDec_sc/Makefile @@ -0,0 +1,284 @@ +# Copyright 2019-2021 Xilinx, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# sc makefile-generator v1.0.0 + +############################## Help Section ############################## +.PHONY: help + +help:: + $(ECHO) "Makefile Usage:" + $(ECHO) " make all TARGET= PLATFORM= HOST_ARCH=" + $(ECHO) " Command to generate the design for specified Target and Shell." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" + $(ECHO) "" + $(ECHO) " make run TARGET= PLATFORM= HOST_ARCH=" + $(ECHO) " Command to run application in emulation." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH required for SoC shells" + $(ECHO) "" + $(ECHO) " make host HOST_ARCH=" + $(ECHO) " Command to build host application." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" + $(ECHO) "" + $(ECHO) " make clean " + $(ECHO) " Command to remove the generated non-hardware files." + $(ECHO) "" + $(ECHO) " make cleanall" + $(ECHO) " Command to remove all the generated files." + $(ECHO) "" + +############################## Setting up Project Variables ############################## + +MK_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +XF_PROJ_ROOT ?= $(shell bash -c 'export MK_PATH=$(MK_PATH); echo $${MK_PATH%/L2/*}') +CUR_DIR := $(patsubst %/,%,$(dir $(MK_PATH))) +XFLIB_DIR = $(XF_PROJ_ROOT) + +# setting devault value +TARGET ?= sw_emu +HOST_ARCH ?= x86 + +#setting PLATFORM +ifeq ($(PLATFORM),) +PLATFORM := $(DEVICE) +endif +ifeq ($(PLATFORM),) +PLATFORM := xilinx_u50_gen3x16_xdma_5_202210_1 +endif + +# #################### Checking if PLATFORM in whitelist ############################ +PLATFORM_ALLOWLIST += u50 u280 +PLATFORM_BLOCKLIST += zc + +GCC_INTOOL := 8.3.0 +BINUTILS_INTOOL := 2.37 +include ./utils.mk +TEMP_DIR := _x_temp.$(TARGET).$(PLATFORM_NAME) +TEMP_REPORT_DIR := $(CUR_DIR)/reports/_x.$(TARGET).$(PLATFORM_NAME) +BUILD_DIR := build_dir.$(TARGET).$(PLATFORM_NAME) +BUILD_REPORT_DIR := $(CUR_DIR)/reports/_build.$(TARGET).$(PLATFORM_NAME) +EMCONFIG := $(BUILD_DIR)/emconfig.json +XCLBIN_DIR := $(CUR_DIR)/$(BUILD_DIR) +export XCL_BINDIR = $(XCLBIN_DIR) + +EXE_FILE_DEPS := +BINARY_CONTAINERS_DEPS := +RUN_DEPS := + +# set debug switch +ifneq ($(debug),yes) +CXXFLAGS += -O3 +endif + +# get global setting +ifdef XILINX_SC_PFM_CONFIG +CXXFLAGS += -DXILINX_SC_PFM_CONFIG=$(XILINX_SC_PFM_CONFIG) +endif +ifdef XILINX_SC_PFM_EXT +CXXFLAGS += -DXILINX_SC_PFM_EXT=$(XILINX_SC_PFM_EXT) +endif +ifeq ($(HOST_ARCH), x86) +CXXFLAGS += -I $(XILINX_VITIS)/system_compiler/include -I $(XILINX_HLS)/include +LDFLAGS += -L$(XILINX_XRT)/lib -L$(XILINX_VITIS)/system_compiler/lib/x86 -lvpp_acc -l$(LIB_XRT) -lxrt_coreutil -Wl,-rpath=$(XILINX_VITIS)/system_compiler/lib/x86:$(XILINX_XRT)/lib:$(GCC_HOME)/lib64 -Wl,--enable-new-dtags -lpthread +VPP_FLAGS += -t $(TARGET) --platform $(XPLATFORM) --temp_dir $(TEMP_DIR) --save-temps -g -I $(XILINX_VITIS)/system_compiler/include +VPP_LDFLAGS += +else ifeq ($(HOST_ARCH), aarch64) +CXXFLAGS += +LDFLAGS += +VPP_FLAGS += +VPP_LDFLAGS += +endif +CXXFLAGS += $(EXTRA_CXXFLAGS) +VPP_FLAGS += $(EXTRA_VPP_FLAGS) + +ifeq ($(TARGET),sw) + $(error Error: The sw target is not supported anymore. Please use sw_emu instead) +else ifeq ($(TARGET),sw_emu) + LIB_XRT := xrt_swemu + HOST_PREAMBLE := XCL_EMULATION_MODE=sw_emu +else ifeq ($(TARGET),hw_emu) + LIB_XRT := xrt_hwemu + HOST_PREAMBLE := XCL_EMULATION_MODE=hw_emu + ifneq (,$(findstring -g,$(EXTRA_VPPFLAGS) $(CXXFLAGS))) + # for sourcing pre/post xsim scripts + ifneq ($(XILINX_SC_HW_EMU),0) + HOST_PREAMBLE += XILINX_SC_HW_EMU=1 XILINX_SC_BUILD_DIR=$(PWD)/$(BUILD_DIR) + endif + endif +else ifeq ($(TARGET),hw) + LIB_XRT := xrt_core +endif + +########################## Setting up Host Variables ########################## + +#Inclue Required Host Source Files +ifneq (,$(shell echo $(XPLATFORM) | awk '/u50/')) +HOST_SRCS += $(XFLIB_DIR)/L2/demos/jpegDec_sc/host/test_decoder.cpp $(XFLIB_DIR)/L2/demos/jpegDec_sc/kernel/kernelJpegDecoder.cpp +CXXFLAGS += -D USE_HBM +CXXFLAGS += -I $(XFLIB_DIR)/L2/include/hw/jpegDec -I $(XFLIB_DIR)/L2/demos/jpegDec_sc/host -I $(XFLIB_DIR)/L2/demos/jpegDec_sc/kernel -I $(XFLIB_DIR)/../utils/L1/include +CXXFLAGS += -O3 -D KERNEL0 -B/usr/lib/x86_64-linux-gnu + +else ifneq (,$(shell echo $(XPLATFORM) | awk '/u280/')) +HOST_SRCS += $(XFLIB_DIR)/L2/demos/jpegDec_sc/host/test_decoder.cpp $(XFLIB_DIR)/L2/demos/jpegDec_sc/kernel/kernelJpegDecoder.cpp +CXXFLAGS += -D USE_HBM +CXXFLAGS += -I $(XFLIB_DIR)/L2/include/hw/jpegDec -I $(XFLIB_DIR)/L2/demos/jpegDec_sc/host -I $(XFLIB_DIR)/L2/demos/jpegDec_sc/kernel -I $(XFLIB_DIR)/../utils/L1/include +CXXFLAGS += -O3 -D KERNEL0 -B/usr/lib/x86_64-linux-gnu + +else +HOST_SRCS += $(XFLIB_DIR)/L2/demos/jpegDec_sc/host/test_decoder.cpp $(XFLIB_DIR)/L2/demos/jpegDec_sc/kernel/kernelJpegDecoder.cpp +CXXFLAGS += -I $(XFLIB_DIR)/L2/include/hw/jpegDec -I $(XFLIB_DIR)/L2/demos/jpegDec_sc/host -I $(XFLIB_DIR)/L2/demos/jpegDec_sc/kernel -I $(XFLIB_DIR)/../utils/L1/include +CXXFLAGS += -O3 -D KERNEL0 -B/usr/lib/x86_64-linux-gnu + +endif +EXE_NAME := host.exe +EXE_OBJS := $(addprefix $(TEMP_DIR)/, $(addsuffix .o,$(basename $(HOST_SRCS)))) +EXE_FILE := $(BUILD_DIR)/$(EXE_NAME) +EXE_FILE_DEPS := $(EXE_OBJS) +MAKEDEPEND = $(CXX) $< -MM -MP -MF $(basename $@).d -MT $@ $(CXXFLAGS) + +HOST_ARGS := -JPEGFile $(XFLIB_DIR)/L2/demos/jpegDec/images/t0.jpg +ifneq ($(HOST_ARCH), x86) +PKG_HOST_ARGS = $(foreach args,$(HOST_ARGS),$(subst $(dir $(patsubst %/,%,$(args))),,$(args))) +endif +LIBRARY_PATH := $(LD_LIBRARY_PATH):$(XILINX_XRT)/lib + +########################## Kernel compiler global settings ########################## +VPP_FLAGS += -I $(XFLIB_DIR)/L2/include/hw/jpegDec -I $(XFLIB_DIR)/../utils/L1/include -I $(XFLIB_DIR)/L2/demos/jpegDec_sc/kernel + +######################### binary container global settings ########################## +VPP_FLAGS_kernelJpegDecoder += -D KERNEL0 +VPP_FLAGS_kernelJpegDecoder += --hls.clock 300000000:kernelJpegDecoder +ifneq ($(HOST_ARCH), x86) +VPP_LDFLAGS_JDK += --clock.defaultFreqHz 300000000 +else +VPP_LDFLAGS_JDK += --kernel_frequency 300 +endif + +ifeq ($(HOST_ARCH), x86) +BINARY_CONTAINERS_TMP := $(BUILD_DIR)/$(TARGET).o +BINARY_CONTAINERS := $(BUILD_DIR)/$(TARGET).xclbin +ifeq ($(TARGET),sw_emu) + BINARY_CONTAINERS_TMP := +endif +else +# placeholder for non_x86 +endif + +.SECONDEXPANSION: +# ################ Setting Rules for Binary Containers (Building Kernels) ################ +ACC_SRCS_kernelJpegDecoder += $(XFLIB_DIR)/L2/demos/jpegDec_sc/kernel/kernelJpegDecoder.cpp +ACC_OBJS_kernelJpegDecoder := $(addprefix $(TEMP_DIR)/, $(addsuffix .o,$(basename $(ACC_SRCS_kernelJpegDecoder)))) +$(ACC_OBJS_kernelJpegDecoder): $(TEMP_DIR)/%.o : %.cpp $$(@D)/.f + @echo "--> Making $@ from: $?" + $(MAKEDEPEND) + $(VPP) $(VPP_FLAGS) $(VPP_FLAGS_kernelJpegDecoder) -o $@ -c $< +BINARY_CONTAINERS_DEPS += $(ACC_OBJS_kernelJpegDecoder) +$(BINARY_CONTAINERS_TMP) : $(BINARY_CONTAINERS_DEPS) + @echo "--> Making $@ from: $?" + $(VPP) $(VPP_FLAGS) $(VPP_LDFLAGS) $(VPP_LDFLAGS_JDK) -o $(BINARY_CONTAINERS) -l $^ +EXE_FILE_DEPS += $(BINARY_CONTAINERS_TMP) +EXE_FILE_DEPS += $(BINARY_CONTAINERS_DEPS) + +############################## Setting Rules for Host (Building Host Executable) ############################## +ifeq ($(HOST_ARCH), x86) +$(TEMP_DIR)/%.o : %.cpp $$(@D)/.f + @echo "--> Making $@ from: $?" + mkdir -p $(BUILD_DIR) + $(MAKEDEPEND) + $(CXX) -o $@ $(CXXFLAGS) -I . -c $< +$(EXE_FILE): $(EXE_FILE_DEPS) + mkdir -p $(BUILD_DIR) + $(CXX) -o $@ $^ $(CXXFLAGS) $(LDFLAGS) +else +# place holder for arch64 +endif + +$(EMCONFIG): + emconfigutil --platform $(XPLATFORM) --od $(BUILD_DIR) + +%/.f: + mkdir -p $(dir $@) + touch $@ + +.PRECIOUS: %/.f + +RUN_DEPS += $(EXE_FILE) $(EMCONFIG) + +run: check_device $(RUN_DEPS) +#sw_emu +ifneq (,$(filter sw_emu, $(TARGET))) +ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(HOST_PREAMBLE) $(EXE_FILE) $(HOST_ARGS) + +else +# place holder for arch64 +endif +endif + +#hw_emu +ifneq (,$(filter hw_emu, $(TARGET))) +ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(HOST_PREAMBLE) $(EXE_FILE) $(HOST_ARGS) + +else +# place holder for arch64 +endif +endif + +#hw +ifeq ($(TARGET), hw) +ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(HOST_PREAMBLE) $(EXE_FILE) $(HOST_ARGS) + +else +# place holder for arch64 +endif +endif + +############################## Setting Targets ############################## + +.PHONY: all clean cleanall emconfig +emconfig: $(EMCONFIG) +ifeq ($(HOST_ARCH), x86) +all: check_vpp check_platform check_xrt $(EXE_FILE) $(BINARY_CONTAINERS) emconfig +else +all: check_vpp check_platform check_sysroot $(EXE_FILE) $(BINARY_CONTAINERS) emconfig sd_card +endif + +.PHONY: host xclbin +ifeq ($(HOST_ARCH), x86) +host: check_xrt $(EXE_FILE) +else +host: check_sysroot $(EXE_FILE) +endif +xclbin: $(BINARY_CONTAINERS_TMP) + +############################## Cleaning Rules ############################## +cleanh: + -$(RMDIR) $(EXE_FILE) vitis_* TempConfig system_estimate.xtxt *.rpt .run/ + -$(RMDIR) src/*.ll _xocc_* .Xil dltmp* xmltmp* *.log *.jou *.wcfg *.wdb sample_link.ini sample_compile.ini obj* bin* *.csv *.jpg *.jpeg *.png + +cleank: + -$(RMDIR) $(BUILD_DIR)/*.xclbin _vimage *xclbin.run_summary qemu-memory-_* emulation/ _vimage/ pl*start_simulation. sh *.xclbin + -$(RMDIR) _x_temp.*/_x.* _x_temp.*/.Xil _x_temp.*/profile_summary.* xo_* _x* + -$(RMDIR) _x_temp.*/dltmp* _x_temp.*/kernel_info.dat _x_temp.*/*.log + -$(RMDIR) _x_temp.* + +cleanall: cleanh cleank + -$(RMDIR) $(BUILD_DIR) build_dir.* emconfig.json *.html $(TEMP_DIR) $(CUR_DIR)/reports *.csv *.run_summary $(CUR_DIR)/*.raw package_* $(BUILD_DIR)/run_script.sh .ipcache *.str + -$(RMDIR) $(XFLIB_DIR)/common/data/*.xe2xd* $(XFLIB_DIR)/common/data/*.orig* + +clean: cleanh \ No newline at end of file diff --git a/codec/L2/demos/jpegDec_sc/README.md b/codec/L2/demos/jpegDec_sc/README.md new file mode 100644 index 0000000000..fb0ea4adc5 --- /dev/null +++ b/codec/L2/demos/jpegDec_sc/README.md @@ -0,0 +1,123 @@ +JPEG Decoder System complier demo +============ + +Jpeg Decoder example resides in ``L2/demos/jpegDec_sc`` directory. The tutorial provides a step-by-step guide that covers commands for building and running jpeg decoder system complier demo. + +Executable Usage +---------------- + +* **Work Directory(Step 1)** + +The steps for library download and environment setup can be found in [here](https://github.com/Xilinx/Vitis_Libraries/tree/master/codec/L2/demos#building). For getting the design, + +``` + cd L2/demos/jpegDec_sc +``` + +* **Build kernel(Step 2)** + +Run the following make command to build your XCLBIN and host binary targeting a specific device. Please be noticed that this process will take a long time, maybe couple of hours. + +``` + make run TARGET=hw DEVICE=xilinx_u250_xdma_201830_2 +``` + +* **Run kernel(Step 3)** + +To get the benchmark results, please run the following command. + +``` + ./build_dir.hw.xilinx_u250_xdma_201830_2/host.exe -xclbin build_dir.hw.xilinx_u250_xdma_201830_2/jpegDecoder.xclbin -JPEGFile android.jpg +``` + +JPEG Decoder Input Arguments: + +``` + Usage: host.exe -[-xclbin -dataSetDir -refDir] + -xclbin: the kernel name + -JPEGFile: the path point to input *.jpg +``` + +Note: Default arguments are set in Makefile, you can use other [pictures](https://github.com/Xilinx/Vitis_Libraries/tree/master/codec/L2/demos#pictures) listed in the table. + +* **Example output(Step 4)** + +``` + Found Platform + Platform Name: Xilinx + INFO: Found Device=xilinx_u250_xdma_201830_2 + INFO: Importing build_dir.hw.xilinx_u250_xdma_201830_2/jpegDecoder.xclbin + Loading: 'build_dir.hw.xilinx_u250_xdma_201830_2/jpegDecoder.xclbin' + INFO: Kernel has been created + INFO: Finish kernel setup + ... + + INFO: Finish kernel execution + INFO: Finish E2E execution + INFO: Data transfer from host to device: 40 us + INFO: Data transfer from device to host: 6 us + INFO: Average kernel execution per run: 988 us + ... + + INFO: android.yuv will be generated from the jpeg decoder's output oINFO: android.yuv is generated correctly + INFO: android.yuv is generated correctly +``` + +Profiling +--------- + +The hardware resource utilizations are listed in the following table. +Different tool versions may result slightly different resource. + +##### Table 1 IP resources for jpegDecoder with huffman decoder(L1 IP) + +| IP | BRAM | URAM | DSP | FF | LUT | Frequency(MHz) | +|-----------------------|----------|----------|----------|----------|---------|-----------------| +| huffman_decoder | 5 | 0 | 12 | 6963 | 7344 | 286 | + +##### Table 2 IP resources for jpegDecoder with jfif parser and huffman decoder(L1 IP) + +| IP | BRAM | URAM | DSP | FF | LUT | Frequency(MHz) | +|-----------------------|----------|----------|----------|----------|---------|-----------------| +| kernel_parser_decoder | 5 | 0 | 12 | 7615 | 8382 | 257 | + +##### Table 3 Hardware resources for jpegDecoder with jfif parser, huffman, iq and idct (L2 kernel) + +| Kernel | BRAM | URAM | DSP | FF | LUT | Frequency(MHz) | +|-----------------------|----------|----------|----------|----------|---------|-----------------| +| jpegDecoder | 7 | 0 | 39 | 12298 | 13417 | 257 | + +Result +------ + +To check the output yuv file, download https://sourceforge.net/projects/raw-yuvplayer/ . +Then upload the rebuild_image.yuv, set the right sample radio and custom size on the software, and check the yuv file. + +Table 1 : Jpeg Decoder profiling + +![Table 1 : Jpeg Decoder profiling](../../../docs/images/jpegDecoderpofile.png) + +##### Note +``` + | 1. MAX_DEC_PIX is for benchmark. If testcase image is larger than 20M, the value of MAX_DEC_PIX should be enlarged following the size of image. + | 2. MAXCMP_BC is for benchmark. If testcase image is larger than 20M, the value of MAXCMP_BC should be enlarged following the size of image. +``` + +## License + +Licensed using the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0). + + Copyright 2022 Xilinx, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + Copyright 2022 Xilinx, Inc. diff --git a/codec/L2/demos/jpegDec_sc/description.json b/codec/L2/demos/jpegDec_sc/description.json new file mode 100644 index 0000000000..364603bc7a --- /dev/null +++ b/codec/L2/demos/jpegDec_sc/description.json @@ -0,0 +1,133 @@ +{ + "gui": false, + "name": "Xilinx JPEG Decoder (SC) Test", + "description": "This example is a SystemCompiler example for decoder supports the 'Sequential DCT-based mode' of ISO/IEC 10918-1 standard. It is a high-performance implementation based-on Xilinx HLS design methodolygy. It can process 1 Huffman token and create up to 8 DCT coeffiects within one cycle. It is also an easy-to-use decoder as it can direct parser the JPEG file header without help of software functions", + "flow": "vitis", + "platform_allowlist": [ + "u50", + "u280" + ], + "platform_blocklist": [ + "zc" + ], + "platform_properties": { + "u50": { + "host": { + "compiler": { + "symbols": [ + "USE_HBM" + ] + } + } + }, + "u280": { + "host": { + "compiler": { + "symbols": [ + "USE_HBM" + ] + } + } + } + }, + "launch": [ + { + "cmd_args": " -JPEGFile LIB_DIR/L2/demos/jpegDec/images/t0.jpg", + "name": "generic launch for all flows", + "ld_library_path": [ + "$(LD_LIBRARY_PATH)", + "$(XILINX_XRT)/lib" + ] + } + ], + "host": { + "host_exe": "host.exe", + "compiler": { + "sources": [ + "LIB_DIR/L2/demos/jpegDec_sc/host/test_decoder.cpp", + "LIB_DIR/L2/demos/jpegDec_sc/kernel/kernelJpegDecoder.cpp" + ], + "includepaths": [ + "LIB_DIR/L2/include/hw/jpegDec", + "LIB_DIR/L2/demos/jpegDec_sc/host", + "LIB_DIR/L2/demos/jpegDec_sc/kernel", + "LIB_DIR/../utils/L1/include" + ], + "options": "-O3 -D KERNEL0 -B/usr/lib/x86_64-linux-gnu" + } + }, + "v++": { + "compiler": { + "includepaths": [ + "LIB_DIR/L2/include/hw/jpegDec", + "LIB_DIR/../utils/L1/include", + "LIB_DIR/L2/demos/jpegDec_sc/kernel" + ] + } + }, + "containers": [ + { + "accelerators": [ + { + "location": "LIB_DIR/L2/demos/jpegDec_sc/kernel/kernelJpegDecoder.cpp", + "frequency": 300.0, + "clflags": " -D KERNEL0", + "name": "kernelJpegDecoder", + "num_compute_units": 1, + "compute_units": [ + { + "name": "JDK", + "slr": "SLR0", + "arguments": [ + { + "name": "jpeg_pointer", + "memory": "HBM[0]" + }, + { + "name": "yuv_mcu_pointer", + "memory": "HBM[1]" + }, + { + "name": "infos", + "memory": "HBM[2]" + } + ] + } + ] + } + ], + "frequency": 300.0, + "name": "JDK" + } + ], + "testinfo": { + "disable": false, + "jobs": [ + { + "index": 0, + "dependency": [], + "env": "", + "cmd": "", + "max_memory_MB": { + "vitis_hw_build": 40960, + "vitis_hw_emu": 40960, + "vitis_sw_emu": 10240, + "vitis_hw_run": 10240 + }, + "max_time_min": { + "vitis_hw_build": 1600, + "vitis_hw_emu": 400, + "vitis_sw_emu": 120, + "vitis_hw_run": 10 + } + } + ], + "targets": [ + "vitis_sw_emu", + "vitis_hw_emu", + "vitis_hw_build", + "vitis_hw_run" + ], + "category": "canary" + } +} diff --git a/codec/L2/demos/jpegDec_sc/host/test_decoder.cpp b/codec/L2/demos/jpegDec_sc/host/test_decoder.cpp new file mode 100644 index 0000000000..254f73e666 --- /dev/null +++ b/codec/L2/demos/jpegDec_sc/host/test_decoder.cpp @@ -0,0 +1,447 @@ +/* + * Copyright 2019 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//#define _HLS_TEST_ 1 + +#ifndef _HLS_TEST_ +//#include "xcl2.hpp" +#endif + +#include "kernelJpegDecoder.hpp" +#include "utils_XAcc_jpeg.hpp" +#include "xf_utils_sw/logger.hpp" + +#include "utils.hpp" + +#ifndef __SYNTHESIS__ + +// ------------------------------------------------------------ +// for tmp application and reorder +int16_t* hls_block = (int16_t*)malloc(sizeof(int16_t) * MAX_NUM_COLOR * MAXCMP_BC * 64); +xf::codec::idct_out_t* yuv_row_pointer = (uint8_t*)malloc(sizeof(uint8_t) * MAX_NUM_COLOR * MAXCMP_BC * 64); + +// ------------------------------------------------------------ +// input strm_iDCT_x8[8] is the row of block yuv in mcu order of sample +// output image_height*image_width*Y ... image_height*image_width*U ... image_height*image_width*V 0a to form a file to +// show the picture +void rebuild_raw_yuv(std::string file_name, + xf::codec::bas_info* bas_info, + int hls_bc[MAX_NUM_COLOR], + // hls::stream strm_iDCT_x8[8], + ap_uint<64>* yuv_mcu_pointer) { + std::string file = file_name.substr(file_name.find_last_of('/') + 1); + std::string fn = file.substr(0, file.find_last_of(".")) + ".raw"; + FILE* f = fopen(fn.c_str(), "wb"); + std::cout << "WARNING: " << fn << " will be opened for binary write." << std::endl; + if (!f) { + std::cerr << "ERROR: " << fn << " cannot be opened for binary write." << std::endl; + } + + xf::codec::idct_out_t* yuv_mcu_pointer_pix = (uint8_t*)malloc(sizeof(uint8_t) * bas_info->all_blocks * 64); + + int cnt = 0; + int cnt_row = 0; + for (int b = 0; b < (int)(bas_info->all_blocks); b++) { + for (int i = 0; i < 8; i++) { // write one block of Y or U or V + for (int j = 0; j < 8; j++) { + yuv_mcu_pointer_pix[cnt] = yuv_mcu_pointer[cnt_row](8 * (j + 1) - 1, 8 * j); // strm_iDCT_x8[j].read(); + cnt++; + } + cnt_row++; + } + } + +write_mcu_raw_data: + fwrite(yuv_mcu_pointer, sizeof(char), bas_info->all_blocks * 64, f); + + // fwrite(&end_file, 1, 1, f);//write 0x0a + fclose(f); + + file = file_name.substr(file_name.find_last_of('/') + 1); + fn = file.substr(0, file.find_last_of(".")) + ".yuv"; + f = fopen(fn.c_str(), "wb"); + std::cout << "WARNING: " << fn << " will be opened for binary write." << std::endl; + if (!f) { + std::cerr << "ERROR: " << fn << " cannot be opened for binary write." << std::endl; + } + + xf::codec::COLOR_FORMAT fmt = bas_info->format; + + int dpos[MAX_NUM_COLOR]; // the dc position of the pointer + for (int cmp = 0; cmp < MAX_NUM_COLOR; cmp++) { + dpos[cmp] = 0; + } + + uint16_t block_width = bas_info->axi_width[0]; + int n_mcu = 0; + + printf("INFO: fmt %d, bas_info->mcu_cmp = %d \n", fmt, (int)(bas_info->mcu_cmp)); + printf("INFO: bas_info->hls_mbs[cmp] %d, %d, %d \n", bas_info->hls_mbs[0], bas_info->hls_mbs[1], + bas_info->hls_mbs[2]); + +LOOP_write_yuv_buffer: + while (n_mcu < (int)(bas_info->hls_mcuc)) { + for (int cmp = 0; cmp < MAX_NUM_COLOR; cmp++) { // 0,1,2 + for (int mbs = 0; mbs < bas_info->hls_mbs[cmp]; mbs++) { // 0,1,2,3, 0, 0, + + for (int i = 0; i < 8; i++) { // write one block of Y or U or V + for (int j = 0; j < 8; j++) { + yuv_row_pointer[(cmp)*bas_info->axi_height[0] * bas_info->axi_width[0] * 64 + (dpos[cmp]) * 8 + + j * bas_info->axi_width[cmp] * 8 + i] = *yuv_mcu_pointer_pix; + yuv_mcu_pointer_pix++; + } + } // end block + + if (fmt == xf::codec::C420) { // 420 mbs= 0 1 2 3 0 0 + + if (mbs == 0) { + if (cmp != 0 && (dpos[cmp] % bas_info->axi_width[1] == bas_info->axi_width[1] - 1)) { + dpos[cmp] += 1 + bas_info->axi_width[1] * (8 - 1); + } else { + dpos[cmp] += 1; + } + } else if (mbs == 1) { + dpos[cmp] += block_width * 8 - 1; + } else if (mbs == 2) { + dpos[cmp] += 1; + } else { + if (dpos[cmp] % (block_width * (8) * 2) == (8 + 1) * block_width - 1) { + dpos[cmp] += 1 + block_width * (8 - 1); + } else { + dpos[cmp] -= block_width * 8 - 1; + } + } + } else if (fmt == xf::codec::C422) { // 422 mbs 0 1 0 0 + if (mbs == 0) { + if (cmp != 0 && (dpos[cmp] % bas_info->axi_width[1] == bas_info->axi_width[1] - 1)) { + dpos[cmp] += 1 + bas_info->axi_width[1] * (8 - 1); + } else { + dpos[cmp] += 1; + } + } else { // cmp=0, mbs=1 + if (dpos[cmp] % (block_width) == block_width - 1) { + dpos[cmp] += 1 + block_width * (8 - 1); + } else { + dpos[cmp] += 1; + } + } + } else { + if (dpos[cmp] % block_width == block_width - 1) { + dpos[cmp] += 1 + block_width * (8 - 1); + } else { + dpos[cmp] += 1; + } + } + } + } // end one mcu + n_mcu++; + } + + for (int i = 0; i < 16; i++) { + for (int j = 0; j < 8; j++) { + printf("%02X, ", (uint8_t)(yuv_row_pointer[8 * i + j])); + } + printf("\n"); + } + + for (int i = 0; i < 16; i++) { + for (int j = 0; j < 8; j++) { + printf("%d, ", (uint8_t)(yuv_row_pointer[8 * i + j])); + } + printf("\n"); + } + +LOOP_write_y: + fwrite(yuv_row_pointer, sizeof(char), bas_info->axi_height[0] * bas_info->axi_width[0] * 64, f); +LOOP_write_u: + fwrite(yuv_row_pointer + bas_info->axi_height[0] * bas_info->axi_width[0] * 64, sizeof(char), + bas_info->axi_height[1] * bas_info->axi_width[1] * 64, f); +LOOP_write_v: + fwrite(yuv_row_pointer + bas_info->axi_height[0] * bas_info->axi_width[0] * 128, sizeof(char), + bas_info->axi_height[2] * bas_info->axi_width[2] * 64, f); + + // fwrite(&end_file, 1, 1, f);//write 0x0a + fclose(f); + + printf("Please open the YUV file with fmt %d and (width, height) = (%d, %d) \n", fmt, bas_info->axi_width[0] * 8, + bas_info->axi_height[0] * 8); + + // write yuv info to a file + fn = file.substr(0, file.find_last_of(".")) + ".yuv.h"; + f = fopen(fn.c_str(), "aw"); + std::cout << "WARNING: " << fn << " will be opened for binary write." << std::endl; + if (!f) { + std::cerr << "ERROR: " << fn << " cannot be opened for binary write." << std::endl; + } + fprintf(f, "INFO: fmt=%d, bas_info->mcu_cmp=%d\n", fmt, (int)(bas_info->mcu_cmp)); + fprintf(f, "INFO: bas_info->hls_mbs[cmp] %d, %d, %d \n", bas_info->hls_mbs[0], bas_info->hls_mbs[1], + bas_info->hls_mbs[2]); + fprintf(f, "Please open the YUV file with fmt %d and (width, height) = (%d, %d) \n", fmt, + bas_info->axi_width[0] * 8, bas_info->axi_height[0] * 8); + fclose(f); +} + +// ------------------------------------------------------------ +void rebuild_infos(xf::codec::img_info& img_info, + xf::codec::cmp_info cmp_info[MAX_NUM_COLOR], + xf::codec::bas_info& bas_info, + int& rtn, + int& rtn2, + ap_uint<32> infos[1024]) { + img_info.hls_cs_cmpc = *(infos + 0); + img_info.hls_mcuc = *(infos + 1); + img_info.hls_mcuh = *(infos + 2); + img_info.hls_mcuv = *(infos + 3); + rtn = *(infos + 4); + rtn2 = *(infos + 5); + + bas_info.all_blocks = *(infos + 10); + for (int i = 0; i < MAX_NUM_COLOR; i++) { + bas_info.axi_height[i] = *(infos + 11 + i); + } + for (int i = 0; i < 4; i++) { + bas_info.axi_map_row2cmp[i] = *(infos + 14 + i); + } + bas_info.axi_mcuv = *(infos + 18); + bas_info.axi_num_cmp = *(infos + 19); + bas_info.axi_num_cmp_mcu = *(infos + 20); + for (int i = 0; i < MAX_NUM_COLOR; i++) { + bas_info.axi_width[i] = *(infos + 21 + i); + } + int format = *(infos + 24); + bas_info.format = (xf::codec::COLOR_FORMAT)format; + for (int i = 0; i < MAX_NUM_COLOR; i++) { + bas_info.hls_mbs[i] = *(infos + 25 + i); + } + bas_info.hls_mcuc = *(infos + 28); + for (int c = 0; c < MAX_NUM_COLOR; c++) { + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + bas_info.idct_q_table_x[c][i][j] = *(infos + 29 + c * 64 + i * 8 + j); + } + } + } + for (int c = 0; c < MAX_NUM_COLOR; c++) { + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + bas_info.idct_q_table_y[c][i][j] = *(infos + 221 + c * 64 + i * 8 + j); + } + } + } + bas_info.mcu_cmp = *(infos + 413); + for (int c = 0; c < MAX_NUM_COLOR; c++) { + for (int i = 0; i < 64; i++) { + bas_info.min_nois_thld_x[c][i] = *(infos + 414 + c * 64 + i); + } + } + for (int c = 0; c < MAX_NUM_COLOR; c++) { + for (int i = 0; i < 64; i++) { + bas_info.min_nois_thld_y[c][i] = *(infos + 606 + c * 64 + i); + } + } + for (int c = 0; c < MAX_NUM_COLOR; c++) { + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + bas_info.q_tables[c][i][j] = *(infos + 798 + c * 64 + i * 8 + j); + } + } + } + for (int c = 0; c < MAX_NUM_COLOR; c++) { + cmp_info[c].bc = *(infos + 990 + c * 6); + cmp_info[c].bch = *(infos + 991 + c * 6); + cmp_info[c].bcv = *(infos + 992 + c * 6); + cmp_info[c].mbs = *(infos + 993 + c * 6); + cmp_info[c].sfh = *(infos + 994 + c * 6); + cmp_info[c].sfv = *(infos + 995 + c * 6); + } + + printf("test INFO: bas_info->mcu_cmp = %d \n", (int)(bas_info.mcu_cmp)); + printf("test INFO: bas_info->hls_mbs[cmp] %d, %d, %d \n", bas_info.hls_mbs[0], bas_info.hls_mbs[1], + bas_info.hls_mbs[2]); +} + +// ------------------------------------------------------------ + +int main(int argc, const char* argv[]) { + std::cout << "\n------------ Test for decode image.jpg -------------\n"; + std::string optValue; + std::string JPEGFile; + std::string xclbin_path; + + // cmd arg parser. + ArgParser parser(argc, argv); + + // Read In paths addresses + if (parser.getCmdOption("-JPEGFile", optValue)) { + JPEGFile = optValue; + std::cout << "COMMOND: host.exe -JPEGFile " << optValue << std::endl; + } else { + std::cout << "INFO: JPEG file not specified for this test. use " + "'-JPEGFile' to specified it. \n"; + } + + ///// declaration + + // load data to simulate the ddr data + // size of jpeg_pointer, output of yuv_mcu_pointer, and output image infos + int size; + uint8_t* jpeg_pointer; +#ifndef _HLS_TEST_ + ap_uint<64>* yuv_mcu_pointer = aligned_alloc >(sizeof(ap_uint<64>) * MAXCMP_BC * 8); + ap_uint<32>* infos = aligned_alloc >(sizeof(ap_uint<32>) * 1024); +#else + ap_uint<64>* yuv_mcu_pointer = (ap_uint<64>*)malloc(sizeof(ap_uint<64>) * MAXCMP_BC * 8); + ap_uint<32>* infos = (ap_uint<32>*)malloc(sizeof(ap_uint<32>) * 1024); +#endif + int err = load_dat(jpeg_pointer, JPEGFile, size); + if (err) { + printf("Alloc buf failed!, size:%d Bytes\n", size); + return err; + } else { + printf("Alloc buf successfully!, size:%d Bytes\n", size); + } + + // Variables to measure time + + // To test SYNTHESIS top + hls::stream > block_strm; + xf::codec::cmp_info cmp_info[MAX_NUM_COLOR]; + xf::codec::img_info img_info; + xf::codec::bas_info bas_info; + img_info.hls_cs_cmpc = 0; // init + + // 0: decode jfif successful + // 1: marker in jfif is not in expectation + int rtn = 0; + + // 0: decode huffman successful + // 1: huffman data is not in expectation + int rtn2 = false; + +#ifdef _HLS_TEST_ + uint32_t hls_mcuc; + uint16_t hls_mcuh; + uint16_t hls_mcuv; + uint8_t hls_cs_cmpc; + hls::stream > idx_coef; + hls::stream strm_iDCT_x8[8]; + + // L2 top + kernelJpegDecoder((ap_uint*)jpeg_pointer, (int)size, + //&img_info, cmp_info, &bas_info, + yuv_mcu_pointer, infos); + // strm_iDCT_x8);//idx_coef, + + rebuild_infos(img_info, cmp_info, bas_info, rtn, rtn2, infos); + // one shoot test for the IDCT + printf("INFO: bas_info.q_tables are : \n"); + for (int id = 0; id < 2; id++) { + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + printf("%d, ", (int)(bas_info.q_tables[id][i][j])); + } + printf("\n"); + } + } +#else + xf::common::utils_sw::Logger logger(std::cout, std::cerr); + + // send task requests + auto jpeg_pointer_pool = jpegDec_acc::create_bufpool(vpp::input); + auto yuv_mcu_pointer_pool = jpegDec_acc::create_bufpool(vpp::output); + auto infos_pool = jpegDec_acc::create_bufpool(vpp::output); + + jpegDec_acc::send_while([&]() -> bool { + uint8_t* acc_jpeg_pointer = (uint8_t*)jpegDec_acc::alloc_buf(jpeg_pointer_pool, sizeof(uint8_t) * size); + ap_uint<64>* acc_yuv_mcu_pointer = + (ap_uint<64>*)jpegDec_acc::alloc_buf(yuv_mcu_pointer_pool, sizeof(ap_uint<64>) * MAXCMP_BC * 8); + ap_uint<32>* acc_infos = (ap_uint<32>*)jpegDec_acc::alloc_buf(infos_pool, sizeof(ap_uint<32>) * 1024); + + memcpy(acc_jpeg_pointer, jpeg_pointer, sizeof(uint8_t) * size); + + jpegDec_acc::compute((ap_uint*)acc_jpeg_pointer, size, acc_yuv_mcu_pointer, acc_infos); + + return 0; + }); + + // send result receiving requests + jpegDec_acc::receive_all_in_order([&]() { + ap_uint<64>* acc_yuv_mcu_pointer = (ap_uint<64>*)jpegDec_acc::get_buf(yuv_mcu_pointer_pool); + ap_uint<32>* acc_infos = (ap_uint<32>*)jpegDec_acc::get_buf(infos_pool); + + memcpy(yuv_mcu_pointer, acc_yuv_mcu_pointer, sizeof(ap_uint<64>) * MAXCMP_BC * 8); + memcpy(infos, acc_infos, sizeof(ap_uint<32>) * 1024); + + rebuild_infos(img_info, cmp_info, bas_info, rtn, rtn2, acc_infos); + + }); + + struct timeval start_time, end_time; + gettimeofday(&start_time, 0); + jpegDec_acc::join(); + gettimeofday(&end_time, 0); + + std::cout << "INFO: Finish kernel execution" << std::endl; + std::cout << "INFO: Finish E2E execution" << std::endl; + std::cout << "-------------------------------------------------------" << std::endl; + unsigned long exec_timeE2E = diff(&end_time, &start_time); + std::cout << "INFO: Average E2E per run: " << exec_timeE2E << " us\n"; + std::cout << "-------------------------------------------------------" << std::endl; + + rebuild_infos(img_info, cmp_info, bas_info, rtn, rtn2, infos); +#endif + // for image info + int hls_bc[MAX_NUM_COLOR]; + for (int i = 0; i < MAX_NUM_COLOR; i++) { + hls_bc[i] = cmp_info[i].bc; + } + + // todo merge to syn-code + + if (rtn || rtn2) { + printf("Warning: Decoding the bad case input file!\n"); + if (rtn == 1) { + printf("Warning: [code 1] marker in jfif is not in expectation!\n"); + } else if (rtn == 2) { + printf("ERROR: [code 2] huffman table is not in expectation!\n"); + } else { + if (rtn2) { + printf("Warning: [code 3] huffman data is not in expectation!\n"); + } + } + return 1; +#ifndef _HLS_TEST_ + logger.error(xf::common::utils_sw::Logger::Message::TEST_FAIL); + } else { + logger.info(xf::common::utils_sw::Logger::Message::TEST_PASS); +#endif + } + + printf("INFO: writing the YUV file!\n"); + rebuild_raw_yuv(JPEGFile, &bas_info, hls_bc, yuv_mcu_pointer); + + free(jpeg_pointer); + free(hls_block); + free(infos); + free(yuv_row_pointer); + + std::cout << "Ready for next image!\n "; + + return 0; +} +#endif + +// ************************************************************ diff --git a/codec/L2/demos/jpegDec_sc/host/utils.hpp b/codec/L2/demos/jpegDec_sc/host/utils.hpp new file mode 100644 index 0000000000..5c35306056 --- /dev/null +++ b/codec/L2/demos/jpegDec_sc/host/utils.hpp @@ -0,0 +1,105 @@ +/* + * Copyright 2019 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef UTILS_H +#define UTILS_H +#include +#include +#include +#include +#include +#include +// ------------------------------------------------------------ + +#if __linux +template +T* aligned_alloc(std::size_t num) { + void* ptr = nullptr; + if (posix_memalign(&ptr, 4096, num * sizeof(T))) { + throw std::bad_alloc(); + } + return reinterpret_cast(ptr); +} +#endif + +// ------------------------------------------------------------ +// Compute time difference +unsigned long diff(const struct timeval* newTime, const struct timeval* oldTime) { + return (newTime->tv_sec - oldTime->tv_sec) * 1000000 + (newTime->tv_usec - oldTime->tv_usec); +} + +// ------------------------------------------------------------ +// load the data file (.txt, .bin, .jpg ...)to ptr +template +int load_dat(T*& data, const std::string& name, int& size) { + uint64_t n; + std::string fn = name; + FILE* f = fopen(fn.c_str(), "rb"); + std::cout << "WARNING: " << fn << " will be opened for binary read." << std::endl; + if (!f) { + std::cerr << "ERROR: " << fn << " cannot be opened for binary read." << std::endl; + return -1; + } + + fseek(f, 0, SEEK_END); + n = (uint64_t)ftell(f); + if (n > MAX_DEC_PIX) { + std::cout << " read n bytes > MAX_DEC_PIX, please set a larger MAX_DEC_PIX " << std::endl; + return 1; + } +#if __linux + data = aligned_alloc(n); +#else + data = (T*)malloc(MAX_DEC_PIX); +#endif + fseek(f, 0, SEEK_SET); + size = fread(data, sizeof(char), n, f); + fclose(f); + std::cout << n << " entries read from " << fn << std::endl; + + return 0; +} + +// ------------------------------------------------------------ +// get the arg +class ArgParser { + public: + ArgParser(int& argc, const char* argv[]) { + for (int i = 1; i < argc; ++i) mTokens.push_back(std::string(argv[i])); + } + bool getCmdOption(const std::string option, std::string& value) const { + std::vector::const_iterator itr; + itr = std::find(this->mTokens.begin(), this->mTokens.end(), option); + if (itr != this->mTokens.end() && ++itr != this->mTokens.end()) { + value = *itr; + return true; + } + return false; + } + bool getCmdOption(const std::string option) const { + std::vector::const_iterator itr; + itr = std::find(this->mTokens.begin(), this->mTokens.end(), option); + if (itr != this->mTokens.end()) + return true; + else + return false; + } + + private: + std::vector mTokens; +}; + +#endif \ No newline at end of file diff --git a/codec/L2/demos/jpegDec_sc/kernel/kernelJpegDecoder.cpp b/codec/L2/demos/jpegDec_sc/kernel/kernelJpegDecoder.cpp new file mode 100644 index 0000000000..6712266894 --- /dev/null +++ b/codec/L2/demos/jpegDec_sc/kernel/kernelJpegDecoder.cpp @@ -0,0 +1,66 @@ +/* + * Copyright 2019 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file kernelJpegDecoder.cpp + * @brief kernelJpegDecoder template function implementation and kernel_decoder warpper. + * + * This file is part of HLS algorithm library. + */ + +#include "kernelJpegDecoder.hpp" + +void jpegDec_acc::compute(ap_uint* jpeg_pointer, + const int size, + ap_uint<64>* yuv_mcu_pointer, + ap_uint<32>* infos) { + JDK(jpeg_pointer, size, yuv_mcu_pointer, infos); +} + +// ------------------------------------------------------------ +// @brief Level 2 : kernel for jfif parser + huffman decoder + iQ_iDCT, kernelJpegDecoder Abb to JDK +// a.input the jpg 420/422/444 baseline file +// b.output the as the 8x8 's Column scan order YUV (0~255), like [Y*allpixels,U*0.5*allpixels, V*0.5*allpixels], and +// image infos +// c.Fault tolerance: If the picture's format is incorrect, error codes will directly end the kernel +// and wait for the input of the next image. Error codes cloud help to position at which decoding stage does the error +// occur +// d.performance: input throughput: 150MB/s~300MB/s(1symbol/clk), output 1~1.6GB/s (max 8B/clk), +// frequency 250MHz for kernel, for only huffman core 286MHz by vivado 2018.3 + +void jpegDec_acc::JDK(ap_uint* jpeg_pointer, + const int size, + ap_uint<64>* yuv_mcu_pointer, + ap_uint<32>* infos) { + // clang-format off + //const uint64_t max_pix = MAX_NUM_PIX;//for 8K*8K + const uint64_t max_pix = MAX_DEC_PIX;//for 800*800 + const uint64_t max_yuv = MAXCMP_BC * 8;//blocknum * 8 rows + const uint64_t burst_lenth = BURST_LENTH; +#pragma HLS INTERFACE m_axi port = jpeg_pointer depth = 65000 offset = direct bundle = gmem_in0 \ + latency = 64 num_read_outstanding = 32 max_read_burst_length = 32 +#pragma HLS INTERFACE m_axi port = yuv_mcu_pointer depth = 230400 offset = direct bundle = gmem_in1 \ + latency = 64 num_write_outstanding = 32 max_write_burst_length = 32 +#pragma HLS INTERFACE m_axi port = infos depth = 1024 offset = direct bundle = gmem_in2 \ + latency = 64 num_write_outstanding = 32 max_write_burst_length = 32 +// #pragma HLS INTERFACE s_axilite port=jpeg_pointer bundle=control +// #pragma HLS INTERFACE s_axilite port=yuv_mcu_pointer bundle=control +// #pragma HLS INTERFACE s_axilite port=size bundle=control +// #pragma HLS INTERFACE s_axilite port=infos bundle=control +// #pragma HLS INTERFACE s_axilite port=return bundle=control + + xf::codec::kernelJpegDecoderTop(jpeg_pointer, size, yuv_mcu_pointer, infos); +} diff --git a/codec/L2/demos/jpegDec_sc/kernel/kernelJpegDecoder.hpp b/codec/L2/demos/jpegDec_sc/kernel/kernelJpegDecoder.hpp new file mode 100644 index 0000000000..06b5634549 --- /dev/null +++ b/codec/L2/demos/jpegDec_sc/kernel/kernelJpegDecoder.hpp @@ -0,0 +1,78 @@ +/* + * Copyright 2019 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file kernelJpegDecoder.hpp + * @brief kernelJpegDecoder template function implementation and kernel_decoder warpper. + * + * This file is part of HLS algorithm library. + */ + +#ifndef _XF_CODEC_KERNEL_JPEG_DEC_SC_HPP_ +#define _XF_CODEC_KERNEL_JPEG_DEC_SC_HPP_ + +#include "XAcc_jpegdecoder.hpp" +#include "XAcc_jfifparser.hpp" +#include "XAcc_idct.hpp" + +#include "vpp_acc.hpp" + +// ------------------------------------------------------------ +/** + * @brief Level 2 : kernel for jfif parser + huffman decoder + iQ_iDCT + * + * @tparam CH_W size of data path in dataflow region, in bit. + * when CH_W is 16, the decoder could decode one symbol per cycle in about 99% cases. + * when CH_W is 8 , the decoder could decode one symbol per cycle in about 80% cases, but use less resource. + * + * @param jpeg_pointer the input jpeg to be read from DDR. + * @param size the total bytes to be read from DDR. + * @param yuv_mcu_pointer the output yuv to DDR in mcu order. 1 ap_uint<64> has 8 uint8_t pixels after idct. + * @param info information of the image, maybe use in the recovery image. + */ +// a.input the jpg 420/422/444 baseline file +// b.output the as the 8x8 's Column scan order YUV (0~255), like [Y*allpixels,U*0.5*allpixels, V*0.5*allpixels], and +// image infos +// c.Fault tolerance: If the picture's format is incorrect, error codes will directly end the kernel +// and wait for the input of the next image. Error codes cloud help to position at which decoding stage does the error +// occur +// d.performance: input throughput: 150MB/s~300MB/s(1symbol/clk), output 1~1.6GB/s (max 8B/clk), +// frequency 250MHz for kernel, for only huffman core 286MHz by vivado 2018.3 + +class jpegDec_acc : public VPP_ACC { + // port bindings + ZERO_COPY(jpeg_pointer); + // ZERO_COPY(size); + ZERO_COPY(yuv_mcu_pointer); + ZERO_COPY(infos); + + SYS_PORT(jpeg_pointer, DDR[0]); + SYS_PORT(yuv_mcu_pointer, DDR[0]); + SYS_PORT(infos, DDR[0]); + + SYS_PORT_PFM(u50, jpeg_pointer, HBM[0]); + SYS_PORT_PFM(u50, yuv_mcu_pointer, HBM[1]); + SYS_PORT_PFM(u50, infos, HBM[2]); + + public: + static void compute(ap_uint* jpeg_pointer, + const int size, + ap_uint<64>* yuv_mcu_pointer, + ap_uint<32>* infos); + static void JDK(ap_uint* jpeg_pointer, const int size, ap_uint<64>* yuv_mcu_pointer, ap_uint<32>* infos); +}; + +#endif // _XF_CODEC_KERNEL_JPEG_DEC_SC_HPP_ \ No newline at end of file diff --git a/codec/L2/demos/jpegDec_sc/utils.mk b/codec/L2/demos/jpegDec_sc/utils.mk new file mode 100644 index 0000000000..1937b53d2b --- /dev/null +++ b/codec/L2/demos/jpegDec_sc/utils.mk @@ -0,0 +1,239 @@ +# +# Copyright 2019-2021 Xilinx, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# sc makefile-generator v1.0.0 +# +#+------------------------------------------------------------------------------- +# The following parameters are assigned with default values. These parameters can +# be overridden through the make command line +#+------------------------------------------------------------------------------- + +REPORT := no +PROFILE := no +DEBUG := no + +#'estimate' for estimate report generation +#'system' for system report generation +ifneq ($(REPORT), no) +VPP_LDFLAGS += --report estimate +VPP_LDFLAGS += --report system +endif + +#Generates profile summary report +ifeq ($(PROFILE), yes) +VPP_LDFLAGS += --profile_kernel data:all:all:all +endif + +#Generates debug summary report +ifeq ($(DEBUG), yes) +VPP_LDFLAGS += --dk protocol:all:all:all +endif + +#Check environment setup +ifndef XILINX_VITIS + XILINX_VITIS = /opt/xilinx/Vitis/$(TOOL_VERSION) + export XILINX_VITIS +endif +ifndef XILINX_XRT + XILINX_XRT = /opt/xilinx/xrt + export XILINX_XRT +endif + +check_device: + @set -eu; \ + inallowlist=False; \ + inblocklist=False; \ + for dev in $(PLATFORM_ALLOWLIST); \ + do if [[ $$(echo $(PLATFORM_NAME) | grep $$dev) != "" ]]; \ + then inallowlist=True; fi; \ + done ;\ + for dev in $(PLATFORM_BLOCKLIST); \ + do if [[ $$(echo $(PLATFORM_NAME) | grep $$dev) != "" ]]; \ + then inblocklist=True; fi; \ + done ;\ + if [[ $$inallowlist == False ]]; \ + then echo "[Warning]: The device $(PLATFORM_NAME) not in allowlist."; \ + fi; \ + if [[ $$inblocklist == True ]]; \ + then echo "[ERROR]: The device $(PLATFORM_NAME) in blocklist."; exit 1;\ + fi; + +#get HOST_ARCH by PLATFORM +ifneq (,$(PLATFORM)) +HOST_ARCH_temp = $(shell platforminfo -p $(PLATFORM) | grep 'CPU Type' | sed 's/.*://' | sed '/ai_engine/d' | sed 's/^[[:space:]]*//') +ifeq ($(HOST_ARCH_temp), x86) +HOST_ARCH := x86 +else ifeq ($(HOST_ARCH_temp), cortex-a9) +HOST_ARCH := aarch32 +else ifneq (,$(findstring cortex-a, $(HOST_ARCH_temp))) +HOST_ARCH := aarch64 +endif +endif + +#Checks for Device Family +ifeq ($(HOST_ARCH), aarch32) + DEV_FAM = 7Series +else ifeq ($(HOST_ARCH), aarch64) + DEV_FAM = Ultrascale +endif + +#Checks for Correct architecture +ifneq ($(HOST_ARCH), $(filter $(HOST_ARCH),aarch64 aarch32 x86)) +$(error HOST_ARCH variable not set, please set correctly and rerun) +endif + +check_version: +ifneq (, $(shell which git)) +ifneq (,$(wildcard $(XFLIB_DIR)/.git)) + @cd $(XFLIB_DIR) && git log --graph --pretty=format:'%Cred%h%Creset -%C(yellow)%d%Creset %s %Cgreen(%cr) %C(bold blue)<%an>%Creset' --abbrev-commit -n 1 && cd - +endif +endif + +#Checks for SYSROOT +check_sysroot: +ifneq ($(HOST_ARCH), x86) +ifndef SYSROOT + $(error SYSROOT ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif + +#Checks for g++ +CXX := g++ +CXX_REQ := $(shell echo $(GCC_INTOOL) | cut -f 1 -d ".") +ifeq ($(HOST_ARCH), x86) +ifneq ($(shell expr $(shell echo "__GNUG__" | g++ -E -x c++ - | tail -1) \>= $(CXX_REQ)), 1) +ifndef XILINX_VIVADO +$(error [ERROR]: g++ version too old. Please use $(CXX_REQ) or above) +else +CXX := $(XILINX_VIVADO)/tps/lnx64/gcc-$(GCC_INTOOL)/bin/g++ +ifeq ($(LD_LIBRARY_PATH),) +export LD_LIBRARY_PATH := $(XILINX_VIVADO)/tps/lnx64/gcc-$(GCC_INTOOL)/lib64 +else +export LD_LIBRARY_PATH := $(XILINX_VIVADO)/tps/lnx64/gcc-$(GCC_INTOOL)/lib64:$(LD_LIBRARY_PATH) +endif +$(warning [WARNING]: g++ version too old. Using g++ provided by the tool: $(CXX)) +endif +endif +else ifeq ($(HOST_ARCH), aarch64) +CXX := $(XILINX_VITIS)/gnu/aarch64/lin/aarch64-linux/bin/aarch64-linux-gnu-g++ +else ifeq ($(HOST_ARCH), aarch32) +CXX := $(XILINX_VITIS)/gnu/aarch32/lin/gcc-arm-linux-gnueabi/bin/arm-linux-gnueabihf-g++ +endif + +#check binutils +BINUTILS := $(shell ld -v | cut -f 4 -d " " | cut -f 1 -d "-") +BINUTILS_REQ := $(BINUTILS_INTOOL) +ifneq ($(shell expr $(BINUTILS) \>= $(BINUTILS_REQ)), 1) +export PATH := $(XILINX_VIVADO)/tps/lnx64/binutils-$(BINUTILS_INTOOL)/bin:$(PATH) +endif + +#Setting VPP +VPP := v++ + +#Cheks for aiecompiler +AIECXX := aiecompiler +AIESIMULATOR := aiesimulator +X86SIMULATOR := x86simulator + +.PHONY: check_vivado +check_vivado: +ifeq (,$(wildcard $(XILINX_VIVADO)/bin/vivado)) + @echo "Cannot locate Vivado installation. Please set XILINX_VIVADO variable." && false +endif + +.PHONY: check_vpp +check_vpp: +ifeq (,$(wildcard $(XILINX_VITIS)/bin/v++)) + @echo "Cannot locate Vitis installation. Please set XILINX_VITIS variable." && false +endif + +.PHONY: check_xrt +check_xrt: +ifeq (,$(wildcard $(XILINX_XRT)/lib/libxilinxopencl.so)) + @echo "Cannot locate XRT installation. Please set XILINX_XRT variable." && false +endif + +export PATH := $(XILINX_VITIS)/bin:$(XILINX_XRT)/bin:$(PATH) +ifeq ($(HOST_ARCH), x86) +ifeq (,$(LD_LIBRARY_PATH)) +LD_LIBRARY_PATH := $(XILINX_XRT)/lib +else +LD_LIBRARY_PATH := $(XILINX_XRT)/lib:$(LD_LIBRARY_PATH) +endif +endif + +ifneq (,$(wildcard $(PLATFORM))) +# Use PLATFORM as a file path +XPLATFORM := $(PLATFORM) +else +# Use PLATFORM as a file name pattern +# 1. search paths specified by variable +ifneq (,$(PLATFORM_REPO_PATHS)) +# 1.1 as exact name +XPLATFORM := $(strip $(foreach p, $(subst :, ,$(PLATFORM_REPO_PATHS)), $(wildcard $(p)/$(PLATFORM)/$(PLATFORM).xpfm))) +# 1.2 as a pattern +ifeq (,$(XPLATFORM)) +XPLATFORMS := $(foreach p, $(subst :, ,$(PLATFORM_REPO_PATHS)), $(wildcard $(p)/*/*.xpfm)) +XPLATFORM := $(strip $(foreach p, $(XPLATFORMS), $(shell echo $(p) | awk '$$1 ~ /$(PLATFORM)/'))) +endif # 1.2 +endif # 1 +# 2. search Vitis installation +ifeq (,$(XPLATFORM)) +# 2.1 as exact name +XPLATFORM := $(strip $(wildcard $(XILINX_VITIS)/platforms/$(PLATFORM)/$(PLATFORM).xpfm)) +# 2.2 as a pattern +ifeq (,$(XPLATFORM)) +XPLATFORMS := $(wildcard $(XILINX_VITIS)/platforms/*/*.xpfm) +XPLATFORM := $(strip $(foreach p, $(XPLATFORMS), $(shell echo $(p) | awk '$$1 ~ /$(PLATFORM)/'))) +endif # 2.2 +endif # 2 +# 3. search default locations +ifeq (,$(XPLATFORM)) +# 3.1 as exact name +XPLATFORM := $(strip $(wildcard /opt/xilinx/platforms/$(PLATFORM)/$(PLATFORM).xpfm)) +# 3.2 as a pattern +ifeq (,$(XPLATFORM)) +XPLATFORMS := $(wildcard /opt/xilinx/platforms/*/*.xpfm) +XPLATFORM := $(strip $(foreach p, $(XPLATFORMS), $(shell echo $(p) | awk '$$1 ~ /$(PLATFORM)/'))) +endif # 3.2 +endif # 3 +endif + +define MSG_PLATFORM +No platform matched pattern '$(PLATFORM)'. +Available platforms are: $(XPLATFORMS) +To add more platform directories, set the PLATFORM_REPO_PATHS variable or point PLATFORM variable to the full path of platform .xpfm file. +endef +export MSG_PLATFORM + + +.PHONY: check_platform +check_platform: +ifeq (,$(XPLATFORM)) + @echo "$${MSG_PLATFORM}" && false +endif +#Check ends + +# device2xsa - create a filesystem friendly name from device name +# $(1) - full name of device +PLATFORM_NAME = $(strip $(patsubst %.xpfm, % , $(shell basename $(PLATFORM)))) + + +# Cleaning stuff +RM = rm -f +RMDIR = rm -rf + +MV = mv -f +CP = cp -rf +ECHO:= @echo diff --git a/codec/L2/demos/jxlEnc/README.md b/codec/L2/demos/jxlEnc/README.md new file mode 100644 index 0000000000..74be710ecd --- /dev/null +++ b/codec/L2/demos/jxlEnc/README.md @@ -0,0 +1,125 @@ +JXL Encoder +=============== + +JXL Encoder example resides in ``L2/demos/jxlEnc`` directory. The tutorial provides a step-by-step guide that covers commands for building and running kernel. + +Executable Usage +---------------- + +* **Work Directory(Step 1)** + +The steps for library download and environment setup can be found in [here](https://github.com/Xilinx/Vitis_Libraries/tree/master/codec/L2/demos#building). For getting the design, + +``` + cd L2/demos/jxlEnc +``` + +* **Build kernel(Step 2)** + +Run the following make command to build your XCLBIN and host binary targeting a specific device. Please be noticed that this process will take a long time, maybe couple of hours. + +``` + make run TARGET=hw DEVICE=xilinx_u50_gen3x16_xdma_201920_3 +``` + +* **Run kernel(Step 3)** + +To get the benchmark results, please run the following command. + +``` + ./build_dir.hw.xilinx_u50_gen3x16_xdma_201920_3/host.exe --xclbin ./build_dir.hw.xilinx_u50_gen3x16_xdma_201920_3/jxlEnc.xclbin PNGFilePath JXLFilePath +``` + +JXL Encoder Input Arguments: + +``` + Usage: host.exe -[-xclbin] + --xclbin: the kernel name + PNGFilePath: the path to the input *.PNG + JXLFilePath: the path to the output *.jxl +``` + +Note: Default arguments are set in Makefile, you can use other [pictures](https://github.com/Xilinx/Vitis_Libraries/tree/master/codec/L2/demos#pictures) listed in the table. + +* **Example output(Step 4)** + +``` + Found Platform + Platform Name: Xilinx + Info: Context created + Info: Command queue created + INFO: Found Device=xilinx_u50_gen3x16_xdma_201920_3 + INFO: Importing build_dir.sw_emu.xilinx_u50_gen3x16_xdma_201920_3/jxlEnc.xclbin + Loading: 'build_dir.sw_emu.xilinx_u50_gen3x16_xdma_201920_3/jxlEnc.xclbin' + Info: Program created + Info: Kernel created + INFO: kernel has been created + INFO: Kernel Start + INFO: Finish kernel execution + INFO: Finish E2E execution + ... + + INFO: Finish kernel execution + INFO: Finish E2E execution + INFO: Data transfer from host to device: 100 us + INFO: Data transfer from device to host: 20 us + INFO: kernel execution time: 600 ms +``` + +Profiling +--------- + +The hardware resource utilizations are listed in the following table. +Different tool versions may result slightly different resource. + + +##### Table 1 IP resources for JXL encoder + +| IP | BRAM | URAM | DSP | FF | LUT | +|------------------------|----------|----------|----------|----------|---------| +| lossy_enc_compute | 364 | 53 | 498 | 145111 | 121741 | +| cluster_histogram | 70 | 28 | 51 | 60744 | 38507 | +| tokInit_histogram | 150 | 41 | 95 | 64710 | 39289 | + + +##### Table 2 JXL Encoder Performance + +###### lossy_enc_compute +| Image | Size | Time(ms) | Throughput(MP/s) | +|-------------------|---------------|------------|--------------------| +| lena_c_512.png | 512x512 | 3.63 | 72.21 | +| hq_1024x1024.png | 1024x1024 | 13.06 | 80.29 | +| hq_2Kx2K.png | 2048x2048 | 50.33 | 83.34 | + +###### cluster_histogram +| Image | Size | Time(ms) | Throughput(MP/s) | +|-------------------|---------------|------------|--------------------| +| lena_c_512.png | 512x512 | 4.6 | 56.98 | +| hq_1024x1024.png | 1024x1024 | 14.6 | 71.82 | +| hq_2Kx2K.png | 2048x2048 | 41.13 | 101.97 | + +###### tokInit_histogram +| Image | Size | Time(ms) | Throughput(MP/s) | +|-------------------|---------------|-------------|--------------------| +| lena_c_512.png | 512x512 | 6.07 | 43.19 | +| hq_1024x1024.png | 1024x1024 | 18.03 | 58.16 | +| hq_2Kx2K.png | 2048x2048 | 79.30 | 52.89 | + +## License + +Licensed using the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0). + + Copyright 2022 Xilinx, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + Copyright 2022 Xilinx, Inc. \ No newline at end of file diff --git a/codec/L2/demos/jxlEnc/acc_cluster_histogram/Makefile b/codec/L2/demos/jxlEnc/acc_cluster_histogram/Makefile new file mode 100644 index 0000000000..3d7f53ad6e --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_cluster_histogram/Makefile @@ -0,0 +1,331 @@ +# Copyright 2019-2022 Xilinx, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# vitis makefile-generator v2.0.6 + +############################## Help Section ############################## +.PHONY: help + +help:: + $(ECHO) "Makefile Usage:" + $(ECHO) " make all TARGET= PLATFORM= HOST_ARCH=" + $(ECHO) " Command to generate the design for specified Target and Shell." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" + $(ECHO) "" + $(ECHO) " make run TARGET= PLATFORM= HOST_ARCH=" + $(ECHO) " Command to run application in emulation." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH required for SoC shells" + $(ECHO) "" + $(ECHO) " make xclbin TARGET= PLATFORM= HOST_ARCH=" + $(ECHO) " Command to build xclbin application." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" + $(ECHO) "" + $(ECHO) " make host TARGET=" + $(ECHO) " Command to build host application." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" + $(ECHO) "" + $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, env variable SYSROOT and EDGE_COMMON_SW need to be set first, and HOST_ARCH is either aarch32 or aarch64. For example," + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " export EDGE_COMMON_SW=< path-to-rootfs-and-Image-files >" + $(ECHO) "" + $(ECHO) " make clean " + $(ECHO) " Command to remove the generated non-hardware files." + $(ECHO) "" + $(ECHO) " make cleanall" + $(ECHO) " Command to remove all the generated files." + $(ECHO) "" + +############################## Setting up Project Variables ############################## + +MK_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +XF_PROJ_ROOT ?= $(shell bash -c 'export MK_PATH=$(MK_PATH); echo $${MK_PATH%/L2/*}') +CUR_DIR := $(patsubst %/,%,$(dir $(MK_PATH))) +XFLIB_DIR = $(XF_PROJ_ROOT) + +# setting devault value +TARGET ?= sw_emu +HOST_ARCH ?= x86 + +#setting PLATFORM +ifeq ($(PLATFORM),) +PLATFORM := $(DEVICE) +endif +ifeq ($(PLATFORM),) +PLATFORM := xilinx_u50_gen3x16_xdma_5_202210_1 +endif + +# #################### Checking if PLATFORM in whitelist ############################ +PLATFORM_ALLOWLIST += u50 +PLATFORM_BLOCKLIST += zc + +include ./utils.mk +TEMP_DIR := _x_temp.$(TARGET).$(PLATFORM_NAME) +TEMP_REPORT_DIR := $(CUR_DIR)/reports/_x.$(TARGET).$(PLATFORM_NAME) +BUILD_DIR := build_dir.$(TARGET).$(PLATFORM_NAME) +ifneq ($(RESULT_DIR),) +BUILD_DIR = $(RESULT_DIR) +endif +BUILD_REPORT_DIR := $(CUR_DIR)/reports/_build.$(TARGET).$(PLATFORM_NAME) +EMCONFIG := $(BUILD_DIR)/emconfig.json +XCLBIN_DIR := $(CUR_DIR)/$(BUILD_DIR) +export XCL_BINDIR = $(XCLBIN_DIR) + +EXE_FILE_DEPS := +BINARY_CONTAINERS_DEPS := +RUN_DEPS := + +# get global setting +ifeq ($(HOST_ARCH), x86) +CXXFLAGS += -fmessage-length=0 -I$(CUR_DIR)/src/ -I$(XILINX_XRT)/include -I$(XILINX_HLS)/include -std=c++14 -O3 -Wall -Wno-unknown-pragmas -Wno-unused-label +LDFLAGS += -pthread -L$(XILINX_XRT)/lib -L$(XILINX_HLS)/lnx64/tools/fpo_v7_0 -Wl,--as-needed -lOpenCL -lxrt_coreutil -lgmp -lmpfr -lIp_floating_point_v7_0_bitacc_cmodel +VPP_FLAGS += -t $(TARGET) --platform $(XPLATFORM) --save-temps +VPP_LDFLAGS += --optimize 2 -R 2 +else ifeq ($(HOST_ARCH), aarch64) +CXXFLAGS += -I$(CUR_DIR)/src/ -fmessage-length=0 --sysroot=$(SYSROOT) -I$(SYSROOT)/usr/include/xrt -I$(XILINX_HLS)/include -std=c++14 -O3 -Wall -Wno-unknown-pragmas -Wno-unused-label +LDFLAGS += -pthread -L$(SYSROOT)/usr/lib -L$(XILINX_VITIS_AIETOOLS)/lib/aarch64.o -Wl,--as-needed -lxilinxopencl -lxrt_coreutil +VPP_FLAGS += -t $(TARGET) --platform $(XPLATFORM) --save-temps +VPP_LDFLAGS += --optimize 2 -R 2 +endif +CXXFLAGS += $(EXTRA_CXXFLAGS) +VPP_FLAGS += $(EXTRA_VPP_FLAGS) + +########################## Setting up Host Variables ########################## +ifeq ($(TARGET),sw_emu) +CXXFLAGS += -D SW_EMU_TEST +endif +ifeq ($(TARGET),hw_emu) +CXXFLAGS += -D HW_EMU_TEST +endif + +ifeq (,$(findstring opencv,$(CXXFLAGS))) +CXXFLAGS += $(XRT_CXXFLAGS) +endif + +#Inclue Required Host Source Files +HOST_SRCS += $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cjxl.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cjxl_main.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cmdline.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/codec_config.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/speed_stats.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cpu/cpu.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cpu/os_specific.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/box/box.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/time.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_png.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_pgx.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_pnm.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_jpg.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_psd.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_internal.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cluster.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/decode_to_jpeg.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_huffman.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cluster.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_context_map.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/progressive_split.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_detect_dots.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_params.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/blending.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_comparator.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_table.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_tree.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/linalg.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_file.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/headers.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/alpha.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/image_metadata.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/frame_header.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_fast_heuristics.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_encode.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/fields.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/luminance.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_color_management.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_bit_writer.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/image.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/loop_filter.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/color_management.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_modular.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_quant_weights.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_noise.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_splines.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_patch_dictionary.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/splines.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_xyb.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ar_control_field.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/memory_manager_internal.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_file.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_image_bundle.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_toc.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/passes_state.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/chroma_from_luma.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_context_map.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_coeff_order.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_ans.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_entropy_coder.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_common.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_dc.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/epf.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_dot_dictionary.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_frame.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_patch_dictionary.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_comparator.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_reconstruct.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group_border.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/filters.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/convolve.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_cache.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_noise.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_huffman.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dct_scales.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_decode.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_icc_codec.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli/butteraugli.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data_writer.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/jpeg_data.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data_reader.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/base/padded_bytes.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/base/data_parallel.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/base/cache_aligned.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/base/status.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/dec_ma.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/modular_image.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/encoding.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_rct.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_squeeze.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_palette.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/squeeze.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_transform.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/jxl_transform.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_ma.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_encoding.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/encode.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/memory.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/backward_references_hq.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/brotli_bit_stream.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/block_splitter.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/metablock.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/compress_fragment.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/compress_fragment_two_pass.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/backward_references.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/encoder_dict.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/utf8_util.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/decode.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/static_dict.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/literal_cost.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/entropy_encode.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/bit_cost.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/cluster.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/dictionary_hash.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/histogram.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/bit_reader.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/huffman.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/state.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/dictionary.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/transform.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmslut.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsnamed.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmspack.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscnvrt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsio1.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsgmt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsopt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsalpha.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmstypes.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsintrp.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsgamma.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscam02.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscgats.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmshalf.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsmtrx.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsps2.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmssamp.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmssm.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsxform.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsio0.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsplugin.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmserr.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmspcs.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmswtpnt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsvirt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lodepng/lodepng.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/highway/hwy/aligned_allocator.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/highway/hwy/targets.cc $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_enc_ac_strategy.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_enc_adaptive_quantization.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_enc_cache.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_enc_frame.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_enc_group.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_enc_chroma_from_luma.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_init_histogram.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_host.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_phase1.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_phase2.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_phase3.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/acc_cluster_histogram/host/host_cluster_histogram.cpp $(XFLIB_DIR)/ext/xcl2/xcl2.cpp +CXXFLAGS += -I $(XFLIB_DIR)/../utils/L1/include/ -I $(XFLIB_DIR)/ext/xcl2 -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/ -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/build/lib/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/highway -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lodepng -I $(XFLIB_DIR)/L2/demos/jxlEnc/acc_cluster_histogram/kernel -I $(XFLIB_DIR)/L2/demos/jxlEnc/acc_cluster_histogram/host -I $(XFLIB_DIR)/L2/demos/jxlEnc/others/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram +CXXFLAGS += -O3 + +EXE_NAME := host.exe +EXE_FILE := $(BUILD_DIR)/$(EXE_NAME) +EXE_FILE_DEPS := $(HOST_SRCS) $(EXE_FILE_DEPS) + +HOST_ARGS := --xclbin $(BUILD_DIR)/jxlEnc.xclbin $(XFLIB_DIR)/L2/demos/jxlEnc/images/t0.png t0.jxl +ifneq ($(HOST_ARCH), x86) +PKG_HOST_ARGS = $(foreach args,$(HOST_ARGS),$(subst $(dir $(patsubst %/,%,$(args))),,$(args))) +endif + +########################## Kernel compiler global settings ########################## +ifneq (,$(shell echo $(XPLATFORM) | awk '/u50/')) +VPP_FLAGS += --config $(CUR_DIR)/conn_u50.cfg +VPP_FLAGS += -I $(XFLIB_DIR)/../utils/L1/include/ -I $(XFLIB_DIR)/L2/include/hw/jxlEnc + +else +VPP_FLAGS += -I $(XFLIB_DIR)/../utils/L1/include/ -I $(XFLIB_DIR)/L2/include/hw/jxlEnc + +endif + +######################### binary container global settings ########################## +VPP_FLAGS_JxlEnc_ans_clusterHistogram += -D KERNEL_NAME=JxlEnc_ans_clusterHistogram +VPP_FLAGS_JxlEnc_ans_clusterHistogram += --hls.clock 300000000:JxlEnc_ans_clusterHistogram +ifneq ($(HOST_ARCH), x86) +VPP_LDFLAGS_jxlEnc += --clock.defaultFreqHz 300000000 +else +VPP_LDFLAGS_jxlEnc += --kernel_frequency 300 +endif + +ifeq ($(HOST_ARCH), x86) +BINARY_CONTAINERS += $(BUILD_DIR)/jxlEnc.xclbin +else +BINARY_CONTAINERS += $(BUILD_DIR)/jxlEnc_pkg.$(LINK_TARGET_FMT) +BINARY_CONTAINERS_PKG += $(BUILD_DIR)/jxlEnc.xclbin +endif + +# ################ Setting Rules for Binary Containers (Building Kernels) ################ +$(TEMP_DIR)/JxlEnc_ans_clusterHistogram.xo: $(XFLIB_DIR)/L2/demos/jxlEnc/acc_cluster_histogram/kernel/hls_cluster_histogram.cpp + $(ECHO) "Compiling Kernel: JxlEnc_ans_clusterHistogram" + mkdir -p $(TEMP_DIR) + $(VPP) -c $(VPP_FLAGS_JxlEnc_ans_clusterHistogram) $(VPP_FLAGS) -k JxlEnc_ans_clusterHistogram -I'$(> $(RUN_SCRIPT) +ifneq ($(filter sw_emu hw_emu, $(TARGET)),) + @echo 'export XCL_EMULATION_MODE=$(TARGET)' >> $(RUN_SCRIPT) +endif + @echo 'export XILINX_VITIS=/mnt' >> $(RUN_SCRIPT) + @echo 'export XILINX_XRT=/usr' >> $(RUN_SCRIPT) + @echo 'if [ -f platform_desc.txt ]; then' >> $(RUN_SCRIPT) + @echo ' cp platform_desc.txt /etc/xocl.txt' >> $(RUN_SCRIPT) + @echo 'fi' >> $(RUN_SCRIPT) + @echo './$(EXE_NAME) $(PKG_HOST_ARGS)' >> $(RUN_SCRIPT) + @echo 'return_code=$$?' >> $(RUN_SCRIPT) + @echo 'if [ $$return_code -ne 0 ]; then' >> $(RUN_SCRIPT) + @echo ' echo "ERROR: Embedded host run failed, RC=$$return_code"' >> $(RUN_SCRIPT) + @echo 'else' >> $(RUN_SCRIPT) + @echo ' echo "INFO: TEST PASSED, RC=0"' >> $(RUN_SCRIPT) + @echo 'fi' >> $(RUN_SCRIPT) + @echo 'echo "INFO: Embedded host run completed."' >> $(RUN_SCRIPT) + @echo 'exit $$return_code' >> $(RUN_SCRIPT) +DATA_FILE := +DATA_DIR := +SD_FILES += $(RUN_SCRIPT) +SD_FILES += $(EXE_FILE) +SD_FILES += $(EMCONFIG) +SD_FILES += xrt.ini +SD_FILES += $(DATA_FILE)# where define DATAFILE in json +SD_FILES_WITH_PREFIX = $(foreach sd_file,$(SD_FILES), $(if $(filter $(sd_file),$(wildcard $(sd_file))), --package.sd_file $(sd_file))) +SD_DIRS_WITH_PREFIX = $(foreach sd_dir,$(DATA_DIR),--package.sd_dir $(sd_dir)) +PACKAGE_FILES := $(BINARY_CONTAINERS) +PACKAGE_FILES += $(AIE_CONTAINER) +SD_CARD := $(CUR_DIR)/package_$(TARGET) +vck190_dfx_hw := false +$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) + @echo "Generating sd_card folder...." + mkdir -p $(SD_CARD) + chmod a+rx $(BUILD_DIR)/run_script.sh +ifneq (,$(findstring vck190_base_dfx, $(PLATFORM_NAME))) +ifeq ($(TARGET),hw) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) -o $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) + @echo "### ***** sd_card generation done! ***** ###" +vck190_dfx_hw := true +endif +endif +ifeq ($(vck190_dfx_hw), false) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) + @echo "### ***** sd_card generation done! ***** ###" +endif + +.PHONY: sd_card +sd_card: $(SD_CARD) +endif +############################## Setting Essential Checks and Building Rules ############################## +RUN_DEPS += $(EXE_FILE) $(BINARY_CONTAINERS) $(EMCONFIG) +RUN_DEPS += $(SD_CARD) + +.PHONY: mkflag all run +mkflag: + mkdir -p $(BUILD_DIR) + rm -rf $(BUILD_DIR)/makefile_args.txt + @for var in $(MAKEFLAGS); do echo $$var >> $(BUILD_DIR)/makefile_args.txt; done +all: check_device check_vpp check_platform mkflag $(RUN_DEPS) +run: all +#hw_emu +ifneq (,$(filter hw_emu, $(TARGET))) +ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + XCL_EMULATION_MODE=$(TARGET) $(EXE_FILE) $(HOST_ARGS) + ./check.sh +else + @echo $(RUN_DEPS) + $(SD_CARD)/launch_$(TARGET).sh -no-reboot -run-app $(notdir $(RUN_SCRIPT)) + grep "TEST PASSED, RC=0" $(SD_CARD)/qemu_output.log || exit 1 + ./check.sh +endif +endif +#sw_emu +ifneq (,$(filter sw_emu, $(TARGET))) +ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + XCL_EMULATION_MODE=$(TARGET) $(EXE_FILE) $(HOST_ARGS) + ./check.sh +else + @echo $(RUN_DEPS) + $(SD_CARD)/launch_$(TARGET).sh -no-reboot -run-app $(notdir $(RUN_SCRIPT)) + grep "TEST PASSED, RC=0" $(SD_CARD)/qemu_output.log || exit 1 + ./check.sh +endif +endif +#hw +ifeq ($(TARGET), hw) +ifneq (,$(findstring aws-vu9p-f1, $(PLATFORM_NAME))) +ifneq ($(JENKINS_INTERNAL_BUILD), 1) + $(ECHO) "This makefile does not directly support converting .xclbin to .awsxclbin, please refer https://github.com/aws/aws-fpga/blob/master/Vitis/README.md for next operations" +else + $(ECHO) "Running inside Xilinx regression without converting to .awsxclbin" + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(EXE_FILE) $(HOST_ARGS) + ./check.sh +endif +else ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(EXE_FILE) $(HOST_ARGS) + ./check.sh +else + $(ECHO) "Please copy the content of sd_card folder and data to an SD Card and run on the board" +endif +endif + +############################## Setting Targets ############################## + +.PHONY: clean cleanall emconfig +emconfig: $(EMCONFIG) + +.PHONY: host +ifeq ($(HOST_ARCH), x86) +host: check_xrt $(EXE_FILE) +else +host: check_sysroot $(EXE_FILE) +endif + +.PHONY: xclbin +ifeq ($(HOST_ARCH), x86) +xclbin: check_vpp check_xrt $(BINARY_CONTAINERS) +else +xclbin: check_vpp check_sysroot $(BINARY_CONTAINERS) +endif + +############################## Cleaning Rules ############################## +cleanh: + -$(RMDIR) $(EXE_FILE) vitis_* TempConfig system_estimate.xtxt *.rpt .run/ + -$(RMDIR) src/*.ll _xocc_* .Xil dltmp* xmltmp* *.log *.jou *.wcfg *.wdb sample_link.ini sample_compile.ini obj* bin* *.csv *.jpg *.jpeg *.png + +cleank: + -$(RMDIR) $(BUILD_DIR)/*.xclbin _vimage *xclbin.run_summary qemu-memory-_* emulation/ _vimage/ pl*start_simulation. sh *.xclbin + -$(RMDIR) _x_temp.* + +cleanall: cleanh cleank + -$(RMDIR) $(BUILD_DIR) emconfig.json *.html $(TEMP_DIR) $(CUR_DIR)/reports *.csv *.run_summary $(CUR_DIR)/*.raw package_* $(BUILD_DIR)/run_script.sh .ipcache *.str + -$(RMDIR) $(CUR_DIR)/Work $(CUR_DIR)/*.xpe $(CUR_DIR)/hw.o $(CUR_DIR)/*.xsa $(CUR_DIR)/xnwOut + -$(RMDIR) + +clean: cleanh \ No newline at end of file diff --git a/codec/L2/demos/jxlEnc/acc_cluster_histogram/check.sh b/codec/L2/demos/jxlEnc/acc_cluster_histogram/check.sh new file mode 100755 index 0000000000..2328296af1 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_cluster_histogram/check.sh @@ -0,0 +1 @@ +echo "3ba213afa0ed5f639877f96990ebd51a t0.jxl" | md5sum -c - diff --git a/codec/L2/demos/jxlEnc/acc_cluster_histogram/conn_u50.cfg b/codec/L2/demos/jxlEnc/acc_cluster_histogram/conn_u50.cfg new file mode 100644 index 0000000000..72c25fe004 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_cluster_histogram/conn_u50.cfg @@ -0,0 +1,14 @@ +[hls] +#pre_tcl=hls_pre.tcl + +[connectivity] +sp=JxlEnc_ans_clusterHistogram_1.m_axi_histogram_gmem:HBM[0] +sp=JxlEnc_ans_clusterHistogram_1.m_axi_histocnt_gmem:HBM[1] +sp=JxlEnc_ans_clusterHistogram_1.m_axi_histosize_gmem:HBM[2] +sp=JxlEnc_ans_clusterHistogram_1.m_axi_nonempty_gmem:HBM[3] +sp=JxlEnc_ans_clusterHistogram_1.m_axi_ctx_gmem:HBM[4] +sp=JxlEnc_ans_clusterHistogram_1.m_axi_histo_clusd_gmem:HBM[5] +sp=JxlEnc_ans_clusterHistogram_1.m_axi_histosize_clusd_gmem:HBM[6] +sp=JxlEnc_ans_clusterHistogram_1.m_axi_histo_clusdin_gmem:HBM[7] +#slr=hls_ANSclusterHistogram_1:SLR1 + diff --git a/codec/L2/demos/jxlEnc/acc_cluster_histogram/description.json b/codec/L2/demos/jxlEnc/acc_cluster_histogram/description.json new file mode 100644 index 0000000000..88ca52a561 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_cluster_histogram/description.json @@ -0,0 +1,330 @@ +{ + "gui": false, + "name": "JXL ACC_CLUSTER Demo", + "description": "This example is based on Google's PIK, which was chosen as the base framework for JPEG XL. The pikEnc is based on the 'fast mode' of PIK which can provide better encoding efficnty than most of other still image encoding methods. The pikEnc is based on Xilinx HLS design methodology and optimized for FPGA arthitecture. It can proved higher throughput and lower latency compared to software-based solutions", + "flow": "vitis", + "platform_allowlist": [ + "u50" + ], + "platform_blocklist": [ + "zc" + ], + "platform_properties": { + "u50": { + "v++": { + "compiler": { + "clflags": [ + "--config PROJECT/conn_u50.cfg" + ] + } + } + } + }, + "data": [ + "./data" + ], + "launch": [ + { + "cmd_args": " --xclbin BUILD/jxlEnc.xclbin LIB_DIR/L2/demos/jxlEnc/images/t0.png t0.jxl", + "name": "generic launch for all flows" + } + ], + "post_launch": [ + { + "launch_cmd": [ + "./check.sh" + ] + } + ], + "host": { + "host_exe": "host.exe", + "compiler": { + "sources": [ + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cjxl.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cjxl_main.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cmdline.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/codec_config.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/speed_stats.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cpu/cpu.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cpu/os_specific.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/box/box.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/time.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_png.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_pgx.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_pnm.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_jpg.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_psd.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_internal.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cluster.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/decode_to_jpeg.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_huffman.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cluster.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_context_map.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/progressive_split.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_detect_dots.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_params.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/blending.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_comparator.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_table.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_tree.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/linalg.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_file.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/headers.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/alpha.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/image_metadata.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/frame_header.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_fast_heuristics.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_encode.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/fields.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/luminance.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_color_management.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_bit_writer.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/image.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/loop_filter.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/color_management.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_modular.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_quant_weights.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_noise.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_splines.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_patch_dictionary.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/splines.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_xyb.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ar_control_field.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/memory_manager_internal.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_file.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_image_bundle.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_toc.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/passes_state.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/chroma_from_luma.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_context_map.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_coeff_order.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_ans.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_entropy_coder.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_common.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_dc.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/epf.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_dot_dictionary.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_frame.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_patch_dictionary.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_comparator.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_reconstruct.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group_border.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/filters.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/convolve.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_cache.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_noise.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_huffman.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dct_scales.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_decode.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_icc_codec.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli/butteraugli.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data_writer.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/jpeg_data.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data_reader.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/base/padded_bytes.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/base/data_parallel.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/base/cache_aligned.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/base/status.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/dec_ma.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/modular_image.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/encoding.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_rct.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_squeeze.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_palette.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/squeeze.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_transform.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/jxl_transform.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_ma.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_encoding.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/encode.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/memory.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/backward_references_hq.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/brotli_bit_stream.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/block_splitter.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/metablock.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/compress_fragment.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/compress_fragment_two_pass.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/backward_references.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/encoder_dict.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/utf8_util.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/decode.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/static_dict.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/literal_cost.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/entropy_encode.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/bit_cost.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/cluster.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/dictionary_hash.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/histogram.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/bit_reader.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/huffman.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/state.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/dictionary.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/transform.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmslut.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsnamed.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmspack.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscnvrt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsio1.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsgmt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsopt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsalpha.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmstypes.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsintrp.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsgamma.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscam02.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscgats.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmshalf.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsmtrx.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsps2.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmssamp.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmssm.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsxform.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsio0.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsplugin.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmserr.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmspcs.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmswtpnt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsvirt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lodepng/lodepng.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/highway/hwy/aligned_allocator.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/highway/hwy/targets.cc", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_enc_ac_strategy.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_enc_adaptive_quantization.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_enc_cache.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_enc_frame.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_enc_group.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_enc_chroma_from_luma.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_init_histogram.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_host.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_phase1.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_phase2.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_phase3.cpp", + "LIB_DIR/L2/demos/jxlEnc/acc_cluster_histogram/host/host_cluster_histogram.cpp", + "LIB_DIR/ext/xcl2/xcl2.cpp" + ], + "includepaths": [ + "LIB_DIR/../utils/L1/include/", + "LIB_DIR/ext/xcl2", + "LIB_DIR/L2/demos/jxlEnc/third_partys/", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/include", + "LIB_DIR/L2/demos/jxlEnc/third_partys/build/lib/include", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/include", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/highway", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/include", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lodepng", + "LIB_DIR/L2/demos/jxlEnc/acc_cluster_histogram/kernel", + "LIB_DIR/L2/demos/jxlEnc/acc_cluster_histogram/host", + "LIB_DIR/L2/demos/jxlEnc/others/include", + "LIB_DIR/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram" + ], + "options": "-O3 " + } + }, + "v++": { + "compiler": { + "includepaths": [ + "LIB_DIR/../utils/L1/include/", + "LIB_DIR/L2/include/hw/jxlEnc" + ] + } + }, + "containers": [ + { + "name": "jxlEnc", + "accelerators": [ + { + "location": "LIB_DIR/L2/demos/jxlEnc/acc_cluster_histogram/kernel/hls_cluster_histogram.cpp", + "frequency": 300.0, + "clflags": " -D KERNEL_NAME=JxlEnc_ans_clusterHistogram", + "name": "JxlEnc_ans_clusterHistogram", + "num_compute_units": 1, + "compute_units": [ + { + "name": "JxlEnc_ans_clusterHistogram", + "arguments": [ + { + "name": "gmem0_0", + "memory": "DDR[0]" + }, + { + "name": "gmem0_1", + "memory": "DDR[0]" + }, + { + "name": "gmem1_0", + "memory": "DDR[1]" + }, + { + "name": "gmem1_1", + "memory": "DDR[1]" + }, + { + "name": "gmem1_2", + "memory": "DDR[1]" + } + ] + } + ] + } + ], + "frequency": 300 + } + ], + "testinfo": { + "disable": false, + "jobs": [ + { + "index": 0, + "dependency": [], + "env": "", + "cmd": "", + "max_memory_MB": { + "vitis_hw_build": 81920, + "vitis_hw_emu": 40960, + "vitis_sw_emu": 10240, + "vitis_hw_run": 10240 + }, + "max_time_min": { + "vitis_hw_build": 3200, + "vitis_hw_emu": 1600, + "vitis_sw_emu": 120, + "vitis_hw_run": 10 + } + } + ], + "targets": [ + "vitis_sw_emu", + "vitis_hw_emu", + "vitis_hw" + ], + "category": "canary" + } +} diff --git a/codec/L2/demos/jxlEnc/acc_cluster_histogram/host/host_cluster_histogram.cpp b/codec/L2/demos/jxlEnc/acc_cluster_histogram/host/host_cluster_histogram.cpp new file mode 100644 index 0000000000..22f6dc963a --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_cluster_histogram/host/host_cluster_histogram.cpp @@ -0,0 +1,689 @@ +/* + * Copyright 2022 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HOST_CLUSTER_HISTOGRAM_CPP +#define HOST_CLUSTER_HISTOGRAM_CPP + +#include +#include + +#include "xcl2.hpp" +#include "xf_utils_sw/logger.hpp" + +#define XCL_BANK(n) (((unsigned int)(n)) | XCL_MEM_TOPOLOGY) + +#define XCL_BANK0 XCL_BANK(0) +#define XCL_BANK1 XCL_BANK(1) +#define XCL_BANK2 XCL_BANK(2) +#define XCL_BANK3 XCL_BANK(3) +#define XCL_BANK4 XCL_BANK(4) +#define XCL_BANK5 XCL_BANK(5) +#define XCL_BANK6 XCL_BANK(6) +#define XCL_BANK7 XCL_BANK(7) +#define XCL_BANK8 XCL_BANK(8) +#define XCL_BANK9 XCL_BANK(9) +#define XCL_BANK10 XCL_BANK(10) +#define XCL_BANK11 XCL_BANK(11) +#define XCL_BANK12 XCL_BANK(12) +#define XCL_BANK13 XCL_BANK(13) +#define XCL_BANK14 XCL_BANK(14) +#define XCL_BANK15 XCL_BANK(15) +#define XCL_BANK16 XCL_BANK(16) +#define XCL_BANK17 XCL_BANK(17) +#define XCL_BANK18 XCL_BANK(18) +#define XCL_BANK19 XCL_BANK(19) +#define XCL_BANK20 XCL_BANK(20) +#define XCL_BANK21 XCL_BANK(21) +#define XCL_BANK22 XCL_BANK(22) +#define XCL_BANK23 XCL_BANK(23) +#define XCL_BANK24 XCL_BANK(24) +#define XCL_BANK25 XCL_BANK(25) +#define XCL_BANK26 XCL_BANK(26) +#define XCL_BANK27 XCL_BANK(27) +#define XCL_BANK28 XCL_BANK(28) +#define XCL_BANK29 XCL_BANK(29) +#define XCL_BANK30 XCL_BANK(30) +#define XCL_BANK31 XCL_BANK(31) +#define XCL_BANK32 XCL_BANK(32) +#define XCL_BANK33 XCL_BANK(33) + +unsigned long diff(const struct timeval* newTime, const struct timeval* oldTime) { + return (newTime->tv_sec - oldTime->tv_sec) * 1000000 + (newTime->tv_usec - oldTime->tv_usec); +} + +template +T* aligned_alloc(std::size_t num) { + void* ptr = NULL; + if (posix_memalign(&ptr, 4096, num * sizeof(T))) throw std::bad_alloc(); + return reinterpret_cast(ptr); +} + +void hls_ANSclusterHistogram_wrapper(std::string xclbinPath, + uint32_t* config, + //==================== + int32_t* histograms0_ptr, + uint32_t* histo_totalcnt0_ptr, + uint32_t* histo_size0_ptr, + uint32_t* nonempty_histo0_ptr, + uint8_t* ctx_map0_ptr, + int32_t* histograms_clusd0_ptr, + uint32_t* histo_size_clusd0_ptr, + int32_t* histograms_clusdin0_ptr, + //==================== + int32_t* histograms1_ptr, + uint32_t* histo_totalcnt1_ptr, + uint32_t* histo_size1_ptr, + uint32_t* nonempty_histo1_ptr, + uint8_t* ctx_map1_ptr, + int32_t* histograms_clusd1_ptr, + uint32_t* histo_size_clusd1_ptr, + int32_t* histograms_clusdin1_ptr, + //====================== + int32_t* histograms2_ptr, + uint32_t* histo_totalcnt2_ptr, + uint32_t* histo_size2_ptr, + uint32_t* nonempty_histo2_ptr, + uint8_t* ctx_map2_ptr, + int32_t* histograms_clusd2_ptr, + uint32_t* histo_size_clusd2_ptr, + int32_t* histograms_clusdin2_ptr, + //====================== + int32_t* histograms3_ptr, + uint32_t* histo_totalcnt3_ptr, + uint32_t* histo_size3_ptr, + uint32_t* nonempty_histo3_ptr, + uint8_t* ctx_map3_ptr, + int32_t* histograms_clusd3_ptr, + uint32_t* histo_size_clusd3_ptr, + int32_t* histograms_clusdin3_ptr, + //====================== + int32_t* histograms4_ptr, + uint32_t* histo_totalcnt4_ptr, + uint32_t* histo_size4_ptr, + uint32_t* nonempty_histo4_ptr, + uint8_t* ctx_map4_ptr, + int32_t* histograms_clusd4_ptr, + uint32_t* histo_size_clusd4_ptr, + int32_t* histograms_clusdin4_ptr) { + printf("[HOST] size= %d\n", config[6]); + + xf::common::utils_sw::Logger logger(std::cout, std::cerr); + cl_int fail; + + struct timeval start_time; // End to end time clock start + gettimeofday(&start_time, 0); + + // platform related operations + std::vector devices = xcl::get_xil_devices(); + cl::Device device = devices[0]; + + // Creating Context and Command Queue for selected Device + cl::Context context(device, NULL, NULL, NULL, &fail); + logger.logCreateContext(fail); + cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &fail); + logger.logCreateCommandQueue(fail); + std::string devName = device.getInfo(); + printf("INFO: Found Device=%s\n", devName.c_str()); + cl::Program::Binaries xclBins = xcl::import_binary_file(xclbinPath); + + devices.resize(1); + cl::Program program(context, devices, xclBins, NULL, &fail); + logger.logCreateProgram(fail); + + int repInt = 1; + // create kernels + std::vector cluster_kernel(repInt); + for (int i = 0; i < repInt; i++) { + cluster_kernel[i] = cl::Kernel(program, "JxlEnc_ans_clusterHistogram", &fail); + logger.logCreateKernel(fail); + } + std::cout << "INFO: kernel has been created" << std::endl; + + // declare map of host buffers + std::cout << "kernel config size:" << 30 << std::endl; + std::cout << "histogram size: " << config[0] << "," << config[1] << "," << config[2] << "," << config[3] << "," + << config[4] << std::endl; + std::cout << "non-empty histogram size: " + << "," << config[5] << "," << config[6] << "," << config[7] << "," << config[8] << "," << config[9] + << std::endl; + std::cout << "largest idx: " << config[10] << "," << config[11] << "," << config[12] << "," << config[13] << "," + << config[14] << std::endl; + std::cout << "num cluster: " << config[15] << "," << config[16] << "," << config[17] << "," << config[18] << "," + << config[19] << std::endl; + std::cout << "histo_size_clusdin: " << config[20] << "," << config[21] << "," << config[22] << "," << config[23] + << "," << config[24] << std::endl; + std::cout << "do_once: " << config[25] << "," << config[26] << "," << config[27] << "," << config[28] << "," + << config[29] << std::endl; + +#define MAX_NUM_CONFIG 30 + uint32_t* hb_config = aligned_alloc(MAX_NUM_CONFIG); + + int32_t* hb_histograms0_ptr = aligned_alloc(163840); + int32_t* hb_histograms1_ptr = aligned_alloc(163840); + int32_t* hb_histograms2_ptr = aligned_alloc(163840); + int32_t* hb_histograms3_ptr = aligned_alloc(163840); + int32_t* hb_histograms4_ptr = aligned_alloc(163840); + + uint32_t* hb_histo_totalcnt0_ptr = aligned_alloc(4096); + uint32_t* hb_histo_totalcnt1_ptr = aligned_alloc(4096); + uint32_t* hb_histo_totalcnt2_ptr = aligned_alloc(4096); + uint32_t* hb_histo_totalcnt3_ptr = aligned_alloc(4096); + uint32_t* hb_histo_totalcnt4_ptr = aligned_alloc(4096); + + uint32_t* hb_histo_size0_ptr = aligned_alloc(4096); + uint32_t* hb_histo_size1_ptr = aligned_alloc(4096); + uint32_t* hb_histo_size2_ptr = aligned_alloc(4096); + uint32_t* hb_histo_size3_ptr = aligned_alloc(4096); + uint32_t* hb_histo_size4_ptr = aligned_alloc(4096); + + uint32_t* hb_nonempty_histo0_ptr = aligned_alloc(4096); + uint32_t* hb_nonempty_histo1_ptr = aligned_alloc(4096); + uint32_t* hb_nonempty_histo2_ptr = aligned_alloc(4096); + uint32_t* hb_nonempty_histo3_ptr = aligned_alloc(4096); + uint32_t* hb_nonempty_histo4_ptr = aligned_alloc(4096); + + uint8_t* hb_ctx_map0_ptr = aligned_alloc(4096); + uint8_t* hb_ctx_map1_ptr = aligned_alloc(4096); + uint8_t* hb_ctx_map2_ptr = aligned_alloc(4096); + uint8_t* hb_ctx_map3_ptr = aligned_alloc(4096); + uint8_t* hb_ctx_map4_ptr = aligned_alloc(4096); + + int32_t* hb_histograms_clusd0_ptr = aligned_alloc(5120); + int32_t* hb_histograms_clusd1_ptr = aligned_alloc(5120); + int32_t* hb_histograms_clusd2_ptr = aligned_alloc(5120); + int32_t* hb_histograms_clusd3_ptr = aligned_alloc(5120); + int32_t* hb_histograms_clusd4_ptr = aligned_alloc(5120); + + uint32_t* hb_histo_size_clusd0_ptr = aligned_alloc(128); + uint32_t* hb_histo_size_clusd1_ptr = aligned_alloc(128); + uint32_t* hb_histo_size_clusd2_ptr = aligned_alloc(128); + uint32_t* hb_histo_size_clusd3_ptr = aligned_alloc(128); + uint32_t* hb_histo_size_clusd4_ptr = aligned_alloc(128); + + int32_t* hb_histograms_clusdin0_ptr = aligned_alloc(4096); + int32_t* hb_histograms_clusdin1_ptr = aligned_alloc(4096); + int32_t* hb_histograms_clusdin2_ptr = aligned_alloc(4096); + int32_t* hb_histograms_clusdin3_ptr = aligned_alloc(4096); + int32_t* hb_histograms_clusdin4_ptr = aligned_alloc(4096); + + for (int j = 0; j < MAX_NUM_CONFIG; j++) { + hb_config[j] = config[j]; + } + + for (int j = 0; j < 163840; j++) { + hb_histograms0_ptr[j] = histograms0_ptr[j]; + hb_histograms1_ptr[j] = histograms1_ptr[j]; + hb_histograms2_ptr[j] = histograms2_ptr[j]; + hb_histograms3_ptr[j] = histograms3_ptr[j]; + hb_histograms4_ptr[j] = histograms4_ptr[j]; + } + + for (int j = 0; j < 4096; j++) { + hb_histo_totalcnt0_ptr[j] = histo_totalcnt0_ptr[j]; + hb_histo_totalcnt1_ptr[j] = histo_totalcnt1_ptr[j]; + hb_histo_totalcnt2_ptr[j] = histo_totalcnt2_ptr[j]; + hb_histo_totalcnt3_ptr[j] = histo_totalcnt3_ptr[j]; + hb_histo_totalcnt4_ptr[j] = histo_totalcnt4_ptr[j]; + } + + for (int j = 0; j < 4096; j++) { + hb_histo_size0_ptr[j] = histo_size0_ptr[j]; + hb_histo_size1_ptr[j] = histo_size1_ptr[j]; + hb_histo_size2_ptr[j] = histo_size2_ptr[j]; + hb_histo_size3_ptr[j] = histo_size3_ptr[j]; + hb_histo_size4_ptr[j] = histo_size4_ptr[j]; + } + + for (int j = 0; j < 4096; j++) { + hb_nonempty_histo0_ptr[j] = nonempty_histo0_ptr[j]; + hb_nonempty_histo1_ptr[j] = nonempty_histo1_ptr[j]; + hb_nonempty_histo2_ptr[j] = nonempty_histo2_ptr[j]; + hb_nonempty_histo3_ptr[j] = nonempty_histo3_ptr[j]; + hb_nonempty_histo4_ptr[j] = nonempty_histo4_ptr[j]; + } + + std::vector mext_o(41); + mext_o[0] = {XCL_BANK(7), hb_config, 0}; + + mext_o[1] = {XCL_BANK(0), hb_histograms0_ptr, 0}; + mext_o[2] = {XCL_BANK(0), hb_histograms1_ptr, 0}; + mext_o[3] = {XCL_BANK(0), hb_histograms2_ptr, 0}; + mext_o[4] = {XCL_BANK(0), hb_histograms3_ptr, 0}; + mext_o[5] = {XCL_BANK(0), hb_histograms4_ptr, 0}; + + mext_o[6] = {XCL_BANK(1), hb_histo_totalcnt0_ptr, 0}; + mext_o[7] = {XCL_BANK(1), hb_histo_totalcnt1_ptr, 0}; + mext_o[8] = {XCL_BANK(1), hb_histo_totalcnt2_ptr, 0}; + mext_o[9] = {XCL_BANK(1), hb_histo_totalcnt3_ptr, 0}; + mext_o[10] = {XCL_BANK(1), hb_histo_totalcnt4_ptr, 0}; + + mext_o[11] = {XCL_BANK(2), hb_histo_size0_ptr, 0}; + mext_o[12] = {XCL_BANK(2), hb_histo_size1_ptr, 0}; + mext_o[13] = {XCL_BANK(2), hb_histo_size2_ptr, 0}; + mext_o[14] = {XCL_BANK(2), hb_histo_size3_ptr, 0}; + mext_o[15] = {XCL_BANK(2), hb_histo_size4_ptr, 0}; + + mext_o[16] = {XCL_BANK(3), hb_nonempty_histo0_ptr, 0}; + mext_o[17] = {XCL_BANK(3), hb_nonempty_histo1_ptr, 0}; + mext_o[18] = {XCL_BANK(3), hb_nonempty_histo2_ptr, 0}; + mext_o[19] = {XCL_BANK(3), hb_nonempty_histo3_ptr, 0}; + mext_o[20] = {XCL_BANK(3), hb_nonempty_histo4_ptr, 0}; + + mext_o[21] = {XCL_BANK(4), hb_ctx_map0_ptr, 0}; + mext_o[22] = {XCL_BANK(4), hb_ctx_map1_ptr, 0}; + mext_o[23] = {XCL_BANK(4), hb_ctx_map2_ptr, 0}; + mext_o[24] = {XCL_BANK(4), hb_ctx_map3_ptr, 0}; + mext_o[25] = {XCL_BANK(4), hb_ctx_map4_ptr, 0}; + + mext_o[26] = {XCL_BANK(5), hb_histograms_clusd0_ptr, 0}; + mext_o[27] = {XCL_BANK(5), hb_histograms_clusd1_ptr, 0}; + mext_o[28] = {XCL_BANK(5), hb_histograms_clusd2_ptr, 0}; + mext_o[29] = {XCL_BANK(5), hb_histograms_clusd3_ptr, 0}; + mext_o[30] = {XCL_BANK(5), hb_histograms_clusd4_ptr, 0}; + + mext_o[31] = {XCL_BANK(6), hb_histo_size_clusd0_ptr, 0}; + mext_o[32] = {XCL_BANK(6), hb_histo_size_clusd1_ptr, 0}; + mext_o[33] = {XCL_BANK(6), hb_histo_size_clusd2_ptr, 0}; + mext_o[34] = {XCL_BANK(6), hb_histo_size_clusd3_ptr, 0}; + mext_o[35] = {XCL_BANK(6), hb_histo_size_clusd4_ptr, 0}; + + mext_o[36] = {XCL_BANK(7), hb_histograms_clusdin0_ptr, 0}; + mext_o[37] = {XCL_BANK(7), hb_histograms_clusdin1_ptr, 0}; + mext_o[38] = {XCL_BANK(7), hb_histograms_clusdin2_ptr, 0}; + mext_o[39] = {XCL_BANK(7), hb_histograms_clusdin3_ptr, 0}; + mext_o[40] = {XCL_BANK(7), hb_histograms_clusdin4_ptr, 0}; + + // create device buffer and map dev buf to host buf + cl::Buffer db_config; + cl::Buffer db_histograms0_ptr; + cl::Buffer db_histograms1_ptr; + cl::Buffer db_histograms2_ptr; + cl::Buffer db_histograms3_ptr; + cl::Buffer db_histograms4_ptr; + cl::Buffer db_histo_totalcnt0_ptr; + cl::Buffer db_histo_totalcnt1_ptr; + cl::Buffer db_histo_totalcnt2_ptr; + cl::Buffer db_histo_totalcnt3_ptr; + cl::Buffer db_histo_totalcnt4_ptr; + cl::Buffer db_histo_size0_ptr; + cl::Buffer db_histo_size1_ptr; + cl::Buffer db_histo_size2_ptr; + cl::Buffer db_histo_size3_ptr; + cl::Buffer db_histo_size4_ptr; + cl::Buffer db_nonempty_histo0_ptr; + cl::Buffer db_nonempty_histo1_ptr; + cl::Buffer db_nonempty_histo2_ptr; + cl::Buffer db_nonempty_histo3_ptr; + cl::Buffer db_nonempty_histo4_ptr; + cl::Buffer db_ctx_map0_ptr; + cl::Buffer db_ctx_map1_ptr; + cl::Buffer db_ctx_map2_ptr; + cl::Buffer db_ctx_map3_ptr; + cl::Buffer db_ctx_map4_ptr; + cl::Buffer db_histograms_clusd0_ptr; + cl::Buffer db_histograms_clusd1_ptr; + cl::Buffer db_histograms_clusd2_ptr; + cl::Buffer db_histograms_clusd3_ptr; + cl::Buffer db_histograms_clusd4_ptr; + cl::Buffer db_histo_size_clusd0_ptr; + cl::Buffer db_histo_size_clusd1_ptr; + cl::Buffer db_histo_size_clusd2_ptr; + cl::Buffer db_histo_size_clusd3_ptr; + cl::Buffer db_histo_size_clusd4_ptr; + cl::Buffer db_histograms_clusdin0_ptr; + cl::Buffer db_histograms_clusdin1_ptr; + cl::Buffer db_histograms_clusdin2_ptr; + cl::Buffer db_histograms_clusdin3_ptr; + cl::Buffer db_histograms_clusdin4_ptr; + + db_config = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 30, &mext_o[0]); + + db_histograms0_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 163840, &mext_o[1]); + db_histograms1_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 163840, &mext_o[2]); + db_histograms2_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 163840, &mext_o[3]); + db_histograms3_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 163840, &mext_o[4]); + db_histograms4_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 163840, &mext_o[5]); + + db_histo_totalcnt0_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[6]); + db_histo_totalcnt1_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[7]); + db_histo_totalcnt2_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[8]); + db_histo_totalcnt3_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[9]); + db_histo_totalcnt4_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[10]); + + db_histo_size0_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[11]); + db_histo_size1_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[12]); + db_histo_size2_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[13]); + db_histo_size3_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[14]); + db_histo_size4_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[15]); + + db_nonempty_histo0_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[16]); + db_nonempty_histo1_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[17]); + db_nonempty_histo2_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[18]); + db_nonempty_histo3_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[19]); + db_nonempty_histo4_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[20]); + + db_ctx_map0_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint8_t) * 4096, &mext_o[21]); + db_ctx_map1_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint8_t) * 4096, &mext_o[22]); + db_ctx_map2_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint8_t) * 4096, &mext_o[23]); + db_ctx_map3_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint8_t) * 4096, &mext_o[24]); + db_ctx_map4_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint8_t) * 4096, &mext_o[25]); + + db_histograms_clusd0_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 5120, &mext_o[26]); + db_histograms_clusd1_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 5120, &mext_o[27]); + db_histograms_clusd2_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 5120, &mext_o[28]); + db_histograms_clusd3_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 5120, &mext_o[29]); + db_histograms_clusd4_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 5120, &mext_o[30]); + + db_histo_size_clusd0_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 128, &mext_o[31]); + db_histo_size_clusd1_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 128, &mext_o[32]); + db_histo_size_clusd2_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 128, &mext_o[33]); + db_histo_size_clusd3_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 128, &mext_o[34]); + db_histo_size_clusd4_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 128, &mext_o[35]); + + db_histograms_clusdin0_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 4096, &mext_o[36]); + db_histograms_clusdin1_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 4096, &mext_o[37]); + db_histograms_clusdin2_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 4096, &mext_o[38]); + db_histograms_clusdin3_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 4096, &mext_o[39]); + db_histograms_clusdin4_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 4096, &mext_o[40]); + + // add buffers to migrate + std::vector ob_in; + std::vector ob_out; + + ob_in.push_back(db_config); + ob_in.push_back(db_histograms0_ptr); + ob_in.push_back(db_histograms1_ptr); + ob_in.push_back(db_histograms2_ptr); + ob_in.push_back(db_histograms3_ptr); + ob_in.push_back(db_histograms4_ptr); + ob_in.push_back(db_histo_totalcnt0_ptr); + ob_in.push_back(db_histo_totalcnt1_ptr); + ob_in.push_back(db_histo_totalcnt2_ptr); + ob_in.push_back(db_histo_totalcnt3_ptr); + ob_in.push_back(db_histo_totalcnt4_ptr); + ob_in.push_back(db_histo_size0_ptr); + ob_in.push_back(db_histo_size1_ptr); + ob_in.push_back(db_histo_size2_ptr); + ob_in.push_back(db_histo_size3_ptr); + ob_in.push_back(db_histo_size4_ptr); + ob_in.push_back(db_nonempty_histo0_ptr); + ob_in.push_back(db_nonempty_histo1_ptr); + ob_in.push_back(db_nonempty_histo2_ptr); + ob_in.push_back(db_nonempty_histo3_ptr); + ob_in.push_back(db_nonempty_histo4_ptr); + + ob_out.push_back(db_config); + ob_out.push_back(db_ctx_map0_ptr); + ob_out.push_back(db_ctx_map1_ptr); + ob_out.push_back(db_ctx_map2_ptr); + ob_out.push_back(db_ctx_map3_ptr); + ob_out.push_back(db_ctx_map4_ptr); + ob_out.push_back(db_histograms_clusd0_ptr); + ob_out.push_back(db_histograms_clusd1_ptr); + ob_out.push_back(db_histograms_clusd2_ptr); + ob_out.push_back(db_histograms_clusd3_ptr); + ob_out.push_back(db_histograms_clusd4_ptr); + ob_out.push_back(db_histo_size_clusd0_ptr); + ob_out.push_back(db_histo_size_clusd1_ptr); + ob_out.push_back(db_histo_size_clusd2_ptr); + ob_out.push_back(db_histo_size_clusd3_ptr); + ob_out.push_back(db_histo_size_clusd4_ptr); + ob_out.push_back(db_histograms_clusdin0_ptr); + ob_out.push_back(db_histograms_clusdin1_ptr); + ob_out.push_back(db_histograms_clusdin2_ptr); + ob_out.push_back(db_histograms_clusdin3_ptr); + ob_out.push_back(db_histograms_clusdin4_ptr); + + // set kernel args + for (int i = 0; i < repInt; i++) { + cluster_kernel[i].setArg(0, db_config); + cluster_kernel[i].setArg(1, db_histograms0_ptr); + cluster_kernel[i].setArg(2, db_histo_totalcnt0_ptr); + cluster_kernel[i].setArg(3, db_histo_size0_ptr); + cluster_kernel[i].setArg(4, db_nonempty_histo0_ptr); + cluster_kernel[i].setArg(5, db_ctx_map0_ptr); + cluster_kernel[i].setArg(6, db_histograms_clusd0_ptr); + cluster_kernel[i].setArg(7, db_histo_size_clusd0_ptr); + cluster_kernel[i].setArg(8, db_histograms_clusdin0_ptr); + cluster_kernel[i].setArg(9, db_histograms1_ptr); + cluster_kernel[i].setArg(10, db_histo_totalcnt1_ptr); + cluster_kernel[i].setArg(11, db_histo_size1_ptr); + cluster_kernel[i].setArg(12, db_nonempty_histo1_ptr); + cluster_kernel[i].setArg(13, db_ctx_map1_ptr); + cluster_kernel[i].setArg(14, db_histograms_clusd1_ptr); + cluster_kernel[i].setArg(15, db_histo_size_clusd1_ptr); + cluster_kernel[i].setArg(16, db_histograms_clusdin1_ptr); + cluster_kernel[i].setArg(17, db_histograms2_ptr); + cluster_kernel[i].setArg(18, db_histo_totalcnt2_ptr); + cluster_kernel[i].setArg(19, db_histo_size2_ptr); + cluster_kernel[i].setArg(20, db_nonempty_histo2_ptr); + cluster_kernel[i].setArg(21, db_ctx_map2_ptr); + cluster_kernel[i].setArg(22, db_histograms_clusd2_ptr); + cluster_kernel[i].setArg(23, db_histo_size_clusd2_ptr); + cluster_kernel[i].setArg(24, db_histograms_clusdin2_ptr); + cluster_kernel[i].setArg(25, db_histograms3_ptr); + cluster_kernel[i].setArg(26, db_histo_totalcnt3_ptr); + cluster_kernel[i].setArg(27, db_histo_size3_ptr); + cluster_kernel[i].setArg(28, db_nonempty_histo3_ptr); + cluster_kernel[i].setArg(29, db_ctx_map3_ptr); + cluster_kernel[i].setArg(30, db_histograms_clusd3_ptr); + cluster_kernel[i].setArg(31, db_histo_size_clusd3_ptr); + cluster_kernel[i].setArg(32, db_histograms_clusdin3_ptr); + cluster_kernel[i].setArg(33, db_histograms4_ptr); + cluster_kernel[i].setArg(34, db_histo_totalcnt4_ptr); + cluster_kernel[i].setArg(35, db_histo_size4_ptr); + cluster_kernel[i].setArg(36, db_nonempty_histo4_ptr); + cluster_kernel[i].setArg(37, db_ctx_map4_ptr); + cluster_kernel[i].setArg(38, db_histograms_clusd4_ptr); + cluster_kernel[i].setArg(39, db_histo_size_clusd4_ptr); + cluster_kernel[i].setArg(40, db_histograms_clusdin4_ptr); + } + + // launch kernel and calculate kernel execution time + std::cout << "INFO: Kernel Start" << std::endl; + // declare events + std::vector events_write(1); + std::vector events_kernel(1); + std::vector events_read(1); + + // migrate + q.enqueueMigrateMemObjects(ob_in, 0, nullptr, &events_write[0]); + q.enqueueTask(cluster_kernel[0], &events_write, &events_kernel[0]); + q.enqueueMigrateMemObjects(ob_out, 1, &events_kernel, &events_read[0]); + q.finish(); + + struct timeval end_time; + gettimeofday(&end_time, 0); + std::cout << "INFO: Finish kernel execution" << std::endl; + std::cout << "INFO: Finish E2E execution" << std::endl; + + // print related times + unsigned long timeStart, timeEnd, exec_time0; + std::cout << "-------------------------------------------------------" << std::endl; + events_write[0].getProfilingInfo(CL_PROFILING_COMMAND_START, &timeStart); + events_write[0].getProfilingInfo(CL_PROFILING_COMMAND_END, &timeEnd); + exec_time0 = (timeEnd - timeStart) / 1000.0; + std::cout << "INFO: Data transfer from host to device: " << exec_time0 << " us\n"; + std::cout << "-------------------------------------------------------" << std::endl; + events_read[0].getProfilingInfo(CL_PROFILING_COMMAND_START, &timeStart); + events_read[0].getProfilingInfo(CL_PROFILING_COMMAND_END, &timeEnd); + exec_time0 = (timeEnd - timeStart) / 1000.0; + std::cout << "INFO: Kernel1 Data transfer from device to host: " << exec_time0 << " us\n"; + std::cout << "-------------------------------------------------------" << std::endl; + exec_time0 = 0; + for (int i = 0; i < 1; ++i) { + events_kernel[0].getProfilingInfo(CL_PROFILING_COMMAND_START, &timeStart); + events_kernel[0].getProfilingInfo(CL_PROFILING_COMMAND_END, &timeEnd); + exec_time0 += (timeEnd - timeStart) / 1000.0; + + std::cout << "INFO: Kernel" << i + 1 << " execution: " << (timeEnd - timeStart) / 1000.0 << " us\n"; + std::cout << "-------------------------------------------------------" << std::endl; + } + std::cout << "INFO: kernel total execution: " << exec_time0 << " us\n"; + std::cout << "-------------------------------------------------------" << std::endl; + unsigned long exec_timeE2E = diff(&end_time, &start_time); + std::cout << "INFO: FPGA execution time:" << exec_timeE2E << " us\n"; + std::cout << "-------------------------------------------------------" << std::endl; + + for (int j = 0; j < MAX_NUM_CONFIG; j++) { + config[j] = hb_config[j]; + } + // std::cout << "out kernel config size:" << 30 << std::endl; + // std::cout << "histogram size: " << config[0] << "," << config[1] << "," << config[2] + // << "," << config[3] << "," << config[4] << std::endl; + // std::cout << "non-empty histogram size: " << "," << config[5] << "," << config[6] + // << "," << config[7] << "," << config[8] << "," << config[9] << std::endl; + // std::cout << "largest idx: " << config[10] << "," << config[11] << "," << config[12] + // << "," << config[13] << "," << config[14] << std::endl; + // std::cout << "num cluster: " << config[15] << "," << config[16] << "," << config[17] + // << "," << config[18] << "," << config[19] << std::endl; + // std::cout << "histo_size_clusdin: " << config[20] << "," << config[21] << "," << config[22] + // << "," << config[23] << "," << config[24] << std::endl; + // std::cout << "do_once: " << config[25] << "," << config[26] << "," << config[27] + // << "," << config[28] << "," << config[29] << std::endl; + + // output + std::cout << "ctx_map_ptr:" << std::endl; + for (int j = 0; j < 4096; j++) { + ctx_map0_ptr[j] = hb_ctx_map0_ptr[j]; + ctx_map1_ptr[j] = hb_ctx_map1_ptr[j]; + ctx_map2_ptr[j] = hb_ctx_map2_ptr[j]; + ctx_map3_ptr[j] = hb_ctx_map3_ptr[j]; + ctx_map4_ptr[j] = hb_ctx_map4_ptr[j]; + } + + std::cout << "histograms_clusd_ptr:" << std::endl; + for (int j = 0; j < 5120; j++) { + histograms_clusd0_ptr[j] = hb_histograms_clusd0_ptr[j]; + histograms_clusd1_ptr[j] = hb_histograms_clusd1_ptr[j]; + histograms_clusd2_ptr[j] = hb_histograms_clusd2_ptr[j]; + histograms_clusd3_ptr[j] = hb_histograms_clusd3_ptr[j]; + histograms_clusd4_ptr[j] = hb_histograms_clusd4_ptr[j]; + } + + std::cout << "histo_size_clusd_ptr:" << std::endl; + for (int j = 0; j < 128; j++) { + histo_size_clusd0_ptr[j] = hb_histo_size_clusd0_ptr[j]; + histo_size_clusd1_ptr[j] = hb_histo_size_clusd1_ptr[j]; + histo_size_clusd2_ptr[j] = hb_histo_size_clusd2_ptr[j]; + histo_size_clusd3_ptr[j] = hb_histo_size_clusd3_ptr[j]; + histo_size_clusd4_ptr[j] = hb_histo_size_clusd4_ptr[j]; + } + + std::cout << "histograms_clusdin_ptr:" << std::endl; + for (int j = 0; j < 4096; j++) { + histograms_clusdin0_ptr[j] = hb_histograms_clusdin0_ptr[j]; + histograms_clusdin1_ptr[j] = hb_histograms_clusdin1_ptr[j]; + histograms_clusdin2_ptr[j] = hb_histograms_clusdin2_ptr[j]; + histograms_clusdin3_ptr[j] = hb_histograms_clusdin3_ptr[j]; + histograms_clusdin4_ptr[j] = hb_histograms_clusdin4_ptr[j]; + } + + // for(int i=0; i +#include +#include "xcl2.hpp" +#include "xf_utils_sw/logger.hpp" + +void hls_ANSclusterHistogram_wrapper(std::string xclbinPath, + uint32_t* config, + //==================== + int32_t* histograms0_ptr, + uint32_t* histo_totalcnt0_ptr, + uint32_t* histo_size0_ptr, + uint32_t* nonempty_histo0_ptr, + uint8_t* ctx_map0_ptr, + int32_t* histograms_clusd0_ptr, + uint32_t* histo_size_clusd0_ptr, + int32_t* histograms_clusdin0_ptr, + //==================== + int32_t* histograms1_ptr, + uint32_t* histo_totalcnt1_ptr, + uint32_t* histo_size1_ptr, + uint32_t* nonempty_histo1_ptr, + uint8_t* ctx_map1_ptr, + int32_t* histograms_clusd1_ptr, + uint32_t* histo_size_clusd1_ptr, + int32_t* histograms_clusdin1_ptr, + //====================== + int32_t* histograms2_ptr, + uint32_t* histo_totalcnt2_ptr, + uint32_t* histo_size2_ptr, + uint32_t* nonempty_histo2_ptr, + uint8_t* ctx_map2_ptr, + int32_t* histograms_clusd2_ptr, + uint32_t* histo_size_clusd2_ptr, + int32_t* histograms_clusdin2_ptr, + //====================== + int32_t* histograms3_ptr, + uint32_t* histo_totalcnt3_ptr, + uint32_t* histo_size3_ptr, + uint32_t* nonempty_histo3_ptr, + uint8_t* ctx_map3_ptr, + int32_t* histograms_clusd3_ptr, + uint32_t* histo_size_clusd3_ptr, + int32_t* histograms_clusdin3_ptr, + //====================== + int32_t* histograms4_ptr, + uint32_t* histo_totalcnt4_ptr, + uint32_t* histo_size4_ptr, + uint32_t* nonempty_histo4_ptr, + uint8_t* ctx_map4_ptr, + int32_t* histograms_clusd4_ptr, + uint32_t* histo_size_clusd4_ptr, + int32_t* histograms_clusdin4_ptr); + +#endif diff --git a/codec/L2/demos/jxlEnc/acc_cluster_histogram/kernel/hls_cluster_histogram.cpp b/codec/L2/demos/jxlEnc/acc_cluster_histogram/kernel/hls_cluster_histogram.cpp new file mode 100644 index 0000000000..b298fd9e73 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_cluster_histogram/kernel/hls_cluster_histogram.cpp @@ -0,0 +1,1321 @@ +/* + * Copyright 2022 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HLS_CLUSTER_HISTOGRAM_CPP +#define HLS_CLUSTER_HISTOGRAM_CPP + +#include "stdio.h" +#include "hls_cluster_histogram.hpp" + +#define FLOAT_MAX 3.402823466e+38F + +unsigned get_uram(unsigned idx0, + unsigned idx1, +#ifndef __SYNTHESIS__ + std::vector > >& histograms +#else + ap_uint<64> histograms[4096][20] +#endif + ) { + ap_uint<64> uram_tmp = histograms[idx0][idx1 / 2]; + return idx1 % 2 == 0 ? uram_tmp.range(31, 0) : uram_tmp.range(63, 32); +} + +inline float compute_8(float in[8]) { + float tmp_x0 = in[0] + in[1]; + float tmp_x1 = in[2] + in[3]; + float tmp_x2 = in[4] + in[5]; + float tmp_x3 = in[6] + in[7]; + float tmp_x4 = tmp_x0 + tmp_x1; + float tmp_x5 = tmp_x2 + tmp_x3; + return tmp_x4 + tmp_x5; +} + +void GetIdx(unsigned int numNonempty, + unsigned int nonempty_histo[4096], + unsigned int histo_size[4096], + hls::stream& stream_idx, + hls::stream& stream_a_size0) { +GETIDX: + for (unsigned int i = 0; i < numNonempty; i++) { +#pragma HLS LOOP_TRIPCOUNT min = 10000 max = 10000 +#pragma HLS pipeline + unsigned int idx = nonempty_histo[i]; + stream_idx.write(idx); + unsigned int tmp = histo_size[idx]; + stream_a_size0.write(tmp); + } +} + +void GetA(bool isEntropy, + unsigned int numNonempty, + unsigned int histo_size[4096], +#ifndef __SYNTHESIS__ + std::vector > >& histograms, +#else + ap_uint<64> histograms[4096][20], +#endif + unsigned int histo_totalcnt[4096], + unsigned int nonempty_histo[4096], + hls::stream& stream_idx, + hls::stream& stream_b_size0, + hls::stream& stream_b_size1, + hls::stream& stream_size0, + hls::stream& stream_a, + hls::stream& stream_a_size0, + hls::stream& stream_a_size1, + hls::stream& stream_a_count) { + unsigned char count_a = 0; + unsigned int count_context = 0; + unsigned int idx; + unsigned char a_size; + unsigned int a_total_count; + unsigned char size; +GETA_OUT: + while (count_context < numNonempty) { +#pragma HLS LOOP_TRIPCOUNT min = 10000 max = 10000 +#pragma HLS pipeline + if (count_a == 0) { + idx = stream_idx.read(); + a_size = stream_a_size0.read(); + stream_a_size1.write(a_size); + if (!isEntropy) { + unsigned char b_size = stream_b_size0.read(); + stream_b_size1.write(b_size); + size = hls::max(a_size, b_size); + } else { + size = a_size; + } + // printf("GetA isEntropy=%d, idx=%d, count_context=%d, count_a=%d, + // size=%d, a_size=%d\n", + // isEntropy, idx, count_context, count_a, size, a_size); + stream_size0.write(size); + a_total_count = histo_totalcnt[idx]; + stream_a_count.write(a_total_count); + } + unsigned int tmp = get_uram(idx, count_a, histograms); // histograms[idx][count_a]; + stream_a.write(tmp); + count_a++; + if (count_a == a_size) { + count_a = 0; + count_context++; + } + } +} + +void GetB(bool isEntropy, + unsigned int numNonempty, + unsigned int refSize, + unsigned int ref_histo[40], + unsigned int ref_totalcount, + hls::stream& stream_b, + hls::stream& stream_b_size0, + hls::stream& stream_b_count) { + unsigned int count_context = 0; + unsigned char count_b = 0; + unsigned char b_size; + unsigned int b_total_count; + if (!isEntropy) { + GETB_OUT: + while (count_context < numNonempty) { +#pragma HLS LOOP_TRIPCOUNT min = 10000 max = 10000 + if (count_b == 0) { + b_size = refSize; + stream_b_size0.write(b_size); + b_total_count = ref_totalcount; + stream_b_count.write(b_total_count); + } + stream_b.write(ref_histo[count_b]); + count_b++; + if (count_b == b_size) { + count_b = 0; + count_context++; + } + } + } +} + +void DoHistogramDistanceEntropy(bool isEntropy, + unsigned int numNonempty, + hls::stream& stream_size0, + hls::stream& stream_size1, + hls::stream& stream_size2, + hls::stream& stream_a, + hls::stream& stream_a_size1, + hls::stream& stream_a_count, + hls::stream& stream_b, + hls::stream& stream_b_size1, + hls::stream& stream_b_count, + hls::stream& stream_dist_total, + hls::stream& stream_dist) { + int count_debug = 0; + unsigned int count_context = 0; + unsigned char count_s = 0; + unsigned char a_size; + unsigned int a_total_count; + unsigned char b_size; + unsigned int b_total_count; + unsigned int sum_count = 0; + unsigned char size = 0; + float total; + float totallog2; +DISTANCE_OUT: + while (count_context < numNonempty) { +#pragma HLS LOOP_TRIPCOUNT min = 10000 max = 10000 + if (count_s == 0) { + sum_count = 0; + a_size = stream_a_size1.read(); + a_total_count = stream_a_count.read(); + if (!isEntropy) { + b_size = stream_b_size1.read(); + b_total_count = stream_b_count.read(); + total = a_total_count + b_total_count; + } else { + total = a_total_count; + } + totallog2 = total == 0 ? 0 : hls::log2(total); + size = stream_size0.read(); + stream_size1.write(size); + stream_size2.write(size); + // printf("DoHist count_context=%d, count_s=%d, size=%d\n", count_context, + // count_s, size); + } + unsigned int counts; + if (!isEntropy) { + unsigned int a_counts = a_size > count_s ? stream_a.read() : 0; + unsigned int b_counts = b_size > count_s ? stream_b.read() : 0; + counts = a_counts + b_counts; + } else { + unsigned int tmp = stream_a.read(); + counts = tmp; + } + float countlog2 = counts == 0 ? 0 : hls::log2((float)counts); + bool flag = counts == total; + sum_count += flag ? 0 : counts; + float tmp = flag ? 0 : counts * countlog2; + stream_dist.write(tmp); + count_s++; + if (count_s == size) { + // printf("DoHist Write stream_dist_total %d %d %d\n", count_context, + // count_s, size); + count_debug++; + stream_dist_total.write(sum_count * totallog2); + count_s = 0; + count_context++; + } + } + // printf("stream_dist_total in=%d\n", count_debug); +} + +void GroupSum(unsigned int numNonempty, + hls::stream& stream_size1, + hls::stream& stream_dist, + hls::stream& stream_sum) { + int count_debug = 0; + unsigned int count_context = 0; + unsigned char count_s = 0; + float sum_array[8]; + unsigned char size; +GROUPSUM_OUT: + while (count_context < numNonempty) { +#pragma HLS LOOP_TRIPCOUNT min = 10000 max = 10000 + if (count_s == 0) { + size = stream_size1.read(); + } + unsigned char idx = count_s % 8; + sum_array[idx] = stream_dist.read(); + if (idx == 7) { + float sum_part = compute_8(sum_array); + stream_sum.write(sum_part); + // printf("GroupSum Write stream_sum %d %d %d\n", count_context, count_s, + // size); + count_debug++; + } + count_s++; + if (count_s == size) { + count_s = 0; + count_context++; + } + } + // printf("stream_sum in=%d\n", count_debug); +} + +void GetDist(bool isEntropy, + unsigned int numNonempty, + unsigned int j, + float histo_entropy[4096], + float ref_entropy, + float dists[4096], + unsigned int best[4096], + unsigned int& largest_idx, + hls::stream& stream_size2, + hls::stream& stream_dist_total, + hls::stream& stream_sum) { + int count_debug = 0; + float dist_std; + unsigned int count_context = 0; + unsigned char count_s = 0; + largest_idx = 0; + unsigned char size; + float sum_dist = 0; + float reg_curr; + float reg0; + float reg1; + float reg2; + unsigned short addr_curr = 0; + unsigned short addr0 = 0xffff; + unsigned short addr1 = 0xffff; + unsigned short addr2 = 0xffff; +GET_DIST_OUT: + while (count_context < numNonempty) { +#pragma HLS LOOP_TRIPCOUNT min = 1250 max = 1250 + if (count_s == 0) { + size = stream_size2.read() / 8; + sum_dist = 0; + // printf("GetDist count_context=%d, count_s=%d, size=%d\n", + // count_context, count_s, size); + } + // printf("GetDist count_context=%d, count_s=%d, size=%d\n", count_context, + // count_s, size); + sum_dist += stream_sum.read(); + // printf("GetDist Read stream_sum %d %d %d\n", count_context, count_s, + // size); + if (count_s == size - 1) { + count_debug++; + // printf("GetDist Read stream_dist_total %d %d %d\n", count_context, + // count_s, size); + float tmp = stream_dist_total.read(); + dist_std = tmp - sum_dist; + + // update dist, may update same addess + addr_curr = count_context; + if (addr_curr == addr0) { + reg_curr = reg0; + } else if (addr_curr == addr1) { + reg_curr = reg1; + } else if (addr_curr == addr2) { + reg_curr = reg2; + } else { + reg_curr = dists[addr_curr]; + } + + float tmp_largest = dists[largest_idx]; + if (!isEntropy) { + if (dist_std - histo_entropy[addr_curr] - ref_entropy < reg_curr) { + best[addr_curr] = j; + reg_curr = dist_std - histo_entropy[addr_curr] - ref_entropy; + } + } else { + reg_curr = dist_std; + } + if (reg_curr > tmp_largest) { + largest_idx = addr_curr; + } + + dists[addr_curr] = reg_curr; + reg2 = reg1; + reg1 = reg0; + reg0 = reg_curr; + addr2 = addr1; + addr1 = addr0; + addr0 = addr_curr; + } + count_s++; + if (count_s == size) { + count_s = 0; + count_context++; + } + } + // printf("stream_sum out=%d\n", count_debug); +} + +void hls_HistogramDistance(bool isEntropy, + unsigned int numNonempty, + unsigned int j, + unsigned int histo_size[4096], +#ifndef __SYNTHESIS__ + std::vector > >& histograms, +#else + ap_uint<64> histograms[4096][20], +#endif + unsigned int histo_totalcnt[4096], + float histo_entropy[4096], + unsigned int nonempty_histo[4096], + unsigned int refSize, + unsigned int ref_histo[40], + unsigned int ref_totalcount, + float ref_entropy, + float dists[4096], + unsigned int best[4096], + unsigned int& largest_idx) { + hls::stream stream_size0("stream_size0"); +#pragma HLS STREAM variable = stream_size0 depth = 64 + hls::stream stream_size1("stream_size1"); +#pragma HLS STREAM variable = stream_size1 depth = 64 + hls::stream stream_size2("stream_size2"); +#pragma HLS STREAM variable = stream_size2 depth = 64 + + hls::stream stream_a("stream_a"); +#pragma HLS STREAM variable = stream_a depth = 64 + hls::stream stream_a_size0("stream_a_size0"); +#pragma HLS STREAM variable = stream_a_size0 depth = 64 + hls::stream stream_a_size1("stream_a_size1"); +#pragma HLS STREAM variable = stream_a_size1 depth = 64 + hls::stream stream_a_count("stream_a_count"); +#pragma HLS STREAM variable = stream_a_count depth = 64 + + hls::stream stream_b("stream_b"); +#pragma HLS STREAM variable = stream_b depth = 64 + hls::stream stream_b_size0("stream_b_size0"); +#pragma HLS STREAM variable = stream_b_size0 depth = 64 + hls::stream stream_b_size1("stream_b_size1"); +#pragma HLS STREAM variable = stream_b_size1 depth = 64 + hls::stream stream_b_count("stream_b_count"); +#pragma HLS STREAM variable = stream_b_count depth = 64 + + hls::stream stream_dist_total("stream_dist_total"); +#pragma HLS STREAM variable = stream_dist_total depth = 64 + hls::stream stream_dist("stream_dist"); +#pragma HLS STREAM variable = stream_dist depth = 64 + hls::stream stream_sum("stream_sum"); +#pragma HLS STREAM variable = stream_sum depth = 64 + hls::stream stream_idx("stream_idx"); +#pragma HLS STREAM variable = stream_idx depth = 64 + +// clang-format on +#pragma HLS dataflow + + GetIdx(numNonempty, nonempty_histo, histo_size, stream_idx, stream_a_size0); + + GetB(isEntropy, numNonempty, refSize, ref_histo, ref_totalcount, stream_b, stream_b_size0, stream_b_count); + + GetA(isEntropy, numNonempty, histo_size, histograms, histo_totalcnt, nonempty_histo, stream_idx, stream_b_size0, + stream_b_size1, stream_size0, stream_a, stream_a_size0, stream_a_size1, stream_a_count); + + DoHistogramDistanceEntropy(isEntropy, numNonempty, stream_size0, stream_size1, stream_size2, stream_a, + stream_a_size1, stream_a_count, stream_b, stream_b_size1, stream_b_count, + stream_dist_total, stream_dist); + + GroupSum(numNonempty, stream_size1, stream_dist, stream_sum); + + GetDist(isEntropy, numNonempty, j, histo_entropy, ref_entropy, dists, best, largest_idx, stream_size2, + stream_dist_total, stream_sum); +} + +int hls_ClusterHisgtogram(unsigned int largest_idx, + unsigned int numNonempty, + unsigned int nonempty_histo[4096], + unsigned int histo_totalcnt[4096], + unsigned int histo_size[4096], +#ifndef __SYNTHESIS__ + std::vector > >& histograms, +#else + ap_uint<64> histograms[4096][20], +#endif + unsigned int histo_size_clusd[128], +#ifndef __SYNTHESIS__ + std::vector >& histograms_clusd, +#else + unsigned int histograms_clusd[128][40], +#endif + unsigned char histogram_symbols[4096]) { + unsigned char max_histograms = 128; + float min_distance = 64.0; + unsigned int best[4096]; + float dists[4096]; + float entropy[4096]; + unsigned int total_count[4096]; + float out_entropy[4096]; + float histo_entropy[4096]; + + unsigned int size_b = 0; + unsigned int total_count_b = 0; + float hls_entropy_b = 0; + unsigned int data_b[40]; + unsigned int tmp_largest_idx; + + hls_HistogramDistance(true, numNonempty, 0, histo_size, histograms, histo_totalcnt, histo_entropy, nonempty_histo, + size_b, data_b, total_count_b, hls_entropy_b, entropy, best, tmp_largest_idx); + +INIT_1: + for (unsigned int i = 0; i < numNonempty; i++) { +#pragma HLS LOOP_TRIPCOUNT min = 4096 max = 4096 +#pragma HLS pipeline + unsigned int idx = nonempty_histo[i]; + histo_entropy[idx] = entropy[i]; + best[i] = 0; + dists[i] = FLOAT_MAX; + } + + unsigned int numHisto_clusd = 0; + unsigned int max_count = hls::min((int)max_histograms, (int)numNonempty); + largest_idx = nonempty_histo[largest_idx]; + dists[largest_idx] = FLOAT_MAX; + unsigned int idx = largest_idx; +FIRST_SCAN: + while (numHisto_clusd < max_count && dists[largest_idx] >= min_distance) { +#pragma HLS LOOP_TRIPCOUNT min = 128 max = 128 + histogram_symbols[idx] = numHisto_clusd; + unsigned char data_size = histo_size[idx]; + GEN_REF1: + for (unsigned char k = 0; k < data_size; k++) { +#pragma HLS LOOP_TRIPCOUNT min = 40 max = 40 + histograms_clusd[numHisto_clusd][k] = get_uram(idx, k, histograms); // histograms[idx][k]; + data_b[k] = get_uram(idx, k, histograms); // histograms[idx][k]; + } + histo_size_clusd[numHisto_clusd] = data_size; + size_b = data_size; + total_count[numHisto_clusd] = histo_totalcnt[idx]; + total_count_b = histo_totalcnt[idx]; + out_entropy[numHisto_clusd] = histo_entropy[idx]; + hls_entropy_b = histo_entropy[idx]; +// printf("push idx=%d, size_b=%d, total_count_b=%d, hls_entropy_b=%f\n", +// idx, size_b, total_count_b, hls_entropy_b); +#pragma HLS ALLOCATION function instances = hls_HistogramDistance limit = 1 + hls_HistogramDistance(false, numNonempty, 0, histo_size, histograms, histo_totalcnt, entropy, nonempty_histo, + size_b, data_b, total_count_b, hls_entropy_b, dists, best, largest_idx); + idx = nonempty_histo[largest_idx]; + numHisto_clusd++; + } + +INIT_2: + for (unsigned int j = 0; j < numNonempty; j++) { +#pragma HLS LOOP_TRIPCOUNT min = 4096 max = 4096 +#pragma HLS pipeline + best[j] = 0; + dists[j] = FLOAT_MAX; + } + +SECOND_SCAN: + for (unsigned int j = 0; j < numHisto_clusd; j++) { +#pragma HLS LOOP_TRIPCOUNT min = 128 max = 128 + size_b = histo_size_clusd[j]; + total_count_b = total_count[j]; + hls_entropy_b = out_entropy[j]; + GEN_REF2: + for (unsigned char k = 0; k < size_b; k++) { +#pragma HLS LOOP_TRIPCOUNT min = 40 max = 40 +#pragma HLS pipeline + data_b[k] = histograms_clusd[j][k]; + } + hls_HistogramDistance(false, numNonempty, j, histo_size, histograms, histo_totalcnt, entropy, nonempty_histo, + size_b, data_b, total_count_b, hls_entropy_b, dists, best, largest_idx); + } + +OUTPUT1: + for (unsigned int i = 0; i < numNonempty; i++) { +#pragma HLS LOOP_TRIPCOUNT min = 4096 max = 4096 + unsigned int idx_in = nonempty_histo[i]; + unsigned int idx_out = best[i]; + unsigned int other_data_size = histo_size[idx_in]; + unsigned int total_count = histo_totalcnt[idx_in]; + if (other_data_size > histo_size_clusd[idx_out]) { + histo_size_clusd[idx_out] = other_data_size; + } + OUTPUT2: + for (unsigned char k = 0; k < other_data_size; ++k) { +#pragma HLS LOOP_TRIPCOUNT min = 40 max = 40 + unsigned int data_tmp = get_uram(idx_in, k, histograms); // histograms[idx_in][k]; + histograms_clusd[idx_out][k] += data_tmp * numHisto_clusd; + } + histogram_symbols[idx_in] = idx_out; + } + return numHisto_clusd; +} + +// clang-format off +void hls_fastclusterHistogram_wrapper( + unsigned int largest_idx, + unsigned int numNonempty, + unsigned int nonempty_histo[4096], + unsigned int numHisto, + unsigned int histo_totalcnt[4096], + unsigned int histo_size[4096], +#ifndef __SYNTHESIS__ + std::vector > >& histograms, +#else + ap_uint<64> histograms[4096][20], +#endif + unsigned int& numHisto_clusd, + unsigned int histo_size_clusd[128], +#ifndef __SYNTHESIS__ + std::vector >& histograms_clusd, +#else + unsigned int histograms_clusd[128][40], +#endif + unsigned char histogram_symbols[4096] +) { + //printf("[KERNEL] hls_fastclusterHistogram_wrapper in %d %d %d %d\n", largest_idx, + // numNonempty, numHisto, numHisto_clusd); + // clang-format on + if (numHisto > 1) { + if (numNonempty == 0) { + numHisto_clusd = 1; + } else { + numHisto_clusd = hls_ClusterHisgtogram(largest_idx, numNonempty, nonempty_histo, histo_totalcnt, histo_size, + histograms, histo_size_clusd, histograms_clusd, histogram_symbols); + // printf("[KERNEL]size= %d\n", numNonempty); + // for(int i=0; i num_histograms = numHisto_clusd_ptr; + uint32_t entry_bits; + uint32_t floor_log2 = 32 - num_histograms.countLeadingZeros() - 1; + if ((num_histograms & (num_histograms - 1)) == 0) { + entry_bits = floor_log2; // power of two + } else { + entry_bits = floor_log2 + 1; + } + if (numHisto_ptr > 1 && entry_bits >= 4) { + uint32_t max_tok = 0; + for (uint32_t k = 0; k < numHisto_ptr; ++k) { +#pragma HLS PIPELINE II = 1 + ap_uint<32> value = ctx_map_ptr[k]; + uint32_t tok; + if (value < 16) { + tok = value; + } else { + uint32_t n = 32 - value.countLeadingZeros() - 1; + uint32_t m = value - (1 << n); + tok = 16 + ((n - 4) << 2) + (m >> (n - 2)); + } + max_tok = tok > max_tok ? tok : max_tok; + ++histograms_clusdin_ptr[tok]; + } + histo_size_clusdin_ptr = (max_tok + 8) / 8 * 8; + } +} + +void load_histo(uint32_t numNonempty_ptr, + uint32_t* nonempty_histo_ptr, + int32_t* histograms_ptr, + + uint32_t nonempty_histo_tmp[4096], +#ifndef __SYNTHESIS__ + std::vector > >& histograms_tmp +#else + ap_uint<64> histograms_tmp[4096][20] +#endif + ) { + for (int i = 0; i < 4096; i++) { +#pragma HLS PIPELINE II = 1 + for (int j = 0; j < 20; j++) { +#pragma HLS UNROLL + histograms_tmp[i][j] = 0; + } + } + + for (int i = 0; i < numNonempty_ptr; i++) { + uint32_t reg = nonempty_histo_ptr[i]; + nonempty_histo_tmp[i] = reg; + for (ap_uint<8> j = 0; j < 20; j++) { +#pragma HLS PIPELINE II = 1 + ap_uint<64> val; + val.range(31, 0) = histograms_ptr[reg * 40 + j * 2]; + val.range(63, 32) = histograms_ptr[reg * 40 + j * 2 + 1]; + histograms_tmp[reg][j] = val; + } + } +} + +void load_nonempty(uint32_t* nonempty_histo_ptr, uint32_t nonempty_histo_tmp[4096]) { + for (int i = 0; i < 4096; i++) { +#pragma HLS PIPELINE II = 1 + nonempty_histo_tmp[i] = nonempty_histo_ptr[i]; + } +} + +void load_total_cnt(uint32_t* histo_totalcnt_ptr, uint32_t histo_totalcnt_tmp[4096]) { + for (int i = 0; i < 4096; i++) { +#pragma HLS PIPELINE II = 1 + histo_totalcnt_tmp[i] = histo_totalcnt_ptr[i]; + } +} + +void load_size(uint32_t* histo_size_ptr, uint32_t histo_size_tmp[4096]) { + for (int i = 0; i < 4096; i++) { +#pragma HLS PIPELINE II = 1 + histo_size_tmp[i] = histo_size_ptr[i]; + } +} + +void memset_histo_clusdin(int32_t histograms_clusdin_tmp[40]) { + for (int i = 0; i < 40; i++) { +#pragma HLS UNROLL + histograms_clusdin_tmp[i] = 0; + } +} + +void memset_histo_clusd( +#ifndef __SYNTHESIS__ + std::vector >& histograms_clusd_tmp +#else + unsigned histograms_clusd_tmp[128][40] +#endif + ) { + for (int i = 0; i < 128; i++) { + for (int j = 0; j < 40; j++) { +#pragma HLS PIPELINE II = 1 + histograms_clusd_tmp[i][j] = 0; + } + } +} + +void memset_ctx_map(uint8_t ctx_map_tmp[4096]) { + for (int i = 0; i < 4096; i++) { +#pragma HLS PIPELINE II = 1 + ctx_map_tmp[i] = 0; + } +} + +void load_data(uint32_t numNonempty_ptr, + int32_t* histograms_ptr, + uint32_t* nonempty_histo_ptr, + uint32_t* histo_totalcnt_ptr, + uint32_t* histo_size_ptr, +#ifndef __SYNTHESIS__ + std::vector > >& histograms_tmp, +#else + ap_uint<64> histograms_tmp[4096][20], +#endif + uint32_t nonempty_histo_tmp[4096], + uint32_t histo_totalcnt_tmp[4096], + uint32_t histo_size_tmp[4096], + int32_t histograms_clusdin_tmp[40], +#ifndef __SYNTHESIS__ + std::vector >& histograms_clusd_tmp, +#else + unsigned histograms_clusd_tmp[128][40], +#endif + uint8_t ctx_map_tmp[4096]) { +#pragma HLS DATAFLOW + load_histo(numNonempty_ptr, nonempty_histo_ptr, histograms_ptr, nonempty_histo_tmp, histograms_tmp); + + load_total_cnt(histo_totalcnt_ptr, histo_totalcnt_tmp); + + load_size(histo_size_ptr, histo_size_tmp); + + memset_histo_clusdin(histograms_clusdin_tmp); + + memset_histo_clusd(histograms_clusd_tmp); + + memset_ctx_map(ctx_map_tmp); +} + +void write_histo_clusd( +#ifndef __SYNTHESIS__ + std::vector >& histograms_clusd_tmp, +#else + unsigned histograms_clusd_tmp[128][40], +#endif + int32_t* histograms_clusd_ptr) { + for (int i = 0; i < 128; i++) { + for (int j = 0; j < 40; j++) { +#pragma HLS PIPELINE II = 1 + histograms_clusd_ptr[i * 40 + j] = histograms_clusd_tmp[i][j]; + } + } +} + +void write_size_clusd(uint32_t histo_size_clusd_tmp[128], uint32_t* histo_size_clusd_ptr) { + for (int i = 0; i < 128; i++) { +#pragma HLS PIPELINE II = 1 + histo_size_clusd_ptr[i] = histo_size_clusd_tmp[i]; + } +} + +void write_ctx_map(uint8_t ctx_map_tmp[4096], uint8_t* ctx_map_ptr) { + for (int i = 0; i < 4096; i++) { +#pragma HLS PIPELINE II = 1 + ctx_map_ptr[i] = ctx_map_tmp[i]; + } +} + +void write_histo_clusdin(int32_t histograms_clusdin_tmp[40], int32_t* histograms_clusdin_ptr) { + for (int i = 0; i < 40; i++) { +#pragma HLS PIPELINE II = 1 + histograms_clusdin_ptr[i] = histograms_clusdin_tmp[i]; + } +} + +void write_data( +#ifndef __SYNTHESIS__ + std::vector >& histograms_clusd_tmp, +#else + unsigned histograms_clusd_tmp[128][40], +#endif + uint32_t histo_size_clusd_tmp[128], + uint8_t ctx_map_tmp[4096], + int32_t histograms_clusdin_tmp[40], + int32_t* histograms_clusd_ptr, + uint32_t* histo_size_clusd_ptr, + uint8_t* ctx_map_ptr, + int32_t* histograms_clusdin_ptr) { +#pragma HLS DATAFLOW + write_histo_clusd(histograms_clusd_tmp, histograms_clusd_ptr); + + write_size_clusd(histo_size_clusd_tmp, histo_size_clusd_ptr); + + write_ctx_map(ctx_map_tmp, ctx_map_ptr); + + write_histo_clusdin(histograms_clusdin_tmp, histograms_clusdin_ptr); +} + +/** + * @brief JXL ANS cluster Histogram kernel + * + * @param config configuration for the kernel. + * @param histograms0_ptr histograms for Block Context Map. + * @param histo_totalcnt0_ptr Count of context for histograms for Block Context Map. + * @param histo_size0_ptr size for each context + * @param nonempty_histo0_ptr indicate which context is empty + * @param ctx_map0_ptr the input context map + * @param histograms_clusd0_ptr the clustered histogram + * @param histograms_clusdin0_ptr the context for the clustered histogram + * @param histograms1_ptr histograms for Modular frame tree. + * @param histo_totalcnt1_ptr Count of context for histograms for Modular frame tree. + * @param histo_size1_ptr size for each context + * @param nonempty_histo1_ptr indicate which context is empty + * @param ctx_map1_ptr the input context map + * @param histograms_clusd1_ptr the clustered histogram + * @param histograms_clusdin1_ptr the context for the clustered histogram + * @param histograms2_ptr histograms for code from Modular frame. + * @param histo_totalcnt2_ptr Count of context for histograms for Modular frame. + * @param histo_size2_ptr size for each context + * @param nonempty_histo2_ptr indicate which context is empty + * @param ctx_map2_ptr the input context map + * @param histograms_clusd2_ptr the clustered histogram + * @param histograms_clusdin2_ptr the context for the clustered histogram + * @param histograms3_ptr histograms for coef orders. + * @param histo_totalcnt3_ptr Count of context for histograms for coef orders. + * @param histo_size3_ptr size for each context + * @param nonempty_histo3_ptr indicate which context is empty + * @param ctx_map3_ptr the input context map + * @param histograms_clusd3_ptr the clustered histogram + * @param histograms_clusdin3_ptr the context for the clustered histogram + * @param histograms4_ptr histograms for ac coefficients. + * @param histo_totalcnt4_ptr Count of context for histograms for ac coefficients. + * @param histo_size4_ptr size for each context + * @param nonempty_histo4_ptr indicate which context is empty + * @param ctx_map4_ptr the input context map + * @param histograms_clusd4_ptr the clustered histogram + * @param histograms_clusdin4_ptr the context for the clustered histogram + */ + +// clang-format off +void hls_ANSclusterHistogram_core( + uint32_t numNonempty_ptr, + uint32_t* nonempty_histo_ptr, + + uint32_t lidx_ptr, + uint32_t numHisto_ptr, + uint32_t* histo_totalcnt_ptr, + uint32_t* histo_size_ptr, + int32_t* histograms_ptr, + + uint8_t* ctx_map_ptr, + uint32_t* histo_size_clusd_ptr, + int32_t* histograms_clusd_ptr, + + int32_t* histograms_clusdin_ptr, + uint32_t& numHisto_clusd_ptr, + uint32_t& histo_size_clusdin_ptr) { +// clang-format on +// No dataflow, sequentially run +#ifndef __SYNTHESIS__ + std::vector > > histograms_tmp(4096, std::vector >(20)); + uint32_t* nonempty_histo_tmp = (uint32_t*)malloc(4096 * sizeof(uint32_t)); + uint32_t* histo_totalcnt_tmp = (uint32_t*)malloc(4096 * sizeof(uint32_t)); + uint32_t* histo_size_tmp = (uint32_t*)malloc(4096 * sizeof(uint32_t)); + + uint32_t* histo_size_clusd_tmp = (uint32_t*)malloc(128 * sizeof(uint32_t)); + std::vector > histograms_clusd_tmp(128, std::vector(40)); + uint8_t* ctx_map_tmp = (uint8_t*)malloc(4096 * sizeof(uint8_t)); + + int32_t* histograms_clusdin_tmp = (int32_t*)malloc(40 * sizeof(int32_t)); +#else + ap_uint<64> histograms_tmp[4096][20]; +#pragma HLS BIND_STORAGE impl = URAM variable = histograms_tmp +#pragma HLS ARRAY_PARTITION variable = histograms_tmp complete dim = 2 + uint32_t nonempty_histo_tmp[4096]; +#pragma HLS BIND_STORAGE impl = URAM variable = nonempty_histo_tmp + uint32_t histo_totalcnt_tmp[4096]; +#pragma HLS BIND_STORAGE impl = URAM variable = histo_totalcnt_tmp + uint32_t histo_size_tmp[4096]; +#pragma HLS BIND_STORAGE impl = URAM variable = histo_size_tmp + unsigned histograms_clusd_tmp[128][40]; +#pragma HLS BIND_STORAGE impl = LUTRAM variable = histograms_clusd_tmp + uint32_t histo_size_clusd_tmp[128]; +#pragma HLS BIND_STORAGE impl = LUTRAM variable = histo_size_clusd_tmp + uint8_t ctx_map_tmp[4096]; +#pragma HLS BIND_STORAGE impl = URAM variable = ctx_map_tmp + int32_t histograms_clusdin_tmp[40]; +#pragma HLS ARRAY_PARTITION variable = histograms_clusdin_tmp complete dim = 0 +#endif + + load_data(numNonempty_ptr, histograms_ptr, nonempty_histo_ptr, histo_totalcnt_ptr, histo_size_ptr, histograms_tmp, + nonempty_histo_tmp, histo_totalcnt_tmp, histo_size_tmp, histograms_clusdin_tmp, histograms_clusd_tmp, + ctx_map_tmp); + + hls_fastclusterHistogram_wrapper(lidx_ptr, numNonempty_ptr, nonempty_histo_tmp, numHisto_ptr, histo_totalcnt_tmp, + histo_size_tmp, histograms_tmp, numHisto_clusd_ptr, histo_size_clusd_tmp, + histograms_clusd_tmp, ctx_map_tmp); + + buildCTXHistogram(numHisto_ptr, ctx_map_tmp, numHisto_clusd_ptr, histograms_clusdin_tmp, histo_size_clusdin_ptr); + + write_data(histograms_clusd_tmp, histo_size_clusd_tmp, ctx_map_tmp, histograms_clusdin_tmp, histograms_clusd_ptr, + histo_size_clusd_ptr, ctx_map_ptr, histograms_clusdin_ptr); +} + +namespace xf { +namespace codec { + +/** +* @brief JXL ANS cluster Histogram kernel +* +* @param config configuration for the kernel. +* @param histograms0_ptr histograms for Block Context Map. +* @param histo_totalcnt0_ptr Count of context for histograms for Block Context Map. +* @param histo_size0_ptr size for each context +* @param nonempty_histo0_ptr indicate which context is empty +* @param ctx_map0_ptr the input context map +* @param histograms_clusd0_ptr the clustered histogram +* @param histograms_clusdin0_ptr the context for the clustered histogram +* @param histograms1_ptr histograms for Modular frame tree. +* @param histo_totalcnt1_ptr Count of context for histograms for Modular frame tree. +* @param histo_size1_ptr size for each context +* @param nonempty_histo1_ptr indicate which context is empty +* @param ctx_map1_ptr the input context map +* @param histograms_clusd1_ptr the clustered histogram +* @param histograms_clusdin1_ptr the context for the clustered histogram +* @param histograms2_ptr histograms for code from Modular frame. +* @param histo_totalcnt2_ptr Count of context for histograms for Modular frame. +* @param histo_size2_ptr size for each context +* @param nonempty_histo2_ptr indicate which context is empty +* @param ctx_map2_ptr the input context map +* @param histograms_clusd2_ptr the clustered histogram +* @param histograms_clusdin2_ptr the context for the clustered histogram +* @param histograms3_ptr histograms for coef orders. +* @param histo_totalcnt3_ptr Count of context for histograms for coef orders. +* @param histo_size3_ptr size for each context +* @param nonempty_histo3_ptr indicate which context is empty +* @param ctx_map3_ptr the input context map +* @param histograms_clusd3_ptr the clustered histogram +* @param histograms_clusdin3_ptr the context for the clustered histogram +* @param histograms4_ptr histograms for ac coefficients. +* @param histo_totalcnt4_ptr Count of context for histograms for ac coefficients. +* @param histo_size4_ptr size for each context +* @param nonempty_histo4_ptr indicate which context is empty +* @param ctx_map4_ptr the input context map +* @param histograms_clusd4_ptr the clustered histogram +* @param histograms_clusdin4_ptr the context for the clustered histogram +*/ + +// clang-format off + extern "C" void JxlEnc_ans_clusterHistogram( + uint32_t* config, + + int32_t* histograms0_ptr, + uint32_t* histo_totalcnt0_ptr, + uint32_t* histo_size0_ptr, + + uint32_t* nonempty_histo0_ptr, + + uint8_t* ctx_map0_ptr, + + int32_t* histograms_clusd0_ptr, + uint32_t* histo_size_clusd0_ptr, + + int32_t* histograms_clusdin0_ptr, + //==================== + int32_t* histograms1_ptr, + uint32_t* histo_totalcnt1_ptr, + uint32_t* histo_size1_ptr, + + uint32_t* nonempty_histo1_ptr, + + uint8_t* ctx_map1_ptr, + + int32_t* histograms_clusd1_ptr, + uint32_t* histo_size_clusd1_ptr, + + int32_t* histograms_clusdin1_ptr, + //====================== + int32_t* histograms2_ptr, + uint32_t* histo_totalcnt2_ptr, + uint32_t* histo_size2_ptr, + + uint32_t* nonempty_histo2_ptr, + + uint8_t* ctx_map2_ptr, + + int32_t* histograms_clusd2_ptr, + uint32_t* histo_size_clusd2_ptr, + + int32_t* histograms_clusdin2_ptr, + //====================== + int32_t* histograms3_ptr, + uint32_t* histo_totalcnt3_ptr, + uint32_t* histo_size3_ptr, + + uint32_t* nonempty_histo3_ptr, + + uint8_t* ctx_map3_ptr, + + int32_t* histograms_clusd3_ptr, + uint32_t* histo_size_clusd3_ptr, + + int32_t* histograms_clusdin3_ptr, + //====================== + int32_t* histograms4_ptr, + uint32_t* histo_totalcnt4_ptr, + uint32_t* histo_size4_ptr, + + uint32_t* nonempty_histo4_ptr, + + uint8_t* ctx_map4_ptr, + + int32_t* histograms_clusd4_ptr, + uint32_t* histo_size_clusd4_ptr, + + int32_t* histograms_clusdin4_ptr +) { +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = histogram_gmem port = histograms0_ptr depth = 163840 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = histogram_gmem port = histograms1_ptr depth = 163840 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = histogram_gmem port = histograms2_ptr depth = 163840 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = histogram_gmem port = histograms3_ptr depth = 163840 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = histogram_gmem port = histograms4_ptr depth = 163840 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = histocnt_gmem port = histo_totalcnt0_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = histocnt_gmem port = histo_totalcnt1_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = histocnt_gmem port = histo_totalcnt2_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = histocnt_gmem port = histo_totalcnt3_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = histocnt_gmem port = histo_totalcnt4_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = histosize_gmem port = histo_size0_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = histosize_gmem port = histo_size1_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = histosize_gmem port = histo_size2_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = histosize_gmem port = histo_size3_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = histosize_gmem port = histo_size4_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = nonempty_gmem port = nonempty_histo0_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = nonempty_gmem port = nonempty_histo1_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = nonempty_gmem port = nonempty_histo2_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = nonempty_gmem port = nonempty_histo3_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \ + 8 max_write_burst_length = 2 max_read_burst_length = 64 bundle = nonempty_gmem port = nonempty_histo4_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = ctx_gmem port = ctx_map0_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = ctx_gmem port = ctx_map1_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = ctx_gmem port = ctx_map2_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = ctx_gmem port = ctx_map3_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = ctx_gmem port = ctx_map4_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = histo_clusd_gmem port = histograms_clusd0_ptr depth = 5120 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = histo_clusd_gmem port = histograms_clusd1_ptr depth = 5120 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = histo_clusd_gmem port = histograms_clusd2_ptr depth = 5120 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = histo_clusd_gmem port = histograms_clusd3_ptr depth = 5120 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = histo_clusd_gmem port = histograms_clusd4_ptr depth = 5120 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = histosize_clusd_gmem port = histo_size_clusd0_ptr depth = 128 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = histosize_clusd_gmem port = histo_size_clusd1_ptr depth = 128 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = histosize_clusd_gmem port = histo_size_clusd2_ptr depth = 128 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = histosize_clusd_gmem port = histo_size_clusd3_ptr depth = 128 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = histosize_clusd_gmem port = histo_size_clusd4_ptr depth = 128 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = histo_clusdin_gmem port = histograms_clusdin0_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = histo_clusdin_gmem port = histograms_clusdin1_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = histo_clusdin_gmem port = histograms_clusdin2_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = histo_clusdin_gmem port = histograms_clusdin3_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = histo_clusdin_gmem port = histograms_clusdin4_ptr depth = 4096 + +#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 8 num_read_outstanding = \ + 1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = histo_clusdin_gmem port = config depth = 35 + // clang-format on + + // No dataflow, run sequentially + + uint32_t do_once[5]; + do_once[0] = config[25]; + do_once[1] = config[26]; + do_once[2] = config[27]; + do_once[3] = config[28]; + do_once[4] = config[29]; + + uint32_t numHisto0_ptr = config[0]; + uint32_t numNonempty0_ptr = config[5]; + uint32_t lidx0_ptr = config[10]; + uint32_t numHisto_clusd0_ptr; + uint32_t histo_size_clusdin0_ptr; + + uint32_t numHisto1_ptr = config[1]; + uint32_t numNonempty1_ptr = config[6]; + uint32_t lidx1_ptr = config[11]; + uint32_t numHisto_clusd1_ptr; + uint32_t histo_size_clusdin1_ptr; + + uint32_t numHisto2_ptr = config[2]; + uint32_t numNonempty2_ptr = config[7]; + uint32_t lidx2_ptr = config[12]; + + uint32_t numHisto_clusd2_ptr; + uint32_t histo_size_clusdin2_ptr; + + uint32_t numHisto3_ptr = config[3]; + uint32_t numNonempty3_ptr = config[8]; + uint32_t lidx3_ptr = config[13]; + + uint32_t numHisto_clusd3_ptr; + uint32_t histo_size_clusdin3_ptr; + + uint32_t numHisto4_ptr = config[4]; + uint32_t numNonempty4_ptr = config[9]; + uint32_t lidx4_ptr = config[14]; + + uint32_t numHisto_clusd4_ptr; + uint32_t histo_size_clusdin4_ptr; + + if (do_once[0] != 0) { + // clang-format off + hls_ANSclusterHistogram_core( + numNonempty0_ptr, + nonempty_histo0_ptr, + + lidx0_ptr, + numHisto0_ptr, + histo_totalcnt0_ptr, + histo_size0_ptr, + histograms0_ptr, + + ctx_map0_ptr, + histo_size_clusd0_ptr, + histograms_clusd0_ptr, + + histograms_clusdin0_ptr, + numHisto_clusd0_ptr, + histo_size_clusdin0_ptr); + // clang-format on + } + + if (do_once[1] != 0) { + // clang-format off + hls_ANSclusterHistogram_core( + numNonempty1_ptr, + nonempty_histo1_ptr, + + lidx1_ptr, + numHisto1_ptr, + histo_totalcnt1_ptr, + histo_size1_ptr, + histograms1_ptr, + + ctx_map1_ptr, + histo_size_clusd1_ptr, + histograms_clusd1_ptr, + + histograms_clusdin1_ptr, + numHisto_clusd1_ptr, + histo_size_clusdin1_ptr); + // clang-format on + } + + if (do_once[2] != 0) { + // clang-format off + hls_ANSclusterHistogram_core( + numNonempty2_ptr, + nonempty_histo2_ptr, + + lidx2_ptr, + numHisto2_ptr, + histo_totalcnt2_ptr, + histo_size2_ptr, + histograms2_ptr, + + ctx_map2_ptr, + histo_size_clusd2_ptr, + histograms_clusd2_ptr, + + histograms_clusdin2_ptr, + numHisto_clusd2_ptr, + histo_size_clusdin2_ptr); + // clang-format on + } + + if (do_once[3] != 0) { + // clang-format off + hls_ANSclusterHistogram_core( + numNonempty3_ptr, + nonempty_histo3_ptr, + + lidx3_ptr, + numHisto3_ptr, + histo_totalcnt3_ptr, + histo_size3_ptr, + histograms3_ptr, + + ctx_map3_ptr, + histo_size_clusd3_ptr, + histograms_clusd3_ptr, + + histograms_clusdin3_ptr, + numHisto_clusd3_ptr, + histo_size_clusdin3_ptr); + // clang-format on + } + + if (do_once[4] != 0) { +// clang-format off + #pragma HLS ALLOCATION function instances = hls_ANSclusterHistogram_core limit = 1 + hls_ANSclusterHistogram_core( + numNonempty4_ptr, + nonempty_histo4_ptr, + + lidx4_ptr, + numHisto4_ptr, + histo_totalcnt4_ptr, + histo_size4_ptr, + histograms4_ptr, + + ctx_map4_ptr, + histo_size_clusd4_ptr, + histograms_clusd4_ptr, + + histograms_clusdin4_ptr, + numHisto_clusd4_ptr, + histo_size_clusdin4_ptr); + // clang-format on + } + + config[19] = numHisto_clusd4_ptr; + config[24] = histo_size_clusdin4_ptr; + config[18] = numHisto_clusd3_ptr; + config[23] = histo_size_clusdin3_ptr; + config[17] = numHisto_clusd2_ptr; + config[22] = histo_size_clusdin2_ptr; + config[16] = numHisto_clusd1_ptr; + config[21] = histo_size_clusdin1_ptr; + config[15] = numHisto_clusd0_ptr; + config[20] = histo_size_clusdin0_ptr; + // printf("[KERNEL] cluster size = (%d, %d, %d, %d, %d)\n", numHisto_clusd0_ptr, numHisto_clusd1_ptr, + // numHisto_clusd2_ptr, numHisto_clusd3_ptr, numHisto_clusd4_ptr); + // printf("[KERNEL] cluster in size = (%d, %d, %d, %d, %d)\n", histo_size_clusdin0_ptr, histo_size_clusdin1_ptr, + // histo_size_clusdin2_ptr, histo_size_clusdin3_ptr, histo_size_clusdin4_ptr); + // for(int i=0; i= 2022.1), 1) +LINK_TARGET_FMT := xsa +else +LINK_TARGET_FMT := xclbin +endif +else +LINK_TARGET_FMT := xclbin +endif + +#Checks for Device Family +ifeq ($(HOST_ARCH), aarch32) + DEV_FAM = 7Series +else ifeq ($(HOST_ARCH), aarch64) + DEV_FAM = Ultrascale +endif + +#Checks for Correct architecture +ifneq ($(HOST_ARCH), $(filter $(HOST_ARCH),aarch64 aarch32 x86)) +$(error HOST_ARCH variable not set, please set correctly and rerun) +endif + +check_version: +ifneq (, $(shell which git)) +ifneq (,$(wildcard $(XFLIB_DIR)/.git)) + @cd $(XFLIB_DIR) && git log --graph --pretty=format:'%Cred%h%Creset -%C(yellow)%d%Creset %s %Cgreen(%cr) %C(bold blue)<%an>%Creset' --abbrev-commit -n 1 && cd - +endif +endif + +#Checks for SYSROOT +check_sysroot: +ifneq ($(HOST_ARCH), x86) +ifndef SYSROOT + $(error SYSROOT ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif + +#Checks for g++ +CXX := g++ +ifeq ($(HOST_ARCH), x86) +ifeq ($(shell expr $(VITIS_VER) \>= 2022.1), 1) +CXX_VER := 8.3.0 +else +CXX_VER := 6.2.0 +endif +CXX_V := $(shell echo $(CXX_VER) | awk -F. '{print tolower($$1)}') +ifneq ($(shell expr $(shell echo "__GNUG__" | g++ -E -x c++ - | tail -1) \>= $(CXX_V)), 1) +ifndef XILINX_VIVADO +$(error [ERROR]: g++ version too old. Please use $(CXX_VER) or above) +else +CXX := $(XILINX_VIVADO)/tps/lnx64/gcc-$(CXX_VER)/bin/g++ +ifeq ($(LD_LIBRARY_PATH),) +export LD_LIBRARY_PATH := $(XILINX_VIVADO)/tps/lnx64/gcc-$(CXX_VER)/lib64 +else +export LD_LIBRARY_PATH := $(XILINX_VIVADO)/tps/lnx64/gcc-$(CXX_VER)/lib64:$(LD_LIBRARY_PATH) +endif +$(warning [WARNING]: g++ version too old. Using g++ provided by the tool: $(CXX)) +endif +endif +else ifeq ($(HOST_ARCH), aarch64) +CXX := $(XILINX_VITIS)/gnu/aarch64/lin/aarch64-linux/bin/aarch64-linux-gnu-g++ +else ifeq ($(HOST_ARCH), aarch32) +CXX := $(XILINX_VITIS)/gnu/aarch32/lin/gcc-arm-linux-gnueabi/bin/arm-linux-gnueabihf-g++ +endif + +#Check OS and setting env for xrt c++ api +OSDIST = $(shell lsb_release -i |awk -F: '{print tolower($$2)}' | tr -d ' \t' ) +OSREL = $(shell lsb_release -r |awk -F: '{print tolower($$2)}' |tr -d ' \t') + +# for centos and redhat +ifneq ($(findstring centos,$(OSDIST)),) +ifeq (7,$(shell echo $(OSREL) | awk -F. '{print tolower($$1)}' )) +ifeq ($(HOST_ARCH), x86) +XRT_CXXFLAGS += -D_GLIBCXX_USE_CXX11_ABI=0 +endif +endif +else ifneq ($(findstring redhat,$(OSDIST)),) +ifeq (7,$(shell echo $(OSREL) | awk -F. '{print tolower($$1)}' )) +ifeq ($(HOST_ARCH), x86) +XRT_CXXFLAGS += -D_GLIBCXX_USE_CXX11_ABI=0 +endif +endif +endif + +#Setting VPP +VPP := v++ + +#Cheks for aiecompiler +AIECXX := aiecompiler +AIESIMULATOR := aiesimulator +X86SIMULATOR := x86simulator + +.PHONY: check_vivado +check_vivado: +ifeq (,$(wildcard $(XILINX_VIVADO)/bin/vivado)) + @echo "Cannot locate Vivado installation. Please set XILINX_VIVADO variable." && false +endif + +.PHONY: check_vpp +check_vpp: +ifeq (,$(wildcard $(XILINX_VITIS)/bin/v++)) + @echo "Cannot locate Vitis installation. Please set XILINX_VITIS variable." && false +endif + +.PHONY: check_xrt +check_xrt: +ifeq (,$(wildcard $(XILINX_XRT)/lib/libxilinxopencl.so)) + @echo "Cannot locate XRT installation. Please set XILINX_XRT variable." && false +endif + +export PATH := $(XILINX_VITIS)/bin:$(XILINX_XRT)/bin:$(PATH) +ifeq ($(HOST_ARCH), x86) +ifeq (,$(LD_LIBRARY_PATH)) +LD_LIBRARY_PATH := $(XILINX_XRT)/lib +else +LD_LIBRARY_PATH := $(XILINX_XRT)/lib:$(LD_LIBRARY_PATH) +endif +endif + +ifneq (,$(wildcard $(PLATFORM))) +# Use PLATFORM as a file path +XPLATFORM := $(PLATFORM) +else +# Use PLATFORM as a file name pattern +# 1. search paths specified by variable +ifneq (,$(PLATFORM_REPO_PATHS)) +# 1.1 as exact name +XPLATFORM := $(strip $(foreach p, $(subst :, ,$(PLATFORM_REPO_PATHS)), $(wildcard $(p)/$(PLATFORM)/$(PLATFORM).xpfm))) +# 1.2 as a pattern +ifeq (,$(XPLATFORM)) +XPLATFORMS := $(foreach p, $(subst :, ,$(PLATFORM_REPO_PATHS)), $(wildcard $(p)/*/*.xpfm)) +XPLATFORM := $(strip $(foreach p, $(XPLATFORMS), $(shell echo $(p) | awk '$$1 ~ /$(PLATFORM)/'))) +endif # 1.2 +endif # 1 +# 2. search Vitis installation +ifeq (,$(XPLATFORM)) +# 2.1 as exact name +XPLATFORM := $(strip $(wildcard $(XILINX_VITIS)/platforms/$(PLATFORM)/$(PLATFORM).xpfm)) +# 2.2 as a pattern +ifeq (,$(XPLATFORM)) +XPLATFORMS := $(wildcard $(XILINX_VITIS)/platforms/*/*.xpfm) +XPLATFORM := $(strip $(foreach p, $(XPLATFORMS), $(shell echo $(p) | awk '$$1 ~ /$(PLATFORM)/'))) +endif # 2.2 +endif # 2 +# 3. search default locations +ifeq (,$(XPLATFORM)) +# 3.1 as exact name +XPLATFORM := $(strip $(wildcard /opt/xilinx/platforms/$(PLATFORM)/$(PLATFORM).xpfm)) +# 3.2 as a pattern +ifeq (,$(XPLATFORM)) +XPLATFORMS := $(wildcard /opt/xilinx/platforms/*/*.xpfm) +XPLATFORM := $(strip $(foreach p, $(XPLATFORMS), $(shell echo $(p) | awk '$$1 ~ /$(PLATFORM)/'))) +endif # 3.2 +endif # 3 +endif + +define MSG_PLATFORM +No platform matched pattern '$(PLATFORM)'. +Available platforms are: $(XPLATFORMS) +To add more platform directories, set the PLATFORM_REPO_PATHS variable or point PLATFORM variable to the full path of platform .xpfm file. +endef +export MSG_PLATFORM + + +.PHONY: check_platform +check_platform: +ifeq (,$(XPLATFORM)) + @echo "$${MSG_PLATFORM}" && false +endif +#Check ends + +# device2xsa - create a filesystem friendly name from device name +# $(1) - full name of device +PLATFORM_NAME = $(strip $(patsubst %.xpfm, % , $(shell basename $(PLATFORM)))) + + +# Cleaning stuff +RM = rm -f +RMDIR = rm -rf + +MV = mv -f +CP = cp -rf +ECHO:= @echo diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/Makefile b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/Makefile new file mode 100644 index 0000000000..2fb67f323b --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/Makefile @@ -0,0 +1,333 @@ +# Copyright 2019-2022 Xilinx, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# vitis makefile-generator v2.0.6 + +############################## Help Section ############################## +.PHONY: help + +help:: + $(ECHO) "Makefile Usage:" + $(ECHO) " make all TARGET= PLATFORM= HOST_ARCH=" + $(ECHO) " Command to generate the design for specified Target and Shell." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" + $(ECHO) "" + $(ECHO) " make run TARGET= PLATFORM= HOST_ARCH=" + $(ECHO) " Command to run application in emulation." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH required for SoC shells" + $(ECHO) "" + $(ECHO) " make xclbin TARGET= PLATFORM= HOST_ARCH=" + $(ECHO) " Command to build xclbin application." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" + $(ECHO) "" + $(ECHO) " make host TARGET=" + $(ECHO) " Command to build host application." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" + $(ECHO) "" + $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, env variable SYSROOT and EDGE_COMMON_SW need to be set first, and HOST_ARCH is either aarch32 or aarch64. For example," + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " export EDGE_COMMON_SW=< path-to-rootfs-and-Image-files >" + $(ECHO) "" + $(ECHO) " make clean " + $(ECHO) " Command to remove the generated non-hardware files." + $(ECHO) "" + $(ECHO) " make cleanall" + $(ECHO) " Command to remove all the generated files." + $(ECHO) "" + +############################## Setting up Project Variables ############################## + +MK_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +XF_PROJ_ROOT ?= $(shell bash -c 'export MK_PATH=$(MK_PATH); echo $${MK_PATH%/L2/*}') +CUR_DIR := $(patsubst %/,%,$(dir $(MK_PATH))) +XFLIB_DIR = $(XF_PROJ_ROOT) + +# setting devault value +TARGET ?= sw_emu +HOST_ARCH ?= x86 + +#setting PLATFORM +ifeq ($(PLATFORM),) +PLATFORM := $(DEVICE) +endif +ifeq ($(PLATFORM),) +PLATFORM := xilinx_u50_gen3x16_xdma_5_202210_1 +endif + +# #################### Checking if PLATFORM in whitelist ############################ +PLATFORM_ALLOWLIST += u50 +PLATFORM_BLOCKLIST += zc + +include ./utils.mk +TEMP_DIR := _x_temp.$(TARGET).$(PLATFORM_NAME) +TEMP_REPORT_DIR := $(CUR_DIR)/reports/_x.$(TARGET).$(PLATFORM_NAME) +BUILD_DIR := build_dir.$(TARGET).$(PLATFORM_NAME) +ifneq ($(RESULT_DIR),) +BUILD_DIR = $(RESULT_DIR) +endif +BUILD_REPORT_DIR := $(CUR_DIR)/reports/_build.$(TARGET).$(PLATFORM_NAME) +EMCONFIG := $(BUILD_DIR)/emconfig.json +XCLBIN_DIR := $(CUR_DIR)/$(BUILD_DIR) +export XCL_BINDIR = $(XCLBIN_DIR) + +EXE_FILE_DEPS := +BINARY_CONTAINERS_DEPS := +RUN_DEPS := + +# get global setting +ifeq ($(HOST_ARCH), x86) +CXXFLAGS += -fmessage-length=0 -I$(CUR_DIR)/src/ -I$(XILINX_XRT)/include -I$(XILINX_HLS)/include -std=c++14 -O3 -Wall -Wno-unknown-pragmas -Wno-unused-label +LDFLAGS += -pthread -L$(XILINX_XRT)/lib -L$(XILINX_HLS)/lnx64/tools/fpo_v7_0 -Wl,--as-needed -lOpenCL -lxrt_coreutil -lgmp -lmpfr -lIp_floating_point_v7_0_bitacc_cmodel +VPP_FLAGS += -t $(TARGET) --platform $(XPLATFORM) --save-temps +VPP_LDFLAGS += --optimize 2 -R 2 +else ifeq ($(HOST_ARCH), aarch64) +CXXFLAGS += -I$(CUR_DIR)/src/ -fmessage-length=0 --sysroot=$(SYSROOT) -I$(SYSROOT)/usr/include/xrt -I$(XILINX_HLS)/include -std=c++14 -O3 -Wall -Wno-unknown-pragmas -Wno-unused-label +LDFLAGS += -pthread -L$(SYSROOT)/usr/lib -L$(XILINX_VITIS_AIETOOLS)/lib/aarch64.o -Wl,--as-needed -lxilinxopencl -lxrt_coreutil +VPP_FLAGS += -t $(TARGET) --platform $(XPLATFORM) --save-temps +VPP_LDFLAGS += --optimize 2 -R 2 +endif +CXXFLAGS += $(EXTRA_CXXFLAGS) +VPP_FLAGS += $(EXTRA_VPP_FLAGS) + +########################## Setting up Host Variables ########################## +ifeq ($(TARGET),sw_emu) +CXXFLAGS += -D SW_EMU_TEST +endif +ifeq ($(TARGET),hw_emu) +CXXFLAGS += -D HW_EMU_TEST +endif + +ifeq (,$(findstring opencv,$(CXXFLAGS))) +CXXFLAGS += $(XRT_CXXFLAGS) +endif + +#Inclue Required Host Source Files +HOST_SRCS += $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cjxl.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cjxl_main.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cmdline.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/codec_config.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/speed_stats.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cpu/cpu.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cpu/os_specific.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/box/box.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/time.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_png.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_pgx.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_pnm.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_jpg.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_psd.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_internal.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/decode_to_jpeg.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_huffman.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_context_map.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/progressive_split.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_detect_dots.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_params.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/blending.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_comparator.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_table.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_tree.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/linalg.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_file.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/headers.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/alpha.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/image_metadata.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/frame_header.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_fast_heuristics.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_encode.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/fields.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/luminance.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_color_management.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_bit_writer.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/image.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/loop_filter.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/color_management.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_modular.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_quant_weights.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_chroma_from_luma.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_adaptive_quantization.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cache.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_group.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ac_strategy.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_noise.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_splines.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_patch_dictionary.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/splines.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_xyb.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ar_control_field.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/memory_manager_internal.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_file.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_image_bundle.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_toc.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/passes_state.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/chroma_from_luma.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_context_map.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_coeff_order.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_ans.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_entropy_coder.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_common.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_dc.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/epf.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_dot_dictionary.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_frame.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_patch_dictionary.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_comparator.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_reconstruct.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group_border.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/filters.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/convolve.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_cache.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_noise.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_huffman.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dct_scales.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_decode.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_icc_codec.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli/butteraugli.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cluster.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data_writer.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/jpeg_data.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data_reader.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/base/padded_bytes.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/base/data_parallel.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/base/cache_aligned.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/base/status.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/dec_ma.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/modular_image.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/encoding.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_rct.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_squeeze.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_palette.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/squeeze.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_transform.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/jxl_transform.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_ma.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_encoding.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/encode.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/memory.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/backward_references_hq.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/brotli_bit_stream.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/block_splitter.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/metablock.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/compress_fragment.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/compress_fragment_two_pass.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/backward_references.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/encoder_dict.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/utf8_util.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/decode.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/static_dict.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/literal_cost.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/entropy_encode.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/bit_cost.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/cluster.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/dictionary_hash.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/histogram.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/bit_reader.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/huffman.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/state.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/dictionary.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/transform.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmslut.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsnamed.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmspack.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscnvrt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsio1.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsgmt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsopt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsalpha.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmstypes.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsintrp.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsgamma.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscam02.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscgats.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmshalf.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsmtrx.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsps2.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmssamp.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmssm.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsxform.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsio0.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsplugin.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmserr.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmspcs.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmswtpnt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsvirt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lodepng/lodepng.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/highway/hwy/aligned_allocator.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/highway/hwy/targets.cc $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_enc_frame.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_host.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase1.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase2.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase3.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/acc_lossy_enc_compute/host/host_lossy_enc_compute.cpp $(XFLIB_DIR)/ext/xcl2/xcl2.cpp +CXXFLAGS += -I $(XFLIB_DIR)/../utils/L1/include/ -I $(XFLIB_DIR)/ext/xcl2 -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/ -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/build/lib/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/highway -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lodepng -I $(XFLIB_DIR)/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute -I $(XFLIB_DIR)/L2/demos/jxlEnc/others/include/ -I $(XFLIB_DIR)/L2/demos/jxlEnc/acc_lossy_enc_compute/kernel -I $(XFLIB_DIR)/L2/demos/jxlEnc/acc_lossy_enc_compute/host +CXXFLAGS += -O3 + +EXE_NAME := host.exe +EXE_FILE := $(BUILD_DIR)/$(EXE_NAME) +EXE_FILE_DEPS := $(HOST_SRCS) $(EXE_FILE_DEPS) + +HOST_ARGS := --xclbin $(BUILD_DIR)/jxlEnc.xclbin $(XFLIB_DIR)/L2/demos/jxlEnc/images/small32x32.png small32x32.jxl +ifneq ($(HOST_ARCH), x86) +PKG_HOST_ARGS = $(foreach args,$(HOST_ARGS),$(subst $(dir $(patsubst %/,%,$(args))),,$(args))) +endif + +########################## Kernel compiler global settings ########################## +ifneq (,$(shell echo $(XPLATFORM) | awk '/u50/')) +VPP_FLAGS += --config $(CUR_DIR)/conn_u50.cfg +VPP_FLAGS += -I $(XFLIB_DIR)/../utils/L1/include/ -I $(XFLIB_DIR)/L2/include/hw/jxlEnc + +else +VPP_FLAGS += -I $(XFLIB_DIR)/../utils/L1/include/ -I $(XFLIB_DIR)/L2/include/hw/jxlEnc + +endif + +######################### binary container global settings ########################## +VPP_FLAGS_JxlEnc_lossy_enc_compute += -D KERNEL_NAME=JxlEnc_lossy_enc_compute +VPP_FLAGS_JxlEnc_lossy_enc_compute += --hls.clock 300000000:JxlEnc_lossy_enc_compute +ifneq ($(HOST_ARCH), x86) +VPP_LDFLAGS_jxlEnc += --clock.defaultFreqHz 300000000 +else +VPP_LDFLAGS_jxlEnc += --kernel_frequency 300 +endif +VPP_LDFLAGS_jxlEnc_temp := -g --advanced.param compiler.userPostSysLinkOverlayTcl=postSysLink.tcl +VPP_LDFLAGS_jxlEnc += $(VPP_LDFLAGS_jxlEnc_temp) + +ifeq ($(HOST_ARCH), x86) +BINARY_CONTAINERS += $(BUILD_DIR)/jxlEnc.xclbin +else +BINARY_CONTAINERS += $(BUILD_DIR)/jxlEnc_pkg.$(LINK_TARGET_FMT) +BINARY_CONTAINERS_PKG += $(BUILD_DIR)/jxlEnc.xclbin +endif + +# ################ Setting Rules for Binary Containers (Building Kernels) ################ +$(TEMP_DIR)/JxlEnc_lossy_enc_compute.xo: $(XFLIB_DIR)/L2/demos/jxlEnc/acc_lossy_enc_compute/kernel/hls_lossy_enc_compute.cpp + $(ECHO) "Compiling Kernel: JxlEnc_lossy_enc_compute" + mkdir -p $(TEMP_DIR) + $(VPP) -c $(VPP_FLAGS_JxlEnc_lossy_enc_compute) $(VPP_FLAGS) -k JxlEnc_lossy_enc_compute -I'$(> $(RUN_SCRIPT) +ifneq ($(filter sw_emu hw_emu, $(TARGET)),) + @echo 'export XCL_EMULATION_MODE=$(TARGET)' >> $(RUN_SCRIPT) +endif + @echo 'export XILINX_VITIS=/mnt' >> $(RUN_SCRIPT) + @echo 'export XILINX_XRT=/usr' >> $(RUN_SCRIPT) + @echo 'if [ -f platform_desc.txt ]; then' >> $(RUN_SCRIPT) + @echo ' cp platform_desc.txt /etc/xocl.txt' >> $(RUN_SCRIPT) + @echo 'fi' >> $(RUN_SCRIPT) + @echo './$(EXE_NAME) $(PKG_HOST_ARGS)' >> $(RUN_SCRIPT) + @echo 'return_code=$$?' >> $(RUN_SCRIPT) + @echo 'if [ $$return_code -ne 0 ]; then' >> $(RUN_SCRIPT) + @echo ' echo "ERROR: Embedded host run failed, RC=$$return_code"' >> $(RUN_SCRIPT) + @echo 'else' >> $(RUN_SCRIPT) + @echo ' echo "INFO: TEST PASSED, RC=0"' >> $(RUN_SCRIPT) + @echo 'fi' >> $(RUN_SCRIPT) + @echo 'echo "INFO: Embedded host run completed."' >> $(RUN_SCRIPT) + @echo 'exit $$return_code' >> $(RUN_SCRIPT) +DATA_FILE := +DATA_DIR := +SD_FILES += $(RUN_SCRIPT) +SD_FILES += $(EXE_FILE) +SD_FILES += $(EMCONFIG) +SD_FILES += xrt.ini +SD_FILES += $(DATA_FILE)# where define DATAFILE in json +SD_FILES_WITH_PREFIX = $(foreach sd_file,$(SD_FILES), $(if $(filter $(sd_file),$(wildcard $(sd_file))), --package.sd_file $(sd_file))) +SD_DIRS_WITH_PREFIX = $(foreach sd_dir,$(DATA_DIR),--package.sd_dir $(sd_dir)) +PACKAGE_FILES := $(BINARY_CONTAINERS) +PACKAGE_FILES += $(AIE_CONTAINER) +SD_CARD := $(CUR_DIR)/package_$(TARGET) +vck190_dfx_hw := false +$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) + @echo "Generating sd_card folder...." + mkdir -p $(SD_CARD) + chmod a+rx $(BUILD_DIR)/run_script.sh +ifneq (,$(findstring vck190_base_dfx, $(PLATFORM_NAME))) +ifeq ($(TARGET),hw) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) -o $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) + @echo "### ***** sd_card generation done! ***** ###" +vck190_dfx_hw := true +endif +endif +ifeq ($(vck190_dfx_hw), false) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) + @echo "### ***** sd_card generation done! ***** ###" +endif + +.PHONY: sd_card +sd_card: $(SD_CARD) +endif +############################## Setting Essential Checks and Building Rules ############################## +RUN_DEPS += $(EXE_FILE) $(BINARY_CONTAINERS) $(EMCONFIG) +RUN_DEPS += $(SD_CARD) + +.PHONY: mkflag all run +mkflag: + mkdir -p $(BUILD_DIR) + rm -rf $(BUILD_DIR)/makefile_args.txt + @for var in $(MAKEFLAGS); do echo $$var >> $(BUILD_DIR)/makefile_args.txt; done +all: check_device check_vpp check_platform mkflag $(RUN_DEPS) +run: all +#hw_emu +ifneq (,$(filter hw_emu, $(TARGET))) +ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + XCL_EMULATION_MODE=$(TARGET) $(EXE_FILE) $(HOST_ARGS) + ./check.sh +else + @echo $(RUN_DEPS) + $(SD_CARD)/launch_$(TARGET).sh -no-reboot -run-app $(notdir $(RUN_SCRIPT)) + grep "TEST PASSED, RC=0" $(SD_CARD)/qemu_output.log || exit 1 + ./check.sh +endif +endif +#sw_emu +ifneq (,$(filter sw_emu, $(TARGET))) +ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + XCL_EMULATION_MODE=$(TARGET) $(EXE_FILE) $(HOST_ARGS) + ./check.sh +else + @echo $(RUN_DEPS) + $(SD_CARD)/launch_$(TARGET).sh -no-reboot -run-app $(notdir $(RUN_SCRIPT)) + grep "TEST PASSED, RC=0" $(SD_CARD)/qemu_output.log || exit 1 + ./check.sh +endif +endif +#hw +ifeq ($(TARGET), hw) +ifneq (,$(findstring aws-vu9p-f1, $(PLATFORM_NAME))) +ifneq ($(JENKINS_INTERNAL_BUILD), 1) + $(ECHO) "This makefile does not directly support converting .xclbin to .awsxclbin, please refer https://github.com/aws/aws-fpga/blob/master/Vitis/README.md for next operations" +else + $(ECHO) "Running inside Xilinx regression without converting to .awsxclbin" + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(EXE_FILE) $(HOST_ARGS) + ./check.sh +endif +else ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(EXE_FILE) $(HOST_ARGS) + ./check.sh +else + $(ECHO) "Please copy the content of sd_card folder and data to an SD Card and run on the board" +endif +endif + +############################## Setting Targets ############################## + +.PHONY: clean cleanall emconfig +emconfig: $(EMCONFIG) + +.PHONY: host +ifeq ($(HOST_ARCH), x86) +host: check_xrt $(EXE_FILE) +else +host: check_sysroot $(EXE_FILE) +endif + +.PHONY: xclbin +ifeq ($(HOST_ARCH), x86) +xclbin: check_vpp check_xrt $(BINARY_CONTAINERS) +else +xclbin: check_vpp check_sysroot $(BINARY_CONTAINERS) +endif + +############################## Cleaning Rules ############################## +cleanh: + -$(RMDIR) $(EXE_FILE) vitis_* TempConfig system_estimate.xtxt *.rpt .run/ + -$(RMDIR) src/*.ll _xocc_* .Xil dltmp* xmltmp* *.log *.jou *.wcfg *.wdb sample_link.ini sample_compile.ini obj* bin* *.csv *.jpg *.jpeg *.png + +cleank: + -$(RMDIR) $(BUILD_DIR)/*.xclbin _vimage *xclbin.run_summary qemu-memory-_* emulation/ _vimage/ pl*start_simulation. sh *.xclbin + -$(RMDIR) _x_temp.* + +cleanall: cleanh cleank + -$(RMDIR) $(BUILD_DIR) emconfig.json *.html $(TEMP_DIR) $(CUR_DIR)/reports *.csv *.run_summary $(CUR_DIR)/*.raw package_* $(BUILD_DIR)/run_script.sh .ipcache *.str + -$(RMDIR) $(CUR_DIR)/Work $(CUR_DIR)/*.xpe $(CUR_DIR)/hw.o $(CUR_DIR)/*.xsa $(CUR_DIR)/xnwOut + -$(RMDIR) + +clean: cleanh \ No newline at end of file diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/check.sh b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/check.sh new file mode 100755 index 0000000000..d9450ab8d2 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/check.sh @@ -0,0 +1 @@ +echo "bcf0915760ea2ffbfd33a1bb2abe028a small32x32.jxl" | md5sum -c - diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/conn_u50.cfg b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/conn_u50.cfg new file mode 100644 index 0000000000..70beb22bc7 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/conn_u50.cfg @@ -0,0 +1,21 @@ +[hls] +#pre_tcl=hls_pre.tcl + +[connectivity] +sp=JxlEnc_lossy_enc_compute_1.config:HBM[14] +sp=JxlEnc_lossy_enc_compute_1.config_fl:HBM[15] +sp=JxlEnc_lossy_enc_compute_1.hls_opsin_1:HBM[0] +sp=JxlEnc_lossy_enc_compute_1.hls_opsin_2:HBM[1] +sp=JxlEnc_lossy_enc_compute_1.hls_opsin_3:HBM[2] +sp=JxlEnc_lossy_enc_compute_1.quant_field_row:HBM[3] +sp=JxlEnc_lossy_enc_compute_1.masking_field_row:HBM[4] +sp=JxlEnc_lossy_enc_compute_1.aq_map_f:HBM[5] +sp=JxlEnc_lossy_enc_compute_1.cmap_axi:HBM[6] +sp=JxlEnc_lossy_enc_compute_1.ac_coef_axiout:HBM[7] +sp=JxlEnc_lossy_enc_compute_1.strategy_all:HBM[8] +sp=JxlEnc_lossy_enc_compute_1.raw_quant_field_i:HBM[9] +sp=JxlEnc_lossy_enc_compute_1.hls_order:HBM[10] +sp=JxlEnc_lossy_enc_compute_1.hls_dc8x8:HBM[11] +sp=JxlEnc_lossy_enc_compute_1.hls_dc16x16:HBM[12] +sp=JxlEnc_lossy_enc_compute_1.hls_dc32x32:HBM[13] + diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/description.json b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/description.json new file mode 100644 index 0000000000..3e66da981d --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/description.json @@ -0,0 +1,328 @@ +{ + "gui": false, + "name": "JXL ACC_LOSSY_ENC Demo", + "description": "This example is based on Google's PIK, which was chosen as the base framework for JPEG XL. The pikEnc is based on the 'fast mode' of PIK which can provide better encoding efficnty than most of other still image encoding methods. The pikEnc is based on Xilinx HLS design methodology and optimized for FPGA arthitecture. It can proved higher throughput and lower latency compared to software-based solutions", + "flow": "vitis", + "platform_allowlist": [ + "u50" + ], + "platform_blocklist": [ + "zc" + ], + "platform_properties": { + "u50": { + "v++": { + "compiler": { + "clflags": [ + "--config PROJECT/conn_u50.cfg" + ] + } + } + } + }, + "data": [ + "./data" + ], + "launch": [ + { + "cmd_args": " --xclbin BUILD/jxlEnc.xclbin LIB_DIR/L2/demos/jxlEnc/images/small32x32.png small32x32.jxl", + "name": "generic launch for all flows" + } + ], + "post_launch": [ + { + "launch_cmd": [ + "./check.sh" + ] + } + ], + "host": { + "host_exe": "host.exe", + "compiler": { + "sources": [ + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cjxl.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cjxl_main.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cmdline.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/codec_config.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/speed_stats.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cpu/cpu.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cpu/os_specific.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/box/box.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/time.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_png.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_pgx.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_pnm.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_jpg.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_psd.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_internal.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/decode_to_jpeg.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_huffman.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_context_map.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/progressive_split.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_detect_dots.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_params.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/blending.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_comparator.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_table.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_tree.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/linalg.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_file.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/headers.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/alpha.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/image_metadata.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/frame_header.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_fast_heuristics.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_encode.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/fields.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/luminance.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_color_management.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_bit_writer.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/image.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/loop_filter.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/color_management.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_modular.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_quant_weights.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_chroma_from_luma.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_adaptive_quantization.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cache.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_group.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ac_strategy.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_noise.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_splines.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_patch_dictionary.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/splines.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_xyb.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ar_control_field.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/memory_manager_internal.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_file.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_image_bundle.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_toc.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/passes_state.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/chroma_from_luma.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_context_map.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_coeff_order.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_ans.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_entropy_coder.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_common.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_dc.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/epf.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_dot_dictionary.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_frame.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_patch_dictionary.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_comparator.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_reconstruct.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group_border.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/filters.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/convolve.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_cache.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_noise.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_huffman.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dct_scales.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_decode.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_icc_codec.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli/butteraugli.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cluster.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data_writer.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/jpeg_data.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data_reader.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/base/padded_bytes.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/base/data_parallel.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/base/cache_aligned.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/base/status.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/dec_ma.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/modular_image.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/encoding.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_rct.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_squeeze.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_palette.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/squeeze.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_transform.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/jxl_transform.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_ma.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_encoding.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/encode.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/memory.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/backward_references_hq.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/brotli_bit_stream.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/block_splitter.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/metablock.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/compress_fragment.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/compress_fragment_two_pass.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/backward_references.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/encoder_dict.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/utf8_util.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/decode.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/static_dict.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/literal_cost.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/entropy_encode.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/bit_cost.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/cluster.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/dictionary_hash.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/histogram.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/bit_reader.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/huffman.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/state.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/dictionary.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/transform.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmslut.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsnamed.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmspack.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscnvrt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsio1.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsgmt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsopt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsalpha.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmstypes.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsintrp.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsgamma.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscam02.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscgats.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmshalf.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsmtrx.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsps2.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmssamp.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmssm.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsxform.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsio0.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsplugin.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmserr.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmspcs.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmswtpnt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsvirt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lodepng/lodepng.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/highway/hwy/aligned_allocator.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/highway/hwy/targets.cc", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_enc_frame.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_host.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase1.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase2.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase3.cpp", + "LIB_DIR/L2/demos/jxlEnc/acc_lossy_enc_compute/host/host_lossy_enc_compute.cpp", + "LIB_DIR/ext/xcl2/xcl2.cpp" + ], + "includepaths": [ + "LIB_DIR/../utils/L1/include/", + "LIB_DIR/ext/xcl2", + "LIB_DIR/L2/demos/jxlEnc/third_partys/", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/include", + "LIB_DIR/L2/demos/jxlEnc/third_partys/build/lib/include", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/include", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/highway", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/include", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lodepng", + "LIB_DIR/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute", + "LIB_DIR/L2/demos/jxlEnc/others/include/", + "LIB_DIR/L2/demos/jxlEnc/acc_lossy_enc_compute/kernel", + "LIB_DIR/L2/demos/jxlEnc/acc_lossy_enc_compute/host" + ], + "options": "-O3 " + } + }, + "v++": { + "compiler": { + "includepaths": [ + "LIB_DIR/../utils/L1/include/", + "LIB_DIR/L2/include/hw/jxlEnc" + ] + } + }, + "containers": [ + { + "name": "jxlEnc", + "accelerators": [ + { + "location": "LIB_DIR/L2/demos/jxlEnc/acc_lossy_enc_compute/kernel/hls_lossy_enc_compute.cpp", + "frequency": 300.0, + "clflags": " -D KERNEL_NAME=JxlEnc_lossy_enc_compute", + "name": "JxlEnc_lossy_enc_compute", + "num_compute_units": 1, + "compute_units": [ + { + "name": "JxlEnc_lossy_enc_compute", + "arguments": [ + { + "name": "gmem0_0", + "memory": "DDR[0]" + }, + { + "name": "gmem0_1", + "memory": "DDR[0]" + }, + { + "name": "gmem1_0", + "memory": "DDR[1]" + }, + { + "name": "gmem1_1", + "memory": "DDR[1]" + }, + { + "name": "gmem1_2", + "memory": "DDR[1]" + } + ] + } + ] + } + ], + "ldclflags": "-g --advanced.param compiler.userPostSysLinkOverlayTcl=postSysLink.tcl", + "frequency": 300 + } + ], + "testinfo": { + "disable": false, + "jobs": [ + { + "index": 0, + "dependency": [], + "env": "", + "cmd": "", + "max_memory_MB": { + "vitis_hw_build": 81920, + "vitis_hw_emu": 40960, + "vitis_sw_emu": 10240, + "vitis_hw_run": 10240 + }, + "max_time_min": { + "vitis_hw_build": 3200, + "vitis_hw_emu": 1600, + "vitis_sw_emu": 120, + "vitis_hw_run": 10 + } + } + ], + "targets": [ + "vitis_sw_emu", + "vitis_hw_emu", + "vitis_hw" + ], + "category": "canary" + } +} diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/host/host_lossy_enc_compute.cpp b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/host/host_lossy_enc_compute.cpp new file mode 100644 index 0000000000..4e2fac99cb --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/host/host_lossy_enc_compute.cpp @@ -0,0 +1,366 @@ +/* + * Copyright 2022 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HOST_LOSSY_ENC_COMPUTE_CPP +#define HOST_LOSSY_ENC_COMPUTE_CPP + +#include +#include + +#include "host_lossy_enc_compute.hpp" + +#ifndef HLS_TEST +#include "xf_utils_sw/logger.hpp" +#include "xcl2.hpp" +#endif + +unsigned long diff(const struct timeval* newTime, const struct timeval* oldTime) { + return (newTime->tv_sec - oldTime->tv_sec) * 1000000 + (newTime->tv_usec - oldTime->tv_usec); +} + +template +T* aligned_alloc(std::size_t num) { + void* ptr = NULL; + if (posix_memalign(&ptr, 4096, num * sizeof(T))) throw std::bad_alloc(); + return reinterpret_cast(ptr); +} + +void hls_lossy_enc_compute_wrapper(std::string xclbinPath, // xclbin + int config[MAX_NUM_CONFIG], // mm15, input + float config_fl[MAX_NUM_CONFIG], // mm16, input + float* hls_opsin_1, // mm1, input + float* hls_opsin_2, // mm2, input + float* hls_opsin_3, // mm3, input + float* hls_quant_field, // mm4, input + float* hls_masking_field, // mm5, input + float* aq_map_f, // mm6, input + int8_t* cmap_axi, // mm7, output + int* ac_coef_axiout, // mm8, output + uint8_t* strategy_all, // mm9, output + int* raw_quant_field_i, // mm10, output + uint32_t* hls_order, // mm11, output + float* hls_dc8x8, // mm12, output + float* hls_dc16x16, // mm13, output + float* hls_dc32x32 // mm14, output + ) { +#ifndef HLS_TEST + + xf::common::utils_sw::Logger logger(std::cout, std::cerr); + cl_int fail; + + struct timeval start_time; // End to end time clock start + gettimeofday(&start_time, 0); + + // platform related operations + std::vector devices = xcl::get_xil_devices(); + cl::Device device = devices[0]; + + // Creating Context and Command Queue for selected Device + cl::Context context(device, NULL, NULL, NULL, &fail); + logger.logCreateContext(fail); + cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &fail); + logger.logCreateCommandQueue(fail); + std::string devName = device.getInfo(); + printf("INFO: Found Device=%s\n", devName.c_str()); + cl::Program::Binaries xclBins = xcl::import_binary_file(xclbinPath); + + devices.resize(1); + cl::Program program(context, devices, xclBins, NULL, &fail); + logger.logCreateProgram(fail); + + int repInt = 1; + // create kernels + std::vector hls_lossy_enc_compute(repInt); + for (int i = 0; i < repInt; i++) { + hls_lossy_enc_compute[i] = cl::Kernel(program, "JxlEnc_lossy_enc_compute", &fail); + logger.logCreateKernel(fail); + } + std::cout << "INFO: kernel has been created" << std::endl; + + // 1. create all I/O Buffer + int32_t* hb_config = aligned_alloc(MAX_NUM_CONFIG); + float* hb_config_fl = aligned_alloc(MAX_NUM_CONFIG); + float* hb_hls_opsin_1 = aligned_alloc(ALL_PIXEL); + float* hb_hls_opsin_2 = aligned_alloc(ALL_PIXEL); + float* hb_hls_opsin_3 = aligned_alloc(ALL_PIXEL); + float* hb_hls_quant_field = aligned_alloc(BLOCK8_H * BLOCK8_W); + float* hb_hls_masking_field = aligned_alloc(BLOCK8_H * BLOCK8_W); + float* hb_aq_map_f = aligned_alloc(BLOCK8_H * BLOCK8_W); + int8_t* hb_cmap_axi = aligned_alloc(TILE_W * TILE_H * 2); + int32_t* hb_ac_coef_axiout = aligned_alloc(ALL_PIXEL); + uint8_t* hb_strategy_all = aligned_alloc(BLOCK8_W * BLOCK8_H); + int32_t* hb_raw_quant_field_i = aligned_alloc(BLOCK8_H * BLOCK8_W); + uint32_t* hb_hls_order = aligned_alloc(MAX_ORDER); + float* hb_hls_dc8x8 = aligned_alloc(ALL_PIXEL); + float* hb_hls_dc16x16 = aligned_alloc(ALL_PIXEL); + float* hb_hls_dc32x32 = aligned_alloc(ALL_PIXEL); + + //================================================== + // 2. init all the host Buffers + //================================================== + + // input port + for (int j = 0; j < MAX_NUM_CONFIG; j++) { + hb_config[j] = config[j]; + } + + for (int j = 0; j < MAX_NUM_CONFIG; j++) { + hb_config_fl[j] = config_fl[j]; + } + + for (int j = 0; j < ALL_PIXEL; j++) { + hb_hls_opsin_1[j] = hls_opsin_1[j]; + } + + for (int j = 0; j < ALL_PIXEL; j++) { + hb_hls_opsin_2[j] = hls_opsin_2[j]; + } + + for (int j = 0; j < ALL_PIXEL; j++) { + hb_hls_opsin_3[j] = hls_opsin_3[j]; + } + + for (int j = 0; j < BLOCK8_H * BLOCK8_W; j++) { + hb_hls_quant_field[j] = hls_quant_field[j]; + } + + for (int j = 0; j < BLOCK8_H * BLOCK8_W; j++) { + hb_hls_masking_field[j] = hls_masking_field[j]; + } + + for (int j = 0; j < BLOCK8_H * BLOCK8_W; j++) { + hb_aq_map_f[j] = aq_map_f[j]; + } + + // mapping to HBM banks + std::vector mext_o(33); + mext_o[0] = {(((unsigned int)(14)) | XCL_MEM_TOPOLOGY), hb_config, 0}; + mext_o[1] = {(((unsigned int)(15)) | XCL_MEM_TOPOLOGY), hb_config_fl, 0}; + mext_o[2] = {(((unsigned int)(0)) | XCL_MEM_TOPOLOGY), hb_hls_opsin_1, 0}; + mext_o[3] = {(((unsigned int)(1)) | XCL_MEM_TOPOLOGY), hb_hls_opsin_2, 0}; + mext_o[4] = {(((unsigned int)(2)) | XCL_MEM_TOPOLOGY), hb_hls_opsin_3, 0}; + mext_o[5] = {(((unsigned int)(3)) | XCL_MEM_TOPOLOGY), hb_hls_quant_field, 0}; + mext_o[6] = {(((unsigned int)(4)) | XCL_MEM_TOPOLOGY), hb_hls_masking_field, 0}; + mext_o[7] = {(((unsigned int)(5)) | XCL_MEM_TOPOLOGY), hb_aq_map_f, 0}; + mext_o[8] = {(((unsigned int)(6)) | XCL_MEM_TOPOLOGY), hb_cmap_axi, 0}; + mext_o[9] = {(((unsigned int)(7)) | XCL_MEM_TOPOLOGY), hb_ac_coef_axiout, 0}; + mext_o[10] = {(((unsigned int)(8)) | XCL_MEM_TOPOLOGY), hb_strategy_all, 0}; + mext_o[11] = {(((unsigned int)(9)) | XCL_MEM_TOPOLOGY), hb_raw_quant_field_i, 0}; + mext_o[12] = {(((unsigned int)(10)) | XCL_MEM_TOPOLOGY), hb_hls_order, 0}; + mext_o[13] = {(((unsigned int)(11)) | XCL_MEM_TOPOLOGY), hb_hls_dc8x8, 0}; + mext_o[14] = {(((unsigned int)(12)) | XCL_MEM_TOPOLOGY), hb_hls_dc16x16, 0}; + mext_o[15] = {(((unsigned int)(13)) | XCL_MEM_TOPOLOGY), hb_hls_dc32x32, 0}; + + //=================================================== + // 3. create device Buffer and map dev buf to host buf, + //=================================================== + cl::Buffer db_config; // mm15, input + cl::Buffer db_config_fl; // mm16, input + cl::Buffer db_hls_opsin_1; // mm1, input + cl::Buffer db_hls_opsin_2; // mm2, input + cl::Buffer db_hls_opsin_3; // mm3, input + cl::Buffer db_hls_quant_field; // mm4, input + cl::Buffer db_hls_masking_field; // mm5, input + cl::Buffer db_aq_map_f; // mm6, input + cl::Buffer db_cmap_axi; // mm7, output + cl::Buffer db_ac_coef_axiout; // mm8, output + cl::Buffer db_strategy_all; // mm9, output + cl::Buffer db_raw_quant_field_i; // mm10, output + cl::Buffer db_hls_order; // mm11, output + cl::Buffer db_hls_dc8x8; // mm12, output + cl::Buffer db_hls_dc16x16; // mm13, output + cl::Buffer db_hls_dc32x32; // mm14, output + + // init cl Buffer + db_config = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int) * MAX_NUM_CONFIG, &mext_o[0]); + db_config_fl = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(float) * MAX_NUM_CONFIG, &mext_o[1]); + db_hls_opsin_1 = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(float) * ALL_PIXEL, &mext_o[2]); + db_hls_opsin_2 = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(float) * ALL_PIXEL, &mext_o[3]); + db_hls_opsin_3 = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(float) * ALL_PIXEL, &mext_o[4]); + db_hls_quant_field = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(float) * (BLOCK8_H * BLOCK8_W), &mext_o[5]); + db_hls_masking_field = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(float) * (BLOCK8_H * BLOCK8_W), &mext_o[6]); + db_aq_map_f = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(float) * (BLOCK8_H * BLOCK8_W), &mext_o[7]); + db_cmap_axi = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int8_t) * (TILE_W * TILE_H * 2), &mext_o[8]); + db_ac_coef_axiout = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int) * ALL_PIXEL, &mext_o[9]); + db_strategy_all = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint8_t) * (BLOCK8_H * BLOCK8_W), &mext_o[10]); + db_raw_quant_field_i = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int) * (BLOCK8_H * BLOCK8_W), &mext_o[11]); + db_hls_order = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * MAX_ORDER, &mext_o[12]); + db_hls_dc8x8 = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(float) * ALL_PIXEL, &mext_o[13]); + db_hls_dc16x16 = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(float) * ALL_PIXEL, &mext_o[14]); + db_hls_dc32x32 = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(float) * ALL_PIXEL, &mext_o[15]); + //================================== + // add Buffers to migrate + std::vector ob_in; + std::vector ob_out; + + ob_in.push_back(db_config); + ob_in.push_back(db_config_fl); + ob_in.push_back(db_hls_opsin_1); + ob_in.push_back(db_hls_opsin_2); + ob_in.push_back(db_hls_opsin_3); + ob_in.push_back(db_hls_quant_field); + ob_in.push_back(db_hls_masking_field); + ob_in.push_back(db_aq_map_f); + + ob_out.push_back(db_cmap_axi); + ob_out.push_back(db_ac_coef_axiout); + ob_out.push_back(db_strategy_all); + ob_out.push_back(db_raw_quant_field_i); + ob_out.push_back(db_hls_order); + ob_out.push_back(db_hls_dc8x8); + ob_out.push_back(db_hls_dc16x16); + ob_out.push_back(db_hls_dc32x32); + + // set kernel args + for (int i = 0; i < repInt; i++) { + hls_lossy_enc_compute[i].setArg(0, db_config); + hls_lossy_enc_compute[i].setArg(1, db_config_fl); + hls_lossy_enc_compute[i].setArg(2, db_hls_opsin_1); + hls_lossy_enc_compute[i].setArg(3, db_hls_opsin_2); + hls_lossy_enc_compute[i].setArg(4, db_hls_opsin_3); + hls_lossy_enc_compute[i].setArg(5, db_hls_quant_field); + hls_lossy_enc_compute[i].setArg(6, db_hls_masking_field); + hls_lossy_enc_compute[i].setArg(7, db_aq_map_f); + hls_lossy_enc_compute[i].setArg(8, db_cmap_axi); + hls_lossy_enc_compute[i].setArg(9, db_ac_coef_axiout); + hls_lossy_enc_compute[i].setArg(10, db_strategy_all); + hls_lossy_enc_compute[i].setArg(11, db_raw_quant_field_i); + hls_lossy_enc_compute[i].setArg(12, db_hls_order); + hls_lossy_enc_compute[i].setArg(13, db_hls_dc8x8); + hls_lossy_enc_compute[i].setArg(14, db_hls_dc16x16); + hls_lossy_enc_compute[i].setArg(15, db_hls_dc32x32); + } + + // launch kernel and calculate kernel execution time + std::cout << "INFO: Kernel Start" << std::endl; + // declare events + std::vector events_write(1); + std::vector events_kernel(1); + std::vector events_read(1); + + // migrate, + q.enqueueMigrateMemObjects(ob_in, 0, nullptr, &events_write[0]); + q.enqueueTask(hls_lossy_enc_compute[0], &events_write, &events_kernel[0]); + q.enqueueMigrateMemObjects(ob_out, 1, &events_kernel, &events_read[0]); + q.finish(); + + struct timeval end_time; + gettimeofday(&end_time, 0); + std::cout << "INFO: Finish kernel execution" << std::endl; + std::cout << "INFO: Finish E2E execution" << std::endl; + + // print related times + unsigned long timeStart, timeEnd, exec_time0; + std::cout << "-------------------------------------------------------" << std::endl; + events_write[0].getProfilingInfo(CL_PROFILING_COMMAND_START, &timeStart); + events_write[0].getProfilingInfo(CL_PROFILING_COMMAND_END, &timeEnd); + exec_time0 = (timeEnd - timeStart) / 1000.0; + std::cout << "INFO: Data transfer from host to device: " << exec_time0 << " us\n"; + std::cout << "-------------------------------------------------------" << std::endl; + events_read[0].getProfilingInfo(CL_PROFILING_COMMAND_START, &timeStart); + events_read[0].getProfilingInfo(CL_PROFILING_COMMAND_END, &timeEnd); + exec_time0 = (timeEnd - timeStart) / 1000.0; + std::cout << "INFO: Kernel1 Data transfer from device to host: " << exec_time0 << " us\n"; + std::cout << "-------------------------------------------------------" << std::endl; + exec_time0 = 0; + for (int i = 0; i < 1; ++i) { + events_kernel[0].getProfilingInfo(CL_PROFILING_COMMAND_START, &timeStart); + events_kernel[0].getProfilingInfo(CL_PROFILING_COMMAND_END, &timeEnd); + exec_time0 += (timeEnd - timeStart) / 1000.0; + + std::cout << "INFO: Kernel" << i + 1 << " execution: " << (timeEnd - timeStart) / 1000.0 << " us\n"; + std::cout << "-------------------------------------------------------" << std::endl; + } + std::cout << "INFO: kernel total execution: " << exec_time0 << " us\n"; + std::cout << "-------------------------------------------------------" << std::endl; + unsigned long exec_timeE2E = diff(&end_time, &start_time); + std::cout << "INFO: FPGA execution time:" << exec_timeE2E << " us\n"; + std::cout << "-------------------------------------------------------" << std::endl; + + // output + for (int j = 0; j < TILE_W * TILE_H * 2; j++) { + cmap_axi[j] = hb_cmap_axi[j]; + } + + for (int j = 0; j < ALL_PIXEL; j++) { + ac_coef_axiout[j] = hb_ac_coef_axiout[j]; + } + + for (int j = 0; j < BLOCK8_W * BLOCK8_H; j++) { + strategy_all[j] = hb_strategy_all[j]; + } + + for (int j = 0; j < BLOCK8_H * BLOCK8_W; j++) { + raw_quant_field_i[j] = hb_raw_quant_field_i[j]; + } + + for (int j = 0; j < MAX_ORDER; j++) { + hls_order[j] = hb_hls_order[j]; + } + + for (int j = 0; j < ALL_PIXEL; j++) { + hls_dc8x8[j] = hb_hls_dc8x8[j]; + } + + for (int j = 0; j < ALL_PIXEL; j++) { + hls_dc16x16[j] = hb_hls_dc16x16[j]; + } + + for (int j = 0; j < ALL_PIXEL; j++) { + hls_dc32x32[j] = hb_hls_dc32x32[j]; + } + + // free mem + free(hb_hls_opsin_1); + free(hb_hls_opsin_2); + free(hb_hls_opsin_3); + free(hb_hls_quant_field); + free(hb_hls_masking_field); + free(hb_aq_map_f); + free(hb_cmap_axi); + free(hb_ac_coef_axiout); + free(hb_strategy_all); + free(hb_raw_quant_field_i); + free(hb_hls_order); + free(hb_hls_dc8x8); + free(hb_hls_dc16x16); + free(hb_hls_dc32x32); + free(hb_config); + free(hb_config_fl); +#else + hls_lossy_enc_compute(config, config_fl, hls_opsin_1, hls_opsin_2, hls_opsin_3, hls_quant_field, hls_masking_field, + aq_map_f, cmap_axi, ac_coef_axiout, strategy_all, raw_quant_field_i, hls_order, hls_dc8x8, + hls_dc16x16, hls_dc32x32); +#endif +} + +#endif diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/host/host_lossy_enc_compute.hpp b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/host/host_lossy_enc_compute.hpp new file mode 100644 index 0000000000..9dc93ad803 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/host/host_lossy_enc_compute.hpp @@ -0,0 +1,60 @@ +/* + * Copyright 2022 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HOST_CLUSTER_HISTOGRAM_HPP +#define HOST_CLUSTER_HISTOGRAM_HPP + +#include +#include + +#ifndef HLS_TEST +#include "xcl2.hpp" +#include "xf_utils_sw/logger.hpp" + +const int PIXEL_W = 2048; +const int PIXEL_H = 2048; +const int FRAME_DIM = 3; +const int ALL_PIXEL = PIXEL_W * PIXEL_H * FRAME_DIM; +const int BLOCK8_W = PIXEL_W / 8; +const int BLOCK8_H = PIXEL_H / 8; +const int BLOCK8_NUM = BLOCK8_W * BLOCK8_H * FRAME_DIM; +const int TILE_W = PIXEL_W / 64; +const int TILE_H = PIXEL_H / 64; +const int MAX_ORDER = 320 * 3 + 1; +const int MAX_NUM_CONFIG = 32; + +#else +#include "hls_lossy_enc_compute.hpp" +#endif + +void hls_lossy_enc_compute_wrapper(std::string xclbinPath, + int config[MAX_NUM_CONFIG], + float config_fl[MAX_NUM_CONFIG], + float* hls_opsin_1, + float* hls_opsin_2, + float* hls_opsin_3, + float* quant_field_row, + float* masking_field_row, + float* aq_map_f, + int8_t* cmap_axi, + int* ac_coef_axiout, + uint8_t* strategy_all, + int* raw_quant_field_i, + uint32_t* hls_order, + float* hls_dc8x8, + float* hls_dc16x16, + float* hls_dc32x32); +#endif diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/kernel/hls_lossy_enc_compute.cpp b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/kernel/hls_lossy_enc_compute.cpp new file mode 100644 index 0000000000..c0ce310ce9 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/kernel/hls_lossy_enc_compute.cpp @@ -0,0 +1,9420 @@ +/* + * Copyright 2022 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HLS_LOSSY_ENC_COMPUTE_CPP +#define HLS_LOSSY_ENC_COMPUTE_CPP + +#include "hls_lossy_enc_compute.hpp" + +#define FIX + +// uint8_t covered_blocks_x_set[6] = {1, 1, 1, 1, 2, 4}; +// uint8_t covered_blocks_y_set[6] = {1, 1, 1, 1, 2, 4}; +uint8_t strategy_block[6] = {1, 1, 1, 1, 2, 4}; + +const float inv_matrix_8[3][64] = {{0, + 560, + 558.510437012, + 489.194152832, + 428.480621338, + 375.302246094, + 328.723815918, + 287.926147461, + 560, + 560, + 541.309387207, + 478.786773682, + 421.547454834, + 370.409942627, + 325.138336182, + 285.227325439, + 558.510437012, + 541.309387207, + 500.443756104, + 451.472991943, + 402.49432373, + 356.627593994, + 314.88571167, + 277.434692383, + 489.194152832, + 478.786773682, + 451.472991943, + 414.922729492, + 375.302246094, + 336.170715332, + 299.277435303, + 265.364807129, + 428.480621338, + 421.547454834, + 402.49432373, + 375.302246094, + 344.016448975, + 311.624298096, + 279.983337402, + 250.119842529, + 375.302246094, + 370.409942627, + 356.627593994, + 336.170715332, + 311.624298096, + 285.227325439, + 258.613525391, + 232.845169067, + 328.723815918, + 325.138336182, + 314.88571167, + 299.277435303, + 279.983337402, + 258.613525391, + 236.484725952, + 214.558776855, + 287.926147461, + 285.227325439, + 277.434692383, + 265.364807129, + 250.119842529, + 232.845169067, + 214.558776855, + 196.071777344}, + {0, + 3150, + 3139.25854492, + 2648.63037109, + 2234.68115234, + 1885.42749023, + 1590.75805664, + 1342.14172363, + 3150, + 3150, + 3015.80957031, + 2576.58398438, + 2188.41503906, + 1853.96557617, + 1568.54064941, + 1326.02929688, + 3139.25854492, + 3015.80957031, + 2726.99536133, + 2389.61645508, + 2062.38256836, + 1765.96655273, + 1505.39343262, + 1279.74853516, + 2648.63037109, + 2576.58398438, + 2389.61645508, + 2144.4074707, + 1885.42749023, + 1637.12109375, + 1410.37487793, + 1208.78967285, + 2234.68115234, + 2188.41503906, + 2062.38256836, + 1885.42749023, + 1686.28210449, + 1485.42663574, + 1294.84509277, + 1060.59338379, + 1885.42749023, + 1853.96557617, + 1765.96655273, + 1637.12109375, + 1485.42663574, + 1326.02929688, + 1169.49206543, + 785.963012695, + 1590.75805664, + 1568.54064941, + 1505.39343262, + 1410.37487793, + 1294.84509277, + 1169.49206543, + 838.701721191, + 558.03729248, + 1342.14172363, + 1326.02929688, + 1279.74853516, + 1208.78967285, + 1060.59338379, + 785.963012695, + 558.03729248, + 382.654693604}, + {0, + 293.959503174, + 169.469955444, + 119.412483215, + 85.3333358765, + 85.3333358765, + 83.5508270264, + 58.8718566895, + 293.959503174, + 233.598114014, + 156.027160645, + 112.817504883, + 85.3333358765, + 85.3333358765, + 81.1647109985, + 57.4251747131, + 169.469955444, + 156.027160645, + 126.80493927, + 96.6006240845, + 85.3333358765, + 85.3333358765, + 74.5768890381, + 53.3726730347, + 119.412483215, + 112.817504883, + 96.6006240845, + 85.3333358765, + 85.3333358765, + 85.3333358765, + 65.2038497925, + 47.4551811218, + 85.3333358765, + 85.3333358765, + 85.3333358765, + 85.3333358765, + 85.3333358765, + 72.5535202026, + 54.6778106689, + 39.419506073, + 85.3333358765, + 85.3333358765, + 85.3333358765, + 85.3333358765, + 72.5535202026, + 57.4251747131, + 44.3317565918, + 29.2122058868, + 83.5508270264, + 81.1647109985, + 74.5768890381, + 65.2038497925, + 54.6778106689, + 44.3317565918, + 31.1723690033, + 20.7407989502, + 58.8718566895, + 57.4251747131, + 53.3726730347, + 47.4551811218, + 39.419506073, + 29.2122058868, + 20.7407989502, + 14.2222824097}}; +const float inv_matrix_16[3][256] = {{0, + 0, + 2384.4128418, + 2060.98974609, + 1763.60900879, + 1491.73779297, + 1261.77709961, + 1067.26635742, + 956.67767334, + 861.364074707, + 775.546569824, + 703.312927246, + 644.910888672, + 591.358520508, + 542.252990723, + 501.345214844, + 0, + 0, + 2303.75878906, + 2012.80981445, + 1727.63220215, + 1467.21154785, + 1244.41430664, + 1054.64306641, + 950.44720459, + 856.371826172, + 771.497619629, + 700.552734375, + 642.589599609, + 589.392944336, + 540.578857422, + 500.060272217, + 2384.4128418, + 2303.75878906, + 2113.18408203, + 1884.00744629, + 1629.57141113, + 1398.57958984, + 1195.04504395, + 1031.75708008, + 932.273986816, + 841.744262695, + 759.593811035, + 692.403076172, + 635.722961426, + 583.569458008, + 535.612548828, + 496.2421875, + 2060.98974609, + 2012.80981445, + 1884.00744629, + 1693.40161133, + 1491.73779297, + 1297.99816895, + 1120.69970703, + 996.043395996, + 903.588256836, + 818.460021973, + 740.524108887, + 679.239624023, + 624.590454102, + 574.100036621, + 528.409057617, + 489.997619629, + 1763.60900879, + 1727.63220215, + 1629.57141113, + 1491.73779297, + 1336.38830566, + 1179.42834473, + 1039.25634766, + 950.44720459, + 866.416687012, + 787.946533203, + 717.456176758, + 661.633422852, + 609.623046875, + 561.31427002, + 518.629089355, + 481.495361328, + 1491.73779297, + 1467.21154785, + 1398.57958984, + 1297.99816895, + 1179.42834473, + 1054.64294434, + 975.919921875, + 898.074401855, + 823.012390137, + 751.853820801, + 692.403076172, + 640.284667969, + 591.358520508, + 545.629760742, + 506.54699707, + 470.954223633, + 1261.77709961, + 1244.41430664, + 1195.04504395, + 1120.69970703, + 1039.25634766, + 975.919921875, + 909.174133301, + 841.744262695, + 775.546569824, + 714.580871582, + 664.092590332, + 615.952392578, + 570.392150879, + 528.409057617, + 492.477874756, + 458.628570557, + 1067.26635742, + 1054.64306641, + 1031.75708008, + 996.043395996, + 950.44720459, + 898.074401855, + 841.744262695, + 783.770263672, + 726.22833252, + 679.239624023, + 633.465698242, + 589.392944336, + 547.332580566, + 510.515045166, + 476.757659912, + 444.792907715, + 956.67767334, + 950.44720459, + 932.273986816, + 903.588256836, + 866.416687012, + 823.012390137, + 775.546569824, + 726.22833252, + 684.443725586, + 642.589599609, + 601.375, + 561.31439209, + 524.175048828, + 491.234863281, + 459.72479248, + 429.72869873, + 861.364074707, + 856.371826172, + 841.744262695, + 818.460021973, + 787.946533203, + 751.853820801, + 714.580871582, + 679.239624023, + 642.589599609, + 605.472290039, + 568.554870605, + 532.708679199, + 501.345214844, + 470.954223633, + 441.705718994, + 413.71182251, + 775.546569824, + 771.497619629, + 759.593811035, + 740.524108887, + 717.456176758, + 692.403076172, + 664.092590332, + 633.465698242, + 601.375, + 568.554870605, + 535.612426758, + 506.546936035, + 477.933990479, + 450.024688721, + 423.003997803, + 395.167694092, + 703.312927246, + 700.552734375, + 692.403076172, + 679.239624023, + 661.633422852, + 640.284667969, + 615.952392578, + 589.392944336, + 561.31439209, + 532.708679199, + 506.546936035, + 480.302856445, + 454.290039062, + 428.756591797, + 403.216186523, + 375.228302002, + 644.910888672, + 642.589599609, + 635.722961426, + 624.590454102, + 609.623046875, + 591.358520508, + 570.392150879, + 547.332580566, + 524.175048828, + 501.345214844, + 477.933990479, + 454.290039062, + 430.704803467, + 407.340545654, + 380.75769043, + 355.171173096, + 591.358520508, + 589.392944336, + 583.569458008, + 574.100036621, + 561.31427002, + 545.629760742, + 528.409057617, + 510.515045166, + 491.234863281, + 470.954223633, + 450.024688721, + 428.756591797, + 407.340545654, + 382.62991333, + 358.535705566, + 335.223266602, + 542.252990723, + 540.578857422, + 535.612548828, + 528.409057617, + 518.629089355, + 506.54699707, + 492.477874756, + 476.757659912, + 459.72479248, + 441.705718994, + 423.003997803, + 403.216186523, + 380.75769043, + 358.535705566, + 336.753845215, + 315.57409668, + 501.345214844, + 500.060272217, + 496.2421875, + 489.997619629, + 481.495361328, + 470.954223633, + 458.628570557, + 444.792907715, + 429.72869873, + 413.71182251, + 395.167694092, + 375.228302002, + 355.171173096, + 335.223266602, + 315.57409668, + 296.378265381}, + {0, + 0, + 5616.41552734, + 4437.54785156, + 3710.52368164, + 3312.08374023, + 2956.42822266, + 2638.96386719, + 2378.97973633, + 2146.23095703, + 1936.2532959, + 1722.18615723, + 1498.60571289, + 1304.05163574, + 1134.75488281, + 951.882019043, + 0, + 0, + 5312.58251953, + 4271.09716797, + 3658.99584961, + 3275.03710938, + 2928.76391602, + 2617.74536133, + 2363.77954102, + 2134.02709961, + 1926.33569336, + 1711.35717773, + 1489.96264648, + 1297.10559082, + 1129.14038086, + 946.136962891, + 5616.41552734, + 5312.58251953, + 4620.59277344, + 3880.56469727, + 3516.76147461, + 3170.29418945, + 2849.4152832, + 2562.00634766, + 2319.43164062, + 2098.26171875, + 1897.17285156, + 1679.53442383, + 1464.50524902, + 1276.60888672, + 1112.54638672, + 929.184143066, + 4437.54785156, + 4271.09716797, + 3880.56469727, + 3609.64770508, + 3312.08374023, + 3013.74951172, + 2727.90283203, + 2474.97729492, + 2249.39648438, + 2041.30578613, + 1850.4362793, + 1628.60998535, + 1423.58496094, + 1243.54284668, + 1077.57275391, + 901.836975098, + 3710.52368164, + 3658.99584961, + 3516.76147461, + 3312.08374023, + 3073.94458008, + 2824.09741211, + 2580.27368164, + 2363.77954102, + 2158.58081055, + 1966.61950684, + 1778.07653809, + 1561.42590332, + 1369.25976562, + 1199.41723633, + 1031.11547852, + 865.35723877, + 3312.08374023, + 3275.03710938, + 3170.29418945, + 3013.74951172, + 2824.09741211, + 2617.74511719, + 2425.91333008, + 2235.92993164, + 2052.44384766, + 1878.20617676, + 1679.53442383, + 1481.39880371, + 1304.05163574, + 1146.11157227, + 975.344787598, + 821.329833984, + 2956.42822266, + 2928.76391602, + 2849.4152832, + 2727.90283203, + 2580.27368164, + 2425.91333008, + 2263.03759766, + 2098.26171875, + 1936.2532959, + 1766.65966797, + 1570.74584961, + 1392.13525391, + 1230.68457031, + 1077.57275391, + 912.64251709, + 771.521240234, + 2638.96386719, + 2617.74536133, + 2562.00634766, + 2474.97729492, + 2363.77954102, + 2235.92993164, + 2098.26171875, + 1956.39318848, + 1813.07836914, + 1628.60998535, + 1456.17285156, + 1297.10559082, + 1151.85449219, + 993.464355469, + 845.405334473, + 717.737731934, + 2378.97973633, + 2363.77954102, + 2319.43164062, + 2249.39648438, + 2158.58081055, + 2052.44384766, + 1936.2532959, + 1813.07836914, + 1648.67211914, + 1489.96264648, + 1339.6640625, + 1199.41748047, + 1057.31555176, + 907.217956543, + 775.878479004, + 661.709289551, + 2146.23095703, + 2134.02709961, + 2098.26171875, + 2041.30578613, + 1966.61950684, + 1878.20617676, + 1766.65966797, + 1628.60998535, + 1489.96264648, + 1354.33557129, + 1224.33178711, + 1098.37109375, + 951.882019043, + 821.329833984, + 706.041503906, + 604.99597168, + 1936.2532959, + 1926.33569336, + 1897.17285156, + 1850.4362793, + 1778.07653809, + 1679.53442383, + 1570.74584961, + 1456.17285156, + 1339.6640625, + 1224.33178711, + 1112.54614258, + 975.344482422, + 850.33416748, + 737.812194824, + 637.541503906, + 531.866638184, + 1722.18615723, + 1711.35717773, + 1679.53442383, + 1628.60998535, + 1561.42590332, + 1481.39880371, + 1392.13525391, + 1297.10559082, + 1199.41748047, + 1098.37109375, + 975.344482422, + 860.309997559, + 754.414855957, + 658.18359375, + 565.168762207, + 455.065155029, + 1498.60571289, + 1489.96264648, + 1464.50524902, + 1423.58496094, + 1369.25976562, + 1304.05163574, + 1230.68457031, + 1151.85449219, + 1057.31555176, + 951.882019043, + 850.33416748, + 754.414855957, + 665.260375977, + 582.761047363, + 475.564758301, + 385.666412354, + 1304.05163574, + 1297.10559082, + 1276.60888672, + 1243.54284668, + 1199.41723633, + 1146.11157227, + 1077.57275391, + 993.464355469, + 907.217956543, + 821.329833984, + 737.812194824, + 658.18359375, + 582.761047363, + 482.643035889, + 396.775939941, + 324.039428711, + 1134.75488281, + 1129.14038086, + 1112.54638672, + 1077.57275391, + 1031.11547852, + 975.344787598, + 912.64251709, + 845.405334473, + 775.878479004, + 706.041503906, + 637.541503906, + 565.168762207, + 475.564758301, + 396.775939941, + 328.516326904, + 270.136077881, + 951.882019043, + 946.136962891, + 929.184143066, + 901.836975098, + 865.35723877, + 821.329833984, + 771.521240234, + 717.737731934, + 661.709289551, + 604.99597168, + 531.866638184, + 455.065155029, + 385.666412354, + 324.039428711, + 270.136077881, + 223.60848999}, + {0, + 0, + 615.613830566, + 448.953399658, + 337.930267334, + 263.807556152, + 205.943115234, + 160.770889282, + 141.832733154, + 126.301643372, + 112.471244812, + 100.763389587, + 91.1208114624, + 82.4009933472, + 74.5156097412, + 58.8962364197, + 0, + 0, + 571.402038574, + 426.532226562, + 327.784393311, + 257.417816162, + 201.765563965, + 157.966430664, + 140.812332153, + 125.492965698, + 111.822540283, + 100.304679871, + 90.7403564453, + 82.0832748413, + 74.2487335205, + 58.3933258057, + 615.613830566, + 571.402038574, + 473.941894531, + 372.602783203, + 300.644775391, + 239.80960083, + 190.039825439, + 154.182662964, + 137.840042114, + 123.126365662, + 109.91746521, + 98.952003479, + 89.6162185669, + 81.1429672241, + 73.4578170776, + 56.9167442322, + 448.953399658, + 426.532226562, + 372.602783203, + 318.224456787, + 263.807556152, + 214.746795654, + 172.817260742, + 148.295852661, + 133.160797119, + 119.368148804, + 106.872108459, + 96.7725219727, + 87.7978591919, + 79.6171722412, + 70.2083129883, + 54.5584373474, + 337.930267334, + 327.784393311, + 300.644775391, + 263.807556152, + 224.206954956, + 186.378311157, + 155.421569824, + 140.812332153, + 127.120582581, + 114.460098267, + 103.118339539, + 93.8680496216, + 85.3613052368, + 77.5634307861, + 65.9593734741, + 51.4587516785, + 263.807556152, + 257.417816162, + 239.80960083, + 214.746795654, + 186.378311157, + 157.966400146, + 144.988540649, + 132.263153076, + 120.102050781, + 108.680435181, + 98.952003479, + 90.3628005981, + 82.4009933472, + 75.0543060303, + 60.9631996155, + 47.7897415161, + 205.943115234, + 201.765563965, + 190.039825439, + 172.817260742, + 155.421569824, + 144.988540649, + 134.070770264, + 123.126365662, + 112.471244812, + 102.638969421, + 94.2730102539, + 86.3905029297, + 79.0208206177, + 70.2083129883, + 55.4867515564, + 43.7368011475, + 160.770889282, + 157.966430664, + 154.182662964, + 148.295852661, + 140.812332153, + 132.263153076, + 123.126365662, + 113.789886475, + 104.582710266, + 96.7725219727, + 89.2471008301, + 82.0832748413, + 75.3261566162, + 62.5737113953, + 49.7861824036, + 39.4813766479, + 141.832733154, + 140.812332153, + 137.840042114, + 133.160797119, + 127.120582581, + 120.102050781, + 112.471244812, + 104.582710266, + 97.6333694458, + 90.7403564453, + 84.0226669312, + 77.5634460449, + 68.3460235596, + 55.020149231, + 44.0871162415, + 35.1875991821, + 126.301643372, + 125.492965698, + 123.126365662, + 119.368148804, + 114.460098267, + 108.680435181, + 102.638969421, + 96.7725219727, + 90.7403564453, + 84.6872787476, + 78.7255554199, + 72.1355895996, + 58.8962364197, + 47.7897415161, + 38.5730819702, + 30.993062973, + 112.471244812, + 111.822540283, + 109.91746521, + 106.872108459, + 103.118339539, + 98.952003479, + 94.2730102539, + 89.2471008301, + 84.0226669312, + 78.7255554199, + 73.4578094482, + 60.9631729126, + 50.1978492737, + 41.0546913147, + 33.3810348511, + 24.7806758881, + 100.763389587, + 100.304679871, + 98.952003479, + 96.7725219727, + 93.8680496216, + 90.3628005981, + 86.3905029297, + 82.0832748413, + 77.5634460449, + 72.1355895996, + 60.9631729126, + 51.0341072083, + 42.3694725037, + 34.9223136902, + 27.7260704041, + 18.5722160339, + 91.1208114624, + 90.7403564453, + 89.6162185669, + 87.7978591919, + 85.3613052368, + 82.4009933472, + 79.0208206177, + 75.3261566162, + 68.3460235596, + 58.8962364197, + 50.1978492737, + 42.3694725037, + 35.4553947449, + 29.343132019, + 20.1489048004, + 13.676407814, + 82.4009933472, + 82.0832748413, + 81.1429672241, + 79.6171722412, + 77.5634307861, + 75.0543060303, + 70.2083129883, + 62.5737113953, + 55.020149231, + 47.7897415161, + 41.0546913147, + 34.9223136902, + 29.343132019, + 20.7069969177, + 14.4138498306, + 9.9115486145, + 74.5156097412, + 74.2487335205, + 73.4578170776, + 70.2083129883, + 65.9593734741, + 60.9631996155, + 55.4867515564, + 49.7861824036, + 44.0871162415, + 38.5730819702, + 33.3810348511, + 27.7260704041, + 20.1489048004, + 14.4138498306, + 10.166267395, + 7.07980155945, + 58.8962364197, + 58.3933258057, + 56.9167442322, + 54.5584373474, + 51.4587516785, + 47.7897415161, + 43.7368011475, + 39.4813766479, + 35.1875991821, + 30.993062973, + 24.7806758881, + 18.5722160339, + 13.676407814, + 9.9115486145, + 7.07980155945, + 4.99121952057}}; +const float inv_matrix_32[3][1024] = {{0, + 0, + 0, + 0, + 5011.67871094, + 4561.02685547, + 4150.89794922, + 3787.85327148, + 3459.89013672, + 3160.32299805, + 2886.69311523, + 2636.75488281, + 2408.45727539, + 2220.78833008, + 2069.29418945, + 1928.13452148, + 1796.60424805, + 1674.04626465, + 1559.84912109, + 1455.32824707, + 1364.40710449, + 1279.16601562, + 1199.25048828, + 1124.32775879, + 1054.08581543, + 988.231933594, + 932.328857422, + 879.889831543, + 830.400390625, + 783.694335938, + 739.61541748, + 698.015563965, + 0, + 0, + 0, + 0, + 4953.88232422, + 4518.67041016, + 4118.65429688, + 3763.55249023, + 3440.43725586, + 3144.51098633, + 2873.68359375, + 2625.9453125, + 2399.40185547, + 2214.77026367, + 2064.08569336, + 1923.60375977, + 1792.64550781, + 1670.57409668, + 1556.79296875, + 1452.8614502, + 1362.2097168, + 1277.20385742, + 1197.49438477, + 1122.75280762, + 1052.67053223, + 986.958068848, + 931.291748047, + 878.947387695, + 829.542602539, + 782.912841797, + 738.902404785, + 697.364379883, + 0, + 0, + 0, + 0, + 4793.61474609, + 4398.46826172, + 4026.78955078, + 3692.97387695, + 3383.59692383, + 3098.10839844, + 2835.38208008, + 2594.04101562, + 2372.62280273, + 2196.91870117, + 2048.6171875, + 1910.1348877, + 1780.86755371, + 1660.23608398, + 1547.6887207, + 1445.50598145, + 1355.65515137, + 1271.34863281, + 1192.25231934, + 1118.05004883, + 1048.44384766, + 983.335632324, + 928.192504883, + 876.130004883, + 826.978088379, + 780.575439453, + 736.76965332, + 695.41619873, + 0, + 0, + 0, + 0, + 4561.02685547, + 4217.54345703, + 3889.28466797, + 3582.40161133, + 3293.56469727, + 3024.01489258, + 2773.8503418, + 2542.54394531, + 2329.23535156, + 2167.82006836, + 2023.34472656, + 1888.08752441, + 1761.55737305, + 1643.26379395, + 1532.72387695, + 1433.39599609, + 1344.85412598, + 1261.69274902, + 1183.6015625, + 1110.28479004, + 1041.46118164, + 977.688903809, + 923.064758301, + 871.467163086, + 822.732299805, + 776.704589844, + 733.236633301, + 692.188110352, + 5011.67871094, + 4953.88232422, + 4793.61474609, + 4561.02685547, + 4287.29882812, + 3998.23925781, + 3716.125, + 3440.43725586, + 3176.31298828, + 2926.47753906, + 2692.17285156, + 2473.73583984, + 2276.53393555, + 2128.3894043, + 1988.98718262, + 1858.03149414, + 1735.17175293, + 1620.02612305, + 1512.1998291, + 1416.7467041, + 1329.98596191, + 1248.38586426, + 1171.66845703, + 1099.56396484, + 1031.81274414, + 969.874816895, + 915.964599609, + 865.006896973, + 816.846923828, + 771.336730957, + 728.335144043, + 687.707824707, + 4561.02685547, + 4518.67041016, + 4398.46826172, + 4217.54345703, + 3998.23925781, + 3763.55249023, + 3519.85961914, + 3276.21459961, + 3038.5234375, + 2810.43310547, + 2594.04101562, + 2390.41162109, + 2220.78833008, + 2079.79077148, + 1946.46655273, + 1820.70666504, + 1702.30786133, + 1591.01000977, + 1486.51586914, + 1395.84521484, + 1311.29003906, + 1231.62927246, + 1156.62243652, + 1086.03051758, + 1019.62091064, + 959.981201172, + 906.967834473, + 856.815124512, + 809.379150391, + 764.521179199, + 722.108276367, + 682.245117188, + 4150.89794922, + 4118.65429688, + 4026.78955078, + 3889.28466797, + 3716.125, + 3519.85961914, + 3311.12646484, + 3098.10839844, + 2886.69311523, + 2680.90209961, + 2483.34204102, + 2295.77954102, + 2156.40185547, + 2023.34472656, + 1896.84716797, + 1776.97375488, + 1663.66943359, + 1556.79296875, + 1457.80285645, + 1371.0369873, + 1289.05615234, + 1211.6673584, + 1138.67041016, + 1069.86108398, + 1005.03588867, + 948.117004395, + 896.168579102, + 846.973266602, + 800.399719238, + 756.319885254, + 714.610473633, + 675.848571777, + 3787.85327148, + 3763.55249023, + 3692.97387695, + 3582.40161133, + 3440.43725586, + 3276.21459961, + 3098.10839844, + 2913.08789062, + 2726.56884766, + 2542.54394531, + 2363.82275391, + 2214.77026367, + 2085.0793457, + 1960.44067383, + 1841.2644043, + 1727.76757812, + 1620.02624512, + 1518.01391602, + 1426.21765137, + 1342.71228027, + 1263.61376953, + 1188.7791748, + 1118.05004883, + 1051.25854492, + 988.232055664, + 934.408508301, + 883.676330566, + 835.576843262, + 789.991882324, + 746.805908203, + 705.905090332, + 668.410766602, + 3459.89013672, + 3440.43725586, + 3383.59692383, + 3293.56469727, + 3176.31298828, + 3038.5234375, + 2886.69311523, + 2726.56884766, + 2562.8984375, + 2399.40185547, + 2251.3984375, + 2128.38916016, + 2008.47729492, + 1892.4576416, + 1780.86755371, + 1674.04638672, + 1572.18115234, + 1475.3458252, + 1391.27539062, + 1311.29003906, + 1235.31958008, + 1163.2689209, + 1095.02172852, + 1030.44628906, + 970.984558105, + 918.996520996, + 869.613464355, + 822.732299805, + 778.249511719, + 736.060913086, + 696.064697266, + 659.988525391, + 3160.32299805, + 3144.51098633, + 3098.10839844, + 3024.01489258, + 2926.47753906, + 2810.43310547, + 2680.90209961, + 2542.54394531, + 2399.40185547, + 2263.89282227, + 2150.73901367, + 2038.4329834, + 1928.13452148, + 1820.70666504, + 1716.77062988, + 1616.75537109, + 1520.93566895, + 1433.39599609, + 1353.48266602, + 1277.20385742, + 1204.54553223, + 1135.45678711, + 1069.86108398, + 1007.66223145, + 952.402770996, + 902.032531738, + 854.112670898, + 808.556152344, + 765.27355957, + 724.174499512, + 685.184509277, + 650.644348145, + 2886.69311523, + 2873.68359375, + 2835.38208008, + 2773.8503418, + 2692.17285156, + 2594.04101562, + 2483.34204102, + 2363.82275391, + 2251.3984375, + 2150.73901367, + 2048.6171875, + 1946.46655273, + 1845.42907715, + 1746.38916016, + 1650.01464844, + 1556.79296875, + 1467.77758789, + 1389.00024414, + 1313.34448242, + 1240.89025879, + 1171.66845703, + 1105.66882324, + 1042.85168457, + 983.335632324, + 932.328857422, + 883.676208496, + 837.313964844, + 793.171142578, + 751.173278809, + 711.242553711, + 674.120605469, + 640.444885254, + 2636.75488281, + 2625.9453125, + 2594.04101562, + 2542.54394531, + 2473.73583984, + 2390.41162109, + 2295.77954102, + 2214.77026367, + 2128.38916016, + 2038.4329834, + 1946.46655273, + 1853.81237793, + 1761.55737305, + 1670.57421875, + 1581.54528809, + 1494.99133301, + 1416.74682617, + 1342.71228027, + 1271.34875488, + 1202.77600098, + 1137.06164551, + 1074.23132324, + 1014.27752686, + 959.981201172, + 910.948364258, + 864.090393066, + 819.361022949, + 776.704589844, + 736.060913086, + 697.364379883, + 662.218017578, + 629.4609375, + 2408.45727539, + 2399.40185547, + 2372.62280273, + 2329.23535156, + 2276.53393555, + 2220.78833008, + 2156.40185547, + 2085.0793457, + 2008.47729492, + 1928.13452148, + 1845.42907715, + 1761.55737305, + 1677.53186035, + 1594.18786621, + 1512.1998291, + 1435.8034668, + 1364.40710449, + 1295.05419922, + 1227.95825195, + 1163.2689209, + 1101.08483887, + 1041.46118164, + 984.471923828, + 935.451171875, + 888.446838379, + 843.440002441, + 800.399719238, + 759.286071777, + 720.051269531, + 682.831237793, + 649.558654785, + 617.765075684, + 2220.78833008, + 2214.77026367, + 2196.91870117, + 2167.82006836, + 2128.3894043, + 2079.79077148, + 2023.34472656, + 1960.44067383, + 1892.4576416, + 1820.70666504, + 1746.38916016, + 1670.57421875, + 1594.18786621, + 1518.01391602, + 1445.50622559, + 1377.7244873, + 1311.29003906, + 1246.50488281, + 1183.6015625, + 1122.75280762, + 1064.07983398, + 1007.66223145, + 956.720947266, + 909.950622559, + 865.006896973, + 821.887329102, + 780.575439453, + 741.04473877, + 703.25994873, + 668.410766602, + 636.225524902, + 605.431762695, + 2069.29418945, + 2064.08569336, + 2048.6171875, + 2023.34472656, + 1988.98718262, + 1946.46655273, + 1896.84716797, + 1841.2644043, + 1780.86755371, + 1716.77062988, + 1650.01464844, + 1581.54528809, + 1512.1998291, + 1445.50622559, + 1382.21533203, + 1319.54187012, + 1257.8659668, + 1197.49438477, + 1138.67041016, + 1081.58068848, + 1026.36474609, + 974.326965332, + 928.192504883, + 883.676208496, + 840.805603027, + 799.591430664, + 760.030456543, + 722.108276367, + 685.801452637, + 653.370849609, + 622.301086426, + 592.535888672, + 1928.13452148, + 1923.60375977, + 1910.1348877, + 1888.08752441, + 1858.03149414, + 1820.70666504, + 1776.97375488, + 1727.76757812, + 1674.04638672, + 1616.75537109, + 1556.79296875, + 1494.99133301, + 1435.8034668, + 1377.7244873, + 1319.54187012, + 1261.69274902, + 1204.54553223, + 1148.40356445, + 1093.51477051, + 1040.0736084, + 988.232055664, + 942.805053711, + 899.092651367, + 856.815124512, + 816.011779785, + 776.704589844, + 738.902404785, + 702.601013184, + 668.97833252, + 637.802612305, + 607.867736816, + 579.151306152, + 1796.60424805, + 1792.64550781, + 1780.86755371, + 1761.55737305, + 1735.17175293, + 1702.30786133, + 1663.66943359, + 1620.02624512, + 1572.18115234, + 1520.93566895, + 1467.77758789, + 1416.74682617, + 1364.40710449, + 1311.29003906, + 1257.8659668, + 1204.54553223, + 1151.67944336, + 1099.56396484, + 1048.44384766, + 998.518310547, + 953.479187012, + 910.948364258, + 869.613342285, + 829.542602539, + 790.784973145, + 753.372253418, + 717.322814941, + 682.831237793, + 651.732788086, + 621.794555664, + 593.00592041, + 565.35144043, + 1674.04626465, + 1670.57409668, + 1660.23608398, + 1643.26379395, + 1620.02612305, + 1591.01000977, + 1556.79296875, + 1518.01391602, + 1475.3458252, + 1433.39599609, + 1389.00024414, + 1342.71228027, + 1295.05419922, + 1246.50488281, + 1197.49438477, + 1148.40356445, + 1099.56396484, + 1051.25854492, + 1003.72686768, + 959.981201172, + 918.996520996, + 878.947387695, + 839.930419922, + 802.020446777, + 765.27355957, + 729.730224609, + 695.41619873, + 663.897949219, + 634.132019043, + 605.431762695, + 577.793395996, + 551.20690918, + 1559.84912109, + 1556.79296875, + 1547.6887207, + 1532.72387695, + 1512.1998291, + 1486.51586914, + 1457.80285645, + 1426.21765137, + 1391.27539062, + 1353.48266602, + 1313.34448242, + 1271.34875488, + 1227.95825195, + 1183.6015625, + 1138.67041016, + 1093.51477051, + 1048.44384766, + 1003.72686768, + 962.165222168, + 923.064758301, + 884.62689209, + 846.973266602, + 810.203491211, + 774.396850586, + 739.61541748, + 705.905090332, + 674.120605469, + 644.708618164, + 616.264648438, + 588.796264648, + 562.305175781, + 536.786437988, + 1455.32824707, + 1452.8614502, + 1445.50598145, + 1433.39599609, + 1416.7467041, + 1395.84521484, + 1371.0369873, + 1342.71228027, + 1311.29003906, + 1277.20385742, + 1240.89025879, + 1202.77600098, + 1163.2689209, + 1122.75280762, + 1081.58068848, + 1040.0736084, + 998.518310547, + 959.981201172, + 923.064758301, + 886.533447266, + 850.530761719, + 815.17779541, + 780.575439453, + 746.805908203, + 713.934814453, + 682.245117188, + 653.370849609, + 625.353942871, + 598.214355469, + 571.965026855, + 546.611877441, + 522.155395508, + 1364.40710449, + 1362.2097168, + 1355.65515137, + 1344.85412598, + 1329.98596191, + 1311.29003906, + 1289.05615234, + 1263.61376953, + 1235.31958008, + 1204.54553223, + 1171.66845703, + 1137.06164551, + 1101.08483887, + 1064.07983398, + 1026.36474609, + 988.232055664, + 953.479187012, + 918.996520996, + 884.62689209, + 850.530761719, + 816.846923828, + 783.694335938, + 751.173278809, + 719.367553711, + 688.345153809, + 659.988647461, + 632.568969727, + 605.91784668, + 580.059387207, + 555.010437012, + 530.781066895, + 507.375701904, + 1279.16601562, + 1277.20385742, + 1271.34863281, + 1261.69274902, + 1248.38586426, + 1231.62927246, + 1211.6673584, + 1188.7791748, + 1163.2689209, + 1135.45678711, + 1105.66882324, + 1074.23132324, + 1041.46118164, + 1007.66223145, + 974.326965332, + 942.805053711, + 910.948364258, + 878.947387695, + 846.973266602, + 815.17779541, + 783.694335938, + 752.638000488, + 722.108276367, + 692.188110352, + 664.459472656, + 637.802490234, + 611.796875, + 586.477539062, + 561.871887207, + 537.999816895, + 514.875244141, + 492.505737305, + 1199.25048828, + 1197.49438477, + 1192.25231934, + 1183.6015625, + 1171.66845703, + 1156.62243652, + 1138.67041016, + 1118.05004883, + 1095.02172852, + 1069.86108398, + 1042.85168457, + 1014.27752686, + 984.471923828, + 956.720947266, + 928.192504883, + 899.092651367, + 869.613342285, + 839.930419922, + 810.203491211, + 780.575439453, + 751.173278809, + 722.108276367, + 693.4765625, + 666.712768555, + 640.975524902, + 615.765563965, + 591.129272461, + 567.10345459, + 543.718078613, + 520.995727539, + 498.952667236, + 480.805541992, + 1124.32775879, + 1122.75280762, + 1118.05004883, + 1110.28479004, + 1099.56396484, + 1086.03051758, + 1069.86108398, + 1051.25854492, + 1030.44628906, + 1007.66223145, + 983.335632324, + 959.981201172, + 935.451171875, + 909.950622559, + 883.676208496, + 856.815124512, + 829.542602539, + 802.020446777, + 774.396850586, + 746.805908203, + 719.367553711, + 692.188110352, + 666.712768555, + 642.038635254, + 617.765075684, + 593.947570801, + 570.63269043, + 547.859313965, + 525.65826416, + 504.054443359, + 484.735015869, + 470.0362854, + 1054.08581543, + 1052.67053223, + 1048.44384766, + 1041.46118164, + 1031.81274414, + 1019.62091064, + 1005.03588867, + 988.232055664, + 970.984558105, + 952.402770996, + 932.328857422, + 910.948364258, + 888.446838379, + 865.006896973, + 840.805603027, + 816.011779785, + 790.784973145, + 765.27355957, + 739.61541748, + 713.934814453, + 688.345153809, + 664.459472656, + 640.975524902, + 617.765075684, + 594.891723633, + 572.41003418, + 550.3671875, + 528.801818848, + 507.746917725, + 487.717651367, + 473.343078613, + 459.212097168, + 988.231933594, + 986.958068848, + 983.335632324, + 977.688903809, + 969.874816895, + 959.981201172, + 948.117004395, + 934.408508301, + 918.996520996, + 902.032531738, + 883.676208496, + 864.090393066, + 843.440002441, + 821.887329102, + 799.591430664, + 776.704589844, + 753.372253418, + 729.730224609, + 705.905090332, + 682.245117188, + 659.988647461, + 637.802490234, + 615.765563965, + 593.947570801, + 572.41003418, + 551.20690918, + 530.384277344, + 509.981781006, + 490.03225708, + 475.728912354, + 461.936004639, + 448.361358643, + 932.328857422, + 931.291748047, + 928.192504883, + 923.064758301, + 915.964599609, + 906.967834473, + 896.168579102, + 883.676330566, + 869.613464355, + 854.112670898, + 837.313964844, + 819.361022949, + 800.399719238, + 780.575439453, + 760.030456543, + 738.902404785, + 717.322814941, + 695.41619873, + 674.120605469, + 653.370849609, + 632.568969727, + 611.796875, + 591.129272461, + 570.63269043, + 550.3671875, + 530.384277344, + 510.730102539, + 491.443481445, + 477.170257568, + 463.766784668, + 450.541046143, + 437.510101318, + 879.889831543, + 878.947387695, + 876.130004883, + 871.467163086, + 865.006896973, + 856.815124512, + 846.973266602, + 835.576843262, + 822.732299805, + 808.556152344, + 793.171142578, + 776.704589844, + 759.286071777, + 741.04473877, + 722.108276367, + 702.601013184, + 682.831237793, + 663.897949219, + 644.708618164, + 625.353942871, + 605.91784668, + 586.477539062, + 567.10345459, + 547.859313965, + 528.801818848, + 509.981781006, + 491.443481445, + 477.65222168, + 464.686828613, + 451.85736084, + 439.183532715, + 426.682556152, + 830.400390625, + 829.542602539, + 826.978088379, + 822.732299805, + 816.846923828, + 809.379150391, + 800.399719238, + 789.991882324, + 778.249511719, + 765.27355957, + 751.173278809, + 736.060913086, + 720.051269531, + 703.25994873, + 685.801452637, + 668.97833252, + 651.732788086, + 634.132019043, + 616.264648438, + 598.214355469, + 580.059387207, + 561.871887207, + 543.718078613, + 525.65826416, + 507.746917725, + 490.03225708, + 477.170257568, + 464.686828613, + 452.297576904, + 440.024200439, + 427.886230469, + 415.901092529, + 783.694335938, + 782.912841797, + 780.575439453, + 776.704589844, + 771.336730957, + 764.521179199, + 756.319885254, + 746.805908203, + 736.060913086, + 724.174499512, + 711.242553711, + 697.364379883, + 682.831237793, + 668.410766602, + 653.370849609, + 637.802612305, + 621.794555664, + 605.431762695, + 588.796264648, + 571.965026855, + 555.010437012, + 537.999816895, + 520.995727539, + 504.054443359, + 487.717651367, + 475.728912354, + 463.766784668, + 451.85736084, + 440.024200439, + 428.288726807, + 416.670166016, + 405.185882568, + 739.61541748, + 738.902404785, + 736.76965332, + 733.236633301, + 728.335144043, + 722.108276367, + 714.610473633, + 705.905090332, + 696.064697266, + 685.184509277, + 674.120605469, + 662.218017578, + 649.558654785, + 636.225524902, + 622.301086426, + 607.867736816, + 593.00592041, + 577.793395996, + 562.305175781, + 546.611877441, + 530.781066895, + 514.875244141, + 498.952667236, + 484.735015869, + 473.343078613, + 461.936004639, + 450.541046143, + 439.183532715, + 427.886230469, + 416.670166016, + 405.554260254, + 394.555480957, + 698.015563965, + 697.364379883, + 695.41619873, + 692.188110352, + 687.707824707, + 682.245117188, + 675.848571777, + 668.410766602, + 659.988525391, + 650.644348145, + 640.444885254, + 629.4609375, + 617.765075684, + 605.431762695, + 592.535888672, + 579.151306152, + 565.35144043, + 551.20690918, + 536.786437988, + 522.155395508, + 507.375701904, + 492.505737305, + 480.805541992, + 470.0362854, + 459.212097168, + 448.361358643, + 437.510101318, + 426.682556152, + 415.901092529, + 405.185882568, + 394.555480957, + 384.026672363}, + {0, + 0, + 0, + 0, + 10016.1787109, + 8949.01855469, + 7995.55859375, + 7162.60107422, + 6422.47558594, + 5758.82910156, + 5163.75830078, + 4630.17675781, + 4151.73242188, + 3734.18823242, + 3370.10986328, + 3041.52880859, + 2744.98388672, + 2477.35107422, + 2235.81323242, + 2038.74963379, + 1932.10974121, + 1831.04748535, + 1735.27160645, + 1644.50561523, + 1558.48730469, + 1476.96801758, + 1386.82666016, + 1301.52868652, + 1221.47717285, + 1146.34912109, + 1075.84216309, + 1009.67150879, + 0, + 0, + 0, + 0, + 9878.22460938, + 8849.74414062, + 7921.35595703, + 7107.29541016, + 6379.01171875, + 5724.14550781, + 5135.74365234, + 4607.32568359, + 4132.93945312, + 3719.50512695, + 3357.80053711, + 3031.15722656, + 2736.20654297, + 2469.89428711, + 2229.45581055, + 2035.87133789, + 1929.51806641, + 1828.70812988, + 1733.15539551, + 1642.58703613, + 1556.74450684, + 1475.38232422, + 1385.13513184, + 1300, + 1220.09375, + 1145.09570312, + 1074.70495605, + 1008.63867188, + 0, + 0, + 0, + 0, + 9497.34082031, + 8569.00976562, + 7710.1953125, + 6947.08251953, + 6252.30078125, + 5622.56835938, + 5053.41699219, + 4539.99316406, + 4077.45068359, + 3676.05541992, + 3321.32641602, + 3000.390625, + 2710.14355469, + 2447.73339844, + 2210.55053711, + 2027.28417969, + 1921.78308105, + 1821.72375488, + 1726.8347168, + 1636.85534668, + 1551.53735352, + 1470.2409668, + 1380.08117676, + 1295.43139648, + 1215.95825195, + 1141.34753418, + 1071.30383301, + 1005.54919434, + 0, + 0, + 0, + 0, + 8949.01855469, + 8149.28955078, + 7394.22412109, + 6697.34423828, + 6052.48828125, + 5461.01953125, + 4921.63427734, + 4431.66796875, + 3987.81860352, + 3605.57324219, + 3262.00317383, + 2950.23999023, + 2667.58154297, + 2411.48632812, + 2179.58496094, + 2013.13024902, + 1909.02331543, + 1810.1940918, + 1716.39379883, + 1627.38232422, + 1542.92712402, + 1460.9855957, + 1371.72302246, + 1287.87316895, + 1209.11425781, + 1135.14245605, + 1065.67199707, + 1000.43200684, + 10016.1787109, + 9878.22460938, + 9497.34082031, + 8949.01855469, + 8310.703125, + 7644.40527344, + 6999.56738281, + 6379.01171875, + 5793.93896484, + 5249.58837891, + 4747.62841797, + 4287.62841797, + 3871.05200195, + 3510.74609375, + 3181.88964844, + 2882.29760742, + 2609.7644043, + 2362.1328125, + 2137.33789062, + 1993.63818359, + 1891.43066406, + 1794.28063965, + 1701.97070312, + 1614.28540039, + 1531.01391602, + 1448.18615723, + 1360.15734863, + 1277.40795898, + 1199.63305664, + 1126.54284668, + 1057.86291504, + 993.333496094, + 8949.01855469, + 8849.74414062, + 8569.00976562, + 8149.28955078, + 7644.40527344, + 7107.29541016, + 6556.77978516, + 6014.109375, + 5492.59033203, + 4999.91259766, + 4539.99316406, + 4114.296875, + 3734.18823242, + 3394.95922852, + 3083.60522461, + 2798.61328125, + 2538.30639648, + 2300.9543457, + 2084.83349609, + 1969.11364746, + 1869.26220703, + 1774.20117188, + 1683.74963379, + 1597.7220459, + 1515.93359375, + 1431.99487305, + 1345.51452637, + 1264.14855957, + 1187.61206055, + 1115.63220215, + 1047.94946289, + 984.940734863, + 7995.55859375, + 7921.35595703, + 7710.1953125, + 7394.22412109, + 6999.56738281, + 6556.77978516, + 6091.37744141, + 5622.56835938, + 5163.75830078, + 4723.70117188, + 4307.68896484, + 3918.65771484, + 3578.03271484, + 3262.00317383, + 2970.13378906, + 2701.54418945, + 2455.08642578, + 2229.45581055, + 2041.63623047, + 1939.92504883, + 1842.82971191, + 1750.22119141, + 1661.9576416, + 1577.88708496, + 1497.85339355, + 1412.59997559, + 1327.95666504, + 1248.23461914, + 1173.171875, + 1102.515625, + 1036.02380371, + 975.335021973, + 7162.60107422, + 7107.29541016, + 6947.08251953, + 6697.34423828, + 6379.01171875, + 6014.109375, + 5622.56835938, + 5220.67480469, + 4820.77636719, + 4431.66796875, + 4059.2434082, + 3719.50512695, + 3407.50024414, + 3115.79980469, + 2844.60986328, + 2593.61132812, + 2362.13305664, + 2149.27954102, + 2004.73095703, + 1906.4909668, + 1812.48913574, + 1722.64440918, + 1636.85534668, + 1555.00537109, + 1476.96813965, + 1390.21911621, + 1307.67175293, + 1229.82922363, + 1156.4543457, + 1087.31677246, + 1022.19244385, + 964.170288086, + 6422.47558594, + 6379.01171875, + 6252.30078125, + 6052.48828125, + 5793.93896484, + 5492.59033203, + 5163.75830078, + 4820.77636719, + 4474.43066406, + 4132.93945312, + 3809.15112305, + 3510.74560547, + 3227.25976562, + 2960.15966797, + 2710.14355469, + 2477.3515625, + 2261.5234375, + 2062.12646484, + 1963.74353027, + 1869.26220703, + 1778.62731934, + 1691.80358887, + 1608.73010254, + 1529.32519531, + 1450.00341797, + 1365.09509277, + 1284.86962891, + 1209.11425781, + 1137.61865234, + 1070.17382812, + 1006.57757568, + 951.533935547, + 5758.82910156, + 5724.14550781, + 5622.56835938, + 5461.01953125, + 5249.58837891, + 4999.91259766, + 4723.70117188, + 4431.66796875, + 4132.93945312, + 3839.88183594, + 3564.3984375, + 3297.38085938, + 3041.52880859, + 2798.61328125, + 2569.67749023, + 2355.21166992, + 2155.28833008, + 2013.13024902, + 1919.21801758, + 1828.70812988, + 1741.64990234, + 1658.05053711, + 1577.88708496, + 1501.11230469, + 1419.60339355, + 1337.48803711, + 1259.77709961, + 1186.2878418, + 1116.83618164, + 1051.23815918, + 989.356018066, + 937.521850586, + 5163.75830078, + 5135.74365234, + 5053.41699219, + 4921.63427734, + 4747.62841797, + 4539.99316406, + 4307.68896484, + 4059.2434082, + 3809.15112305, + 3564.3984375, + 3321.32641602, + 3083.60522461, + 2853.95654297, + 2634.29614258, + 2425.88378906, + 2229.45581055, + 2053.26318359, + 1961.06884766, + 1871.70092773, + 1785.30419922, + 1701.97070312, + 1621.74597168, + 1544.64245605, + 1470.2409668, + 1386.82666016, + 1307.67150879, + 1232.63317871, + 1161.55883789, + 1094.29150391, + 1030.67077637, + 972.740783691, + 922.236572266, + 4630.17675781, + 4607.32568359, + 4539.99316406, + 4431.66796875, + 4287.62841797, + 4114.296875, + 3918.65771484, + 3719.50512695, + 3510.74560547, + 3297.38085938, + 3083.60522461, + 2872.79980469, + 2667.58154297, + 2469.89453125, + 2281.10717773, + 2102.11450195, + 1993.63842773, + 1906.4909668, + 1821.72387695, + 1739.5189209, + 1660.00183105, + 1583.25292969, + 1509.31469727, + 1431.99487305, + 1351.99133301, + 1275.92382812, + 1203.68237305, + 1135.14245605, + 1070.17382812, + 1008.63867188, + 954.878356934, + 905.786682129, + 4151.73242188, + 4132.93945312, + 4077.45068359, + 3987.81860352, + 3871.05200195, + 3734.18823242, + 3578.03271484, + 3407.50024414, + 3227.25976562, + 3041.52880859, + 2853.95654297, + 2667.58154297, + 2484.84399414, + 2307.63012695, + 2137.33789062, + 2015.94567871, + 1932.10974121, + 1849.96801758, + 1769.79614258, + 1691.80358887, + 1616.14465332, + 1542.92712402, + 1472.10400391, + 1391.92028809, + 1315.41491699, + 1242.52575684, + 1173.171875, + 1107.25793457, + 1044.67651367, + 985.821166992, + 935.89440918, + 888.28326416, + 3734.18823242, + 3719.50512695, + 3676.05541992, + 3605.57324219, + 3510.74609375, + 3394.95922852, + 3262.00317383, + 3115.79980469, + 2960.15966797, + 2798.61328125, + 2634.29614258, + 2469.89453125, + 2307.63012695, + 2149.27954102, + 2027.2845459, + 1947.80200195, + 1869.26220703, + 1792.02880859, + 1716.39379883, + 1642.58703613, + 1570.78308105, + 1501.11230469, + 1426.6628418, + 1350.36767578, + 1277.40795898, + 1207.75256348, + 1141.34753418, + 1078.12231445, + 1017.99298096, + 964.170288086, + 915.916137695, + 869.840393066, + 3370.10986328, + 3357.80053711, + 3321.32641602, + 3262.00317383, + 3181.88964844, + 3083.60522461, + 2970.13378906, + 2844.60986328, + 2710.14355469, + 2569.67749023, + 2425.88378906, + 2281.10717773, + 2137.33789062, + 2027.2845459, + 1953.08789062, + 1879.05322266, + 1805.62060547, + 1733.15539551, + 1661.9576416, + 1592.26843262, + 1524.27880859, + 1455.47753906, + 1380.08117676, + 1307.67150879, + 1238.27075195, + 1171.8729248, + 1108.44836426, + 1047.94946289, + 990.31439209, + 941.609558105, + 895.070007324, + 850.572570801, + 3041.52880859, + 3031.15722656, + 3000.390625, + 2950.23999023, + 2882.29760742, + 2798.61328125, + 2701.54418945, + 2593.61132812, + 2477.3515625, + 2355.21166992, + 2229.45581055, + 2102.11450195, + 2015.94567871, + 1947.80200195, + 1879.05322266, + 1810.1940918, + 1741.64990234, + 1673.77990723, + 1606.8861084, + 1541.21508789, + 1476.96813965, + 1403.92370605, + 1332.70874023, + 1264.14855957, + 1198.28808594, + 1135.14245605, + 1074.70495605, + 1016.94714355, + 965.022094727, + 918.278381348, + 873.481933594, + 830.592407227, + 2744.98388672, + 2736.20654297, + 2710.14355469, + 2667.58154297, + 2609.7644043, + 2538.30639648, + 2455.08642578, + 2362.13305664, + 2261.5234375, + 2155.28833008, + 2053.26318359, + 1993.63842773, + 1932.10974121, + 1869.26220703, + 1805.62060547, + 1741.64990234, + 1677.75500488, + 1614.28540039, + 1551.53735352, + 1489.75976562, + 1421.36291504, + 1351.99133301, + 1284.86938477, + 1220.09375, + 1157.72741699, + 1097.8046875, + 1040.33654785, + 985.821166992, + 939.153625488, + 894.312011719, + 851.274475098, + 810.01184082, + 2477.35107422, + 2469.89428711, + 2447.73339844, + 2411.48632812, + 2362.1328125, + 2300.9543457, + 2229.45581055, + 2149.27954102, + 2062.12646484, + 2013.13024902, + 1961.06884766, + 1906.4909668, + 1849.96801758, + 1792.02880859, + 1733.15539551, + 1673.77990723, + 1614.28540039, + 1555.00537109, + 1496.22851562, + 1431.99487305, + 1365.09509277, + 1300, + 1236.85754395, + 1175.77709961, + 1116.83618164, + 1060.08496094, + 1005.54919434, + 957.398681641, + 912.780822754, + 869.840393066, + 828.566345215, + 788.938354492, + 2235.81323242, + 2229.45581055, + 2210.55053711, + 2179.58496094, + 2137.33789062, + 2084.83349609, + 2041.63623047, + 2004.73095703, + 1963.74353027, + 1919.21801758, + 1871.70092773, + 1821.72387695, + 1769.79614258, + 1716.39379883, + 1661.9576416, + 1606.8861084, + 1551.53735352, + 1496.22851562, + 1435.56774902, + 1371.72302246, + 1309.21435547, + 1248.23461914, + 1188.93847656, + 1131.44470215, + 1075.84216309, + 1022.19244385, + 972.740783691, + 928.625244141, + 886.038757324, + 844.988342285, + 805.471496582, + 767.476257324, + 2038.74963379, + 2035.87133789, + 2027.28417969, + 2013.13024902, + 1993.63818359, + 1969.11364746, + 1939.92504883, + 1906.4909668, + 1869.26220703, + 1828.70812988, + 1785.30419922, + 1739.5189209, + 1691.80358887, + 1642.58703613, + 1592.26843262, + 1541.21508789, + 1489.75976562, + 1431.99487305, + 1371.72302246, + 1312.30871582, + 1253.98498535, + 1196.94519043, + 1141.34753418, + 1087.31677246, + 1034.94958496, + 984.940734863, + 941.609558105, + 899.638793945, + 859.054748535, + 819.872619629, + 782.097106934, + 745.724487305, + 1932.10974121, + 1929.51806641, + 1921.78308105, + 1909.02331543, + 1891.43066406, + 1869.26220703, + 1842.82971191, + 1812.48913574, + 1778.62731934, + 1741.64990234, + 1701.97070312, + 1660.00183105, + 1616.14465332, + 1570.78308105, + 1524.27880859, + 1476.96813965, + 1421.36291504, + 1365.09509277, + 1309.21435547, + 1253.98498535, + 1199.63305664, + 1146.34912109, + 1094.29150391, + 1043.58874512, + 994.342956543, + 951.534057617, + 910.440185547, + 870.567016602, + 831.947387695, + 794.602966309, + 758.545227051, + 723.776733398, + 1831.04748535, + 1828.70812988, + 1821.72375488, + 1810.1940918, + 1794.28063965, + 1774.20117188, + 1750.22119141, + 1722.64440918, + 1691.80358887, + 1658.05053711, + 1621.74597168, + 1583.25292969, + 1542.92712402, + 1501.11230469, + 1455.47753906, + 1403.92370605, + 1351.99133301, + 1300, + 1248.23461914, + 1196.94519043, + 1146.34912109, + 1096.63146973, + 1047.94946289, + 1000.43200684, + 958.241088867, + 918.278259277, + 879.356750488, + 841.526489258, + 804.825805664, + 769.28125, + 734.910339355, + 701.721008301, + 1735.27160645, + 1733.15539551, + 1726.8347168, + 1716.39379883, + 1701.97070312, + 1683.74963379, + 1661.9576416, + 1636.85534668, + 1608.73010254, + 1577.88708496, + 1544.64245605, + 1509.31469727, + 1472.10400391, + 1426.6628418, + 1380.08117676, + 1332.70874023, + 1284.86938477, + 1236.85754395, + 1188.93847656, + 1141.34753418, + 1094.29150391, + 1047.94946289, + 1002.47418213, + 961.622131348, + 923.031555176, + 885.292236328, + 848.471923828, + 812.623657227, + 777.789794922, + 744.001464844, + 711.280090332, + 684.97052002, + 1644.50561523, + 1642.58703613, + 1636.85534668, + 1627.38232422, + 1614.28540039, + 1597.7220459, + 1577.88708496, + 1555.00537109, + 1529.32519531, + 1501.11230469, + 1470.2409668, + 1431.99487305, + 1391.92028809, + 1350.36767578, + 1307.67150879, + 1264.14855957, + 1220.09375, + 1175.77709961, + 1131.44470215, + 1087.31677246, + 1043.58874512, + 1000.43200684, + 961.622131348, + 924.624450684, + 888.28326416, + 852.680969238, + 817.885742188, + 783.954040527, + 750.929992676, + 718.848205566, + 690.50970459, + 669.78717041, + 1558.48730469, + 1556.74450684, + 1551.53735352, + 1542.92712402, + 1531.01391602, + 1515.93359375, + 1497.85339355, + 1476.96813965, + 1450.00341797, + 1419.60339355, + 1386.82666016, + 1351.99133301, + 1315.41491699, + 1277.40795898, + 1238.27075195, + 1198.28808594, + 1157.72741699, + 1116.83618164, + 1075.84216309, + 1034.94958496, + 994.342956543, + 958.241088867, + 923.031555176, + 888.28326416, + 854.091186523, + 820.536315918, + 787.687927246, + 755.602600098, + 724.327697754, + 694.713867188, + 674.449768066, + 654.522705078, + 1476.96801758, + 1475.38232422, + 1470.2409668, + 1460.9855957, + 1448.18615723, + 1431.99487305, + 1412.59997559, + 1390.21911621, + 1365.09509277, + 1337.48803711, + 1307.67150879, + 1275.92382812, + 1242.52575684, + 1207.75256348, + 1171.8729248, + 1135.14245605, + 1097.8046875, + 1060.08496094, + 1022.19244385, + 984.940734863, + 951.534057617, + 918.278259277, + 885.292236328, + 852.680969238, + 820.536315918, + 788.938354492, + 757.955322266, + 727.644897461, + 698.05480957, + 677.813598633, + 658.364379883, + 639.217041016, + 1386.82666016, + 1385.13513184, + 1380.08117676, + 1371.72302246, + 1360.15734863, + 1345.51452637, + 1327.95666504, + 1307.67175293, + 1284.86962891, + 1259.77709961, + 1232.63317871, + 1203.68237305, + 1173.171875, + 1141.34753418, + 1108.44836426, + 1074.70495605, + 1040.33654785, + 1005.54919434, + 972.740783691, + 941.609558105, + 910.440185547, + 879.356750488, + 848.471923828, + 817.885742188, + 787.687927246, + 757.955322266, + 728.75579834, + 700.14642334, + 679.84564209, + 660.946289062, + 642.291992188, + 623.906738281, + 1301.52868652, + 1300, + 1295.43139648, + 1287.87316895, + 1277.40795898, + 1264.14855957, + 1248.23461914, + 1229.82922363, + 1209.11425781, + 1186.2878418, + 1161.55883789, + 1135.14245605, + 1107.25793457, + 1078.12231445, + 1047.94946289, + 1016.94714355, + 985.821166992, + 957.398681641, + 928.625244141, + 899.638793945, + 870.567016602, + 841.526489258, + 812.623657227, + 783.954040527, + 755.602600098, + 727.644897461, + 700.14642334, + 680.525085449, + 662.243774414, + 644.148803711, + 626.268066406, + 608.625976562, + 1221.47717285, + 1220.09375, + 1215.95825195, + 1209.11425781, + 1199.63305664, + 1187.61206055, + 1173.171875, + 1156.4543457, + 1137.61865234, + 1116.83618164, + 1094.29150391, + 1070.17382812, + 1044.67651367, + 1017.99298096, + 990.31439209, + 965.022094727, + 939.153625488, + 912.780822754, + 886.038757324, + 859.054748535, + 831.947387695, + 804.825805664, + 777.789794922, + 750.929992676, + 724.327697754, + 698.05480957, + 679.84564209, + 662.243774414, + 644.769775391, + 627.454284668, + 610.324890137, + 593.40612793, + 1146.34912109, + 1145.09570312, + 1141.34753418, + 1135.14245605, + 1126.54284668, + 1115.63220215, + 1102.515625, + 1087.31677246, + 1070.17382812, + 1051.23815918, + 1030.67077637, + 1008.63867188, + 985.821166992, + 964.170288086, + 941.609558105, + 918.278381348, + 894.312011719, + 869.840393066, + 844.988342285, + 819.872619629, + 794.602966309, + 769.28125, + 744.001464844, + 718.848205566, + 694.713867188, + 677.813598633, + 660.946289062, + 644.148803711, + 627.454284668, + 610.893005371, + 594.491943359, + 578.275817871, + 1075.84216309, + 1074.70495605, + 1071.30383301, + 1065.67199707, + 1057.86291504, + 1047.94946289, + 1036.02380371, + 1022.19244385, + 1006.57757568, + 989.356018066, + 972.740783691, + 954.878356934, + 935.89440918, + 915.916137695, + 895.070007324, + 873.481933594, + 851.274475098, + 828.566345215, + 805.471496582, + 782.097106934, + 758.545227051, + 734.910339355, + 711.280090332, + 690.50970459, + 674.449768066, + 658.364379883, + 642.291992188, + 626.268066406, + 610.324890137, + 594.491943359, + 578.796020508, + 563.261047363, + 1009.67150879, + 1008.63867188, + 1005.54919434, + 1000.43200684, + 993.333496094, + 984.940734863, + 975.335021973, + 964.170288086, + 951.533935547, + 937.521850586, + 922.236572266, + 905.786682129, + 888.28326416, + 869.840393066, + 850.572570801, + 830.592407227, + 810.01184082, + 788.938354492, + 767.476257324, + 745.724487305, + 723.776733398, + 701.721008301, + 684.97052002, + 669.78717041, + 654.522705078, + 639.217041016, + 623.906738281, + 608.625976562, + 593.40612793, + 578.275817871, + 563.261047363, + 548.385559082}, + {0, + 0, + 0, + 0, + 1554.1237793, + 1242.53955078, + 993.424560547, + 821.738647461, + 688.023742676, + 576.067199707, + 482.328430176, + 403.842987061, + 338.128967285, + 283.233520508, + 237.367095947, + 198.928222656, + 166.714080811, + 139.71661377, + 117.091148376, + 100.366226196, + 93.5875701904, + 87.2667160034, + 81.3727798462, + 75.8769226074, + 70.7522583008, + 65.9736862183, + 62.4703788757, + 59.2027587891, + 56.1060714722, + 53.1713485718, + 50.3901405334, + 47.7544021606, + 0, + 0, + 0, + 0, + 1511.89892578, + 1215.3125, + 975.19708252, + 811.432128906, + 680.458190918, + 570.428588867, + 478.074890137, + 400.60269165, + 335.640289307, + 281.318328857, + 235.876022339, + 197.761489868, + 165.797119141, + 138.993164062, + 116.518371582, + 100.18183136, + 93.4242019653, + 87.1216278076, + 81.2436294556, + 75.76171875, + 70.6492919922, + 65.8815155029, + 62.4058837891, + 59.1439094543, + 56.0522842407, + 53.1221389771, + 50.3450584412, + 47.7130508423, + 0, + 0, + 0, + 0, + 1398.31689453, + 1139.93933105, + 926.469055176, + 781.859680176, + 658.586914062, + 554.038146973, + 465.659057617, + 391.113586426, + 328.333618164, + 275.682922363, + 231.480926514, + 194.317352295, + 163.08682251, + 136.852478027, + 114.822052002, + 99.6321716309, + 92.9370193481, + 86.6887664795, + 80.8581924438, + 75.4178009033, + 70.3418579102, + 65.6360702515, + 62.2131195068, + 58.9679450989, + 55.8914489746, + 52.974937439, + 50.2101821899, + 47.5893363953, + 0, + 0, + 0, + 0, + 1242.53955078, + 1031.72070312, + 865.446105957, + 736.612304688, + 624.660888672, + 528.352294922, + 446.048339844, + 376.032348633, + 316.662597656, + 266.643341064, + 224.406723022, + 188.757797241, + 158.70111084, + 133.381164551, + 112.06615448, + 98.727722168, + 92.1346588135, + 85.9753189087, + 80.2224273682, + 74.8501815796, + 69.8341827393, + 65.2862091064, + 61.8940887451, + 58.6766357422, + 55.625087738, + 52.7310905457, + 49.9866943359, + 47.3842887878, + 1554.1237793, + 1511.89892578, + 1398.31689453, + 1242.53955078, + 1072.70471191, + 913.631103516, + 791.500732422, + 680.458190918, + 581.79699707, + 495.450836182, + 420.656097412, + 356.335296631, + 301.3465271, + 254.681488037, + 214.999893188, + 181.334152222, + 152.824005127, + 128.714950562, + 108.351615906, + 97.4852905273, + 91.031036377, + 84.9928512573, + 79.3460922241, + 74.0670623779, + 69.1331710815, + 64.8018112183, + 61.4521331787, + 58.2728424072, + 55.2557067871, + 52.3927955627, + 49.6765098572, + 47.0995864868, + 1242.53955078, + 1215.3125, + 1139.93933105, + 1031.72070312, + 913.631103516, + 811.432128906, + 711.604919434, + 618.224060059, + 533.334899902, + 457.657806396, + 391.113586426, + 333.178527832, + 283.233520508, + 240.389190674, + 203.691101074, + 172.362747192, + 145.689254761, + 123.027793884, + 103.808616638, + 95.9272842407, + 89.6447677612, + 83.7569274902, + 78.2422027588, + 73.0794143677, + 68.2481460571, + 64.1881027222, + 60.8917617798, + 57.7605171204, + 54.7867393494, + 51.9630203247, + 49.2822341919, + 46.7375259399, + 993.424560547, + 975.19708252, + 926.469055176, + 865.446105957, + 791.500732422, + 711.604919434, + 631.209533691, + 554.038146973, + 482.328430176, + 417.209503174, + 359.053955078, + 307.756866455, + 263.145568848, + 224.406723022, + 190.955062866, + 162.196685791, + 137.561203003, + 116.518371582, + 100.551231384, + 94.0806045532, + 87.9983291626, + 82.2864456177, + 76.9266967773, + 71.9007492065, + 67.1905670166, + 63.4515571594, + 60.2185935974, + 57.1445236206, + 54.2224197388, + 51.4454956055, + 48.8071632385, + 46.300994873, + 821.738647461, + 811.432128906, + 781.859680176, + 736.612304688, + 680.458190918, + 618.224060059, + 554.038146973, + 491.015136719, + 431.260406494, + 376.032348633, + 325.949676514, + 281.318328857, + 241.920471191, + 207.367233276, + 177.270202637, + 151.198699951, + 128.714981079, + 109.396255493, + 98.1919021606, + 91.9756011963, + 86.1172180176, + 80.602897644, + 75.4178009033, + 70.5465774536, + 65.9736938477, + 62.5996856689, + 59.4391670227, + 56.4305839539, + 53.5677680969, + 50.8446311951, + 48.2551269531, + 45.7933807373, + 688.023742676, + 680.458190918, + 658.586914062, + 624.660888672, + 581.79699707, + 533.334899902, + 482.328430176, + 431.260406494, + 381.958526611, + 335.640289307, + 293.096069336, + 254.681427002, + 220.30645752, + 189.852081299, + 163.08682251, + 139.716659546, + 119.419540405, + 101.869186401, + 95.5869064331, + 89.6447677612, + 84.0289993286, + 78.7296905518, + 73.7354660034, + 69.0339279175, + 64.8706207275, + 61.6408843994, + 58.5607948303, + 55.625087738, + 52.8284225464, + 50.1653556824, + 47.6305236816, + 45.2185592651, + 576.067199707, + 570.428588867, + 554.038146973, + 528.352294922, + 495.450836182, + 457.657806396, + 417.209503174, + 376.032348633, + 335.640289307, + 297.179992676, + 261.421112061, + 228.614364624, + 198.928222656, + 172.362747192, + 148.803924561, + 128.066162109, + 109.923492432, + 98.727722168, + 92.7755966187, + 87.1216278076, + 81.7623062134, + 76.6913833618, + 71.9007492065, + 67.3809127808, + 63.7176971436, + 60.5841941833, + 57.5914230347, + 54.7350311279, + 52.0104789734, + 49.4130935669, + 46.938117981, + 44.5807800293, + 482.328430176, + 478.074890137, + 465.659057617, + 446.048339844, + 420.656097412, + 391.113586426, + 359.053955078, + 325.949676514, + 293.096069336, + 261.421112061, + 231.480926514, + 203.691101074, + 178.274459839, + 155.306304932, + 134.755508423, + 116.518371582, + 101.297218323, + 95.4174804688, + 89.7970199585, + 84.4398193359, + 79.3460922241, + 74.5129318237, + 69.935256958, + 65.6360702515, + 62.4703788757, + 59.4391593933, + 56.5394515991, + 53.7678108215, + 51.1205253601, + 48.5936508179, + 46.1830673218, + 43.8845863342, + 403.842987061, + 400.60269165, + 391.113586426, + 376.032348633, + 356.335296631, + 333.178527832, + 307.756866455, + 281.318328857, + 254.681427002, + 228.614364624, + 203.691101074, + 180.30632019, + 158.70111084, + 138.99319458, + 121.205970764, + 105.29486084, + 97.4852981567, + 91.9756011963, + 86.6887817383, + 81.6321105957, + 76.8088912964, + 72.2191696167, + 67.8605422974, + 64.1881027222, + 61.1397399902, + 58.2155418396, + 55.4135246277, + 52.7310905457, + 50.1653556824, + 47.7130508423, + 45.3707275391, + 43.1347961426, + 338.128967285, + 335.640289307, + 328.333618164, + 316.662597656, + 301.3465271, + 283.233520508, + 263.145568848, + 241.920471191, + 220.30645752, + 198.928222656, + 178.274459839, + 158.70111084, + 140.445129395, + 123.643127441, + 108.351615906, + 98.9074707031, + 93.5875701904, + 88.4422607422, + 83.4863510132, + 78.7296905518, + 74.1781234741, + 69.8341827393, + 65.7064590454, + 62.6645126343, + 59.736907959, + 56.9232521057, + 54.2224197388, + 51.6327171326, + 49.1519355774, + 46.7775268555, + 44.5066757202, + 42.336353302, + 283.233520508, + 281.318328857, + 275.682922363, + 266.643341064, + 254.681488037, + 240.389190674, + 224.406723022, + 207.367233276, + 189.852081299, + 172.362747192, + 155.306304932, + 138.99319458, + 123.643127441, + 109.396255493, + 99.632194519, + 94.5781402588, + 89.6447677612, + 84.8540420532, + 80.2224273682, + 75.76171875, + 71.4796905518, + 67.3809127808, + 63.9857673645, + 61.0775909424, + 58.2728424072, + 55.5720672607, + 52.974937439, + 50.4805107117, + 48.0872917175, + 45.7933807373, + 43.5965652466, + 41.4943313599, + 237.367095947, + 235.876022339, + 231.480926514, + 224.406723022, + 214.999893188, + 203.691101074, + 190.955062866, + 177.270202637, + 163.08682251, + 148.803924561, + 134.755508423, + 121.205970764, + 108.351615906, + 99.632194519, + 94.9123535156, + 90.2564239502, + 85.6926956177, + 81.2436294556, + 76.9266967773, + 72.7549057007, + 68.7375793457, + 65.077835083, + 62.2131195068, + 59.4391593933, + 56.7582321167, + 54.1716041565, + 51.6796913147, + 49.2822341919, + 46.9784011841, + 44.7668800354, + 42.6460189819, + 40.6138343811, + 198.928222656, + 197.761489868, + 194.317352295, + 188.757797241, + 181.334152222, + 172.362747192, + 162.196685791, + 151.198699951, + 139.716659546, + 128.066162109, + 116.518371582, + 105.29486084, + 98.9074707031, + 94.5781402588, + 90.2564239502, + 85.9753189087, + 81.7623062134, + 77.6397247314, + 73.6254806519, + 69.7333374023, + 65.9736938477, + 63.1215667725, + 60.4009246826, + 57.7605171204, + 55.2032775879, + 52.7310905457, + 50.3450584412, + 48.0454750061, + 45.832118988, + 43.7042236328, + 41.6606483459, + 39.699886322, + 166.714080811, + 165.797119141, + 163.08682251, + 158.70111084, + 152.824005127, + 145.689254761, + 137.561203003, + 128.714981079, + 119.419540405, + 109.923492432, + 101.297218323, + 97.4852981567, + 93.5875701904, + 89.6447677612, + 85.6926956177, + 81.7623062134, + 77.8798141479, + 74.0670623779, + 70.3418579102, + 66.7183837891, + 63.7845306396, + 61.1397399902, + 58.5607872009, + 56.0522842407, + 53.6176719666, + 51.2593917847, + 48.9790611267, + 46.7775268555, + 44.6550750732, + 42.6114387512, + 40.6459236145, + 38.7575035095, + 139.71661377, + 138.993164062, + 136.852478027, + 133.381164551, + 128.714950562, + 123.027793884, + 116.518371582, + 109.396255493, + 101.869186401, + 98.727722168, + 95.4174804688, + 91.9756011963, + 88.4422607422, + 84.8540420532, + 81.2436294556, + 77.6397247314, + 74.0670623779, + 70.5465774536, + 67.0957107544, + 64.1881027222, + 61.6408843994, + 59.1439094543, + 56.7033996582, + 54.3243103027, + 52.0104789734, + 49.7648124695, + 47.5893363953, + 45.4853858948, + 43.4536628723, + 41.4943313599, + 39.6071586609, + 37.7914886475, + 117.091148376, + 116.518371582, + 114.822052002, + 112.06615448, + 108.351615906, + 103.808616638, + 100.551231384, + 98.1919021606, + 95.5869064331, + 92.7755966187, + 89.7970199585, + 86.6887817383, + 83.4863510132, + 80.2224273682, + 76.9266967773, + 73.6254806519, + 70.3418579102, + 67.0957107544, + 64.3236160278, + 61.8940887451, + 59.4985046387, + 57.1445236206, + 54.83852005, + 52.5856742859, + 50.3901405334, + 48.2551269531, + 46.1830673218, + 44.1756248474, + 42.2339172363, + 40.3584899902, + 38.5494613647, + 36.8065338135, + 100.366226196, + 100.18183136, + 99.6321716309, + 98.727722168, + 97.4852905273, + 95.9272842407, + 94.0806045532, + 91.9756011963, + 89.6447677612, + 87.1216278076, + 84.4398193359, + 81.6321105957, + 78.7296905518, + 75.76171875, + 72.7549057007, + 69.7333374023, + 66.7183837891, + 64.1881027222, + 61.8940887451, + 59.6175003052, + 57.3672447205, + 55.1509132385, + 52.974937439, + 50.8446311951, + 48.7643356323, + 46.7375259399, + 44.7668800354, + 42.8544273376, + 41.0015525818, + 39.2091522217, + 37.4776496887, + 35.8070869446, + 93.5875701904, + 93.4242019653, + 92.9370193481, + 92.1346588135, + 91.031036377, + 89.6447677612, + 87.9983291626, + 86.1172180176, + 84.0289993286, + 81.7623062134, + 79.3460922241, + 76.8088912964, + 74.1781234741, + 71.4796905518, + 68.7375793457, + 65.9736938477, + 63.7845306396, + 61.6408843994, + 59.4985046387, + 57.3672447205, + 55.2557067871, + 53.1713485718, + 51.1205253601, + 49.1086196899, + 47.1400909424, + 45.2185668945, + 43.3469619751, + 41.5275192261, + 39.7618980408, + 38.0512619019, + 36.396320343, + 34.7973823547, + 87.2667160034, + 87.1216278076, + 86.6887664795, + 85.9753189087, + 84.9928512573, + 83.7569274902, + 82.2864456177, + 80.602897644, + 78.7296905518, + 76.6913833618, + 74.5129318237, + 72.2191696167, + 69.8341827393, + 67.3809127808, + 65.077835083, + 63.1215667725, + 61.1397399902, + 59.1439094543, + 57.1445236206, + 55.1509132385, + 53.1713485718, + 51.2130279541, + 49.2822341919, + 47.3842887878, + 45.5237045288, + 43.7042160034, + 41.9289016724, + 40.2001571655, + 38.5198707581, + 36.8894119263, + 35.3097419739, + 33.7813949585, + 81.3727798462, + 81.2436294556, + 80.8581924438, + 80.2224273682, + 79.3460922241, + 78.2422027588, + 76.9266967773, + 75.4178009033, + 73.7354660034, + 71.9007492065, + 69.935256958, + 67.8605422974, + 65.7064590454, + 63.9857673645, + 62.2131195068, + 60.4009246826, + 58.5607872009, + 56.7033996582, + 54.83852005, + 52.974937439, + 51.1205253601, + 49.2822341919, + 47.4661407471, + 45.6774940491, + 43.9208030701, + 42.199848175, + 40.5177879333, + 38.877155304, + 37.2799949646, + 35.7278671265, + 34.2218933105, + 32.9320831299, + 75.8769226074, + 75.76171875, + 75.4178009033, + 74.8501815796, + 74.0670623779, + 73.0794143677, + 71.9007492065, + 70.5465774536, + 69.0339279175, + 67.3809127808, + 65.6360702515, + 64.1881027222, + 62.6645126343, + 61.0775909424, + 59.4391593933, + 57.7605171204, + 56.0522842407, + 54.3243103027, + 52.5856742859, + 50.8446311951, + 49.1086196899, + 47.3842887878, + 45.6774940491, + 43.9933776855, + 42.336353302, + 40.7102241516, + 39.1181678772, + 37.5628471375, + 36.0463790894, + 34.5704689026, + 33.2245254517, + 32.1316642761, + 70.7522583008, + 70.6492919922, + 70.3418579102, + 69.8341827393, + 69.1331710815, + 68.2481460571, + 67.1905670166, + 65.9736938477, + 64.8706207275, + 63.7176971436, + 62.4703788757, + 61.1397399902, + 59.736907959, + 58.2728424072, + 56.7582321167, + 55.2032775879, + 53.6176719666, + 52.0104789734, + 50.3901405334, + 48.7643356323, + 47.1400909424, + 45.5237045288, + 43.9208030701, + 42.336353302, + 40.7746887207, + 39.2395439148, + 37.7341346741, + 36.2611160278, + 34.8227424622, + 33.4466362, + 32.3772735596, + 31.328754425, + 65.9736862183, + 65.8815155029, + 65.6360702515, + 65.2862091064, + 64.8018112183, + 64.1881027222, + 63.4515571594, + 62.5996856689, + 61.6408843994, + 60.5841941833, + 59.4391593933, + 58.2155418396, + 56.9232521057, + 55.5720672607, + 54.1716041565, + 52.7310905457, + 51.2593917847, + 49.7648124695, + 48.2551269531, + 46.7375259399, + 45.2185668945, + 43.7042160034, + 42.199848175, + 40.7102241516, + 39.2395439148, + 37.7914886475, + 36.3692169189, + 34.9754295349, + 33.6123847961, + 32.5545730591, + 31.5306549072, + 30.5255126953, + 62.4703788757, + 62.4058837891, + 62.2131195068, + 61.8940887451, + 61.4521331787, + 60.8917617798, + 60.2185935974, + 59.4391670227, + 58.5607948303, + 57.5914230347, + 56.5394515991, + 55.4135246277, + 54.2224197388, + 52.974937439, + 51.6796913147, + 50.3450584412, + 48.9790611267, + 47.5893363953, + 46.1830673218, + 44.7668800354, + 43.3469619751, + 41.9289016724, + 40.5177879333, + 39.1181678772, + 37.7341346741, + 36.3692169189, + 35.0265541077, + 33.7088127136, + 32.6617202759, + 31.6664142609, + 30.6867351532, + 29.723903656, + 59.2027587891, + 59.1439094543, + 58.9679450989, + 58.6766357422, + 58.2728424072, + 57.7605171204, + 57.1445236206, + 56.4305839539, + 55.625087738, + 54.7350311279, + 53.7678108215, + 52.7310905457, + 51.6327171326, + 50.4805107117, + 49.2822341919, + 48.0454750061, + 46.7775268555, + 45.4853858948, + 44.1756248474, + 42.8544273376, + 41.5275192261, + 40.2001571655, + 38.877155304, + 37.5628471375, + 36.2611160278, + 34.9754295349, + 33.7088127136, + 32.6975517273, + 31.7346553802, + 30.7841281891, + 29.8474140167, + 28.9257545471, + 56.1060714722, + 56.0522842407, + 55.8914489746, + 55.625087738, + 55.2557067871, + 54.7867393494, + 54.2224197388, + 53.5677680969, + 52.8284225464, + 52.0104789734, + 51.1205253601, + 50.1653556824, + 49.1519355774, + 48.0872917175, + 46.9784011841, + 45.832118988, + 44.6550750732, + 43.4536628723, + 42.2339172363, + 41.0015525818, + 39.7618980408, + 38.5198707581, + 37.2799949646, + 36.0463790894, + 34.8227424622, + 33.6123847961, + 32.6617202759, + 31.7346553802, + 30.8167037964, + 29.9094753265, + 29.0143985748, + 28.1327323914, + 53.1713485718, + 53.1221389771, + 52.974937439, + 52.7310905457, + 52.3927955627, + 51.9630203247, + 51.4454956055, + 50.8446311951, + 50.1653556824, + 49.4130935669, + 48.5936508179, + 47.7130508423, + 46.7775268555, + 45.7933807373, + 44.7668800354, + 43.7042236328, + 42.6114387512, + 41.4943313599, + 40.3584899902, + 39.2091522217, + 38.0512619019, + 36.8894119263, + 35.7278671265, + 34.5704689026, + 33.4466362, + 32.5545730591, + 31.6664142609, + 30.7841281891, + 29.9094753265, + 29.0440425873, + 28.1892433167, + 27.3463401794, + 50.3901405334, + 50.3450584412, + 50.2101821899, + 49.9866943359, + 49.6765098572, + 49.2822341919, + 48.8071632385, + 48.2551269531, + 47.6305236816, + 46.938117981, + 46.1830673218, + 45.3707275391, + 44.5066757202, + 43.5965652466, + 42.6460189819, + 41.6606483459, + 40.6459236145, + 39.6071586609, + 38.5494613647, + 37.4776496887, + 36.396320343, + 35.3097419739, + 34.2218933105, + 33.2245254517, + 32.3772735596, + 31.5306549072, + 30.6867351532, + 29.8474140167, + 29.0143985748, + 28.1892433167, + 27.3733463287, + 26.5679397583, + 47.7544021606, + 47.7130508423, + 47.5893363953, + 47.3842887878, + 47.0995864868, + 46.7375259399, + 46.300994873, + 45.7933807373, + 45.2185592651, + 44.5807800293, + 43.8845863342, + 43.1347961426, + 42.336353302, + 41.4943313599, + 40.6138343811, + 39.699886322, + 38.7575035095, + 37.7914886475, + 36.8065338135, + 35.8070869446, + 34.7973823547, + 33.7813949585, + 32.9320831299, + 32.1316642761, + 31.328754425, + 30.5255126953, + 29.723903656, + 28.9257545471, + 28.1327323914, + 27.3463401794, + 26.5679397583, + 25.7987575531}}; + +ap_uint<24> inv_matrix_8_fix[3][64] = { + {0, 573440, 571914, 500934, 438764, 384309, 336613, 294836, 573440, 573440, 554300, 490277, 431664, + 379299, 332941, 292072, 571914, 554300, 512454, 462308, 412154, 365186, 322442, 284093, 500934, 490277, + 462308, 424880, 384309, 344238, 306460, 271733, 438764, 431664, 412154, 384309, 352272, 319103, 286702, + 256122, 384309, 379299, 365186, 344238, 319103, 292072, 264820, 238433, 336613, 332941, 322442, 306460, + 286702, 264820, 242160, 219708, 294836, 292072, 284093, 271733, 256122, 238433, 219708, 200777}, + {0, 3225600, 3214600, 2712197, 2288313, 1930677, 1628936, 1374353, 3225600, 3225600, 3088189, + 2638422, 2240937, 1898460, 1606185, 1357854, 3214600, 3088189, 2792443, 2446967, 2111879, 1808349, + 1541522, 1310462, 2712197, 2638422, 2446967, 2195873, 1930677, 1676412, 1444223, 1237800, 2288313, + 2240937, 2111879, 1930677, 1726752, 1521076, 1325921, 1086047, 1930677, 1898460, 1808349, 1676412, + 1521076, 1357854, 1197559, 804826, 1628936, 1606185, 1541522, 1444223, 1325921, 1197559, 858830, + 571430, 1374353, 1357854, 1310462, 1237800, 1086047, 804826, 571430, 391838}, + {0, 301014, 173537, 122278, 87381, 87381, 85556, 60284, 301014, 239204, 159771, 115525, 87381, + 87381, 83112, 58803, 173537, 159771, 129848, 98919, 87381, 87381, 76366, 54653, 122278, 115525, + 98919, 87381, 87381, 87381, 66768, 48594, 87381, 87381, 87381, 87381, 87381, 74294, 55990, + 40365, 87381, 87381, 87381, 87381, 74294, 58803, 45395, 29913, 85556, 83112, 76366, 66768, + 55990, 45395, 31920, 21238, 60284, 58803, 54653, 48594, 40365, 29913, 21238, 14563}}; + +ap_uint<24> inv_matrix_16_fix[3][256] = { + {0, 0, 2441638, 2110453, 1805935, 1527539, 1292059, 1092880, 979637, 882036, 794159, 720192, + 660388, 605551, 555267, 513377, 0, 0, 2359049, 2061117, 1769095, 1502424, 1274280, 1079954, + 973257, 876924, 790013, 717366, 658011, 603538, 553552, 512061, 2441638, 2359049, 2163900, 1929223, + 1668681, 1432145, 1223726, 1056519, 954648, 861946, 777824, 709020, 650980, 597575, 548467, 508152, + 2110453, 2061117, 1929223, 1734043, 1527539, 1329150, 1147596, 1019948, 925274, 838103, 758296, 695541, + 639580, 587878, 541090, 501757, 1805935, 1769095, 1668681, 1527539, 1368461, 1207734, 1064198, 973257, + 887210, 806857, 734675, 677512, 624254, 574785, 531076, 493051, 1527539, 1502424, 1432145, 1329150, + 1207734, 1079954, 999342, 919628, 842764, 769898, 709020, 655651, 605551, 558724, 518704, 482257, + 1292059, 1274280, 1223726, 1147596, 1064198, 999342, 930994, 861946, 794159, 731730, 680030, 630735, + 584081, 541090, 504297, 469635, 1092880, 1079954, 1056519, 1019948, 973257, 919628, 861946, 802580, + 743657, 695541, 648668, 603538, 560468, 522767, 488199, 455467, 979637, 973257, 954648, 925274, + 887210, 842764, 794159, 743657, 700870, 658011, 615808, 574785, 536755, 503024, 470758, 440042, + 882036, 876924, 861946, 838103, 806857, 769898, 731730, 695541, 658011, 620003, 582200, 545493, + 513377, 482257, 452306, 423640, 794159, 790013, 777824, 758296, 734675, 709020, 680030, 648668, + 615808, 582200, 548467, 518704, 489404, 460825, 433156, 404651, 720192, 717366, 709020, 695541, + 677512, 655651, 630735, 603538, 574785, 545493, 518704, 491830, 465193, 439046, 412893, 384233, + 660388, 658011, 650980, 639580, 624254, 605551, 584081, 560468, 536755, 513377, 489404, 465193, + 441041, 417116, 389895, 363695, 605551, 603538, 597575, 587878, 574785, 558724, 541090, 522767, + 503024, 482257, 460825, 439046, 417116, 391813, 367140, 343268, 555267, 553552, 548467, 541090, + 531076, 518704, 504297, 488199, 470758, 452306, 433156, 412893, 389895, 367140, 344835, 323147, + 513377, 512061, 508152, 501757, 493051, 482257, 469635, 455467, 440042, 423640, 404651, 384233, + 363695, 343268, 323147, 303491}, + {0, 0, 5751209, 4544049, 3799576, 3391573, 3027382, 2702299, 2436075, 2197740, 1982723, 1763518, + 1534572, 1335348, 1161989, 974727, 0, 0, 5440084, 4373603, 3746811, 3353638, 2999054, 2680571, + 2420510, 2185243, 1972567, 1752429, 1525721, 1328236, 1156239, 968844, 5751209, 5440084, 4731487, 3973698, + 3601163, 3246381, 2917801, 2623494, 2375098, 2148620, 1942705, 1719843, 1499653, 1307247, 1139247, 951484, + 4544049, 4373603, 3973698, 3696279, 3391573, 3086079, 2793372, 2534376, 2303382, 2090297, 1894846, 1667696, + 1457751, 1273387, 1103434, 923481, 3799576, 3746811, 3601163, 3391573, 3147719, 2891875, 2642200, 2420510, + 2210386, 2013818, 1820750, 1598900, 1402122, 1228203, 1055862, 886125, 3391573, 3353638, 3246381, 3086079, + 2891875, 2680571, 2484135, 2289592, 2101702, 1923283, 1719843, 1516952, 1335348, 1173618, 998753, 841041, + 3027382, 2999054, 2917801, 2793372, 2642200, 2484135, 2317350, 2148620, 1982723, 1809059, 1608443, 1425546, + 1260221, 1103434, 934545, 790037, 2702299, 2680571, 2623494, 2534376, 2420510, 2289592, 2148620, 2003346, + 1856592, 1667696, 1491121, 1328236, 1179499, 1017307, 865695, 734963, 2436075, 2420510, 2375098, 2303382, + 2210386, 2101702, 1982723, 1856592, 1688240, 1525721, 1371816, 1228203, 1082691, 928991, 794499, 677590, + 2197740, 2185243, 2148620, 2090297, 2013818, 1923283, 1809059, 1667696, 1525721, 1386839, 1253715, 1124732, + 974727, 841041, 722986, 619515, 1982723, 1972567, 1942705, 1894846, 1820750, 1719843, 1608443, 1491121, + 1371816, 1253715, 1139247, 998752, 870742, 755519, 652842, 544631, 1763518, 1752429, 1719843, 1667696, + 1598900, 1516952, 1425546, 1328236, 1228203, 1124732, 998752, 880957, 772520, 673980, 578732, 465986, + 1534572, 1525721, 1499653, 1457751, 1402122, 1335348, 1260221, 1179499, 1082691, 974727, 870742, 772520, + 681226, 596747, 486978, 394922, 1335348, 1328236, 1307247, 1273387, 1228203, 1173618, 1103434, 1017307, + 928991, 841041, 755519, 673980, 596747, 494226, 406298, 331816, 1161989, 1156239, 1139247, 1103434, + 1055862, 998753, 934545, 865695, 794499, 722986, 652842, 578732, 486978, 406298, 336400, 276619, + 974727, 968844, 951484, 923481, 886125, 841041, 790037, 734963, 677590, 619515, 544631, 465986, + 394922, 331816, 276619, 228975}, + {0, 0, 630388, 459728, 346040, 270138, 210885, 164629, 145236, 129332, 115170, 103181, 93307, 84378, + 76303, 60309, 0, 0, 585115, 436769, 335651, 263595, 206607, 161757, 144191, 128504, 114506, 102711, + 92918, 84053, 76030, 59794, 630388, 585115, 485316, 381545, 307860, 245565, 194600, 157883, 141148, 126081, + 112555, 101326, 91767, 83090, 75220, 58282, 459728, 436769, 381545, 325861, 270138, 219900, 176964, 151854, + 136356, 122232, 109437, 99095, 89905, 81527, 71893, 55867, 346040, 335651, 307860, 270138, 229587, 190851, + 159151, 144191, 130171, 117207, 105593, 96120, 87409, 79424, 67542, 52693, 270138, 263595, 245565, 219900, + 190851, 161757, 148468, 135437, 122984, 111288, 101326, 92531, 84378, 76855, 62426, 48936, 210885, 206607, + 194600, 176964, 159151, 148468, 137288, 126081, 115170, 105102, 96535, 88463, 80917, 71893, 56818, 44786, + 164629, 161757, 157883, 151854, 144191, 135437, 126081, 116520, 107092, 99095, 91389, 84053, 77133, 64075, + 50981, 40428, 145236, 144191, 141148, 136356, 130171, 122984, 115170, 107092, 99976, 92918, 86039, 79424, + 69986, 56340, 45145, 36032, 129332, 128504, 126081, 122232, 117207, 111288, 105102, 99095, 92918, 86719, + 80614, 73866, 60309, 48936, 39498, 31736, 115170, 114506, 112555, 109437, 105593, 101326, 96535, 91389, + 86039, 80614, 75220, 62426, 51402, 42040, 34182, 25375, 103181, 102711, 101326, 99095, 96120, 92531, + 88463, 84053, 79424, 73866, 62426, 52258, 43386, 35760, 28391, 19017, 93307, 92918, 91767, 89905, + 87409, 84378, 80917, 77133, 69986, 60309, 51402, 43386, 36306, 30047, 20632, 14004, 84378, 84053, + 83090, 81527, 79424, 76855, 71893, 64075, 56340, 48936, 42040, 35760, 30047, 21203, 14759, 10149, + 76303, 76030, 75220, 71893, 67542, 62426, 56818, 50981, 45145, 39498, 34182, 28391, 20632, 14759, + 10410, 7249, 60309, 59794, 58282, 55867, 52693, 48936, 44786, 40428, 36032, 31736, 25375, 19017, + 14004, 10149, 7249, 5111}}; + +ap_uint<24> inv_matrix_32_fix[3][1024] = { + {0, 0, 0, 0, 5131959, 4670491, 4250519, 3878761, 3542927, 3236170, 2955973, 2700037, + 2466260, 2274087, 2118957, 1974409, 1839722, 1714223, 1597285, 1490256, 1397152, 1309866, 1228032, 1151311, + 1079383, 1011949, 954704, 901007, 850330, 802503, 757366, 714767, 0, 0, 0, 0, + 5072775, 4627118, 4217502, 3853877, 3523007, 3219979, 2942652, 2688968, 2456987, 2267924, 2113623, 1969770, + 1835669, 1710667, 1594156, 1487730, 1394902, 1307856, 1226234, 1149698, 1077934, 1010645, 953642, 900042, + 849451, 801702, 756636, 714101, 0, 0, 0, 0, 4908661, 4504031, 4123432, 3781605, + 3464803, 3172463, 2903431, 2656298, 2429565, 2249644, 2097784, 1955978, 1823608, 1700081, 1584833, 1480198, + 1388190, 1301861, 1220866, 1144883, 1073606, 1006935, 950469, 897157, 846825, 799309, 754452, 712106, + 0, 0, 0, 0, 4670491, 4318764, 3982627, 3668379, 3372610, 3096591, 2840422, 2603565, + 2385137, 2219847, 2071905, 1933401, 1803834, 1682702, 1569509, 1467797, 1377130, 1291973, 1212008, 1136931, + 1066456, 1001153, 945218, 892382, 842477, 795345, 750834, 708800, 5131959, 5072775, 4908661, 4670491, + 4390194, 4094197, 3805312, 3523007, 3252544, 2996713, 2756785, 2533105, 2331170, 2179470, 2036722, 1902624, + 1776815, 1658906, 1548492, 1450748, 1361905, 1278347, 1199788, 1125953, 1056576, 993151, 937947, 885767, + 836451, 789848, 745815, 704212, 4670491, 4627118, 4504031, 4318764, 4094197, 3853877, 3604336, 3354843, + 3111448, 2877883, 2656298, 2447781, 2274087, 2129705, 1993181, 1864403, 1743163, 1629194, 1522192, 1429345, + 1342761, 1261188, 1184381, 1112095, 1044091, 983020, 928735, 877378, 828804, 782869, 739438, 698619, + 4250519, 4217502, 4123432, 3982627, 3805312, 3604336, 3390593, 3172463, 2955973, 2745243, 2542942, 2350878, + 2208155, 2071905, 1942371, 1819621, 1703597, 1594156, 1492790, 1403941, 1319993, 1240747, 1165998, 1095537, + 1029156, 970871, 917676, 867300, 819609, 774471, 731761, 692068, 3878761, 3853877, 3781605, 3668379, + 3523007, 3354843, 3172463, 2983002, 2792006, 2603565, 2420554, 2267924, 2135121, 2007491, 1885454, 1769234, + 1658906, 1554446, 1460446, 1374937, 1293940, 1217309, 1144883, 1076488, 1011949, 956834, 904884, 855630, + 808951, 764729, 722846, 684452, 3542927, 3523007, 3464803, 3372610, 3252544, 3111448, 2955973, 2792006, + 2624408, 2456987, 2305432, 2179470, 2056680, 1937876, 1823608, 1714223, 1609913, 1510754, 1424666, 1342761, + 1264967, 1191187, 1121302, 1055177, 994288, 941052, 890484, 842477, 796927, 753726, 712770, 675828, + 3236170, 3219979, 3172463, 3096591, 2996713, 2877883, 2745243, 2603565, 2456987, 2318226, 2202356, 2087355, + 1974409, 1864403, 1757973, 1655557, 1557438, 1467797, 1385966, 1307856, 1233454, 1162707, 1095537, 1031846, + 975260, 923681, 874611, 827961, 783640, 741554, 701628, 666259, 2955973, 2942652, 2903431, 2840422, + 2756785, 2656298, 2542942, 2420554, 2305432, 2202356, 2097784, 1993181, 1889719, 1788302, 1689615, 1594156, + 1503004, 1422336, 1344864, 1270671, 1199788, 1132204, 1067880, 1006935, 954704, 904884, 857409, 812207, + 769201, 728312, 690299, 655815, 2700037, 2688968, 2656298, 2603565, 2533105, 2447781, 2350878, 2267924, + 2179470, 2087355, 1993181, 1898303, 1803834, 1710668, 1619502, 1530871, 1450748, 1374937, 1301861, 1231642, + 1164351, 1100012, 1038620, 983020, 932811, 884828, 839025, 795345, 753726, 714101, 678111, 644568, + 2466260, 2456987, 2429565, 2385137, 2331170, 2274087, 2208155, 2135121, 2056680, 1974409, 1889719, 1803834, + 1717792, 1632448, 1548492, 1470262, 1397152, 1326135, 1257429, 1191187, 1127510, 1066456, 1008099, 957902, + 909769, 863682, 819609, 777508, 737332, 699219, 665148, 632591, 2274087, 2267924, 2249644, 2219847, + 2179470, 2129705, 2071905, 2007491, 1937876, 1864403, 1788302, 1710668, 1632448, 1554446, 1480198, 1410789, + 1342761, 1276421, 1212008, 1149698, 1089617, 1031846, 979682, 931789, 885767, 841612, 799309, 758829, + 720138, 684452, 651494, 619962, 2118957, 2113623, 2097784, 2071905, 2036722, 1993181, 1942371, 1885454, + 1823608, 1757973, 1689615, 1619502, 1548492, 1480198, 1415388, 1351210, 1288054, 1226234, 1165998, 1107538, + 1050997, 997710, 950469, 904884, 860984, 818781, 778271, 739438, 702260, 669051, 637236, 606756, + 1974409, 1969770, 1955978, 1933401, 1902624, 1864403, 1819621, 1769234, 1714223, 1655557, 1594156, 1530871, + 1470262, 1410789, 1351210, 1291973, 1233454, 1175965, 1119759, 1065035, 1011949, 965432, 920670, 877378, + 835596, 795345, 756636, 719463, 685033, 653109, 622456, 593050, 1839722, 1835669, 1823608, 1803834, + 1776815, 1743163, 1703597, 1658906, 1609913, 1557438, 1503004, 1450748, 1397152, 1342761, 1288054, 1233454, + 1179319, 1125953, 1073606, 1022482, 976362, 932811, 890484, 849451, 809763, 771453, 734538, 699219, + 667374, 636717, 607238, 578919, 1714223, 1710667, 1700081, 1682702, 1658906, 1629194, 1594156, 1554446, + 1510754, 1467797, 1422336, 1374937, 1326135, 1276421, 1226234, 1175965, 1125953, 1076488, 1027816, 983020, + 941052, 900042, 860088, 821268, 783640, 747243, 712106, 679831, 649351, 619962, 591660, 564435, + 1597285, 1594156, 1584833, 1569509, 1548492, 1522192, 1492790, 1460446, 1424666, 1385966, 1344864, 1301861, + 1257429, 1212008, 1165998, 1119759, 1073606, 1027816, 985257, 945218, 905857, 867300, 829648, 792982, + 757366, 722846, 690299, 660181, 631055, 602927, 575800, 549669, 1490256, 1487730, 1480198, 1467797, + 1450748, 1429345, 1403941, 1374937, 1342761, 1307856, 1270671, 1231642, 1191187, 1149698, 1107538, 1065035, + 1022482, 983020, 945218, 907810, 870943, 834742, 799309, 764729, 731069, 698619, 669051, 640362, + 612571, 585692, 559730, 534687, 1397152, 1394902, 1388190, 1377130, 1361905, 1342761, 1319993, 1293940, + 1264967, 1233454, 1199788, 1164351, 1127510, 1089617, 1050997, 1011949, 976362, 941052, 905857, 870943, + 836451, 802503, 769201, 736632, 704865, 675828, 647750, 620459, 593980, 568330, 543519, 519552, + 1309866, 1307856, 1301861, 1291973, 1278347, 1261188, 1240747, 1217309, 1191187, 1162707, 1132204, 1100012, + 1066456, 1031846, 997710, 965432, 932811, 900042, 867300, 834742, 802503, 770701, 739438, 708800, + 680406, 653109, 626480, 600553, 575356, 550911, 527232, 504325, 1228032, 1226234, 1220866, 1212008, + 1199788, 1184381, 1165998, 1144883, 1121302, 1095537, 1067880, 1038620, 1008099, 979682, 950469, 920670, + 890484, 860088, 829648, 799309, 769201, 739438, 710120, 682713, 656358, 630543, 605316, 580713, + 556767, 533499, 510927, 492344, 1151311, 1149698, 1144883, 1136931, 1125953, 1112095, 1095537, 1076488, + 1055177, 1031846, 1006935, 983020, 957902, 931789, 904884, 877378, 849451, 821268, 792982, 764729, + 736632, 708800, 682713, 657447, 632591, 608202, 584327, 561007, 538274, 516151, 496368, 481317, + 1079383, 1077934, 1073606, 1066456, 1056576, 1044091, 1029156, 1011949, 994288, 975260, 954704, 932811, + 909769, 885767, 860984, 835596, 809763, 783640, 757366, 731069, 704865, 680406, 656358, 632591, + 609169, 586147, 563576, 541493, 519932, 499422, 484703, 470233, 1011949, 1010645, 1006935, 1001153, + 993151, 983020, 970871, 956834, 941052, 923681, 904884, 884828, 863682, 841612, 818781, 795345, + 771453, 747243, 722846, 698619, 675828, 653109, 630543, 608202, 586147, 564435, 543113, 522221, + 501793, 487146, 473022, 459122, 954704, 953642, 950469, 945218, 937947, 928735, 917676, 904884, + 890484, 874611, 857409, 839025, 819609, 799309, 778271, 756636, 734538, 712106, 690299, 669051, + 647750, 626480, 605316, 584327, 563576, 543113, 522987, 503238, 488622, 474897, 461354, 448010, + 901007, 900042, 897157, 892382, 885767, 877378, 867300, 855630, 842477, 827961, 812207, 795345, + 777508, 758829, 739438, 719463, 699219, 679831, 660181, 640362, 620459, 600553, 580713, 561007, + 541493, 522221, 503238, 489115, 475839, 462701, 449723, 436922, 850330, 849451, 846825, 842477, + 836451, 828804, 819609, 808951, 796927, 783640, 769201, 753726, 737332, 720138, 702260, 685033, + 667374, 649351, 631055, 612571, 593980, 575356, 556767, 538274, 519932, 501793, 488622, 475839, + 463152, 450584, 438155, 425882, 802503, 801702, 799309, 795345, 789848, 782869, 774471, 764729, + 753726, 741554, 728312, 714101, 699219, 684452, 669051, 653109, 636717, 619962, 602927, 585692, + 568330, 550911, 533499, 516151, 499422, 487146, 474897, 462701, 450584, 438567, 426670, 414910, + 757366, 756636, 754452, 750834, 745815, 739438, 731761, 722846, 712770, 701628, 690299, 678111, + 665148, 651494, 637236, 622456, 607238, 591660, 575800, 559730, 543519, 527232, 510927, 496368, + 484703, 473022, 461354, 449723, 438155, 426670, 415287, 404024, 714767, 714101, 712106, 708800, + 704212, 698619, 692068, 684452, 675828, 666259, 655815, 644568, 632591, 619962, 606756, 593050, + 578919, 564435, 549669, 534687, 519552, 504325, 492344, 481317, 470233, 459122, 448010, 436922, + 425882, 414910, 404024, 393243}, + {0, 0, 0, 0, 10256567, 9163795, 8187452, 7334503, 6576615, 5897041, 5287688, 4741301, + 4251374, 3823808, 3450992, 3114525, 2810863, 2536807, 2289472, 2087679, 1978480, 1874992, 1776918, 1683973, + 1595891, 1512415, 1420110, 1332765, 1250792, 1173861, 1101662, 1033903, 0, 0, 0, 0, + 10115302, 9062138, 8111468, 7277870, 6532108, 5861525, 5259001, 4717901, 4232130, 3808773, 3438387, 3103905, + 2801875, 2529171, 2282962, 2084732, 1975826, 1872597, 1774751, 1682009, 1594106, 1510791, 1418378, 1331200, + 1249376, 1172578, 1100497, 1032846, 0, 0, 0, 0, 9725277, 8774666, 7895240, 7113812, + 6402356, 5757510, 5174699, 4648953, 4175309, 3764280, 3401038, 3072400, 2775187, 2506479, 2263603, 2075939, + 1967905, 1865445, 1768278, 1676139, 1588774, 1505526, 1413203, 1326521, 1245141, 1168739, 1097015, 1029682, + 0, 0, 0, 0, 9163795, 8344872, 7571685, 6858080, 6197748, 5592084, 5039753, 4538028, + 4083526, 3692107, 3340291, 3021045, 2731603, 2469362, 2231895, 2061445, 1954839, 1853638, 1757587, 1666439, + 1579957, 1496049, 1404644, 1318782, 1238133, 1162385, 1091248, 1024442, 10256567, 10115302, 9725277, 9163795, + 8510160, 7827871, 7167557, 6532108, 5932993, 5375578, 4861571, 4390531, 3963957, 3595004, 3258255, 2951472, + 2672398, 2418824, 2188634, 2041485, 1936825, 1837343, 1742818, 1653028, 1567758, 1482942, 1392801, 1308065, + 1228424, 1153579, 1083251, 1017173, 9163795, 9062138, 8774666, 8344872, 7827871, 7277870, 6714142, 6158448, + 5624412, 5119910, 4648953, 4213040, 3823808, 3476438, 3157611, 2865780, 2599225, 2356177, 2134869, 2016372, + 1914124, 1816782, 1724159, 1636067, 1552316, 1466362, 1377806, 1294488, 1216114, 1142407, 1073100, 1008579, + 8187452, 8111468, 7895240, 7571685, 7167557, 6714142, 6237570, 5757510, 5287688, 4837070, 4411073, 4012705, + 3663905, 3340291, 3041417, 2766381, 2514008, 2282962, 2090635, 1986483, 1887057, 1792226, 1701844, 1615756, + 1533801, 1446502, 1359827, 1278192, 1201328, 1128976, 1060888, 998743, 7334503, 7277870, 7113812, 6858080, + 6532108, 6158448, 5757510, 5345971, 4936475, 4538028, 4156665, 3808773, 3489280, 3190579, 2912880, 2655858, + 2418824, 2200862, 2052844, 1952246, 1855988, 1763987, 1676139, 1592325, 1512415, 1423584, 1339055, 1259345, + 1184209, 1113412, 1046725, 987310, 6576615, 6532108, 6402356, 6197748, 5932993, 5624412, 5287688, 4936475, + 4581817, 4232130, 3900570, 3595003, 3304714, 3031203, 2775187, 2536808, 2315800, 2111617, 2010873, 1914124, + 1821314, 1732406, 1647339, 1566029, 1484803, 1397857, 1315706, 1238133, 1164921, 1095858, 1030735, 974370, + 5897041, 5861525, 5757510, 5592084, 5375578, 5119910, 4837070, 4538028, 4232130, 3932039, 3649944, 3376518, + 3114525, 2865780, 2631349, 2411736, 2207015, 2061445, 1965279, 1872597, 1783449, 1697843, 1615756, 1537139, + 1453673, 1369587, 1290011, 1214758, 1143640, 1076467, 1013100, 960022, 5287688, 5259001, 5174699, 5039753, + 4861571, 4648953, 4411073, 4156665, 3900570, 3649944, 3401038, 3157611, 2922451, 2697519, 2484105, 2282962, + 2102541, 2008134, 1916621, 1828151, 1742818, 1660667, 1581713, 1505526, 1420110, 1339055, 1262216, 1189436, + 1120554, 1055406, 996086, 944370, 4741301, 4717901, 4648953, 4538028, 4390531, 4213040, 4012705, 3808773, + 3595003, 3376518, 3157611, 2941747, 2731603, 2529172, 2335853, 2152565, 2041485, 1952246, 1865445, 1781267, + 1699841, 1621251, 1545538, 1466362, 1384439, 1306546, 1232570, 1162385, 1095858, 1032846, 977795, 927525, + 4251374, 4232130, 4175309, 4083526, 3963957, 3823808, 3663905, 3489280, 3304714, 3114525, 2922451, 2731603, + 2544480, 2363013, 2188634, 2064328, 1978480, 1894367, 1812271, 1732406, 1654932, 1579957, 1507434, 1425326, + 1346984, 1272346, 1201328, 1133832, 1069748, 1009480, 958355, 909602, 3823808, 3808773, 3764280, 3692107, + 3595004, 3476438, 3340291, 3190579, 3031203, 2865780, 2697519, 2529172, 2363013, 2200862, 2075939, 1994549, + 1914124, 1835037, 1757587, 1682009, 1608481, 1537139, 1460902, 1382776, 1308065, 1236738, 1168739, 1103997, + 1042424, 987310, 937898, 890716, 3450992, 3438387, 3401038, 3340291, 3258255, 3157611, 3041417, 2912880, + 2775187, 2631349, 2484105, 2335853, 2188634, 2075939, 1999962, 1924150, 1848955, 1774751, 1701844, 1630482, + 1560861, 1490409, 1413203, 1339055, 1267989, 1199997, 1135051, 1073100, 1014081, 964208, 916551, 870986, + 3114525, 3103905, 3072400, 3021045, 2951472, 2865780, 2766381, 2655858, 2536808, 2411736, 2282962, 2152565, + 2064328, 1994549, 1924150, 1853638, 1783449, 1713950, 1645451, 1578204, 1512415, 1437617, 1364693, 1294488, + 1227047, 1162385, 1100497, 1041353, 988182, 940317, 894445, 850526, 2810863, 2801875, 2775187, 2731603, + 2672398, 2599225, 2514008, 2418824, 2315800, 2207015, 2102541, 2041485, 1978480, 1914124, 1848955, 1783449, + 1718021, 1653028, 1588774, 1525514, 1455475, 1384439, 1315706, 1249376, 1185512, 1124152, 1065304, 1009480, + 961693, 915775, 871705, 829452, 2536807, 2529171, 2506479, 2469362, 2418824, 2356177, 2282962, 2200862, + 2111617, 2061445, 2008134, 1952246, 1894367, 1835037, 1774751, 1713950, 1653028, 1592325, 1532138, 1466362, + 1397857, 1331200, 1266542, 1203995, 1143640, 1085527, 1029682, 980376, 934687, 890716, 848451, 807872, + 2289472, 2282962, 2263603, 2231895, 2188634, 2134869, 2090635, 2052844, 2010873, 1965279, 1916621, 1865445, + 1812271, 1757587, 1701844, 1645451, 1588774, 1532138, 1470021, 1404644, 1340635, 1278192, 1217473, 1158599, + 1101662, 1046725, 996086, 950912, 907303, 865268, 824802, 785895, 2087679, 2084732, 2075939, 2061445, + 2041485, 2016372, 1986483, 1952246, 1914124, 1872597, 1828151, 1781267, 1732406, 1682009, 1630482, 1578204, + 1525514, 1466362, 1404644, 1343804, 1284080, 1225671, 1168739, 1113412, 1059788, 1008579, 964208, 921230, + 879672, 839549, 800867, 763621, 1978480, 1975826, 1967905, 1954839, 1936825, 1914124, 1887057, 1855988, + 1821314, 1783449, 1742818, 1699841, 1654932, 1608481, 1560861, 1512415, 1455475, 1397857, 1340635, 1284080, + 1228424, 1173861, 1120554, 1068634, 1018207, 974370, 932290, 891460, 851914, 813673, 776750, 741147, + 1874992, 1872597, 1865445, 1853638, 1837343, 1816782, 1792226, 1763987, 1732406, 1697843, 1660667, 1621251, + 1579957, 1537139, 1490409, 1437617, 1384439, 1331200, 1278192, 1225671, 1173861, 1122950, 1073100, 1024442, + 981238, 940316, 900461, 861723, 824141, 787744, 752548, 718562, 1776918, 1774751, 1768278, 1757587, + 1742818, 1724159, 1701844, 1676139, 1647339, 1615756, 1581713, 1545538, 1507434, 1460902, 1413203, 1364693, + 1315706, 1266542, 1217473, 1168739, 1120554, 1073100, 1026533, 984701, 945184, 906539, 868835, 832126, + 796456, 761857, 728350, 701409, 1683973, 1682009, 1676139, 1666439, 1653028, 1636067, 1615756, 1592325, + 1566029, 1537139, 1505526, 1466362, 1425326, 1382776, 1339055, 1294488, 1249376, 1203995, 1158599, 1113412, + 1068634, 1024442, 984701, 946815, 909602, 873145, 837515, 802768, 768952, 736100, 707081, 685862, + 1595891, 1594106, 1588774, 1579957, 1567758, 1552316, 1533801, 1512415, 1484803, 1453673, 1420110, 1384439, + 1346984, 1308065, 1267989, 1227047, 1185512, 1143640, 1101662, 1059788, 1018207, 981238, 945184, 909602, + 874589, 840229, 806592, 773737, 741711, 711387, 690636, 670231, 1512415, 1510791, 1505526, 1496049, + 1482942, 1466362, 1446502, 1423584, 1397857, 1369587, 1339055, 1306546, 1272346, 1236738, 1199997, 1162385, + 1124152, 1085527, 1046725, 1008579, 974370, 940316, 906539, 873145, 840229, 807872, 776146, 745108, + 714808, 694081, 674165, 654558, 1420110, 1418378, 1413203, 1404644, 1392801, 1377806, 1359827, 1339055, + 1315706, 1290011, 1262216, 1232570, 1201328, 1168739, 1135051, 1100497, 1065304, 1029682, 996086, 964208, + 932290, 900461, 868835, 837515, 806592, 776146, 746245, 716949, 696161, 676809, 657707, 638880, + 1332765, 1331200, 1326521, 1318782, 1308065, 1294488, 1278192, 1259345, 1238133, 1214758, 1189436, 1162385, + 1133832, 1103997, 1073100, 1041353, 1009480, 980376, 950912, 921230, 891460, 861723, 832126, 802768, + 773737, 745108, 716949, 696857, 678137, 659608, 641298, 623233, 1250792, 1249376, 1245141, 1238133, + 1228424, 1216114, 1201328, 1184209, 1164921, 1143640, 1120554, 1095858, 1069748, 1042424, 1014081, 988182, + 961693, 934687, 907303, 879672, 851914, 824141, 796456, 768952, 741711, 714808, 696161, 678137, + 660244, 642513, 624972, 607647, 1173861, 1172578, 1168739, 1162385, 1153579, 1142407, 1128976, 1113412, + 1095858, 1076467, 1055406, 1032846, 1009480, 987310, 964208, 940317, 915775, 890716, 865268, 839549, + 813673, 787744, 761857, 736100, 711387, 694081, 676809, 659608, 642513, 625554, 608759, 592154, + 1101662, 1100497, 1097015, 1091248, 1083251, 1073100, 1060888, 1046725, 1030735, 1013100, 996086, 977795, + 958355, 937898, 916551, 894445, 871705, 848451, 824802, 800867, 776750, 752548, 728350, 707081, + 690636, 674165, 657707, 641298, 624972, 608759, 592687, 576779, 1033903, 1032846, 1029682, 1024442, + 1017173, 1008579, 998743, 987310, 974370, 960022, 944370, 927525, 909602, 890716, 870986, 850526, + 829452, 807872, 785895, 763621, 741147, 718562, 701409, 685862, 670231, 654558, 638880, 623233, + 607647, 592154, 576779, 561546}, + {0, 0, 0, 0, 1591422, 1272360, 1017266, 841460, 704536, 589892, 493904, 413535, 346244, + 290031, 243063, 203702, 170715, 143069, 119901, 102775, 95833, 89361, 83325, 77697, 72450, 67557, + 63969, 60623, 57452, 54447, 51599, 48900, 0, 0, 0, 0, 1548184, 1244480, 998601, + 830906, 696789, 584118, 489548, 410217, 343695, 288069, 241537, 202507, 169776, 142329, 119314, 102586, + 95666, 89212, 83193, 77580, 72344, 67462, 63903, 60563, 57397, 54397, 51553, 48858, 0, + 0, 0, 0, 1431876, 1167297, 948704, 800624, 674393, 567335, 476834, 400500, 336213, 282299, + 237036, 198980, 167000, 140136, 117577, 102023, 95167, 88769, 82798, 77227, 72030, 67211, 63706, + 60383, 57232, 54246, 51415, 48731, 0, 0, 0, 0, 1272360, 1056482, 886216, 754291, + 639652, 541032, 456753, 385057, 324262, 273042, 229792, 193287, 162509, 136582, 114755, 101097, 94345, + 88038, 82147, 76646, 71510, 66853, 63379, 60084, 56960, 53996, 51186, 48521, 1591422, 1548184, + 1431876, 1272360, 1098449, 935558, 810496, 696789, 595760, 507341, 430751, 364887, 308578, 260793, 220159, + 185686, 156491, 131804, 110952, 99824, 93215, 87032, 81250, 75844, 70792, 66357, 62926, 59671, + 56581, 53650, 50868, 48229, 1272360, 1244480, 1167297, 1056482, 935558, 830906, 728683, 633061, 546134, + 468641, 400500, 341174, 290031, 246158, 208579, 176499, 149185, 125980, 106300, 98229, 91796, 85767, + 80120, 74833, 69886, 65728, 62353, 59146, 56101, 53210, 50465, 47859, 1017266, 998601, 948704, + 886216, 810496, 728683, 646358, 567335, 493904, 427222, 367671, 315143, 269461, 229792, 195537, 166089, + 140862, 119314, 102964, 96338, 90110, 84261, 78772, 73626, 68803, 64974, 61663, 58515, 55523, + 52680, 49978, 47412, 841460, 830906, 800624, 754291, 696789, 633061, 567335, 502799, 441610, 385057, + 333772, 288069, 247726, 212344, 181524, 154827, 131804, 112021, 100548, 94183, 88184, 82537, 77227, + 72239, 67557, 64102, 60865, 57784, 54853, 52064, 49413, 46892, 704536, 696789, 674393, 639652, + 595760, 546134, 493904, 441610, 391125, 343695, 300130, 260793, 225593, 194408, 167000, 143069, 122285, + 104314, 97880, 91796, 86045, 80619, 75505, 70690, 66427, 63120, 59966, 56960, 54096, 51369, + 48773, 46303, 589892, 584118, 567335, 541032, 507341, 468641, 427222, 385057, 343695, 304312, 267695, + 234101, 203702, 176499, 152375, 131139, 112561, 101097, 95002, 89212, 83724, 78531, 73626, 68998, + 65246, 62038, 58973, 56048, 53258, 50599, 48064, 45650, 493904, 489548, 476834, 456753, 430751, + 400500, 367671, 333772, 300130, 267695, 237036, 208579, 182553, 159033, 137989, 119314, 103728, 97707, + 91952, 86466, 81250, 76301, 71613, 67211, 63969, 60865, 57896, 55058, 52347, 49759, 47291, + 44937, 413535, 410217, 400500, 385057, 364887, 341174, 315143, 288069, 260793, 234101, 208579, 184633, + 162509, 142329, 124114, 107821, 99824, 94183, 88769, 83591, 78652, 73952, 69489, 65728, 62607, + 59612, 56743, 53996, 51369, 48858, 46459, 44170, 346244, 343695, 336213, 324262, 308578, 290031, + 269461, 247726, 225593, 203702, 182553, 162509, 143815, 126610, 110952, 101281, 95833, 90564, 85490, + 80619, 75958, 71510, 67283, 64168, 61170, 58289, 55523, 52871, 50331, 47900, 45574, 43352, + 290031, 288069, 282299, 273042, 260793, 246158, 229792, 212344, 194408, 176499, 159033, 142329, 126610, + 112021, 102023, 96848, 91796, 86890, 82147, 77580, 73195, 68998, 65521, 62543, 59671, 56905, + 54246, 51692, 49241, 46892, 44642, 42490, 243063, 241537, 237036, 229792, 220159, 208579, 195537, + 181524, 167000, 152375, 137989, 124114, 110952, 102023, 97190, 92422, 87749, 83193, 78772, 74501, + 70387, 66639, 63706, 60865, 58120, 55471, 52920, 50465, 48105, 45841, 43669, 41588, 203702, + 202507, 198980, 193287, 185686, 176499, 166089, 154827, 143069, 131139, 119314, 107821, 101281, 96848, + 92422, 88038, 83724, 79503, 75392, 71406, 67557, 64636, 61850, 59146, 56528, 53996, 51553, + 49198, 46932, 44753, 42660, 40652, 170715, 169776, 167000, 162509, 156491, 149185, 140862, 131804, + 122285, 112561, 103728, 99824, 95833, 91796, 87749, 83724, 79748, 75844, 72030, 68319, 65315, + 62607, 59966, 57397, 54904, 52489, 50154, 47900, 45726, 43634, 41621, 39687, 143069, 142329, + 140136, 136582, 131804, 125980, 119314, 112021, 104314, 101097, 97707, 94183, 90564, 86890, 83193, + 79503, 75844, 72239, 68706, 65728, 63120, 60563, 58064, 55628, 53258, 50959, 48731, 46577, + 44496, 42490, 40557, 38698, 119901, 119314, 117577, 114755, 110952, 106300, 102964, 100548, 97880, + 95002, 91952, 88769, 85490, 82147, 78772, 75392, 72030, 68706, 65867, 63379, 60926, 58515, + 56154, 53847, 51599, 49413, 47291, 45235, 43247, 41327, 39474, 37689, 102775, 102586, 102023, + 101097, 99824, 98229, 96338, 94183, 91796, 89212, 86466, 83591, 80619, 77580, 74501, 71406, + 68319, 65728, 63379, 61048, 58744, 56474, 54246, 52064, 49934, 47859, 45841, 43882, 41985, + 40150, 38377, 36666, 95833, 95666, 95167, 94345, 93215, 91796, 90110, 88184, 86045, 83724, + 81250, 78652, 75958, 73195, 70387, 67557, 65315, 63120, 60926, 58744, 56581, 54447, 52347, + 50287, 48271, 46303, 44387, 42524, 40716, 38964, 37269, 35632, 89361, 89212, 88769, 88038, + 87032, 85767, 84261, 82537, 80619, 78531, 76301, 73952, 71510, 68998, 66639, 64636, 62607, + 60563, 58515, 56474, 54447, 52442, 50465, 48521, 46616, 44753, 42935, 41164, 39444, 37774, + 36157, 34592, 83325, 83193, 82798, 82147, 81250, 80120, 78772, 77227, 75505, 73626, 71613, + 69489, 67283, 65521, 63706, 61850, 59966, 58064, 56154, 54246, 52347, 50465, 48605, 46773, + 44974, 43212, 41490, 39810, 38174, 36585, 35043, 33722, 77697, 77580, 77227, 76646, 75844, + 74833, 73626, 72239, 70690, 68998, 67211, 65728, 64168, 62543, 60865, 59146, 57397, 55628, + 53847, 52064, 50287, 48521, 46773, 45049, 43352, 41687, 40057, 38464, 36911, 35400, 34021, + 32902, 72450, 72344, 72030, 71510, 70792, 69886, 68803, 67557, 66427, 65246, 63969, 62607, + 61170, 59671, 58120, 56528, 54904, 53258, 51599, 49934, 48271, 46616, 44974, 43352, 41753, + 40181, 38639, 37131, 35658, 34249, 33154, 32080, 67557, 67462, 67211, 66853, 66357, 65728, + 64974, 64102, 63120, 62038, 60865, 59612, 58289, 56905, 55471, 53996, 52489, 50959, 49413, + 47859, 46303, 44753, 43212, 41687, 40181, 38698, 37242, 35814, 34419, 33335, 32287, 31258, + 63969, 63903, 63706, 63379, 62926, 62353, 61663, 60865, 59966, 58973, 57896, 56743, 55523, + 54246, 52920, 51553, 50154, 48731, 47291, 45841, 44387, 42935, 41490, 40057, 38639, 37242, + 35867, 34517, 33445, 32426, 31423, 30437, 60623, 60563, 60383, 60084, 59671, 59146, 58515, + 57784, 56960, 56048, 55058, 53996, 52871, 51692, 50465, 49198, 47900, 46577, 45235, 43882, + 42524, 41164, 39810, 38464, 37131, 35814, 34517, 33482, 32496, 31522, 30563, 29619, 57452, + 57397, 57232, 56960, 56581, 56101, 55523, 54853, 54096, 53258, 52347, 51369, 50331, 49241, + 48105, 46932, 45726, 44496, 43247, 41985, 40716, 39444, 38174, 36911, 35658, 34419, 33445, + 32496, 31556, 30627, 29710, 28807, 54447, 54397, 54246, 53996, 53650, 53210, 52680, 52064, + 51369, 50599, 49759, 48858, 47900, 46892, 45841, 44753, 43634, 42490, 41327, 40150, 38964, + 37774, 36585, 35400, 34249, 33335, 32426, 31522, 30627, 29741, 28865, 28002, 51599, 51553, + 51415, 51186, 50868, 50465, 49978, 49413, 48773, 48064, 47291, 46459, 45574, 44642, 43669, + 42660, 41621, 40557, 39474, 38377, 37269, 36157, 35043, 34021, 33154, 32287, 31423, 30563, + 29710, 28865, 28030, 27205, 48900, 48858, 48731, 48521, 48229, 47859, 47412, 46892, 46303, + 45650, 44937, 44170, 43352, 42490, 41588, 40652, 39687, 38698, 37689, 36666, 35632, 34592, + 33722, 32902, 32080, 31258, 30437, 29619, 28807, 28002, 27205, 26417}}; + +const uint8_t LUTCeilLog2Nonzero[1024] = { + 127, 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10}; + +//==========================================================// +// load data +//==========================================================// +void load_dct8_pixel(unsigned ysize, unsigned xsize, float* axi_opsin, hls::stream& opsin8x8_stream) { +#pragma HLS INLINE off + + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + uint32_t ysize64 = tile_ysize / 64; + uint32_t xsize64 = tile_xsize / 64; + +loop_load_dct8_pixel: + for (int i = 0; i < ysize64 * xsize64; i++) { + uint32_t addr = i * 4096 * 3; + for (int j = 0; j < 4096 * 3; j++) { +#pragma HLS PIPELINE II = 1 + float reg = axi_opsin[addr + j]; + opsin8x8_stream.write(reg); + } + } +} + +void load_dct16_pixel(unsigned ysize, unsigned xsize, float* axi_opsin, hls::stream& opsin16x16_stream) { +#pragma HLS INLINE off + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + uint32_t ysize64 = tile_ysize / 64; + uint32_t xsize64 = tile_xsize / 64; + +loop_load_dct16_pixel: + for (int i = 0; i < ysize64 * xsize64; i++) { + uint32_t addr = i * 4096 * 3; + for (int j = 0; j < 4096 * 3; j++) { +#pragma HLS PIPELINE II = 1 + float reg = axi_opsin[addr + j]; + opsin16x16_stream.write(reg); + } + } +} + +void load_dct32_pixel(unsigned ysize, unsigned xsize, float* axi_opsin, hls::stream& opsin32x32_stream) { +#pragma HLS INLINE off + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + uint32_t ysize64 = tile_ysize / 64; + uint32_t xsize64 = tile_xsize / 64; + +loop_load_dct32_pixel: + for (int i = 0; i < ysize64 * xsize64; i++) { + uint32_t addr = i * 4096 * 3; + for (int j = 0; j < 4096 * 3; j++) { +#pragma HLS PIPELINE II = 1 + float reg = axi_opsin[addr + j]; + opsin32x32_stream.write(reg); + } + } +} + +void loadPixel(unsigned ysize, + unsigned xsize, + float* axi_opsin_1, + float* axi_opsin_2, + float* axi_opsin_3, + hls::stream& opsin8x8_stream, + hls::stream& opsin16x16_stream, + hls::stream& opsin32x32_stream) { +#pragma HLS INLINE + load_dct8_pixel(ysize, xsize, axi_opsin_1, opsin8x8_stream); + load_dct16_pixel(ysize, xsize, axi_opsin_2, opsin16x16_stream); + load_dct32_pixel(ysize, xsize, axi_opsin_3, opsin32x32_stream); +} + +void load_rqf_mask(int xsize, + int ysize, + float* aq_map_f, + float* masking_field_row, + float* quant_field_row, + int stride, + hls::stream& stream_q, + hls::stream& stream_mask, + hls::stream& stream_rqf) { +#pragma HLS INLINE off + int xsize_blocks = xsize / 8; + int ysize_blocks = ysize / 8; + int n_enc_tiles = (xsize_blocks + 7) / 8; + uint32_t xnum_tile = (xsize_blocks + 7) / 8; + uint32_t ynum_tile = (ysize_blocks + 7) / 8; +LOOP_0: + for (int tid = 0; tid < xnum_tile * ynum_tile; tid++) { + int tx1 = tid % n_enc_tiles; + int ty1 = tid / n_enc_tiles; + int by = ty1 * 8; + int by1 = fmin((int)((ty1 + 1) * 8), ysize_blocks); + int bx = tx1 * 8; + int bx1 = fmin((int)((tx1 + 1) * 8), xsize_blocks); + int rect_ysize = by1 - by; + int rect_xsize = bx1 - bx; + LOOP_1: + for (int iy = 0; iy < rect_ysize; iy++) { + LOOP_2: + for (int ix = 0; ix < rect_xsize; ix++) { +#pragma HLS PIPELINE II = 1 + int x = 8 * (bx + ix); + int y = 8 * (by + iy); + int index0 = (y / 8 * stride) + x / 8; + float quant_norm8 = 0; + float masking = 0; + quant_norm8 = quant_field_row[index0]; + stream_q.write(quant_norm8); + masking = masking_field_row[index0]; + stream_mask.write(masking); + int index = (by + iy) * xsize_blocks + (bx + ix); + float rqf_tmp = aq_map_f[index]; + stream_rqf.write(rqf_tmp); + } + } + } +} + +//==========================================================================// +// data write out +//==========================================================================// +void ac_coeff_writeout(int xsize, int ysize, hls::stream& ac_coef_strm, int* ac_coef_axiout) { + unsigned xsizeblock = (xsize + 7) / 8; + unsigned ysizeblock = (ysize + 7) / 8; + for (int i = 0; i < xsizeblock * ysizeblock * 3 * 64; i++) { + ac_coef_axiout[i] = ac_coef_strm.read(); + } +} + +void dc_8x8_writeout(unsigned ysize, + unsigned xsize, + float* hls_dc8x8, + hls::stream& stream_rectx_dc0, + hls::stream& stream_recty_dc0, + hls::stream& dc_coef8x8_stream) { +#pragma HLS INLINE off + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + uint32_t ysize64 = tile_ysize / 64; + uint32_t xsize64 = tile_xsize / 64; + uint32_t ysize32 = tile_ysize / 32; + uint32_t xsize32 = tile_xsize / 32; + uint32_t ysize16 = tile_ysize / 16; + uint32_t xsize16 = tile_xsize / 16; + uint32_t ysize8 = tile_ysize / 8; + uint32_t xsize8 = tile_xsize / 8; + + // dc writeout + int N = 1; + int block_n = N * N; + int block_half_n = N * 8; + int xsize_blocks = xsize / 8; + int ysize_blocks = ysize / 8; + int n_enc_tiles = (xsize_blocks + 7) / 8; +loop_dc8_writeout: + for (uint32_t y64 = 0; y64 < ysize64; y64++) { + for (uint32_t x64 = 0; x64 < xsize64; x64++) { + int rect_ysize = stream_recty_dc0.read(); + int rect_xsize = stream_rectx_dc0.read(); + for (uint32_t y8 = 0; y8 < 8; y8++) { + for (uint32_t x8 = 0; x8 < 8; x8++) { + for (int c = 0; c < 3; c++) { +#pragma HLS PIPELINE II = 1 + int c_tmp = 0; + if (c == 0) { + c_tmp = 1; + } else if (c == 1) { + c_tmp = 0; + } else { + c_tmp = 2; + } + + size_t addr = y64 * xsize8 * 8 + x64 * 8 + y8 * xsize8 + x8; + + if (x8 < rect_xsize && y8 < rect_ysize) { + float reg = dc_coef8x8_stream.read(); + hls_dc8x8[c_tmp * tile_ysize * tile_xsize + addr] = reg; + } + } + } + } + } + } +} + +void dc_16x16_writeout(unsigned ysize, + unsigned xsize, + float* hls_dc16x16, + hls::stream& stream_rectx_dc1, + hls::stream& stream_recty_dc1, + hls::stream& dc_coef16x16_stream) { +#pragma HLS INLINE off + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + size_t ysize64 = tile_ysize / 64; + size_t xsize64 = tile_xsize / 64; + size_t ysize32 = tile_ysize / 32; + size_t xsize32 = tile_xsize / 32; + size_t ysize16 = tile_ysize / 16; + size_t xsize16 = tile_xsize / 16; + size_t ysize8 = tile_ysize / 8; + size_t xsize8 = tile_xsize / 8; + int N = 2; + int block_n = N * N; + int block_half_n = N * 8; + int xsize_blocks = xsize / 8; + int ysize_blocks = ysize / 8; + int n_enc_tiles = (xsize_blocks + 7) / 8; + for (size_t y64 = 0; y64 < ysize64; y64++) { + for (size_t x64 = 0; x64 < xsize64; x64++) { + int rect_ysize = stream_recty_dc1.read(); + int rect_xsize = stream_rectx_dc1.read(); + for (size_t y16 = 0; y16 < 4; y16++) { + for (size_t x16 = 0; x16 < 4; x16++) { + for (int c = 0; c < 3; c++) { + for (size_t m = 0; m < 2; m++) { + for (size_t n = 0; n < 2; n++) { +#pragma HLS PIPELINE II = 1 + // edge judgement + // int tx1 = x64; // tid % n_enc_tiles; + // int ty1 = y64; // tid / n_enc_tiles; + // int by = ty1 * 8; + // int by1 = fmin((int)((ty1 + 1) * 8), ysize_blocks); + // int bx = tx1 * 8; + // int bx1 = fmin((int)((tx1 + 1) * 8), xsize_blocks); + // int rect_ysize = by1 - by; + // int rect_xsize = bx1 - bx; + // int tile_xsize = (xsize + 63) / 64 * 64; + // int tile_ysize = (ysize + 63) / 64 * 64; + + int c_tmp = 0; + if (c == 0) { + c_tmp = 1; + } else if (c == 1) { + c_tmp = 0; + } else { + c_tmp = 2; + } + + size_t addr = + y64 * xsize16 * 4 * 4 + x64 * 4 * 4 + y16 * xsize16 * 4 + x16 * 4 + m * 2 + n; + + if ((2 * x16 + 1) < rect_xsize && (2 * y16 + 1) < rect_ysize) { + float reg = dc_coef16x16_stream.read(); + hls_dc16x16[c_tmp * tile_ysize * tile_xsize + addr] = reg; + } + } + } + } + } + } + } + } +} + +void dc_32x32_writeout(unsigned ysize, + unsigned xsize, + float* hls_dc32x32, + hls::stream& stream_rectx_dc2, + hls::stream& stream_recty_dc2, + hls::stream& dc_coef32x32_stream) { +#pragma HLS INLINE off + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + size_t ysize64 = tile_ysize / 64; + size_t xsize64 = tile_xsize / 64; + size_t ysize32 = tile_ysize / 32; + size_t xsize32 = tile_xsize / 32; + size_t ysize16 = tile_ysize / 16; + size_t xsize16 = tile_xsize / 16; + size_t ysize8 = tile_ysize / 8; + size_t xsize8 = tile_xsize / 8; + int N = 4; + int block_n = N * N; + int block_half_n = N * 8; + int xsize_blocks = xsize / 8; + int ysize_blocks = ysize / 8; + int n_enc_tiles = (xsize_blocks + 7) / 8; + for (size_t y64 = 0; y64 < ysize64; y64++) { + for (size_t x64 = 0; x64 < xsize64; x64++) { + int rect_ysize = stream_recty_dc2.read(); + int rect_xsize = stream_rectx_dc2.read(); + for (size_t y32 = 0; y32 < 2; y32++) { + for (size_t x32 = 0; x32 < 2; x32++) { + for (int c = 0; c < 3; c++) { + for (size_t m = 0; m < 4; m++) { + for (size_t n = 0; n < 4; n++) { +#pragma HLS PIPELINE II = 1 + // edge judgement + // int tx1 = x64; // tid % n_enc_tiles; + // int ty1 = y64; // tid / n_enc_tiles; + // int by = ty1 * 8; + // int by1 = fmin((int)((ty1 + 1) * 8), ysize_blocks); + // int bx = tx1 * 8; + // int bx1 = fmin((int)((tx1 + 1) * 8), xsize_blocks); + // int rect_ysize = by1 - by; + // int rect_xsize = bx1 - bx; + // int tile_xsize = (xsize + 63) / 64 * 64; + // int tile_ysize = (ysize + 63) / 64 * 64; + + int c_tmp = 0; + if (c == 0) { + c_tmp = 1; + } else if (c == 1) { + c_tmp = 0; + } else { + c_tmp = 2; + } + + size_t addr = y64 * xsize32 * 2 * 16 + x64 * 2 * 16 + y32 * xsize32 * 1 * 16 + + x32 * 1 * 16 + m * 4 + n; + + if ((4 * x32 + 3) < rect_xsize && (4 * y32 + 3) < rect_ysize) { + float reg = dc_coef32x32_stream.read(); + hls_dc32x32[c_tmp * tile_ysize * tile_xsize + addr] = reg; + } + } + } + } + } + } + } + } +} + +void GetDCSize(short xsize, + short ysize, + hls::stream& stream_rectx_dc, + hls::stream& stream_recty_dc, + hls::stream& stream_rectx0, + hls::stream& stream_recty0, + hls::stream& stream_rectx1, + hls::stream& stream_recty1, + hls::stream& stream_rectx2, + hls::stream& stream_recty2) { + uint16_t xsize_blocks = xsize / 8; + uint16_t ysize_blocks = ysize / 8; +LOOP_0: + for (uint16_t y = 0; y < (ysize_blocks + 8 - 1) / 8; y++) { + LOOP_1: + for (uint16_t x = 0; x < (xsize_blocks + 8 - 1) / 8; x++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty_dc.read(); + uint8_t rect_xsize = stream_rectx_dc.read(); + stream_rectx0.write(rect_xsize); + stream_recty0.write(rect_ysize); + stream_rectx1.write(rect_xsize); + stream_recty1.write(rect_ysize); + stream_rectx2.write(rect_xsize); + stream_recty2.write(rect_ysize); + } + } +} +void dc_writeout(unsigned ysize, + unsigned xsize, + float* hls_dc8x8, + float* hls_dc16x16, + float* hls_dc32x32, + + hls::stream& stream_rectx_dc, + hls::stream& stream_recty_dc, + hls::stream& dc_coef8x8_stream, + hls::stream& dc_coef16x16_stream, + hls::stream& dc_coef32x32_stream) { +#pragma HLS INLINE + hls::stream stream_rectx_dc0; + hls::stream stream_recty_dc0; + hls::stream stream_rectx_dc1; + hls::stream stream_recty_dc1; + hls::stream stream_rectx_dc2; + hls::stream stream_recty_dc2; + GetDCSize(xsize, ysize, stream_rectx_dc, stream_recty_dc, stream_rectx_dc0, stream_recty_dc0, stream_rectx_dc1, + stream_recty_dc1, stream_rectx_dc2, stream_recty_dc2); + dc_8x8_writeout(ysize, xsize, hls_dc8x8, stream_rectx_dc0, stream_recty_dc0, dc_coef8x8_stream); + dc_16x16_writeout(ysize, xsize, hls_dc16x16, stream_rectx_dc1, stream_recty_dc1, dc_coef16x16_stream); + dc_32x32_writeout(ysize, xsize, hls_dc32x32, stream_rectx_dc2, stream_recty_dc2, dc_coef32x32_stream); +} + +void cfl_writeout(unsigned xsize, + unsigned ysize, + hls::stream& cmapx_strm, + hls::stream& cmapb_strm, + int8_t* cmap_axi) { +#pragma HLS INLINE off + + int xnum_tile = (xsize + 63) / 64; + int ynum_tile = (ysize + 63) / 64; + int num_tile = xnum_tile * ynum_tile; + + for (int tid = 0; tid < num_tile; tid++) { +#pragma HLS PIPELINE II = 2 + cmap_axi[tid] = cmapx_strm.read(); + cmap_axi[num_tile + tid] = cmapb_strm.read(); + } +} + +void acs_rqf_writeout(int xsize, + int ysize, + unsigned char* strategy_all, + int* raw_quant_field_i, + hls::stream& stream_strategy, + hls::stream& stream_rqf) { +#pragma HLS INLINE off + int xsize_blocks = xsize / 8; + int ysize_blocks = ysize / 8; + uint32_t xnum_tile = (xsize_blocks + 7) / 8; + uint32_t ynum_tile = (ysize_blocks + 7) / 8; + ap_uint<64> visited; +LOOP_1: + for (uint8_t ty1 = 0; ty1 < ynum_tile; ty1++) { + LOOP_2: + for (uint8_t tx1 = 0; tx1 < xnum_tile; tx1++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + int by0 = ty1 * 8; + int by1 = fmin((int)((ty1 + 1) * 8), ysize_blocks); + int bx0 = tx1 * 8; + int bx1 = fmin((int)((tx1 + 1) * 8), xsize_blocks); + int rect_ysize = by1 - by0; + int rect_xsize = bx1 - bx0; + visited = 0; + LOOP_3: + for (uint8_t y = 0; y < rect_ysize; ++y) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_4: + for (uint8_t x = 0; x < rect_xsize; ++x) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + uint8_t idx = y * 8 + x; + if (visited.range(idx, idx) == 0) { + char strategy = stream_strategy.read(); + int rqf = stream_rqf.read(); + int b = strategy_block[strategy]; + LOOP_5: + for (uint8_t iy = 0; iy < b; iy++) { + LOOP_6: + for (uint8_t ix = 0; ix < b; ix++) { +#pragma HLS pipeline + uint16_t idx = (iy + y) * 8 + (ix + x); + visited.range(idx, idx) = 1; + uint16_t idxout = (y + by0 + iy) * xsize_blocks + (x + bx0 + ix); + strategy_all[(y + by0 + iy) * xsize_blocks + (x + bx0 + ix)] = strategy; + raw_quant_field_i[(y + by0 + iy) * xsize_blocks + (x + bx0 + ix)] = rqf; + } + } + } + } + } + } + } +} + +//=========================================================// +// module +//=========================================================// +// cfl ----------------------------------------------------- +void hls_CFLComputeTile(unsigned xsize, + unsigned ysize, + hls::stream& ac_coef_strm, + hls::stream& rqf_in_stream, + hls::stream& acs_strm, + hls::stream& cmapx_strm, + hls::stream& cmapb_strm, + hls::stream& cmapx_axi_strm, + hls::stream& cmapb_axi_strm, + hls::stream& ac_coef_cflout_strm, + hls::stream& rqf_out_stream, + hls::stream& acs_cflout_strm) { +#pragma HLS INLINE off + const uint8_t kDefaultColorFactor = 84U; + const float kInvColorFactor = 1.0f / kDefaultColorFactor; + const float kYToBRatio = 1.0f; + const float kDistanceMultiplierAC = 1e-3f; + + unsigned xsize_alg = (xsize + 7) / 8 * 8; + unsigned ysize_alg = (ysize + 7) / 8 * 8; + int xnum_tile = (xsize + 63) / 64; + int ynum_tile = (ysize + 63) / 64; + int num_tile = xnum_tile * ynum_tile; + unsigned tx0 = 0; + unsigned ty0 = 0; + + for (int tid = 0; tid < num_tile; tid++) { + ca_x_t ca_x = 0; + cb_x_t cb_x = 0; + ca_b_t ca_b = 0; + cb_b_t cb_b = 0; + unsigned xsize; + unsigned ysize; + + if (ty0 + 64 > ysize_alg) { + ysize = ysize_alg - ty0; + } else { + ysize = 64; + } + + if (tx0 + 64 > xsize_alg) { + xsize = xsize_alg - tx0; + } else { + xsize = 64; + } + + unsigned total_pix = xsize * ysize; + unsigned cur_pix = 0; + + while (cur_pix < total_pix) { + uint8_t acsRaw = acs_strm.read(); + acs_cflout_strm.write(acsRaw); + rqf_out_stream.write(rqf_in_stream.read()); + + float q = 27.996826171875; + float q_dc_x = 0.000218007407966069877147674560546875; + float q_dc_b = 0.00348811852745711803436279296875; + + unsigned csize; + if (acsRaw == 0) { + csize = 64; + } else if (acsRaw == 4) { + csize = 256; + } else if (acsRaw == 5) { + csize = 1024; + } + + int error_flag = 0; + + for (unsigned i = 0; i < csize; i++) { +#pragma HLS PIPELINE II = 3 + float b_y = ac_coef_strm.read(); + float b_x = ac_coef_strm.read(); + float b_b = ac_coef_strm.read(); + + ac_coef_cflout_strm.write(b_y); + ac_coef_cflout_strm.write(b_x); + ac_coef_cflout_strm.write(b_b); + + float qm_x; + float qm_b; + + if (acsRaw == 0) { + qm_x = qmx8x8[i]; + qm_b = qmb8x8[i]; + } else if (acsRaw == 4) { + qm_x = qmx16x16[i]; + qm_b = qmb16x16[i]; + } else if (acsRaw == 5) { + qm_x = qmx32x32[i]; + qm_b = qmb32x32[i]; + } + + float qqm_x = q * qm_x; + float qqm_b = q * qm_b; + + float coeffs_yx = b_y * qqm_x; + float coeffs_x = b_x * qqm_x; + float a = kInvColorFactor * coeffs_yx; + float b = 0.0f * coeffs_yx - coeffs_x; + ca_x = (ca_x_t)(a * a) + ca_x; + cb_x = (cb_x_t)(a * b) + cb_x; + + float coeffs_yb = b_y * qqm_b; + float coeffs_b = b_b * qqm_b; + + a = kInvColorFactor * coeffs_yb; + b = kYToBRatio * coeffs_yb - coeffs_b; + ca_b = (ca_b_t)(a * a) + ca_b; + cb_b = (cb_b_t)(a * b) + cb_b; + + cur_pix++; + } + } + + float x; + x = -(float)cb_x / ((float)ca_x + total_pix * kDistanceMultiplierAC * 0.5f); + int8_t cmap_x_reg = hls::max(-128.0f, hls::min(127.0f, hls::roundf(x))); + cmapx_strm.write(cmap_x_reg); + cmapx_axi_strm.write(cmap_x_reg); + + x = -(float)cb_b / ((float)ca_b + total_pix * kDistanceMultiplierAC * 0.5f); + int8_t cmap_b_reg = hls::max(-128.0f, hls::min(127.0f, hls::roundf(x))); + cmapb_strm.write(cmap_b_reg); + cmapb_axi_strm.write(cmap_b_reg); + + // printf("cmap_x:%d, cmap_b:%d\n", (int32_t)cmap_x_reg, (int32_t)cmap_b_reg); + + if (tx0 + 64 >= xsize_alg) { + tx0 = 0; + ty0 = ty0 + 64; + } else { + tx0 = tx0 + 64; + } + } +} +//--------------------------hls_compute_coefficients--------------------------// +float adjustQuantBias(size_t c, int32_t quant_i, const float* biases) { + int32_t min = INT32_MIN; + cast mi, ani, anno; + mi.i = min; + int32_t and_result = quant_i & mi.i; + ani.i = and_result; + float sign = ani.f; + // int32_t and_no_result = (~mi.i) & quant_i; + // anno.i = and_no_result; + float abs_quant = hls::abs(quant_i); + // printf("%f %f\n", sign, abs_quant); + bool is_01 = abs_quant < 1.125f; + bool not_0 = abs_quant > 0; + cast bi, si; + bi.f = biases[c]; + int32_t iTmp = bi.i ^ ani.i; + si.i = iTmp; + float one_bias = not_0 ? (si.f) : 0; + float tmp = quant_i ? (1.0 / quant_i) : 0.0f; + float bias = quant_i - biases[3] * tmp; + return is_01 ? one_bias : bias; +} + +void hls_ComputeCoefficients(uint32_t xsize, + uint32_t ysize, + hls::stream& acsStrm, + hls::stream& dctStrm, + hls::stream& quantFieldStrm, + hls::stream& ytoxMapStrm, + hls::stream& ytobMapStrm, + hls::stream& acs_coeff_stream1, + hls::stream& coeffOutStrm, + hls::stream& coeff_axi_stream, + hls::stream& acs_axi_strm, + hls::stream& qf_axi_strm) { +#pragma HLS INLINE off + uint8_t acs; + uint8_t xblocks, yblocks; + int8_t ytox_map, ytob_map; + float x_factor, b_factor; + float qm_multiplier = 1.0f; + bool stop(false); + float coef_dct[3]; +#pragma HLS ARRAY_PARTITION variable = coef_dct complete dim = 1 + ap_uint<32> offset; + int block_out; + float thr_x, thr_y, thr_b, out_x, out_b; + + float thresy[4] = {0.5f, 0.6f, 0.6f, 0.65f}; + float thresxb[4] = {0.5f, 0.75f, 0.75f, 0.75f}; +#pragma HLS ARRAY_PARTITION variable = thresy complete dim = 1 +#pragma HLS ARRAY_PARTITION variable = thresxb complete dim = 1 + + uint32_t xsize_blocks = (xsize + 7) / 8; + uint32_t ysize_blocks = (ysize + 7) / 8; + uint16_t xsize_tails = DivCeil(xsize_blocks, kEncTileDimInBlocks); + uint16_t ysize_tails = DivCeil(ysize_blocks, kEncTileDimInBlocks); + uint16_t xsize_left = 8 - (xsize_tails * 8 - xsize_blocks); // not aligned for blocks + uint16_t ysize_left = 8 - (ysize_tails * 8 - ysize_blocks); // not aligned for blocks + uint16_t num_blocks; + + for (uint16_t ty = 0; ty < ysize_tails; ++ty) { + for (uint16_t tx = 0; tx < xsize_tails; ++tx) { + ytoxMapStrm.read(ytox_map); + ytobMapStrm.read(ytob_map); + x_factor = base_correlation_x + ytox_map * color_scale; + b_factor = base_correlation_b + ytob_map * color_scale; + if (tx == (xsize_tails - 1) && ty != (ysize_tails - 1)) { + num_blocks = xsize_left * 8; + } else if (tx != (xsize_tails - 1) && ty == (ysize_tails - 1)) { + num_blocks = ysize_left * 8; + } else if (tx == (xsize_tails - 1) && ty == (ysize_tails - 1)) { + num_blocks = xsize_left * ysize_left; + } else { + num_blocks = 64; + } + + uint32_t total = num_blocks * 64; + uint32_t cur = 0; + ap_uint<16> size = 0, count = 0; + ap_uint<16> y, x; + int quant; + float qac, fquant, inv_qac; + + while (cur < total) { +#pragma HLS PIPELINE II = 3 + if (count == 0) { + acsStrm.read(acs); + acs_axi_strm.write(acs); + acs_coeff_stream1.write(acs); + if (acs == Type::DCT) { + xblocks = 1; + yblocks = 1; + } else if (acs == Type::DCT16X16) { + xblocks = 2; + yblocks = 2; + } else { + xblocks = 4; + yblocks = 4; + } + + size = kDCTBlockSize * xblocks * yblocks; + quant = quantFieldStrm.read(); + qf_axi_strm.write(quant); + qac = global_scale_float * quant; + fquant = qac * qm_multiplier; // fquant_table[quant - 1]; + inv_qac = inv_global_scale / quant; // inv_qac_table[quant - 1]; + } + y = count / (yblocks * kBlockDim); + x = count % (xblocks * kBlockDim); + ap_uint<32> off; + ap_uint<32> yfix; + if (x == 0) { + off = y * kBlockDim * xblocks; + ap_uint<32> yhalf = yblocks * 4; // ysize * kBlockDim / 2 + if (y >= yhalf) + yfix = 2; + else + yfix = 0; + } + + thr_x = 0; + thr_y = 0; + thr_b = 0; + if (xblocks == 1) { + if (x >= 4) { + thr_x = thresxb[yfix + 1]; //(c == 1) ? thresy[yfix + 1] : thresxb[yfix + 1]; + thr_y = thresy[yfix + 1]; + thr_b = thresxb[yfix + 1]; + } else { + thr_x = thresxb[yfix]; //(c == 1) ? thresy[yfix + 1] : + // thresxb[yfix + 1]; + thr_y = thresy[yfix]; + thr_b = thresxb[yfix]; + } + } else { + ap_uint<32> xhalf = xblocks * 4; // xsize * kBlockDim / 2 + ap_uint<32> xfix; + if (x < xhalf) + xfix = 0; + else + xfix = 1; + thr_x = thresxb[yfix + xfix]; + thr_y = thresy[yfix + xfix]; + thr_b = thresxb[yfix + xfix]; // thr = (c == 1) ? thresy[yfix + + // xfix] : thresxb[yfix + xfix]; + } + + float q_x; + float q_y; + float q_b; + if (acs == Type::DCT) { + q_x = inv_dequant_stable[0 + off + x] * fquant; + q_y = inv_dequant_stable[64 + off + x] * fquant; + q_b = inv_dequant_stable[128 + off + x] * fquant; + } else if (acs == Type::DCT16X16) { + q_x = inv_dequant_stable[768 + off + x] * fquant; + q_y = inv_dequant_stable[1024 + off + x] * fquant; + q_b = inv_dequant_stable[1280 + off + x] * fquant; + } else if (acs == Type::DCT32X32) { + q_x = inv_dequant_stable[1536 + off + x] * fquant; + q_y = inv_dequant_stable[2560 + off + x] * fquant; + q_b = inv_dequant_stable[3584 + off + x] * fquant; + } + + coef_dct[1] = dctStrm.read(); + coef_dct[0] = dctStrm.read(); + coef_dct[2] = dctStrm.read(); + + float val_y; + val_y = q_y * coef_dct[1]; + + bool nzero_mask_y = hls::abs(val_y) >= thr_y; + + int32_t v_y; + if (nzero_mask_y) { + v_y = hls::roundf(val_y); + } else { + v_y = 0; + } + + float adj_quant = adjustQuantBias(1, v_y, kDefaultQuantBias); + float dequantm; + if (acs == Type::DCT) { + dequantm = dequant_table[64 + off + x]; + } else if (acs == Type::DCT16X16) { + dequantm = dequant_table[1024 + off + x]; + } else if (acs == Type::DCT32X32) { + dequantm = dequant_table[2560 + off + x]; + } + coef_dct[1] = adj_quant * dequantm * inv_qac; + + out_x = coef_dct[0] - x_factor * coef_dct[1]; + coef_dct[0] = out_x; + + out_b = coef_dct[2] - b_factor * coef_dct[1]; + coef_dct[2] = out_b; + + float val_x; //= q * coef_dct[c]; // block_in[off + x] + float val_b; + val_x = q_x * coef_dct[0]; + val_b = q_b * coef_dct[2]; + + bool nzero_mask_x = hls::abs(val_x) >= thr_x; + + bool nzero_mask_b = hls::abs(val_b) >= thr_b; + + int32_t v_x; + + int32_t v_b; + if (nzero_mask_x) { + v_x = hls::roundf(val_x); + } else { + v_x = 0; + } + + if (nzero_mask_b) { + v_b = hls::roundf(val_b); + } else { + v_b = 0; + } + + coeffOutStrm.write(v_y); + coeffOutStrm.write(v_x); + coeffOutStrm.write(v_b); + + coeff_axi_stream.write(v_y); + coeff_axi_stream.write(v_x); + coeff_axi_stream.write(v_b); + cur++; + count++; + if (count == size) count = 0; + } // while + } // tx + } // ty +} + +//--------------------- Compute ALL orders---------------------// +template // opt1:256(slow), opt2:8(fast) +void hls_sort(int size, + hls::stream& count_instrm, + hls::stream& pos_instrm, + hls::stream& pos_outstrm) { + unsigned count_shift[RANGE]; + unsigned pos_shift[RANGE]; + ap_uint cmp = 0; + + for (int i = 0; i < RANGE; i++) { +#pragma HLS UNROLL + count_shift[i] = 0; + } + + for (int i = 0; i < size + RANGE; i++) { +#pragma HLS PIPELINE II = 1 + unsigned count_reg; + unsigned pos_reg; + if (i < size) { + pos_reg = pos_instrm.read(); + count_reg = count_instrm.read(); + } else { + count_reg = -1; + pos_reg = -1; + } + + for (int i = 0; i < RANGE; i++) { +#pragma HLS UNROLL + cmp[i] = count_reg >= count_shift[i]; + } + + if (i >= RANGE) { + pos_outstrm.write(pos_shift[0]); + } + + for (int i = 1; i < RANGE; i++) { +#pragma HLS UNROLL + if (cmp[i] == 1) { + count_shift[i - 1] = count_shift[i]; + pos_shift[i - 1] = pos_shift[i]; + } + } + + unsigned insert_pos; + ap_uint cmp_br = ~cmp; + cmp_br.reverse(); + if (cmp_br == 0) { + insert_pos = RANGE - 1; + } else if (cmp == 0) { + insert_pos = 0; + } else { + insert_pos = cmp_br.countLeadingZeros() - 1; + } + + count_shift[insert_pos] = count_reg; + pos_shift[insert_pos] = pos_reg; + } +} + +void hls_sort_top(hls::stream& count_instrm, + hls::stream& pos_instrm, + hls::stream& pos_outstrm) { + unsigned sz; + for (uint8_t o = 0; o < 2; ++o) { + if (o == 0) { + sz = 64; + } else { + sz = 256; + } + + for (uint8_t c = 0; c < 3; c++) { +#ifndef __SYNTHESIS__ + hls_sort<8>(sz, count_instrm, pos_instrm, pos_outstrm); +#else + hls_sort<8>(sz, count_instrm, pos_instrm, pos_outstrm); +#endif + } + } +} + +void init_numzeros(int32_t num_zeros[3][320]) { + for (int i = 0; i < 320; i++) { + for (int c = 0; c < 3; c++) { +#pragma HLS PIPELINE II = 1 + num_zeros[c][i] = 0; + } + } +} + +void count_numzeros(unsigned xsize, + unsigned ysize, + hls::stream& ac_strategy_strm, + hls::stream& ac_coef_quant_strm, + hls::stream >& used_orders_strm, + int32_t num_zeros[3][320]) { +#pragma HLS INLINE off + unsigned xsize_alg = (xsize + 7) / 8 * 8; + unsigned ysize_alg = (ysize + 7) / 8 * 8; + unsigned total_pix = xsize_alg * ysize_alg; + unsigned cur_pix = 0; + + const int32_t offset8x8 = 0; + // const int32_t offsetIDT = 64; + const int32_t offset16x16 = 64; + + ap_uint<3> used_orders_ap = 0; + + while (cur_pix < total_pix) { + uint8_t acsRaw = ac_strategy_strm.read(); + unsigned size; + if (acsRaw == 0) { + size = 64; + used_orders_ap[0] = 1; + } else if (acsRaw != 0 && acsRaw < 4) { + used_orders_ap[1] = 1; + size = 64; + } else if (acsRaw == 4) { + used_orders_ap[2] = 1; + size = 256; + } else if (acsRaw == 5) { + size = 1024; + } + cur_pix = cur_pix + size; + + for (unsigned k = 0; k < size; k++) { + for (int c = 0; c < 3; c++) { +#pragma HLS PIPELINE II = 1 +#pragma HLS DEPENDENCE variable = num_zeros type = inter false + bool is_zerox = ac_coef_quant_strm.read() == 0; + if (is_zerox) { + if (acsRaw == 0) { + num_zeros[c][offset8x8 + k]++; + } else if (acsRaw == 4) { + num_zeros[c][offset16x16 + k]++; + } + } + } + } + } + + used_orders_strm.write(used_orders_ap); + num_zeros[0][offset8x8] = -1; + num_zeros[0][offset16x16 + 0] = -1; + num_zeros[0][offset16x16 + 1] = -1; + num_zeros[0][offset16x16 + 16] = -1; + num_zeros[0][offset16x16 + 17] = -1; + num_zeros[1][offset8x8] = -1; + num_zeros[1][offset16x16 + 0] = -1; + num_zeros[1][offset16x16 + 1] = -1; + num_zeros[1][offset16x16 + 16] = -1; + num_zeros[1][offset16x16 + 17] = -1; + num_zeros[2][offset8x8] = -1; + num_zeros[2][offset16x16 + 0] = -1; + num_zeros[2][offset16x16 + 1] = -1; + num_zeros[2][offset16x16 + 16] = -1; + num_zeros[2][offset16x16 + 17] = -1; +} + +void load_nz2strm(int32_t num_zeros[3][320], hls::stream& count_strm, hls::stream& pos_strm) { + const int32_t offset8x8 = 0; + // const int32_t offsetIDT = 64; + const int32_t offset16x16 = 64; + + const uint32_t coef8x8_zigzag[64] = {0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63}; + const uint32_t coef16x16_zigzag[256] = { + 0, 1, 16, 17, 32, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4, 5, 20, 35, 50, 65, 80, 96, + 81, 66, 51, 36, 21, 6, 7, 22, 37, 52, 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, + 8, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100, 85, 70, 55, 40, 25, 10, + 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87, 72, 57, + 42, 27, 12, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, 224, 209, 194, 179, 164, + 149, 134, 119, 104, 89, 74, 59, 44, 29, 14, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, + 195, 210, 225, 240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91, 76, 61, 46, 31, 47, 62, 77, + 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242, 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93, + 78, 63, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185, 170, 155, 140, + 125, 110, 95, 111, 126, 141, 156, 171, 186, 201, 216, 231, 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, + 143, 158, 173, 188, 203, 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235, 250, 251, + 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255}; + + unsigned sz; + + for (uint8_t o = 0; o < 2; ++o) { + float inv_sqrt_sz; + + if (o == 0) { + sz = 64; + inv_sqrt_sz = 1.0f / 8.0f; + } else { + sz = 256; + inv_sqrt_sz = 1.0f / 16.0f; + } + + for (uint8_t c = 0; c < 3; c++) { + for (unsigned i = 0; i < sz; ++i) { +#pragma HLS PIPELINE II = 1 + unsigned pos; + if (o == 0) { + pos = coef8x8_zigzag[i]; + } else { + pos = coef16x16_zigzag[i]; + } + + // We don't care for the exact number -> quantize number of zeros, + // to get less permuted order. + if (o == 0) { + pos_strm.write(pos); + count_strm.write(num_zeros[c][offset8x8 + pos] * inv_sqrt_sz + 0.1f); + } else { + pos_strm.write(pos); + count_strm.write(num_zeros[c][offset16x16 + pos] * inv_sqrt_sz + 0.1f); + } + } + } + } +} + +void order_writeout(hls::stream >& used_orders_strm, + hls::stream& pos_strm, + uint32_t hls_order[320 * 3 + 1] // AXI port + ) { + const int32_t offset8x8 = 0; + // const int32_t offsetIDT = 64; + const int32_t offset16x16 = 64; + + const uint32_t coef8x8_zigzag[64] = {0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63}; + const uint32_t coef16x16_zigzag[256] = { + 0, 1, 16, 17, 32, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4, 5, 20, 35, 50, 65, 80, 96, + 81, 66, 51, 36, 21, 6, 7, 22, 37, 52, 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, + 8, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100, 85, 70, 55, 40, 25, 10, + 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87, 72, 57, + 42, 27, 12, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, 224, 209, 194, 179, 164, + 149, 134, 119, 104, 89, 74, 59, 44, 29, 14, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, + 195, 210, 225, 240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91, 76, 61, 46, 31, 47, 62, 77, + 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242, 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93, + 78, 63, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185, 170, 155, 140, + 125, 110, 95, 111, 126, 141, 156, 171, 186, 201, 216, 231, 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, + 143, 158, 173, 188, 203, 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235, 250, 251, + 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255}; + + unsigned sz; + ap_uint<3> used_orders_ap = used_orders_strm.read(); + for (uint8_t o = 0; o < 2; ++o) { + if (o == 0) { + sz = 64; + } else { + sz = 256; + } + + bool is_nondefault = false; + for (uint8_t c = 0; c < 3; c++) { + for (unsigned i = 0; i < sz; ++i) { +#pragma HLS PIPELINE II = 1 + unsigned pos_reg = pos_strm.read(); + if (o == 0) { + hls_order[c * 320 + offset8x8 + i] = pos_reg; + } else { + hls_order[c * 320 + offset16x16 + i] = pos_reg; + } + if (o == 0) { + is_nondefault |= coef8x8_zigzag[i] != pos_reg; + } else { + is_nondefault |= coef16x16_zigzag[i] != pos_reg; + } + } + } + if (!is_nondefault) { + if (o == 0) + used_orders_ap[0] = 0; + else + used_orders_ap[2] = 0; + } + } + hls_order[320 * 3] = used_orders_ap; +} + +void order_finalize_dataflow(hls::stream >& used_orders_strm, + int32_t num_zeros[3][320], + uint32_t hls_order[320 * 3 + 1]) { +// #pragma HLS INTERFACE mode = m_axi bundle = mm1 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ +// hls_opsin_1 +// #pragma HLS INTERFACE mode = m_axi bundle = mm2 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ +// hls_opsin_2 +// #pragma HLS INTERFACE mode = m_axi bundle = mm3 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ +// hls_opsin_3 +// #pragma HLS INTERFACE mode = m_axi bundle = mm4 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ +// BLOCK8_H* BLOCK8_W port = quant_field_row +// #pragma HLS INTERFACE mode = m_axi bundle = mm5 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ +// BLOCK8_H* BLOCK8_W port = masking_field_row +// #pragma HLS INTERFACE mode = m_axi bundle = mm6 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ +// BLOCK8_H* BLOCK8_W port = aq_map_f +// #pragma HLS INTERFACE mode = m_axi bundle = mm7 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ +// TILE_W* TILE_H* 2 port = cmap_axi +// #pragma HLS INTERFACE mode = m_axi bundle = mm8 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ +// ac_coef_axiout +// #pragma HLS INTERFACE mode = m_axi bundle = mm9 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ +// BLOCK8_W* BLOCK8_H port = strategy_all +// #pragma HLS INTERFACE mode = m_axi bundle = mm10 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ +// BLOCK8_H* BLOCK8_W port = raw_quant_field_i +#pragma HLS INTERFACE mode = m_axi bundle = mm11 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = MAX_ORDER port = \ + hls_order +// #pragma HLS INTERFACE mode = m_axi bundle = mm12 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ +// hls_dc8x8 +// #pragma HLS INTERFACE mode = m_axi bundle = mm13 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ +// hls_dc16x16 +// #pragma HLS INTERFACE mode = m_axi bundle = mm14 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ +// hls_dc32x32 +// #pragma HLS INTERFACE mode = m_axi bundle = mm15 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = MAX_NUM_CONFIG port = \ +// config +// #pragma HLS INTERFACE mode = m_axi bundle = mm16 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = MAX_NUM_CONFIG port = \ +// config_fl +#pragma HLS DATAFLOW + hls::stream count_instrm("count_instrm"); + hls::stream pos_instrm("pos_instrm"); + hls::stream pos_outstrm("pos_outstrm"); + + load_nz2strm(num_zeros, count_instrm, pos_instrm); + + hls_sort_top(count_instrm, pos_instrm, pos_outstrm); + + order_writeout(used_orders_strm, pos_outstrm, hls_order); +} + +//-------------------------- dct --------------------------// +// dct8x8 +void hls_DCT1DImpl_8x8(float in[64], float out[64]) { +#pragma HLS INLINE off + float kMultipliers_N8_c1 = 0.5097955791041592; + float kMultipliers_N8_c2 = 0.6013448869350453; + float kMultipliers_N8_c3 = 0.8999762231364156; + float kMultipliers_N8_c4 = 2.5629154477415055; + float kMultipliers_N4_c1 = 0.541196100146197; + float kMultipliers_N4_c2 = 1.3065629648763764; + float sqrt2 = 1.4142135623730951f; + +loop_dct8x8: + for (int i = 0; i < 8; i += 1) { +#pragma HLS DEPENDENCE variable = in inter false +#pragma HLS DEPENDENCE variable = out inter false +#pragma HLS LOOP_FLATTEN off +#pragma HLS pipeline II = 2 + float tmp8_0 = in[i * 8 + 0] + in[i * 8 + 7]; + float tmp8_1 = in[i * 8 + 1] + in[i * 8 + 6]; + float tmp8_2 = in[i * 8 + 2] + in[i * 8 + 5]; + float tmp8_3 = in[i * 8 + 3] + in[i * 8 + 4]; + float tmp8_4 = in[i * 8 + 0] - in[i * 8 + 7]; + float tmp8_5 = in[i * 8 + 1] - in[i * 8 + 6]; + float tmp8_6 = in[i * 8 + 2] - in[i * 8 + 5]; + float tmp8_7 = in[i * 8 + 3] - in[i * 8 + 4]; + + float t00 = tmp8_0 + tmp8_3; + float t01 = tmp8_1 + tmp8_2; + float t02 = tmp8_0 - tmp8_3; + float t03 = tmp8_1 - tmp8_2; + + float t16 = tmp8_4 * kMultipliers_N8_c1; + float t17 = tmp8_5 * kMultipliers_N8_c2; + float t18 = tmp8_6 * kMultipliers_N8_c3; + float t19 = tmp8_7 * kMultipliers_N8_c4; + + // tmp 0~3 + float t04 = t00 + t01; + float t05 = t00 - t01; + float t06 = t02 * kMultipliers_N4_c1; + float t07 = t03 * kMultipliers_N4_c2; + + float t09 = t05; + float t10 = t06 + t07; + float t11 = t06 - t07; + + float t13 = t09; + float t14 = t10 * sqrt2 + t11; + float t15 = t11; + // tmp 4~7 + float t00_a = t16 + t19; + float t01_a = t17 + t18; + float t02_a = t16 - t19; + float t03_a = t17 - t18; + + float t04_a = t00_a + t01_a; + float t05_a = t00_a - t01_a; + float t06_a = t02_a * kMultipliers_N4_c1; + float t07_a = t03_a * kMultipliers_N4_c2; + + float t08_a = t04_a; + float t09_a = t05_a; + float t10_a = t06_a + t07_a; + float t11_a = t06_a - t07_a; + + float t12_a = t08_a; + float t13_a = t09_a; + float t14_a = t10_a * sqrt2 + t11_a; + float t15_a = t11_a; + + float tmp8_out1 = t14; + float tmp8_out2 = t05; + float tmp8_out3 = t15; + float tmp8_out4 = t12_a * sqrt2 + t14_a; + float tmp8_out5 = t14_a + t13_a; + float tmp8_out6 = t13_a + t15_a; + float tmp8_out7 = t15_a; + + out[i * 8 + 0] = t04; + out[i * 8 + 1] = tmp8_out4; + out[i * 8 + 2] = tmp8_out1; + out[i * 8 + 3] = tmp8_out5; + out[i * 8 + 4] = t05; + out[i * 8 + 5] = tmp8_out6; + out[i * 8 + 6] = tmp8_out3; + out[i * 8 + 7] = tmp8_out7; + } +} + +void hls_TransposeBlock8(float in[64], float out[64]) { +#pragma HLS INLINE off +loop_transposeBlock8: + for (int m = 0; m < 8; m++) { + for (int n = 0; n < 8; n++) { +#pragma HLS pipeline II = 1 + float mul = 1.0f / 8.0f; + out[n * 8 + m] = mul * in[m * 8 + n]; + } + } +} + +void dct8_block(float in[1024], float out[1024]) { +#pragma HLS INLINE off + float kMultipliers_N8_c1 = 0.5097955791041592; + float kMultipliers_N8_c2 = 0.6013448869350453; + float kMultipliers_N8_c3 = 0.8999762231364156; + float kMultipliers_N8_c4 = 2.5629154477415055; + float kMultipliers_N4_c1 = 0.541196100146197; + float kMultipliers_N4_c2 = 1.3065629648763764; + float sqrt2 = 1.4142135623730951f; + +loop_dct_block: + for (ap_uint<8> by = 0; by < 4; by++) { + for (ap_uint<8> bx = 0; bx < 4; bx++) { + for (ap_uint<8> x = 0; x < 8; x++) { +#pragma HLS DEPENDENCE variable = in inter false +#pragma HLS DEPENDENCE variable = out inter false +#pragma HLS LOOP_FLATTEN off +#pragma HLS pipeline + int addr = 8 * x + bx * 64 + by * 256; + + float mem_0 = in[addr + 0]; + float mem_1 = in[addr + 1]; + float mem_2 = in[addr + 2]; + float mem_3 = in[addr + 3]; + float mem_4 = in[addr + 4]; + float mem_5 = in[addr + 5]; + float mem_6 = in[addr + 6]; + float mem_7 = in[addr + 7]; + + float tmp8_0 = mem_0 + mem_7; + float tmp8_1 = mem_1 + mem_6; + float tmp8_2 = mem_2 + mem_5; + float tmp8_3 = mem_3 + mem_4; + float tmp8_4 = mem_0 - mem_7; + float tmp8_5 = mem_1 - mem_6; + float tmp8_6 = mem_2 - mem_5; + float tmp8_7 = mem_3 - mem_4; + + float t00 = tmp8_0 + tmp8_3; + float t01 = tmp8_1 + tmp8_2; + float t02 = tmp8_0 - tmp8_3; + float t03 = tmp8_1 - tmp8_2; + + float t16 = tmp8_4 * kMultipliers_N8_c1; + float t17 = tmp8_5 * kMultipliers_N8_c2; + float t18 = tmp8_6 * kMultipliers_N8_c3; + float t19 = tmp8_7 * kMultipliers_N8_c4; + + // tmp 0~3 + float t04 = t00 + t01; + float t05 = t00 - t01; + float t06 = t02 * kMultipliers_N4_c1; + float t07 = t03 * kMultipliers_N4_c2; + + float t08 = t04; + float t09 = t05; + float t10 = t06 + t07; + float t11 = t06 - t07; + + float t12 = t08; + float t13 = t09; + float t14 = t10 * sqrt2 + t11; + float t15 = t11; + // tmp 4~7 + float t00_a = t16 + t19; + float t01_a = t17 + t18; + float t02_a = t16 - t19; + float t03_a = t17 - t18; + + float t04_a = t00_a + t01_a; + float t05_a = t00_a - t01_a; + float t06_a = t02_a * kMultipliers_N4_c1; + float t07_a = t03_a * kMultipliers_N4_c2; + + float t08_a = t04_a; + float t09_a = t05_a; + float t10_a = t06_a + t07_a; + float t11_a = t06_a - t07_a; + + float t12_a = t08_a; + float t13_a = t09_a; + float t14_a = t10_a * sqrt2 + t11_a; + float t15_a = t11_a; + + float tmp8_out0 = t12; + float tmp8_out1 = t14; + float tmp8_out2 = t13; + float tmp8_out3 = t15; + float tmp8_out4 = t12_a * sqrt2 + t14_a; + float tmp8_out5 = t14_a + t13_a; + float tmp8_out6 = t13_a + t15_a; + float tmp8_out7 = t15_a; + + out[addr + 0] = tmp8_out0; + out[addr + 1] = tmp8_out4; + out[addr + 2] = tmp8_out1; + out[addr + 3] = tmp8_out5; + out[addr + 4] = tmp8_out2; + out[addr + 5] = tmp8_out6; + out[addr + 6] = tmp8_out3; + out[addr + 7] = tmp8_out7; + } + } + } +} + +void hls_TransposeBlock_dct8(float in[1024], float out[1024]) { +#pragma HLS INLINE off + for (ap_uint<8> by = 0; by < 4; by++) { + for (ap_uint<8> bx = 0; bx < 4; bx++) { + for (ap_uint<8> y = 0; y < 8; y++) { + for (ap_uint<8> x = 0; x < 8; x++) { +#pragma HLS pipeline II = 1 + ap_uint<10> addr_i, addr_o; + addr_i(9, 8) = by(1, 0); + addr_i(7, 5) = x(2, 0); + addr_i(4, 3) = bx(1, 0); + addr_i(2, 0) = y(2, 0); + addr_o(9, 8) = by(1, 0); + addr_o(7, 5) = y(2, 0); + addr_o(4, 3) = bx(1, 0); + addr_o(2, 0) = x(2, 0); + float mul = 1.0f / 8.0f; + out[addr_o] = mul * in[addr_i]; + } + } + } + } +} + +void split_ac_dc_dct8(float in[64], float to_ac[64], float to_dc[1]) { +#pragma HLS INLINE off + for (int m = 0; m < 8; m++) { + for (int n = 0; n < 8; n++) { +#pragma HLS PIPELINE II = 1 + to_ac[8 * m + n] = in[8 * m + n]; + if (m == 0 && n == 0) { + to_dc[0] = in[0]; + } + } + } +} + +void feed_ac_dct8(uint32_t x8, + uint32_t y8, + hls::stream& stream_recty, + hls::stream& stream_rectx, + float in[64], + hls::stream& ac_coef8x8_stream) { +#pragma HLS INLINE off + uint8_t rect_xsize; + uint8_t rect_ysize; +hls_feed_b64: + for (int m = 0; m < 8; m++) { + for (int n = 0; n < 8; n++) { +#pragma HLS PIPELINE II = 1 + if (m == 0 && n == 0) { + rect_xsize = stream_rectx.read(); + rect_ysize = stream_recty.read(); + } + if (x8 < rect_xsize && y8 < rect_ysize) { + ac_coef8x8_stream.write(in[m * 8 + n]); + } + } + } +} + +void feed_dc_dct8(uint32_t x8, + uint32_t y8, + hls::stream& stream_recty, + hls::stream& stream_rectx, + float in[1], + hls::stream& dc_coef8x8_stream) { +#pragma HLS INLINE off + uint8_t rect_xsize; + uint8_t rect_ysize; +hls_feed_b64: + for (int m = 0; m < 8; m++) { + for (int n = 0; n < 8; n++) { +#pragma HLS PIPELINE II = 1 + if (m == 0 && n == 0) { + rect_xsize = stream_rectx.read(); + rect_ysize = stream_recty.read(); + } + if (x8 < rect_xsize && y8 < rect_ysize) { + if (m == 0 && n == 0) { + dc_coef8x8_stream.write(in[0]); + } + } + } + } +} + +void load_b64(float temp0[64], hls::stream& opsin8x8_stream) { +#pragma HLS INLINE off + for (int m = 0; m < 8; m++) { + for (int n = 0; n < 8; n++) { +#pragma HLS PIPELINE II = 1 + int addr = n * 8 + m; + temp0[addr] = opsin8x8_stream.read(); + } + } +} + +void hls_dct8x8_module(unsigned ysize, + unsigned xsize, + hls::stream& stream_recty8, + hls::stream& stream_rectx8, + hls::stream& stream_recty8_1, + hls::stream& stream_rectx8_1, + hls::stream& opsin8x8_stream, + hls::stream& ac_coef8x8_stream, + hls::stream& dc_coef8x8_stream) { +#pragma HLS INLINE off + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + + uint32_t ysize64 = tile_ysize / 64; + uint32_t xsize64 = tile_xsize / 64; + int xsize_blocks = xsize / 8; + int ysize_blocks = ysize / 8; + + float temp0[64]; +#pragma HLS bind_storage variable = temp0 type = ram_2p impl = bram + float temp1[64]; +#pragma HLS bind_storage variable = temp1 type = ram_2p impl = bram + float temp2[64]; +#pragma HLS bind_storage variable = temp2 type = ram_2p impl = bram + float temp3[64]; +#pragma HLS bind_storage variable = temp3 type = ram_2p impl = bram + float temp4[64]; +#pragma HLS bind_storage variable = temp4 type = ram_2p impl = bram + float to_ac[64]; +#pragma HLS bind_storage variable = to_ac type = ram_2p impl = bram + float to_dc[1]; +#pragma HLS bind_storage variable = to_dc type = ram_2p impl = bram + +loop_dct8_all: + for (uint32_t y64 = 0; y64 < ysize64; y64++) { + for (uint32_t x64 = 0; x64 < xsize64; x64++) { + for (uint32_t y8 = 0; y8 < 8; y8++) { + for (uint32_t x8 = 0; x8 < 8; x8++) { + for (int c = 0; c < 3; c++) { +#pragma HLS DATAFLOW + load_b64(temp0, opsin8x8_stream); + hls_DCT1DImpl_8x8(temp0, temp1); + hls_TransposeBlock8(temp1, temp2); + hls_DCT1DImpl_8x8(temp2, temp3); + hls_TransposeBlock8(temp3, temp4); + split_ac_dc_dct8(temp4, to_ac, to_dc); + feed_ac_dct8(x8, y8, stream_recty8, stream_rectx8, to_ac, ac_coef8x8_stream); + feed_dc_dct8(x8, y8, stream_recty8_1, stream_rectx8_1, to_dc, dc_coef8x8_stream); + } + } + } + } + } +} + +void hls_DCT1DImpl_16(float in[256], float out[256]) { +#pragma HLS INLINE off + float kMultipliers_N8_c1 = 0.5097955791041592; + + float kMultipliers_N8_c2 = 0.6013448869350453; + float kMultipliers_N8_c3 = 0.8999762231364156; + float kMultipliers_N8_c4 = 2.5629154477415055; + float kMultipliers_N4_c1 = 0.541196100146197; + float kMultipliers_N4_c2 = 1.3065629648763764; + + float kMultipliers_N16_0 = 0.5024192861881557; + float kMultipliers_N16_1 = 0.5224986149396889; + float kMultipliers_N16_2 = 0.5669440348163577; + float kMultipliers_N16_3 = 0.6468217833599901; + float kMultipliers_N16_4 = 0.7881546234512502; + float kMultipliers_N16_5 = 1.060677685990347; + float kMultipliers_N16_6 = 1.7224470982383342; + float kMultipliers_N16_7 = 5.101148618689155; + float sqrt2 = 1.4142135623730951f; + + float tmp16_0 = in[0] + in[15]; + float tmp16_1 = in[1] + in[14]; + float tmp16_2 = in[2] + in[13]; + float tmp16_3 = in[3] + in[12]; + float tmp16_4 = in[4] + in[11]; + float tmp16_5 = in[5] + in[10]; + float tmp16_6 = in[6] + in[9]; + float tmp16_7 = in[7] + in[8]; + float tmp16_8 = in[0] - in[15]; + float tmp16_9 = in[1] - in[14]; + float tmp16_10 = in[2] - in[13]; + float tmp16_11 = in[3] - in[12]; + float tmp16_12 = in[4] - in[11]; + float tmp16_13 = in[5] - in[10]; + float tmp16_14 = in[6] - in[9]; + float tmp16_15 = in[7] - in[8]; + + float tmp8_0 = tmp16_0 + tmp16_7; + float tmp8_1 = tmp16_1 + tmp16_6; + float tmp8_2 = tmp16_2 + tmp16_5; + float tmp8_3 = tmp16_3 + tmp16_4; + float tmp8_4 = tmp16_0 - tmp16_7; + float tmp8_5 = tmp16_1 - tmp16_6; + float tmp8_6 = tmp16_2 - tmp16_5; + float tmp8_7 = tmp16_3 - tmp16_4; + + float t00 = tmp8_0 + tmp8_3; + float t01 = tmp8_1 + tmp8_2; + float t02 = tmp8_0 - tmp8_3; + float t03 = tmp8_1 - tmp8_2; + float t04 = tmp8_4 * kMultipliers_N8_c1; + float t05 = tmp8_5 * kMultipliers_N8_c2; + float t06 = tmp8_6 * kMultipliers_N8_c3; + float t07 = tmp8_7 * kMultipliers_N8_c4; + + float t08 = t02 * kMultipliers_N4_c1; + float t09 = t03 * kMultipliers_N4_c2; + float t10 = t04 + t07; + float t11 = t05 + t06; + float t12 = t04 - t07; + float t13 = t05 - t06; + + float t14 = t08 + t09; + float t15 = t10 + t11; + float t16 = t08 - t09; + float t17 = t10 - t11; + + float t18 = t12 * kMultipliers_N4_c1; + float t19 = t13 * kMultipliers_N4_c2; + float t20 = t14 * sqrt2; + float t21 = t15 * sqrt2; + + float t22 = t18 + t19; + float t23 = t18 - t19; + + float t24 = t22 * sqrt2; + + float t25 = t24 + t23; + + float t26 = kMultipliers_N16_0 * tmp16_8; + float t27 = kMultipliers_N16_1 * tmp16_9; + float t28 = kMultipliers_N16_2 * tmp16_10; + float t29 = kMultipliers_N16_3 * tmp16_11; + float t30 = kMultipliers_N16_4 * tmp16_12; + float t31 = kMultipliers_N16_5 * tmp16_13; + float t32 = kMultipliers_N16_6 * tmp16_14; + float t33 = kMultipliers_N16_7 * tmp16_15; + + float dmp8_0 = t26 + t33; + float dmp8_1 = t27 + t32; + float dmp8_2 = t28 + t31; + float dmp8_3 = t29 + t30; + float dmp8_4 = t26 - t33; + float dmp8_5 = t27 - t32; + float dmp8_6 = t28 - t31; + float dmp8_7 = t29 - t30; + + float d00 = dmp8_0 + dmp8_3; + float d01 = dmp8_1 + dmp8_2; + float d02 = dmp8_0 - dmp8_3; + float d03 = dmp8_1 - dmp8_2; + float d04 = dmp8_4 * kMultipliers_N8_c1; + float d05 = dmp8_5 * kMultipliers_N8_c2; + float d06 = dmp8_6 * kMultipliers_N8_c3; + float d07 = dmp8_7 * kMultipliers_N8_c4; + + float d08 = d02 * kMultipliers_N4_c1; + float d09 = d03 * kMultipliers_N4_c2; + float d10 = d04 + d07; + float d11 = d05 + d06; + float d12 = d04 - d07; + float d13 = d05 - d06; + + float d14 = d08 + d09; + float d15 = d10 + d11; + float d16 = d08 - d09; + float d17 = d10 - d11; + + float d18 = d12 * kMultipliers_N4_c1; + float d19 = d13 * kMultipliers_N4_c2; + float d20 = d14 * sqrt2; + float d21 = d15 * sqrt2; + + float d22 = d18 + d19; + float d23 = d18 - d19; + + float d24 = d22 * sqrt2; + + float d25 = d24 + d23; + + float d26 = d00 + d01; + float d27 = d21 + d25; + float d28 = d20 + d16; + float d29 = d25 + d17; + float d30 = d00 - d01; + float d31 = d17 + d23; + float d32 = d26 * sqrt2; + + out[0] = t00 + t01; + out[1] = d32 + d27; + out[2] = t21 + t25; + out[3] = d27 + d28; + out[4] = t20 + t16; + out[5] = d28 + d29; + out[6] = t25 + t17; + out[7] = d29 + d30; + out[8] = t00 - t01; + out[9] = d30 + d31; + out[10] = t17 + t23; + out[11] = d31 + d16; + out[12] = t16; + out[13] = d16 + d23; + out[14] = t23; + out[15] = d23; +} + +void hls_dct16_block(float in[256], float out[256]) { +#pragma HLS INLINE off + float kMultipliers_N8_c1 = 0.5097955791041592; + + float kMultipliers_N8_c2 = 0.6013448869350453; + float kMultipliers_N8_c3 = 0.8999762231364156; + float kMultipliers_N8_c4 = 2.5629154477415055; + float kMultipliers_N4_c1 = 0.541196100146197; + float kMultipliers_N4_c2 = 1.3065629648763764; + + float kMultipliers_N16_0 = 0.5024192861881557; + float kMultipliers_N16_1 = 0.5224986149396889; + float kMultipliers_N16_2 = 0.5669440348163577; + float kMultipliers_N16_3 = 0.6468217833599901; + float kMultipliers_N16_4 = 0.7881546234512502; + float kMultipliers_N16_5 = 1.060677685990347; + float kMultipliers_N16_6 = 1.7224470982383342; + float kMultipliers_N16_7 = 5.101148618689155; + float sqrt2 = 1.4142135623730951f; + + for (int i = 0; i < 16; i++) { +#pragma HLS DEPENDENCE variable = in inter false +#pragma HLS DEPENDENCE variable = out inter false +#pragma HLS LOOP_FLATTEN off +#pragma HLS pipeline II = 11 + float tmp16_0 = in[16 * i + 0] + in[16 * i + 15]; + float tmp16_1 = in[16 * i + 1] + in[16 * i + 14]; + float tmp16_2 = in[16 * i + 2] + in[16 * i + 13]; + float tmp16_3 = in[16 * i + 3] + in[16 * i + 12]; + float tmp16_4 = in[16 * i + 4] + in[16 * i + 11]; + float tmp16_5 = in[16 * i + 5] + in[16 * i + 10]; + float tmp16_6 = in[16 * i + 6] + in[16 * i + 9]; + float tmp16_7 = in[16 * i + 7] + in[16 * i + 8]; + float tmp16_8 = in[16 * i + 0] - in[16 * i + 15]; + float tmp16_9 = in[16 * i + 1] - in[16 * i + 14]; + float tmp16_10 = in[16 * i + 2] - in[16 * i + 13]; + float tmp16_11 = in[16 * i + 3] - in[16 * i + 12]; + float tmp16_12 = in[16 * i + 4] - in[16 * i + 11]; + float tmp16_13 = in[16 * i + 5] - in[16 * i + 10]; + float tmp16_14 = in[16 * i + 6] - in[16 * i + 9]; + float tmp16_15 = in[16 * i + 7] - in[16 * i + 8]; + + float tmp8_0 = tmp16_0 + tmp16_7; + float tmp8_1 = tmp16_1 + tmp16_6; + float tmp8_2 = tmp16_2 + tmp16_5; + float tmp8_3 = tmp16_3 + tmp16_4; + float tmp8_4 = tmp16_0 - tmp16_7; + float tmp8_5 = tmp16_1 - tmp16_6; + float tmp8_6 = tmp16_2 - tmp16_5; + float tmp8_7 = tmp16_3 - tmp16_4; + + float t00 = tmp8_0 + tmp8_3; + float t01 = tmp8_1 + tmp8_2; + float t02 = tmp8_0 - tmp8_3; + float t03 = tmp8_1 - tmp8_2; + float t04 = tmp8_4 * kMultipliers_N8_c1; + float t05 = tmp8_5 * kMultipliers_N8_c2; + float t06 = tmp8_6 * kMultipliers_N8_c3; + float t07 = tmp8_7 * kMultipliers_N8_c4; + + float t08 = t02 * kMultipliers_N4_c1; + float t09 = t03 * kMultipliers_N4_c2; + float t10 = t04 + t07; + float t11 = t05 + t06; + float t12 = t04 - t07; + float t13 = t05 - t06; + + float t14 = t08 + t09; + float t15 = t10 + t11; + float t16 = t08 - t09; + float t17 = t10 - t11; + + float t18 = t12 * kMultipliers_N4_c1; + float t19 = t13 * kMultipliers_N4_c2; + float t20 = t14 * sqrt2; + float t21 = t15 * sqrt2; + + float t22 = t18 + t19; + float t23 = t18 - t19; + + float t24 = t22 * sqrt2; + + float t25 = t24 + t23; + + float t26 = kMultipliers_N16_0 * tmp16_8; + float t27 = kMultipliers_N16_1 * tmp16_9; + float t28 = kMultipliers_N16_2 * tmp16_10; + float t29 = kMultipliers_N16_3 * tmp16_11; + float t30 = kMultipliers_N16_4 * tmp16_12; + float t31 = kMultipliers_N16_5 * tmp16_13; + float t32 = kMultipliers_N16_6 * tmp16_14; + float t33 = kMultipliers_N16_7 * tmp16_15; + + float dmp8_0 = t26 + t33; + float dmp8_1 = t27 + t32; + float dmp8_2 = t28 + t31; + float dmp8_3 = t29 + t30; + float dmp8_4 = t26 - t33; + float dmp8_5 = t27 - t32; + float dmp8_6 = t28 - t31; + float dmp8_7 = t29 - t30; + + float d00 = dmp8_0 + dmp8_3; + float d01 = dmp8_1 + dmp8_2; + float d02 = dmp8_0 - dmp8_3; + float d03 = dmp8_1 - dmp8_2; + float d04 = dmp8_4 * kMultipliers_N8_c1; + float d05 = dmp8_5 * kMultipliers_N8_c2; + float d06 = dmp8_6 * kMultipliers_N8_c3; + float d07 = dmp8_7 * kMultipliers_N8_c4; + + float d08 = d02 * kMultipliers_N4_c1; + float d09 = d03 * kMultipliers_N4_c2; + float d10 = d04 + d07; + float d11 = d05 + d06; + float d12 = d04 - d07; + float d13 = d05 - d06; + + float d14 = d08 + d09; + float d15 = d10 + d11; + float d16 = d08 - d09; + float d17 = d10 - d11; + + float d18 = d12 * kMultipliers_N4_c1; + float d19 = d13 * kMultipliers_N4_c2; + float d20 = d14 * sqrt2; + float d21 = d15 * sqrt2; + + float d22 = d18 + d19; + float d23 = d18 - d19; + + float d24 = d22 * sqrt2; + + float d25 = d24 + d23; + + float d26 = d00 + d01; + float d27 = d21 + d25; + float d28 = d20 + d16; + float d29 = d25 + d17; + float d30 = d00 - d01; + float d31 = d17 + d23; + float d32 = d26 * sqrt2; + + out[16 * i + 0] = t00 + t01; + out[16 * i + 1] = d32 + d27; + out[16 * i + 2] = t21 + t25; + out[16 * i + 3] = d27 + d28; + out[16 * i + 4] = t20 + t16; + out[16 * i + 5] = d28 + d29; + out[16 * i + 6] = t25 + t17; + out[16 * i + 7] = d29 + d30; + out[16 * i + 8] = t00 - t01; + out[16 * i + 9] = d30 + d31; + out[16 * i + 10] = t17 + t23; + out[16 * i + 11] = d31 + d16; + out[16 * i + 12] = t16; + out[16 * i + 13] = d16 + d23; + out[16 * i + 14] = t23; + out[16 * i + 15] = d23; + } +} + +void hls_ReinterpretingIDCT16(float input[4], float output[4]) { +#pragma HLS INLINE off + float resample = 0.901764214038848876953125; + + float t0 = input[0]; + float t1 = input[1] * resample; + float t2 = input[2] * resample; + float t3 = input[3] * resample * resample; + + float t4 = t0 + t2; + float t5 = t1 + t3; + float t6 = t0 - t2; + float t7 = t1 - t3; + + float t8 = t4; + float t9 = t6; + float t10 = t5; + float t11 = t7; + + output[0] = t8 + t10; + output[2] = t8 - t10; + output[1] = t9 + t11; + output[3] = t9 - t11; +} + +// dct 16x16 +void load_dct16(float in[256], hls::stream& opsin16x16_stream) { +#pragma HLS INLINE off + for (int y8 = 0; y8 < 2; y8++) { + for (int x8 = 0; x8 < 2; x8++) { + for (int m = 0; m < 8; m++) { + for (int n = 0; n < 8; n++) { +#pragma HLS PIPELINE II = 1 + int addr = y8 * 16 * 8 + x8 * 8 + m * 16 + n; + in[addr] = opsin16x16_stream.read(); + } + } + } + } +} + +void transposeDct16(float in[256], float out[256]) { +#pragma HLS INLINE off + for (int i = 0; i < 16; i++) { + for (int j = 0; j < 16; j++) { +#pragma HLS PIPELINE II = 1 + out[j * 16 + i] = in[i * 16 + j]; + } + } +} + +void transposeDct16_scale(float in[256], float out[256]) { +#pragma HLS INLINE off + for (int i = 0; i < 16; i++) { + for (int j = 0; j < 16; j++) { +#pragma HLS PIPELINE II = 1 + float mul = 1.0f / 16.0f; + out[j * 16 + i] = mul * in[i * 16 + j]; + } + } +} + +void dct16_ac_writeout(float to_ac[256], + hls::stream& stream_recty, + hls::stream& stream_rectx, + hls::stream& ac_coef16x16_stream, + uint32_t x16, + uint32_t y16) { +#pragma HLS INLINE off + uint8_t rect_xsize; + uint8_t rect_ysize; + for (int m = 0; m < 256; m++) { +#pragma HLS PIPELINE II = 1 + if (m == 0) { + rect_xsize = stream_rectx.read(); + rect_ysize = stream_recty.read(); + } + if ((2 * x16 + 1) < rect_xsize && (2 * y16 + 1) < rect_ysize) { + ac_coef16x16_stream.write(to_ac[m]); + } + } +} + +void dct16_dc_writeout(float to_dc[4], + hls::stream& stream_recty, + hls::stream& stream_rectx, + hls::stream& dc_coef16x16_stream, + uint32_t x16, + uint32_t y16) { +#pragma HLS INLINE off + uint8_t rect_xsize; + uint8_t rect_ysize; + for (int m = 0; m < 4; m++) { +#pragma HLS PIPELINE II = 1 + if (m == 0) { + rect_xsize = stream_rectx.read(); + rect_ysize = stream_recty.read(); + } + if ((2 * x16 + 1) < rect_xsize && (2 * y16 + 1) < rect_ysize) { + dc_coef16x16_stream.write(to_dc[m]); + } + } +} + +void dct16_ac_dc_split(float in[256], float ac_out1[256], float dc_out[4]) { +#pragma HLS INLINE off + for (int i = 0; i < 256; i++) { +#pragma HLS PIPELINE II = 1 + ac_out1[i] = in[i]; + if (i == 0) + dc_out[0] = in[i]; + else if (i == 1) + dc_out[1] = in[i]; + else if (i == 16) + dc_out[2] = in[i]; + else if (i == 17) + dc_out[3] = in[i]; + } +} + +void dct16_test_load(float from[256], hls::stream& opsin16x16_stream) { +#pragma HLS INLINE off + for (int m = 0; m < 16; m++) { + for (int n = 0; n < 16; n++) { +#pragma HLS PIPELINE II = 1 + int addr = 16 * m + n; + from[addr] = opsin16x16_stream.read(); + } + } +} + +void hls_dct16x16_module(unsigned ysize, + unsigned xsize, + hls::stream& stream_recty16, + hls::stream& stream_rectx16, + hls::stream& stream_recty16_1, + hls::stream& stream_rectx16_1, + hls::stream& opsin16x16_stream, + hls::stream& ac_coef16x16_stream, + hls::stream& dc_coef16x16_stream) { +#pragma HLS INLINE off + + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + + uint32_t ysize64 = tile_ysize / 64; + uint32_t xsize64 = tile_xsize / 64; + uint32_t ysize16 = tile_ysize / 16; + uint32_t xsize16 = tile_xsize / 16; + int xsize_blocks = xsize / 8; + int ysize_blocks = ysize / 8; + + float from[256]; +#pragma HLS bind_storage variable = from type = ram_2p impl = bram + float temp0[256]; +#pragma HLS bind_storage variable = temp0 type = ram_2p impl = bram + float temp1[256]; +#pragma HLS bind_storage variable = temp1 type = ram_2p impl = bram + float temp2[256]; +#pragma HLS bind_storage variable = temp2 type = ram_2p impl = bram + float temp3[256]; +#pragma HLS bind_storage variable = temp3 type = ram_2p impl = bram + float temp4[256]; +#pragma HLS bind_storage variable = temp4 type = ram_2p impl = bram + float to_ac[256]; +#pragma HLS bind_storage variable = to_ac type = ram_2p impl = bram + float to_dc[4]; +#pragma HLS bind_storage variable = to_dc type = ram_2p impl = bram + float dc_mem[4]; +#pragma HLS bind_storage variable = dc_mem type = ram_2p impl = bram + + for (uint32_t y64 = 0; y64 < ysize64; y64++) { + for (uint32_t x64 = 0; x64 < xsize64; x64++) { + for (uint32_t y16 = 0; y16 < 4; y16++) { + for (uint32_t x16 = 0; x16 < 4; x16++) { + for (int c = 0; c < 3; c++) { +// #pragma HLS PIPELINE rewind +#pragma HLS DATAFLOW + dct16_test_load(from, opsin16x16_stream); + transposeDct16(from, temp0); + hls_dct16_block(temp0, temp1); + transposeDct16_scale(temp1, temp2); + hls_dct16_block(temp2, temp3); + transposeDct16_scale(temp3, temp4); + dct16_ac_dc_split(temp4, to_ac, to_dc); + // output ac_coeff_stream + dct16_ac_writeout(to_ac, stream_recty16, stream_rectx16, ac_coef16x16_stream, x16, y16); + // output dc_coeff_stream + hls_ReinterpretingIDCT16(to_dc, dc_mem); + dct16_dc_writeout(dc_mem, stream_recty16_1, stream_rectx16_1, dc_coef16x16_stream, x16, y16); + } + } + } + } + } +} + +// template +void hls_DCT1DImpl_32(float in[1024], float out[1024]) { +#pragma HLS INLINE off + + float kMultipliers_N32_0 = 0.5006029982351963; + float kMultipliers_N32_1 = 0.5054709598975436; + float kMultipliers_N32_2 = 0.5154473099226246; + float kMultipliers_N32_3 = 0.531042591089784; + float kMultipliers_N32_4 = 0.553103896034444; + float kMultipliers_N32_5 = 0.5829349682061339; + float kMultipliers_N32_6 = 0.622504123035664; + float kMultipliers_N32_7 = 0.674808341455005; + float kMultipliers_N32_8 = 0.7445362710022986; + float kMultipliers_N32_9 = 0.839349645415526; + float kMultipliers_N32_10 = 0.9725682378619608; + float kMultipliers_N32_11 = 1.169439933432884; + float kMultipliers_N32_12 = 1.4841646163141662; + float kMultipliers_N32_13 = 2.057781009953411; + float kMultipliers_N32_14 = 3.407608418468719; + float kMultipliers_N32_15 = 10.19000812354803; + + float sqrt2 = 1.4142135623730951f; + + float kMultipliers_N8_c1 = 0.5097955791041592; + + float kMultipliers_N8_c2 = 0.6013448869350453; + float kMultipliers_N8_c3 = 0.8999762231364156; + float kMultipliers_N8_c4 = 2.5629154477415055; + float kMultipliers_N4_c1 = 0.541196100146197; + float kMultipliers_N4_c2 = 1.3065629648763764; + + float kMultipliers_N16_0 = 0.5024192861881557; + float kMultipliers_N16_1 = 0.5224986149396889; + float kMultipliers_N16_2 = 0.5669440348163577; + float kMultipliers_N16_3 = 0.6468217833599901; + float kMultipliers_N16_4 = 0.7881546234512502; + float kMultipliers_N16_5 = 1.060677685990347; + float kMultipliers_N16_6 = 1.7224470982383342; + float kMultipliers_N16_7 = 5.101148618689155; + + for (int i = 0; i < 32; i++) { +#pragma HLS PIPELINE II = 30 + float tmp32_b16_0 = in[0 + 32 * i] + in[31 + 32 * i]; + float tmp32_b16_1 = in[1 + 32 * i] + in[30 + 32 * i]; + float tmp32_b16_2 = in[2 + 32 * i] + in[29 + 32 * i]; + float tmp32_b16_3 = in[3 + 32 * i] + in[28 + 32 * i]; + float tmp32_b16_4 = in[4 + 32 * i] + in[27 + 32 * i]; + float tmp32_b16_5 = in[5 + 32 * i] + in[26 + 32 * i]; + float tmp32_b16_6 = in[6 + 32 * i] + in[25 + 32 * i]; + float tmp32_b16_7 = in[7 + 32 * i] + in[24 + 32 * i]; + float tmp32_b16_8 = in[8 + 32 * i] + in[23 + 32 * i]; + float tmp32_b16_9 = in[9 + 32 * i] + in[22 + 32 * i]; + float tmp32_b16_10 = in[10 + 32 * i] + in[21 + 32 * i]; + float tmp32_b16_11 = in[11 + 32 * i] + in[20 + 32 * i]; + float tmp32_b16_12 = in[12 + 32 * i] + in[19 + 32 * i]; + float tmp32_b16_13 = in[13 + 32 * i] + in[18 + 32 * i]; + float tmp32_b16_14 = in[14 + 32 * i] + in[17 + 32 * i]; + float tmp32_b16_15 = in[15 + 32 * i] + in[16 + 32 * i]; + + float tmp16_0_b16 = tmp32_b16_0 + tmp32_b16_15; + float tmp16_1_b16 = tmp32_b16_1 + tmp32_b16_14; + float tmp16_2_b16 = tmp32_b16_2 + tmp32_b16_13; + float tmp16_3_b16 = tmp32_b16_3 + tmp32_b16_12; + float tmp16_4_b16 = tmp32_b16_4 + tmp32_b16_11; + float tmp16_5_b16 = tmp32_b16_5 + tmp32_b16_10; + float tmp16_6_b16 = tmp32_b16_6 + tmp32_b16_9; + float tmp16_7_b16 = tmp32_b16_7 + tmp32_b16_8; + float tmp16_8_b16 = tmp32_b16_0 - tmp32_b16_15; + float tmp16_9_b16 = tmp32_b16_1 - tmp32_b16_14; + float tmp16_10_b16 = tmp32_b16_2 - tmp32_b16_13; + float tmp16_11_b16 = tmp32_b16_3 - tmp32_b16_12; + float tmp16_12_b16 = tmp32_b16_4 - tmp32_b16_11; + float tmp16_13_b16 = tmp32_b16_5 - tmp32_b16_10; + float tmp16_14_b16 = tmp32_b16_6 - tmp32_b16_9; + float tmp16_15_b16 = tmp32_b16_7 - tmp32_b16_8; + + float tmp8_0_b16 = tmp16_0_b16 + tmp16_7_b16; + float tmp8_1_b16 = tmp16_1_b16 + tmp16_6_b16; + float tmp8_2_b16 = tmp16_2_b16 + tmp16_5_b16; + float tmp8_3_b16 = tmp16_3_b16 + tmp16_4_b16; + float tmp8_4_b16 = tmp16_0_b16 - tmp16_7_b16; + float tmp8_5_b16 = tmp16_1_b16 - tmp16_6_b16; + float tmp8_6_b16 = tmp16_2_b16 - tmp16_5_b16; + float tmp8_7_b16 = tmp16_3_b16 - tmp16_4_b16; + + float t00_b16 = tmp8_0_b16 + tmp8_3_b16; + float t01_b16 = tmp8_1_b16 + tmp8_2_b16; + float t02_b16 = tmp8_0_b16 - tmp8_3_b16; + float t03_b16 = tmp8_1_b16 - tmp8_2_b16; + float t04_b16 = tmp8_4_b16 * kMultipliers_N8_c1; + float t05_b16 = tmp8_5_b16 * kMultipliers_N8_c2; + float t06_b16 = tmp8_6_b16 * kMultipliers_N8_c3; + float t07_b16 = tmp8_7_b16 * kMultipliers_N8_c4; + + float t08_b16 = t02_b16 * kMultipliers_N4_c1; + float t09_b16 = t03_b16 * kMultipliers_N4_c2; + float t10_b16 = t04_b16 + t07_b16; + float t11_b16 = t05_b16 + t06_b16; + float t12_b16 = t04_b16 - t07_b16; + float t13_b16 = t05_b16 - t06_b16; + + float t14_b16 = t08_b16 + t09_b16; + float t15_b16 = t10_b16 + t11_b16; + float t16_b16 = t08_b16 - t09_b16; + float t17_b16 = t10_b16 - t11_b16; + + float t18_b16 = t12_b16 * kMultipliers_N4_c1; + float t19_b16 = t13_b16 * kMultipliers_N4_c2; + float t20_b16 = t14_b16 * sqrt2; + float t21_b16 = t15_b16 * sqrt2; + + float t22_b16 = t18_b16 + t19_b16; + float t23_b16 = t18_b16 - t19_b16; + + float t24_b16 = t22_b16 * sqrt2; + + float t25_b16 = t24_b16 + t23_b16; + + float t26_b16 = kMultipliers_N16_0 * tmp16_8_b16; + float t27_b16 = kMultipliers_N16_1 * tmp16_9_b16; + float t28_b16 = kMultipliers_N16_2 * tmp16_10_b16; + float t29_b16 = kMultipliers_N16_3 * tmp16_11_b16; + float t30_b16 = kMultipliers_N16_4 * tmp16_12_b16; + float t31_b16 = kMultipliers_N16_5 * tmp16_13_b16; + float t32_b16 = kMultipliers_N16_6 * tmp16_14_b16; + float t33_b16 = kMultipliers_N16_7 * tmp16_15_b16; + + float dmp8_0_b16 = t26_b16 + t33_b16; + float dmp8_1_b16 = t27_b16 + t32_b16; + float dmp8_2_b16 = t28_b16 + t31_b16; + float dmp8_3_b16 = t29_b16 + t30_b16; + float dmp8_4_b16 = t26_b16 - t33_b16; + float dmp8_5_b16 = t27_b16 - t32_b16; + float dmp8_6_b16 = t28_b16 - t31_b16; + float dmp8_7_b16 = t29_b16 - t30_b16; + + float d00_b16 = dmp8_0_b16 + dmp8_3_b16; + float d01_b16 = dmp8_1_b16 + dmp8_2_b16; + float d02_b16 = dmp8_0_b16 - dmp8_3_b16; + float d03_b16 = dmp8_1_b16 - dmp8_2_b16; + float d04_b16 = dmp8_4_b16 * kMultipliers_N8_c1; + float d05_b16 = dmp8_5_b16 * kMultipliers_N8_c2; + float d06_b16 = dmp8_6_b16 * kMultipliers_N8_c3; + float d07_b16 = dmp8_7_b16 * kMultipliers_N8_c4; + + float d08_b16 = d02_b16 * kMultipliers_N4_c1; + float d09_b16 = d03_b16 * kMultipliers_N4_c2; + float d10_b16 = d04_b16 + d07_b16; + float d11_b16 = d05_b16 + d06_b16; + float d12_b16 = d04_b16 - d07_b16; + float d13_b16 = d05_b16 - d06_b16; + + float d14_b16 = d08_b16 + d09_b16; + float d15_b16 = d10_b16 + d11_b16; + float d16_b16 = d08_b16 - d09_b16; + float d17_b16 = d10_b16 - d11_b16; + + float d18_b16 = d12_b16 * kMultipliers_N4_c1; + float d19_b16 = d13_b16 * kMultipliers_N4_c2; + float d20_b16 = d14_b16 * sqrt2; + float d21_b16 = d15_b16 * sqrt2; + + float d22_b16 = d18_b16 + d19_b16; + float d23_b16 = d18_b16 - d19_b16; + + float d24_b16 = d22_b16 * sqrt2; + + float d25_b16 = d24_b16 + d23_b16; + + float d26_b16 = d00_b16 + d01_b16; + float d27_b16 = d21_b16 + d25_b16; + float d28_b16 = d20_b16 + d16_b16; + float d29_b16 = d25_b16 + d17_b16; + float d30_b16 = d00_b16 - d01_b16; + float d31_b16 = d17_b16 + d23_b16; + float d32_b16 = d26_b16 * sqrt2; + + float tmp32_b16_out1_0 = t00_b16 + t01_b16; + float tmp32_b16_out1_1 = d32_b16 + d27_b16; + float tmp32_b16_out1_2 = t21_b16 + t25_b16; + float tmp32_b16_out1_3 = d27_b16 + d28_b16; + float tmp32_b16_out1_4 = t20_b16 + t16_b16; + float tmp32_b16_out1_5 = d28_b16 + d29_b16; + float tmp32_b16_out1_6 = t25_b16 + t17_b16; + float tmp32_b16_out1_7 = d29_b16 + d30_b16; + float tmp32_b16_out1_8 = t00_b16 - t01_b16; + float tmp32_b16_out1_9 = d30_b16 + d31_b16; + float tmp32_b16_out1_10 = t17_b16 + t23_b16; + float tmp32_b16_out1_11 = d31_b16 + d16_b16; + float tmp32_b16_out1_12 = t16_b16; + float tmp32_b16_out1_13 = d16_b16 + d23_b16; + float tmp32_b16_out1_14 = t23_b16; + float tmp32_b16_out1_15 = d23_b16; + + float tmp32_b32_add_sub_16 = in[0 + 32 * i] - in[31 + 32 * i]; + float tmp32_b32_add_sub_17 = in[1 + 32 * i] - in[30 + 32 * i]; + float tmp32_b32_add_sub_18 = in[2 + 32 * i] - in[29 + 32 * i]; + float tmp32_b32_add_sub_19 = in[3 + 32 * i] - in[28 + 32 * i]; + float tmp32_b32_add_sub_20 = in[4 + 32 * i] - in[27 + 32 * i]; + float tmp32_b32_add_sub_21 = in[5 + 32 * i] - in[26 + 32 * i]; + float tmp32_b32_add_sub_22 = in[6 + 32 * i] - in[25 + 32 * i]; + float tmp32_b32_add_sub_23 = in[7 + 32 * i] - in[24 + 32 * i]; + float tmp32_b32_add_sub_24 = in[8 + 32 * i] - in[23 + 32 * i]; + float tmp32_b32_add_sub_25 = in[9 + 32 * i] - in[22 + 32 * i]; + float tmp32_b32_add_sub_26 = in[10 + 32 * i] - in[21 + 32 * i]; + float tmp32_b32_add_sub_27 = in[11 + 32 * i] - in[20 + 32 * i]; + float tmp32_b32_add_sub_28 = in[12 + 32 * i] - in[19 + 32 * i]; + float tmp32_b32_add_sub_29 = in[13 + 32 * i] - in[18 + 32 * i]; + float tmp32_b32_add_sub_30 = in[14 + 32 * i] - in[17 + 32 * i]; + float tmp32_b32_add_sub_31 = in[15 + 32 * i] - in[16 + 32 * i]; + + float tmp32_b32_mul_16 = tmp32_b32_add_sub_16 * kMultipliers_N32_0; + float tmp32_b32_mul_17 = tmp32_b32_add_sub_17 * kMultipliers_N32_1; + float tmp32_b32_mul_18 = tmp32_b32_add_sub_18 * kMultipliers_N32_2; + float tmp32_b32_mul_19 = tmp32_b32_add_sub_19 * kMultipliers_N32_3; + float tmp32_b32_mul_20 = tmp32_b32_add_sub_20 * kMultipliers_N32_4; + float tmp32_b32_mul_21 = tmp32_b32_add_sub_21 * kMultipliers_N32_5; + float tmp32_b32_mul_22 = tmp32_b32_add_sub_22 * kMultipliers_N32_6; + float tmp32_b32_mul_23 = tmp32_b32_add_sub_23 * kMultipliers_N32_7; + float tmp32_b32_mul_24 = tmp32_b32_add_sub_24 * kMultipliers_N32_8; + float tmp32_b32_mul_25 = tmp32_b32_add_sub_25 * kMultipliers_N32_9; + float tmp32_b32_mul_26 = tmp32_b32_add_sub_26 * kMultipliers_N32_10; + float tmp32_b32_mul_27 = tmp32_b32_add_sub_27 * kMultipliers_N32_11; + float tmp32_b32_mul_28 = tmp32_b32_add_sub_28 * kMultipliers_N32_12; + float tmp32_b32_mul_29 = tmp32_b32_add_sub_29 * kMultipliers_N32_13; + float tmp32_b32_mul_30 = tmp32_b32_add_sub_30 * kMultipliers_N32_14; + float tmp32_b32_mul_31 = tmp32_b32_add_sub_31 * kMultipliers_N32_15; + + float tmp16_0_b32 = tmp32_b32_mul_16 + tmp32_b32_mul_31; + float tmp16_1_b32 = tmp32_b32_mul_17 + tmp32_b32_mul_30; + float tmp16_2_b32 = tmp32_b32_mul_18 + tmp32_b32_mul_29; + float tmp16_3_b32 = tmp32_b32_mul_19 + tmp32_b32_mul_28; + float tmp16_4_b32 = tmp32_b32_mul_20 + tmp32_b32_mul_27; + float tmp16_5_b32 = tmp32_b32_mul_21 + tmp32_b32_mul_26; + float tmp16_6_b32 = tmp32_b32_mul_22 + tmp32_b32_mul_25; + float tmp16_7_b32 = tmp32_b32_mul_23 + tmp32_b32_mul_24; + float tmp16_8_b32 = tmp32_b32_mul_16 - tmp32_b32_mul_31; + float tmp16_9_b32 = tmp32_b32_mul_17 - tmp32_b32_mul_30; + float tmp16_10_b32 = tmp32_b32_mul_18 - tmp32_b32_mul_29; + float tmp16_11_b32 = tmp32_b32_mul_19 - tmp32_b32_mul_28; + float tmp16_12_b32 = tmp32_b32_mul_20 - tmp32_b32_mul_27; + float tmp16_13_b32 = tmp32_b32_mul_21 - tmp32_b32_mul_26; + float tmp16_14_b32 = tmp32_b32_mul_22 - tmp32_b32_mul_25; + float tmp16_15_b32 = tmp32_b32_mul_23 - tmp32_b32_mul_24; + + float tmp8_0_b32 = tmp16_0_b32 + tmp16_7_b32; + float tmp8_1_b32 = tmp16_1_b32 + tmp16_6_b32; + float tmp8_2_b32 = tmp16_2_b32 + tmp16_5_b32; + float tmp8_3_b32 = tmp16_3_b32 + tmp16_4_b32; + float tmp8_4_b32 = tmp16_0_b32 - tmp16_7_b32; + float tmp8_5_b32 = tmp16_1_b32 - tmp16_6_b32; + float tmp8_6_b32 = tmp16_2_b32 - tmp16_5_b32; + float tmp8_7_b32 = tmp16_3_b32 - tmp16_4_b32; + + float t00_b32 = tmp8_0_b32 + tmp8_3_b32; + float t01_b32 = tmp8_1_b32 + tmp8_2_b32; + float t02_b32 = tmp8_0_b32 - tmp8_3_b32; + float t03_b32 = tmp8_1_b32 - tmp8_2_b32; + float t04_b32 = tmp8_4_b32 * kMultipliers_N8_c1; + float t05_b32 = tmp8_5_b32 * kMultipliers_N8_c2; + float t06_b32 = tmp8_6_b32 * kMultipliers_N8_c3; + float t07_b32 = tmp8_7_b32 * kMultipliers_N8_c4; + + float t08_b32 = t02_b32 * kMultipliers_N4_c1; + float t09_b32 = t03_b32 * kMultipliers_N4_c2; + float t10_b32 = t04_b32 + t07_b32; + float t11_b32 = t05_b32 + t06_b32; + float t12_b32 = t04_b32 - t07_b32; + float t13_b32 = t05_b32 - t06_b32; + + float t14_b32 = t08_b32 + t09_b32; + float t15_b32 = t10_b32 + t11_b32; + float t16_b32 = t08_b32 - t09_b32; + float t17_b32 = t10_b32 - t11_b32; + + float t18_b32 = t12_b32 * kMultipliers_N4_c1; + float t19_b32 = t13_b32 * kMultipliers_N4_c2; + float t20_b32 = t14_b32 * sqrt2; + float t21_b32 = t15_b32 * sqrt2; + + float t22_b32 = t18_b32 + t19_b32; + float t23_b32 = t18_b32 - t19_b32; + + float t24_b32 = t22_b32 * sqrt2; + + float t25_b32 = t24_b32 + t23_b32; + + float t26_b32 = kMultipliers_N16_0 * tmp16_8_b32; + float t27_b32 = kMultipliers_N16_1 * tmp16_9_b32; + float t28_b32 = kMultipliers_N16_2 * tmp16_10_b32; + float t29_b32 = kMultipliers_N16_3 * tmp16_11_b32; + float t30_b32 = kMultipliers_N16_4 * tmp16_12_b32; + float t31_b32 = kMultipliers_N16_5 * tmp16_13_b32; + float t32_b32 = kMultipliers_N16_6 * tmp16_14_b32; + float t33_b32 = kMultipliers_N16_7 * tmp16_15_b32; + + float dmp8_0_b32 = t26_b32 + t33_b32; + float dmp8_1_b32 = t27_b32 + t32_b32; + float dmp8_2_b32 = t28_b32 + t31_b32; + float dmp8_3_b32 = t29_b32 + t30_b32; + float dmp8_4_b32 = t26_b32 - t33_b32; + float dmp8_5_b32 = t27_b32 - t32_b32; + float dmp8_6_b32 = t28_b32 - t31_b32; + float dmp8_7_b32 = t29_b32 - t30_b32; + + float d00_b32 = dmp8_0_b32 + dmp8_3_b32; + float d01_b32 = dmp8_1_b32 + dmp8_2_b32; + float d02_b32 = dmp8_0_b32 - dmp8_3_b32; + float d03_b32 = dmp8_1_b32 - dmp8_2_b32; + float d04_b32 = dmp8_4_b32 * kMultipliers_N8_c1; + float d05_b32 = dmp8_5_b32 * kMultipliers_N8_c2; + float d06_b32 = dmp8_6_b32 * kMultipliers_N8_c3; + float d07_b32 = dmp8_7_b32 * kMultipliers_N8_c4; + + float d08_b32 = d02_b32 * kMultipliers_N4_c1; + float d09_b32 = d03_b32 * kMultipliers_N4_c2; + float d10_b32 = d04_b32 + d07_b32; + float d11_b32 = d05_b32 + d06_b32; + float d12_b32 = d04_b32 - d07_b32; + float d13_b32 = d05_b32 - d06_b32; + + float d14_b32 = d08_b32 + d09_b32; + float d15_b32 = d10_b32 + d11_b32; + float d16_b32 = d08_b32 - d09_b32; + float d17_b32 = d10_b32 - d11_b32; + + float d18_b32 = d12_b32 * kMultipliers_N4_c1; + float d19_b32 = d13_b32 * kMultipliers_N4_c2; + float d20_b32 = d14_b32 * sqrt2; + float d21_b32 = d15_b32 * sqrt2; + + float d22_b32 = d18_b32 + d19_b32; + float d23_b32 = d18_b32 - d19_b32; + + float d24_b32 = d22_b32 * sqrt2; + + float d25_b32 = d24_b32 + d23_b32; + + float d26_b32 = d00_b32 + d01_b32; + float d27_b32 = d21_b32 + d25_b32; + float d28_b32 = d20_b32 + d16_b32; + float d29_b32 = d25_b32 + d17_b32; + float d30_b32 = d00_b32 - d01_b32; + float d31_b32 = d17_b32 + d23_b32; + float d32_b32 = d26_b32 * sqrt2; + + float tmp32_b32_add_out2_16 = t00_b32 + t01_b32; + float tmp32_b32_add_out2_17 = d32_b32 + d27_b32; + float tmp32_b32_add_out2_18 = t21_b32 + t25_b32; + float tmp32_b32_add_out2_19 = d27_b32 + d28_b32; + float tmp32_b32_add_out2_20 = t20_b32 + t16_b32; + float tmp32_b32_add_out2_21 = d28_b32 + d29_b32; + float tmp32_b32_add_out2_22 = t25_b32 + t17_b32; + float tmp32_b32_add_out2_23 = d29_b32 + d30_b32; + float tmp32_b32_add_out2_24 = t00_b32 - t01_b32; + float tmp32_b32_add_out2_25 = d30_b32 + d31_b32; + float tmp32_b32_add_out2_26 = t17_b32 + t23_b32; + float tmp32_b32_add_out2_27 = d31_b32 + d16_b32; + float tmp32_b32_add_out2_28 = t16_b32; + float tmp32_b32_add_out2_29 = d16_b32 + d23_b32; + float tmp32_b32_add_out2_30 = t23_b32; + float tmp32_b32_add_out2_31 = d23_b32; + + float tmp32_b32_out2_16 = tmp32_b32_add_out2_16 * sqrt2 + tmp32_b32_add_out2_17; + float tmp32_b32_out2_17 = tmp32_b32_add_out2_17 + tmp32_b32_add_out2_18; + float tmp32_b32_out2_18 = tmp32_b32_add_out2_18 + tmp32_b32_add_out2_19; + float tmp32_b32_out2_19 = tmp32_b32_add_out2_19 + tmp32_b32_add_out2_20; + float tmp32_b32_out2_20 = tmp32_b32_add_out2_20 + tmp32_b32_add_out2_21; + float tmp32_b32_out2_21 = tmp32_b32_add_out2_21 + tmp32_b32_add_out2_22; + float tmp32_b32_out2_22 = tmp32_b32_add_out2_22 + tmp32_b32_add_out2_23; + float tmp32_b32_out2_23 = tmp32_b32_add_out2_23 + tmp32_b32_add_out2_24; + float tmp32_b32_out2_24 = tmp32_b32_add_out2_24 + tmp32_b32_add_out2_25; + float tmp32_b32_out2_25 = tmp32_b32_add_out2_25 + tmp32_b32_add_out2_26; + float tmp32_b32_out2_26 = tmp32_b32_add_out2_26 + tmp32_b32_add_out2_27; + float tmp32_b32_out2_27 = tmp32_b32_add_out2_27 + tmp32_b32_add_out2_28; + float tmp32_b32_out2_28 = tmp32_b32_add_out2_28 + tmp32_b32_add_out2_29; + float tmp32_b32_out2_29 = tmp32_b32_add_out2_29 + tmp32_b32_add_out2_30; + float tmp32_b32_out2_30 = tmp32_b32_add_out2_30 + tmp32_b32_add_out2_31; + float tmp32_b32_out2_31 = tmp32_b32_add_out2_31; + + out[0 + 32 * i] = tmp32_b16_out1_0; + out[2 + 32 * i] = tmp32_b16_out1_1; + out[4 + 32 * i] = tmp32_b16_out1_2; + out[6 + 32 * i] = tmp32_b16_out1_3; + out[8 + 32 * i] = tmp32_b16_out1_4; + out[10 + 32 * i] = tmp32_b16_out1_5; + out[12 + 32 * i] = tmp32_b16_out1_6; + out[14 + 32 * i] = tmp32_b16_out1_7; + out[16 + 32 * i] = tmp32_b16_out1_8; + out[18 + 32 * i] = tmp32_b16_out1_9; + out[20 + 32 * i] = tmp32_b16_out1_10; + out[22 + 32 * i] = tmp32_b16_out1_11; + out[24 + 32 * i] = tmp32_b16_out1_12; + out[26 + 32 * i] = tmp32_b16_out1_13; + out[28 + 32 * i] = tmp32_b16_out1_14; + out[30 + 32 * i] = tmp32_b16_out1_15; + + out[1 + 32 * i] = tmp32_b32_out2_16; + out[3 + 32 * i] = tmp32_b32_out2_17; + out[5 + 32 * i] = tmp32_b32_out2_18; + out[7 + 32 * i] = tmp32_b32_out2_19; + out[9 + 32 * i] = tmp32_b32_out2_20; + out[11 + 32 * i] = tmp32_b32_out2_21; + out[13 + 32 * i] = tmp32_b32_out2_22; + out[15 + 32 * i] = tmp32_b32_out2_23; + out[17 + 32 * i] = tmp32_b32_out2_24; + out[19 + 32 * i] = tmp32_b32_out2_25; + out[21 + 32 * i] = tmp32_b32_out2_26; + out[23 + 32 * i] = tmp32_b32_out2_27; + out[25 + 32 * i] = tmp32_b32_out2_28; + out[27 + 32 * i] = tmp32_b32_out2_29; + out[29 + 32 * i] = tmp32_b32_out2_30; + out[31 + 32 * i] = tmp32_b32_out2_31; + } +} + +void hls_IDCT1D_32(float from[16], float to[16]) { +#pragma HLS INLINE off + float IDCT_kMUltipliers_N4_0 = 0.541196100146197; + float IDCT_kMUltipliers_N4_1 = 1.3065629648763764; + float sqrt2 = 1.4142135623730951f; + + for (int i = 0; i < 4; i++) { +#pragma HLS PIPELINE II = 128 + float* from_addr = &from[i]; + float* to_addr = &to[i]; + + float tmp_IDCT_in_0 = from[i + 0]; + float tmp_IDCT_in_1 = from[i + 8]; + float tmp_IDCT_in_2 = from[i + 4]; + float tmp_IDCT_in_3 = from[i + 12]; + + float tmp_IDCT_add_0 = tmp_IDCT_in_0 + tmp_IDCT_in_1; + float in1_dct = tmp_IDCT_in_2 * sqrt2; + float tmp_IDCT_add_1 = tmp_IDCT_in_0 - tmp_IDCT_in_1; + float in2_dct = tmp_IDCT_in_3 + tmp_IDCT_in_2; + + float tmp_IDCT_add_2 = in1_dct + in2_dct; + float tmp_IDCT_add_3 = in1_dct - in2_dct; + + to[i + 0] = IDCT_kMUltipliers_N4_0 * tmp_IDCT_add_2 + tmp_IDCT_add_0; + to[i + 4] = IDCT_kMUltipliers_N4_1 * tmp_IDCT_add_3 + tmp_IDCT_add_1; + to[i + 8] = tmp_IDCT_add_1 - IDCT_kMUltipliers_N4_1 * tmp_IDCT_add_3; + to[i + 12] = tmp_IDCT_add_0 - IDCT_kMUltipliers_N4_0 * tmp_IDCT_add_2; + } +} + +void hls_idct32_scale_2d(float in[16], float out[16]) { +#pragma HLS INLINE off + +Loop_idct32_1: + for (int y = 0; y < 4; y++) { + Loop_idct32_2: + for (int x = 0; x < 4; x++) { +#pragma HLS PIPELINE + float resampley; + float resamplex; + if (x == 0) { + resamplex = 1; + } else if (x == 1) { + resamplex = 0.974886834621429443359375; + } else if (x == 2) { + resamplex = 0.901764214038848876953125; + } else if (x == 3) { + resamplex = 0.78705489635467529296875; + } + if (y == 0) { + resampley = 1; + } else if (y == 1) { + resampley = 0.974886834621429443359375; + } else if (y == 2) { + resampley = 0.901764214038848876953125; + } else if (y == 3) { + resampley = 0.78705489635467529296875; + } + out[y * 4 + x] = in[y * 4 + x] * resampley * resamplex; + } + } +} + +void hls_idct_transpose4x4(float in[16], float out[16]) { +#pragma HLS INLINE off +Loop_idct_transpose: + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { +#pragma HLS PIPELINE + out[i * 4 + j] = in[j * 4 + i]; + } + } +} + +void load_dct32(hls::stream& opsin32x32_stream, float from[1024]) { +#pragma HLS INLINE off +load_dct32: + for (int m = 0; m < 32; m++) { + for (int n = 0; n < 32; n++) { +#pragma HLS PIPELINE II = 1 + int addr = n * 32 + m; // m * 32 + n; + from[addr] = opsin32x32_stream.read(); + } + } +} + +void transpose_dct32(float in[1024], float out[1024]) { +#pragma HLS INLINE off + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 32; j++) { +#pragma HLS PIPELINE II = 1 + out[32 * j + i] = in[i * 32 + j]; + } + } +} + +void transpose_scaled_dct32(float in[1024], float out[1024]) { +#pragma HLS INLINE off + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 32; j++) { +#pragma HLS PIPELINE II = 1 + float mul = 1.0f / 32.0f; + float temp = mul * in[i * 32 + j]; + out[32 * j + i] = temp; + } + } +} + +void split_ac_dc_dct32(float in[1024], float to_ac[1024], float to_dc[16]) { +#pragma HLS INLINE off + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 32; j++) { +#pragma HLS PIPELINE II = 1 + to_ac[32 * i + j] = in[32 * i + j]; + + if (j < 4 && i < 4) { + to_dc[i * 4 + j] = in[32 * i + j]; + } + } + } +} + +void scaled_dct32(float in[1024], float out[1024]) { +#pragma HLS INLINE off + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 32; j++) { +#pragma HLS PIPELINE II = 1 + float mul = 1.0f / 32.0f; + out[32 * i + j] = mul * in[i * 32 + j]; + } + } +} + +void feed_dct32_ac(uint32_t x32, + uint32_t y32, + hls::stream& stream_rectx32, + hls::stream& stream_recty32, + float in[1024], + hls::stream& ac_coef32x32_stream) { +#pragma HLS INLINE off + uint32_t rect_xsize; + uint32_t rect_ysize; +loop_feed_dct32_ac: + for (int m = 0; m < 1024; m++) { +#pragma HLS PIPELINE II = 1 + if (m == 0) { + rect_xsize = stream_rectx32.read(); + rect_ysize = stream_recty32.read(); + } + if ((4 * x32 + 3) < rect_xsize && (4 * y32 + 3) < rect_ysize) { + // printf("feed ac %d %d %d\n", in[m], rect_xsize, rect_ysize); + ac_coef32x32_stream.write(in[m]); + } + } +} + +void feed_dct32_dc(uint32_t x32, + uint32_t y32, + hls::stream& stream_rectx32, + hls::stream& stream_recty32, + float dc_mem[16], + hls::stream& dc_coef32x32_stream) { +#pragma HLS INLINE off + uint32_t rect_xsize; + uint32_t rect_ysize; +loop_feed_dct32_dc: + for (int m = 0; m < 16; m++) { +#pragma HLS PIPELINE II = 1 + if (m == 0) { + rect_xsize = stream_rectx32.read(); + rect_ysize = stream_recty32.read(); + } + if ((4 * x32 + 3) < rect_xsize && (4 * y32 + 3) < rect_ysize) { + dc_coef32x32_stream.write(dc_mem[m]); + } + } +} + +void hls_dct32x32_module(unsigned ysize, + unsigned xsize, + hls::stream& stream_recty32, + hls::stream& stream_rectx32, + hls::stream& stream_recty32_1, + hls::stream& stream_rectx32_1, + hls::stream& opsin32x32_stream, + hls::stream& ac_coef32x32_stream, + hls::stream& dc_coef32x32_stream) { +#pragma HLS INLINE off + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + + int ysize64 = tile_ysize / 64; + int xsize64 = tile_xsize / 64; + + float from[1024]; +#pragma HLS bind_storage variable = from type = ram_2p impl = bram + float temp0[1024]; +#pragma HLS bind_storage variable = temp0 type = ram_2p impl = bram + float temp1[1024]; +#pragma HLS bind_storage variable = temp1 type = ram_2p impl = bram + float temp2[1024]; +#pragma HLS bind_storage variable = temp2 type = ram_2p impl = bram + float temp3[1024]; +#pragma HLS bind_storage variable = temp3 type = ram_2p impl = bram + float temp4[1024]; +#pragma HLS bind_storage variable = temp4 type = ram_2p impl = bram + float temp5[1024]; +#pragma HLS bind_storage variable = temp4 type = ram_2p impl = bram + float to_ac[1024]; +#pragma HLS bind_storage variable = to_ac type = ram_2p impl = bram + float to_dc[16]; +#pragma HLS bind_storage variable = to_dc type = ram_2p impl = bram + float dc_mem[16]; +#pragma HLS bind_storage variable = to_dc type = ram_2p impl = bram + float dc_temp0[16]; +#pragma HLS bind_storage variable = temp0 type = ram_2p impl = bram + float dc_temp1[16]; +#pragma HLS bind_storage variable = temp1 type = ram_2p impl = bram + float dc_temp2[16]; +#pragma HLS bind_storage variable = temp2 type = ram_2p impl = bram + +loop_dct32_tile_y: + for (uint32_t y64 = 0; y64 < ysize64; y64++) { + for (uint32_t x64 = 0; x64 < xsize64; x64++) { + for (uint32_t y32 = 0; y32 < 2; y32++) { + for (uint32_t x32 = 0; x32 < 2; x32++) { + for (int c = 0; c < 3; c++) { +#pragma HLS DATAFLOW + load_dct32(opsin32x32_stream, from); + hls_DCT1DImpl_32(from, temp1); + scaled_dct32(temp1, temp2); + transpose_dct32(temp2, temp3); + hls_DCT1DImpl_32(temp3, temp4); + transpose_scaled_dct32(temp4, temp5); + split_ac_dc_dct32(temp5, to_ac, to_dc); + // output ac_coeff dct32 + feed_dct32_ac(x32, y32, stream_rectx32, stream_recty32, to_ac, ac_coef32x32_stream); + // ouput dc_coeff dct32 + hls_idct32_scale_2d(to_dc, dc_temp0); + hls_IDCT1D_32(dc_temp0, dc_temp1); + hls_idct_transpose4x4(dc_temp1, dc_temp2); + hls_IDCT1D_32(dc_temp2, dc_mem); + feed_dct32_dc(x32, y32, stream_rectx32_1, stream_recty32_1, dc_mem, dc_coef32x32_stream); + } + } + } + } + } +} + +int Div_Ceil2(int a, int b) { +#pragma HLS inline + return (a + b - 1) / b; +} + +void GetRectSizeDCT(short xsize, + short ysize, + hls::stream& stream_rectx_dct, + hls::stream& stream_recty_dct, + hls::stream& stream_rectx32, + hls::stream& stream_recty32, + hls::stream& stream_rectx32_1, + hls::stream& stream_recty32_1, + hls::stream& stream_rectx16, + hls::stream& stream_recty16, + hls::stream& stream_rectx16_1, + hls::stream& stream_recty16_1, + hls::stream& stream_rectx8, + hls::stream& stream_recty8, + hls::stream& stream_rectx8_1, + hls::stream& stream_recty8_1) { + uint16_t xsize_blocks = xsize / 8; + uint16_t ysize_blocks = ysize / 8; +LOOP_0: + for (uint16_t y = 0; y < Div_Ceil2(ysize_blocks, 8); y++) { + LOOP_1: + for (uint16_t x = 0; x < Div_Ceil2(xsize_blocks, 8); x++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + // uint16_t by = y * 8; + // uint16_t by1 = ((y + 1) * 8) < ysize_blocks ? ((y + 1) * 8) : ysize_blocks; + // uint16_t bx = x * 8; + // uint16_t bx1 = ((x + 1) * 8) < xsize_blocks ? ((x + 1) * 8) : xsize_blocks; + // uint8_t rect_ysize = by1 - by; + // uint8_t rect_xsize = bx1 - bx; + uint8_t rect_ysize = stream_recty_dct.read(); + uint8_t rect_xsize = stream_rectx_dct.read(); + // printf("rect_xsize=%d, rect_ysize=%d\n", rect_xsize, rect_ysize); + for (int i = 0; i < 192; i++) { + if (i < 12) { + stream_rectx32.write(rect_xsize); + stream_recty32.write(rect_ysize); + stream_rectx32_1.write(rect_xsize); + stream_recty32_1.write(rect_ysize); + } + if (i < 48) { + stream_rectx16.write(rect_xsize); + stream_recty16.write(rect_ysize); + stream_rectx16_1.write(rect_xsize); + stream_recty16_1.write(rect_ysize); + } + stream_rectx8.write(rect_xsize); + stream_recty8.write(rect_ysize); + stream_rectx8_1.write(rect_xsize); + stream_recty8_1.write(rect_ysize); + } + } + } +} + +void hls_dct_top(unsigned ysize, + unsigned xsize, + hls::stream& stream_rectx_dct, + hls::stream& stream_recty_dct, + hls::stream& opsin8x8_stream, + hls::stream& opsin16x16_stream, + hls::stream& opsin32x32_stream, + hls::stream& ac_coef8x8_stream, + hls::stream& ac_coef16x16_stream, + hls::stream& ac_coef32x32_stream, + hls::stream& dc_coef8x8_stream, + hls::stream& dc_coef16x16_stream, + hls::stream& dc_coef32x32_stream) { + // #pragma HLS INLINE + hls::stream stream_rectx32; + hls::stream stream_recty32; + hls::stream stream_rectx32_1; + hls::stream stream_recty32_1; + hls::stream stream_rectx16; + hls::stream stream_recty16; + hls::stream stream_rectx16_1; + hls::stream stream_recty16_1; + hls::stream stream_rectx8; + hls::stream stream_recty8; + hls::stream stream_rectx8_1; + hls::stream stream_recty8_1; +// #pragma HLS DATAFLOW +#pragma HLS INLINE + GetRectSizeDCT(xsize, ysize, stream_rectx_dct, stream_recty_dct, stream_rectx32, stream_recty32, stream_rectx32_1, + stream_recty32_1, stream_rectx16, stream_recty16, stream_rectx16_1, stream_recty16_1, stream_rectx8, + stream_recty8, stream_rectx8_1, stream_recty8_1); + hls_dct8x8_module(ysize, xsize, stream_recty8, stream_rectx8, stream_recty8_1, stream_rectx8_1, opsin8x8_stream, + ac_coef8x8_stream, dc_coef8x8_stream); + hls_dct16x16_module(ysize, xsize, stream_recty16, stream_rectx16, stream_recty16_1, stream_rectx16_1, + opsin16x16_stream, ac_coef16x16_stream, dc_coef16x16_stream); + hls_dct32x32_module(ysize, xsize, stream_recty32, stream_rectx32, stream_recty32_1, stream_rectx32_1, + opsin32x32_stream, ac_coef32x32_stream, dc_coef32x32_stream); +} + +//-----------------------acs_heuristic---------------------// + +int Div_Ceil(int a, int b) { +#pragma HLS inline + return (a + b - 1) / b; +} + +float EvalRationalPolynomial3_2(float x, float p[3], float q[3]) { + float yp = p[2]; + float yq = q[2]; + yp = (yp * x) + p[1]; + yq = (yq * x) + q[1]; + yp = (yp * x) + p[0]; + yq = (yq * x) + q[0]; + return yp / yq; +} + +float FastLog2f_HLS2(float x) { + union { + float x_f; + int x_i; + } u = {x}; + float p[3] = {-1.8503833400518310E-06f, 1.4287160470083755E+00f, 7.4245873327820566E-01f}; + float q[3] = {9.9032814277590719E-01f, 1.0096718572241148E+00f, 1.7409343003366853E-01f}; + int x_bits = u.x_i; + int exp_bits = x_bits - 0x3f2aaaab; // = 2/3 + int exp_shifted = exp_bits >> 23; + int result0 = exp_shifted << 23; + int result = x_bits - result0; + u.x_i = result; + float mantissa = u.x_f; + float exp_val = static_cast(exp_shifted); + float output = EvalRationalPolynomial3_2(mantissa - 1.0f, p, q) + exp_val; + return output; +} + +float FastPow2f_HLS(float x) { + int floorx = floor(x); + int tmp = ((floorx + 127) << 23); + union { + float x_f; + int x_i; + } u; + u.x_i = tmp; + float exp = u.x_f; + float frac = x - floorx; + float num = frac + 1.01749063e+01; + num = num * frac + 4.88687798e+01; + num = num * frac + 9.85506591e+01; + num = num * exp; + float den = frac * 2.10242958e-01 - 2.22328856e-02; + den = den * frac - 1.94414990e+01; + den = den * frac + 9.85506633e+01; + return num / den; +} + +float FastPowf_HLS(float base, float exponent) { + return FastPow2f_HLS(FastLog2f_HLS2(base) * exponent); +} + +int CeilLog2NonzeroHLS(ap_int<32> x) { + int leading_zeros = x.countLeadingZeros(); + int floor_log2 = 63 ^ (leading_zeros + 32); + if ((x & (x - 1)) != 0) { + floor_log2 = floor_log2 + 1; + } + return floor_log2; +} + +void GetACSSize(short xsize, + short ysize, + hls::stream& stream_rectx_acs, + hls::stream& stream_recty_acs, + hls::stream& stream_rectx0, + hls::stream& stream_recty0, + hls::stream& stream_rectx1, + hls::stream& stream_recty1, + hls::stream& stream_rectx2, + hls::stream& stream_recty2, + hls::stream& stream_rectx3, + hls::stream& stream_recty3, + hls::stream& stream_rectx10, + hls::stream& stream_recty10) { + uint16_t xsize_blocks = xsize / 8; + uint16_t ysize_blocks = ysize / 8; +LOOP_0: + for (uint16_t y = 0; y < Div_Ceil(ysize_blocks, 8); y++) { + LOOP_1: + for (uint16_t x = 0; x < Div_Ceil(xsize_blocks, 8); x++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + // uint16_t by = y * 8; + // uint16_t by1 = ((y + 1) * 8) < ysize_blocks ? ((y + 1) * 8) : ysize_blocks; + // uint16_t bx = x * 8; + // uint16_t bx1 = ((x + 1) * 8) < xsize_blocks ? ((x + 1) * 8) : xsize_blocks; + // uint8_t rect_ysize = by1 - by; + // uint8_t rect_xsize = bx1 - bx; + uint8_t rect_ysize = stream_recty_acs.read(); + uint8_t rect_xsize = stream_rectx_acs.read(); + stream_rectx0.write(rect_xsize); + stream_recty0.write(rect_ysize); + stream_rectx1.write(rect_xsize); + stream_recty1.write(rect_ysize); + stream_rectx2.write(rect_xsize); + stream_recty2.write(rect_ysize); + stream_rectx3.write(rect_xsize); + stream_recty3.write(rect_ysize); + stream_rectx10.write(rect_xsize); + stream_recty10.write(rect_ysize); + } + } +} + +void DupQuantAndMask(uint16_t num_tile, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_q_org, + hls::stream& stream_mask_org, + hls::stream& stream_q_org_8, + hls::stream& stream_mask_org_8, + hls::stream& stream_q_org_16, + hls::stream& stream_mask_org_16, + hls::stream& stream_q_org_32, + hls::stream& stream_mask_org_32) { +DUP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + DUP_1: + for (uint8_t iy = 0; iy < rect_ysize; iy++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + DUP_2: + for (uint8_t ix = 0; ix < rect_xsize; ix++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + // do computation once for 16 and 32 + float tmp0 = stream_q_org.read(); + stream_q_org_8.write(tmp0); + tmp0 *= tmp0; + tmp0 *= tmp0; + tmp0 *= tmp0; + stream_q_org_16.write(tmp0); + stream_q_org_32.write(tmp0); + float tmp1 = stream_mask_org.read(); + stream_mask_org_8.write(tmp1); + stream_mask_org_16.write(tmp1); + stream_mask_org_32.write(tmp1); + } + } + } +} + +void GetQAndMask_8(uint16_t num_tile, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_rectx_out, + hls::stream& stream_recty_out, + hls::stream& stream_q_org, + hls::stream& stream_mask_org, + hls::stream& stream_q, + hls::stream& stream_mask) { +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + stream_recty_out.write(rect_ysize); + stream_rectx_out.write(rect_xsize); + LOOP_1: + for (uint8_t iy = 0; iy < rect_ysize; iy++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_2: + for (uint8_t ix = 0; ix < rect_xsize; ix++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 +#pragma HLS pipeline II = 64 + float quant_norm8 = 0; + float masking = 0; + quant_norm8 = stream_q_org.read(); + stream_q.write(quant_norm8); + masking = 2.0f * stream_mask_org.read(); + stream_mask.write(masking); + } + } + } +} + +template +void GetQAndMask_16_32(uint16_t num_tile, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_rectx_out, + hls::stream& stream_recty_out, + hls::stream& stream_q_org, + hls::stream& stream_mask_org, + hls::stream& stream_q, + hls::stream& stream_mask) { + uint8_t block_n = N * N; +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + stream_recty_out.write(rect_ysize); + stream_rectx_out.write(rect_xsize); + float q_array[64]; +#pragma HLS BIND_STORAGE variable = q_array type = RAM_1P impl = bram + float mask_array[64]; +#pragma HLS BIND_STORAGE variable = mask_array type = RAM_1P impl = bram + LOOP_1: + for (uint8_t iy = 0; iy < rect_ysize; iy++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 +#pragma HLS loop_flatten off + LOOP_2: + for (uint8_t ix = 0; ix < rect_xsize; ix++) { +#pragma HLS pipeline II = 1 +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 +#pragma HLS loop_flatten off + int index = iy * 8 + ix; + q_array[index] = stream_q_org.read(); + mask_array[index] = stream_mask_org.read(); + } + } + LOOP_3: + for (uint8_t iy = 0; iy + N - 1 < rect_ysize; iy += N) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 +#pragma HLS loop_flatten off + LOOP_4: + for (uint8_t ix = 0; ix + N - 1 < rect_xsize; ix += N) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 +#pragma HLS loop_flatten off + float quant_norm8 = 0; + float masking = 0; + float masking_norm2 = 0; + float masking_max = 0; + LOOP_5: + for (uint8_t dy = 0; dy < N; dy++) { +#pragma HLS LOOP_TRIPCOUNT min = 4 max = 4 +#pragma HLS loop_flatten off + LOOP_6: + for (uint8_t dx = 0; dx < N; dx++) { +#pragma HLS LOOP_TRIPCOUNT min = 4 max = 4 +#pragma HLS loop_flatten off +#pragma HLS pipeline + uint8_t idx = (iy + dy) * 8 + ix + dx; + float qval = q_array[idx]; + quant_norm8 += qval; + float maskval = mask_array[idx]; + masking_max = fmax(masking_max, maskval); + masking_norm2 += maskval * maskval; + } + } + quant_norm8 /= block_n; + // Change: use 3 sqrtf to replace FastPowf_HLS, and try to only use on sqrtf to do all things + // float tmp = quant_norm8; + // quant_norm8 = sqrtf(quant_norm8); + // quant_norm8 = sqrtf(quant_norm8); + // quant_norm8 = sqrtf(quant_norm8); + LOOP_7: + for (int dx = 0; dx < 3; dx++) { +#pragma HLS pipeline + quant_norm8 = sqrtf(quant_norm8); + } + // quant_norm8 = FastPowf_HLS(quant_norm8, 1.0f / 8.0f); + masking_norm2 = sqrtf(masking_norm2 / block_n); + masking = masking_norm2 + masking_max; + stream_q.write(quant_norm8); + stream_mask.write(masking); + } + } + } +} + +template +void ComputeEntropy1(uint16_t num_tile, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_rectx_out, + hls::stream& stream_recty_out, + hls::stream& stream_q, + hls::stream& stream_dctin, +#ifdef FIX + hls::stream >& stream_loss, + hls::stream >& stream_loss2, + hls::stream >& stream_entropy, + hls::stream >& stream_nzeros +#else + hls::stream& stream_loss, + hls::stream& stream_loss2, + hls::stream& stream_entropy, + hls::stream& stream_nzeros +#endif + ) { + uint8_t block_n = N * N; + int count_array; + float info_loss = 0.0; + float info_loss2 = 0.0; + float entropy = 0.0; + float zeros_mul = 7.565053364251793f; + float cost2 = 4.4628149885273363f; + float cost_delta = 5.3359184934516337f; + float cmap_factor; + float q; + float entropy_v[3] = {0.0, 0.0, 0.0}; + float nzeros_v[3] = {0.0, 0.0, 0.0}; + float entropy_array[8]; + float info_loss_array[8]; + float info_loss2_array[8]; + float nzeros_array[8]; + float y_ram[1024]; + float cmap_factors_init[3] = {0.0f, 0.0f, 1.0f}; +#ifdef FIX + ap_int<23> info_loss_fix[8]; + ap_int<45> info_loss2_fix[8]; + ap_int<11> nzeros_fix[8]; + ap_int<32> y_fix_ram[1024]; + ap_int<32> cost2_fix = (int)(cost2 * 1024); + ap_int<32> cost_delta_fix = (int)(cost_delta * 1024); + ap_int<28> info_loss_sum; + ap_int<44> info_loss2_sum; + ap_int<11> nzeros_sum; + ap_int<42> entropy_sum; +#endif +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint16_t rect_ysize = stream_recty.read(); + uint16_t rect_xsize = stream_rectx.read(); + stream_recty_out.write(rect_ysize); + stream_rectx_out.write(rect_xsize); + float q_tmp[64]; + LOOP_1: + for (uint8_t iy = 0; iy + N - 1 < rect_ysize; iy += N) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_2: + for (uint8_t ix = 0; ix + N - 1 < rect_xsize; ix += N) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_3: + for (uint8_t c = 0; c < 3; c++) { +#pragma HLS LOOP_TRIPCOUNT min = 3 max = 3 + LOOP_4: + for (uint16_t i = 0; i < block_n * 64; i += 1) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 +#pragma HLS pipeline II = 1 + float in = stream_dctin.read(); +#ifdef FIX + if (i == 0) { + nzeros_sum = 0; + entropy_sum = 0; + if (c == 0) { + q = stream_q.read(); + info_loss_sum = 0; + info_loss2_sum = 0; + } + } + ap_int<30> in_fix = in * 0x1fffffff; // exp=29 + float in_fix_y_tmp; + if (c == 0) { + y_fix_ram[i] = in_fix; + in_fix_y_tmp = in_fix; + } else { + in_fix_y_tmp = y_fix_ram[i]; + } + ap_int<30> in_fix_y = (c == 2) ? in_fix_y_tmp : 0; + ap_int<31> in_fix_m = in_fix - in_fix_y; + + ap_uint<24> im_fix; + if (N == 1) { + im_fix = inv_matrix_8_fix[c][i]; // exp=10 + } + if (N == 2) { + im_fix = inv_matrix_16_fix[c][i]; + } + if (N == 4) { + im_fix = inv_matrix_32_fix[c][i]; + } + + ap_uint<15> rqf_fix = q * 32768; // exp=15 + ap_int<55> val_tmp0 = in_fix_m * im_fix; // exp=29+10=39 + ap_int<28> val_tmp1 = val_tmp0 >> 27; // exp=39-27=12 + ap_int<43> val_tmp2 = val_tmp1 * rqf_fix; // exp=12+15=27 + ap_int<35> val_fix = val_tmp2 >> 11; // exp=27-11=16 + + // actual value is not that large, so just reduce bitwidth + ap_int<11> val_shift0 = val_fix >> 15; + ap_int<10> val_shift1 = val_fix >> 16; + if (val_shift0.range(0, 0) == 1) { + val_shift1 += 1; + } + ap_int<10> rval_fix = val_shift1; // exp=0 + ap_int<32> val_shift_back = val_shift1 * 65536; // exp=16 + ap_uint<16> diff_fix = hls::abs(val_shift_back - val_fix); // exp=-16 hls_abs? + ap_uint<32> diff_fix_square = diff_fix * diff_fix; // exp=-32 + ap_uint<10> q_fix = hls::abs(rval_fix); // hls_abs? + bool q_fix_is_zero = q_fix == 0; + float entropy_tmp = (q_fix > 1 ? cost2 : 0.0f) + sqrtf(q_fix) * cost_delta; + ap_uint<32> entropy_fix = (uint32_t)(entropy_tmp * 65536); + + info_loss_sum += diff_fix; + info_loss2_sum += diff_fix_square; + nzeros_sum += q_fix_is_zero ? 0 : 1; + entropy_sum += entropy_fix; + + if (i == block_n * 64 - 1) { + stream_entropy.write(entropy_sum); + stream_nzeros.write(nzeros_sum); + } + if (i == block_n * 64 - 1 && c == 2) { + stream_loss.write(info_loss_sum); + stream_loss2.write(info_loss2_sum); + } +#else + if (c == 0 && i == 0) { + q = stream_q.read(); + count_array = 0; + } + cmap_factor = cmap_factors_init[c]; + float in_y_tmp; + if (c == 0) { + y_ram[i] = in; + in_y_tmp = in; + } else { + in_y_tmp = y_ram[i]; + } + float in_y = in_y_tmp * cmap_factor; + float im; + if (N == 1) { + im = inv_matrix_8[c][i]; + } + if (N == 2) { + im = inv_matrix_16[c][i]; + } + if (N == 4) { + im = inv_matrix_32[c][i]; + } + const float val = (in - in_y) * im * q; + const int rval = roundf(val); + const float diff = fabs(val - rval); + + info_loss_array[count_array] = diff; + info_loss2_array[count_array] = diff * diff; + + const int q = abs(rval); + const bool q_is_zero = q == 0; + float tmp = (q >= 1.5f ? cost2 : 0.0f) + sqrtf(q) * cost_delta; + entropy_array[count_array] = tmp; + nzeros_array[count_array] = q_is_zero ? 0.0f : 1.0f; + count_array++; + if (count_array == 8) { + float sum0 = entropy_array[0] + entropy_array[1] + entropy_array[2] + entropy_array[3] + + entropy_array[4] + entropy_array[5] + entropy_array[6] + entropy_array[7]; + stream_entropy.write(sum0); + float sum1 = nzeros_array[0] + nzeros_array[1] + nzeros_array[2] + nzeros_array[3] + + nzeros_array[4] + nzeros_array[5] + nzeros_array[6] + nzeros_array[7]; + stream_nzeros.write(sum1); + float sum2 = info_loss_array[0] + info_loss_array[1] + info_loss_array[2] + + info_loss_array[3] + info_loss_array[4] + info_loss_array[5] + + info_loss_array[6] + info_loss_array[7]; + stream_loss.write(sum2); + float sum3 = info_loss2_array[0] + info_loss2_array[1] + info_loss2_array[2] + + info_loss2_array[3] + info_loss2_array[4] + info_loss2_array[5] + + info_loss2_array[6] + info_loss2_array[7]; + stream_loss2.write(sum3); + count_array = 0; + } +#endif + } // loop i + } + } + } + } +} + +template +void ComputeEntropy2(uint16_t num_tile, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_rectx_out, + hls::stream& stream_recty_out, + hls::stream& stream_loss, + hls::stream& stream_loss2, + hls::stream& stream_entropy, + hls::stream& stream_nzeros, + hls::stream& stream_loss_sum, + hls::stream& stream_loss2_sum, + hls::stream& stream_entropy_sum, + hls::stream& stream_nzeros_sum) { + float entropy_v[3]; + float nzeros_v[3]; + float info_loss; + float info_loss2; +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + stream_recty_out.write(rect_ysize); + stream_rectx_out.write(rect_xsize); + LOOP_1: + for (uint8_t iy = 0; iy + N - 1 < rect_ysize; iy += N) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_2: + for (uint8_t ix = 0; ix + N - 1 < rect_xsize; ix += N) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_3: + for (uint8_t c = 0; c < 3; c++) { +#pragma HLS LOOP_TRIPCOUNT min = 3 max = 3 + LOOP_4: + for (uint8_t i = 0; i < 64 * N * N / 8; i += 1) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 +#pragma HLS pipeline II = 8 + if (c == 0 && i == 0) { + info_loss = 0.0; + info_loss2 = 0.0; + } + if (i == 0) { + entropy_v[c] = 0; + nzeros_v[c] = 0; + } + entropy_v[c] += stream_entropy.read(); + nzeros_v[c] += stream_nzeros.read(); + info_loss += stream_loss.read(); + info_loss2 += stream_loss2.read(); + if (i == 64 * N * N / 8 - 1) { + stream_entropy_sum.write(entropy_v[c]); + stream_nzeros_sum.write(nzeros_v[c]); + } + if (c == 2 && i == 64 * N * N / 8 - 1) { + stream_loss_sum.write(info_loss); + stream_loss2_sum.write(info_loss2); + } + } + } + } + } + } +} + +template +void ComputeEntropy3(uint16_t num_tile, + float cost1, + float mul, + hls::stream& stream_rectx, + hls::stream& stream_recty, +#ifdef FIX + hls::stream >& stream_loss, + hls::stream >& stream_loss2, + hls::stream >& stream_entropy, + hls::stream >& stream_nzeros, +#else + hls::stream& stream_loss_sum, + hls::stream& stream_loss2_sum, + hls::stream& stream_entropy_sum, + hls::stream& stream_nzeros_sum, +#endif + hls::stream& stream_mask, + hls::stream& stream_entropy_final) { +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + LOOP_1: + for (uint8_t iy = 0; iy + N - 1 < rect_ysize; iy += N) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_2: + for (uint8_t ix = 0; ix + N - 1 < rect_xsize; ix += N) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 +#ifdef FIX + float entropy = 0.0; + float zeros_mul = 7.565053364251793f; + float entropy_v[3]; + ap_int<11> nzeros_v[3]; + float entropy_bits[3] = {0.0, 0.0, 0.0}; + for (uint8_t c = 0; c < 3; c++) { +#pragma HLS LOOP_TRIPCOUNT min = 3 max = 3 +#pragma HLS pipeline + ap_int<42> entropy_tmp = stream_entropy.read(); + entropy_v[c] = entropy_tmp / 65536.0; + nzeros_v[c] = stream_nzeros.read(); + entropy_v[c] += nzeros_v[c] * cost1; + uint8_t nbits = LUTCeilLog2Nonzero[(nzeros_v[c] + 1)] + 1; + entropy_bits[c] = zeros_mul * (LUTCeilLog2Nonzero[nbits + 17] + nbits); + } + entropy = entropy_v[0] + entropy_v[1] + entropy_v[2]; + entropy += entropy_bits[0] + entropy_bits[1] + entropy_bits[2]; + ap_int<28> tmp_loss = stream_loss.read(); + float loss_f = tmp_loss / 65536.0; + ap_int<44> tmp_loss2 = stream_loss2.read(); + float loss2_f = tmp_loss2 / 65536.0 / 65536.0; + float info_loss_multiplier = 138.0f; + float info_loss_multiplier2 = 50.46839691767866; + float loss = ((info_loss_multiplier * loss_f) + (info_loss_multiplier2 * N * sqrtf(loss2_f))); + float loss_mask = stream_mask.read() * loss; + float ret = entropy + loss_mask; + if (N == 1) { + ret = 3.0f + 0.745f * ret; + } + ret = ret * mul; + stream_entropy_final.write(ret); +#else + float entropy = 0.0; + float zeros_mul = 7.565053364251793f; + float entropy_v[3] = {0.0, 0.0, 0.0}; + float nzeros_v[3] = {0.0, 0.0, 0.0}; + float entropy_bits[3] = {0.0, 0.0, 0.0}; + for (int c = 0; c < 3; c++) { +#pragma HLS LOOP_TRIPCOUNT min = 3 max = 3 +#pragma HLS pipeline + + entropy_v[c] = stream_entropy_sum.read(); + nzeros_v[c] = stream_nzeros_sum.read(); + entropy_v[c] += nzeros_v[c] * cost1; + // TODO: Integer to integer, can we use look up table to implement this? + // int nbits = CeilLog2NonzeroHLS(nzeros_v[c] + 1) + 1; + // entropy_bits[c] = zeros_mul * (CeilLog2NonzeroHLS(nbits + 17) + nbits); + int nbits = LUTCeilLog2Nonzero[(short)(nzeros_v[c] + 1)] + 1; + entropy_bits[c] = zeros_mul * (LUTCeilLog2Nonzero[nbits + 17] + nbits); + } + entropy = entropy_v[0] + entropy_v[1] + entropy_v[2]; + entropy += entropy_bits[0] + entropy_bits[1] + entropy_bits[2]; + float tmp_loss = stream_loss_sum.read(); + float tmp_loss2 = stream_loss2_sum.read(); + float info_loss_multiplier = 138.0f; + float info_loss_multiplier2 = 50.46839691767866; + float ret = entropy + + stream_mask.read() * ((info_loss_multiplier * tmp_loss) + + (info_loss_multiplier2 * sqrtf((float)(N * N * tmp_loss2)))); + if (N == 1) { + ret = 3.0f + 0.745f * ret; + } + stream_entropy_final.write(ret * mul); +#endif + } + } + } +} + +template +void BufferN(uint16_t num_tile, + float* ping, + float* pang, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_dctin, + hls::stream& stream_con, + hls::stream& stream_ok) { + uint8_t block = N; + uint8_t block_n = N * N; + bool flag = true; + uint16_t size = 4096; + uint8_t w = 64; + uint16_t total_size = 4096 * 3; +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + stream_con.write(1); + LOOP_1: + for (uint8_t jy = 0; jy < 8 / N; jy += 1) { + LOOP_2: + for (uint8_t jx = 0; jx < 8 / N; jx += 1) { + LOOP_3: + for (uint8_t c = 0; c < 3; c++) { + LOOP_4: + for (uint8_t iy = 0; iy < 8 * N; iy += 1) { + LOOP_5: + for (uint8_t ix = 0; ix < 8 * N; ix += 1) { + uint8_t y = jy * 8 * N + iy; + uint8_t x = jx * 8 * N + ix; + bool read = false; + if (N == 1 && (jy < rect_ysize) && (jx < rect_xsize)) { + read = true; + } + if (N == 2 && (jy * 2 + 1) < rect_ysize && (jx * 2 + 1) < rect_xsize) { + read = true; + } + if (N == 4 && (jy * 4 + 3) < rect_ysize && (jx * 4 + 3) < rect_xsize) { + read = true; + } + if (read) { + float tmp = stream_dctin.read(); +#ifdef __SYNTHESIS__ + if (flag) { + ping[c * size + y * w + x] = tmp; + } else { + pang[c * size + y * w + x] = tmp; + } +#else + if (flag) { + ping[total_size * tid + c * size + y * w + x] = tmp; + } else { + pang[total_size * tid + c * size + y * w + x] = tmp; + } +#endif + } + } + } + } + } + } + flag = !flag; + stream_ok.write(1); + } +} + +void EstimateEntropy8(uint16_t num_tile, + float cost1, + float mul8x8, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_q_org, + hls::stream& stream_mask_org, + hls::stream& stream_dctin, + hls::stream& stream_entropy_8) { +#pragma HLS inline + + hls::stream stream_rectx80("rectx80"); +#pragma HLS stream variable = stream_rectx80 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx80 type = fifo + hls::stream stream_recty80("recty80"); +#pragma HLS stream variable = stream_recty80 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty80 type = fifo + hls::stream stream_rectx81("rectx81"); +#pragma HLS stream variable = stream_rectx81 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx81 type = fifo + hls::stream stream_recty81("recty81"); +#pragma HLS stream variable = stream_recty81 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty81 type = fifo + hls::stream stream_rectx82("rectx82"); +#pragma HLS stream variable = stream_rectx82 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx82 type = fifo + hls::stream stream_recty82("recty82"); +#pragma HLS stream variable = stream_recty82 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty82 type = fifo + +#ifdef FIX + hls::stream > stream_loss("loss_8"); +#pragma HLS stream variable = stream_loss depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss type = fifo + hls::stream > stream_loss2("loss2_8"); +#pragma HLS stream variable = stream_loss2 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2 type = fifo + hls::stream > stream_entropy("entropy_8"); +#pragma HLS stream variable = stream_entropy depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy type = fifo + hls::stream > stream_nzeros("nzeros_8"); +#pragma HLS stream variable = stream_nzeros depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros type = fifo +#else + hls::stream stream_loss("loss_8"); +#pragma HLS stream variable = stream_loss depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss type = fifo + hls::stream stream_loss2("loss2_8"); +#pragma HLS stream variable = stream_loss2 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2 type = fifo + hls::stream stream_entropy("entropy_8"); +#pragma HLS stream variable = stream_entropy depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy type = fifo + hls::stream stream_nzeros("nzeros_8"); +#pragma HLS stream variable = stream_nzeros depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros type = fifo +#endif + hls::stream stream_loss_sum("loss_8_sum"); + ; +#pragma HLS stream variable = stream_loss_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss_sum type = fifo + hls::stream stream_loss2_sum("loss2_8_sum"); +#pragma HLS stream variable = stream_loss2_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2_sum type = fifo + hls::stream stream_entropy_sum("entropy_8_sum"); +#pragma HLS stream variable = stream_entropy_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy_sum type = fifo + hls::stream stream_nzeros_sum("nzeros_8_sum"); +#pragma HLS stream variable = stream_nzeros_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros_sum type = fifo + hls::stream stream_q("q_8"); +#pragma HLS stream variable = stream_q depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_q type = fifo + hls::stream stream_mask("mask_8"); +#pragma HLS stream variable = stream_mask depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_mask type = fifo + + // #pragma HLS dataflow + GetQAndMask_8(num_tile, stream_rectx, stream_recty, stream_rectx80, stream_recty80, stream_q_org, stream_mask_org, + stream_q, stream_mask); + +#ifdef FIX + ComputeEntropy1<1>(num_tile, stream_rectx80, stream_recty80, stream_rectx81, stream_recty81, stream_q, stream_dctin, + stream_loss, stream_loss2, stream_entropy, stream_nzeros); + + ComputeEntropy3<1>(num_tile, cost1, mul8x8, stream_rectx81, stream_recty81, stream_loss, stream_loss2, + stream_entropy, stream_nzeros, stream_mask, stream_entropy_8); + +#else + + ComputeEntropy1<1>(num_tile, stream_rectx80, stream_recty80, stream_rectx81, stream_recty81, stream_q, stream_dctin, + stream_loss, stream_loss2, stream_entropy, stream_nzeros); + + ComputeEntropy2<1>(num_tile, stream_rectx81, stream_recty81, stream_rectx82, stream_recty82, stream_loss, + stream_loss2, stream_entropy, stream_nzeros, stream_loss_sum, stream_loss2_sum, + stream_entropy_sum, stream_nzeros_sum); + + ComputeEntropy3<1>(num_tile, cost1, mul8x8, stream_rectx82, stream_recty82, stream_loss_sum, stream_loss2_sum, + stream_entropy_sum, stream_nzeros_sum, stream_mask, stream_entropy_8); +#endif +} + +void EstimateEntropy16(uint16_t num_tile, + float cost1, + float mul16x16, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_q_org, + hls::stream& stream_mask_org, + hls::stream& stream_dctin, + hls::stream& stream_entropy_16) { +#pragma HLS inline + + hls::stream stream_rectx160("rectx160"); +#pragma HLS stream variable = stream_rectx160 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx160 type = fifo + hls::stream stream_recty160("recty160"); +#pragma HLS stream variable = stream_recty160 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty160 type = fifo + hls::stream stream_rectx161("rectx161"); +#pragma HLS stream variable = stream_rectx161 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx161 type = fifo + hls::stream stream_recty161("recty161"); +#pragma HLS stream variable = stream_recty161 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty161 type = fifo + hls::stream stream_rectx162("rectx162"); +#pragma HLS stream variable = stream_rectx162 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx162 type = fifo + hls::stream stream_recty162("recty162"); +#pragma HLS stream variable = stream_recty162 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty162 type = fifo + +#ifdef FIX + hls::stream > stream_loss("loss_16"); +#pragma HLS stream variable = stream_loss depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss type = fifo + hls::stream > stream_loss2("loss2_16"); +#pragma HLS stream variable = stream_loss2 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2 type = fifo + hls::stream > stream_entropy("entropy_16"); +#pragma HLS stream variable = stream_entropy depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy type = fifo + hls::stream > stream_nzeros("nzeros_16"); +#pragma HLS stream variable = stream_nzeros depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros type = fifo +#else + hls::stream stream_loss("loss_16"); +#pragma HLS stream variable = stream_loss depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss type = fifo + hls::stream stream_loss2("loss2_16"); +#pragma HLS stream variable = stream_loss2 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2 type = fifo + hls::stream stream_entropy("entropy_16"); +#pragma HLS stream variable = stream_entropy depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy type = fifo + hls::stream stream_nzeros("nzeros_16"); +#pragma HLS stream variable = stream_nzeros depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros type = fifo +#endif + hls::stream stream_loss_sum("loss_16_sum"); + ; +#pragma HLS stream variable = stream_loss_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss_sum type = fifo + hls::stream stream_loss2_sum("loss2_16_sum"); +#pragma HLS stream variable = stream_loss2_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2_sum type = fifo + hls::stream stream_entropy_sum("entropy_16_sum"); +#pragma HLS stream variable = stream_entropy_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy_sum type = fifo + hls::stream stream_nzeros_sum("nzeros_16_sum"); +#pragma HLS stream variable = stream_nzeros_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros_sum type = fifo + hls::stream stream_q("q_16"); +#pragma HLS stream variable = stream_q depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_q type = fifo + hls::stream stream_mask("mask_16"); +#pragma HLS stream variable = stream_mask depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_mask type = fifo + + // #pragma HLS dataflow + GetQAndMask_16_32<2>(num_tile, stream_rectx, stream_recty, stream_rectx160, stream_recty160, stream_q_org, + stream_mask_org, stream_q, stream_mask); + +#ifdef FIX + + ComputeEntropy1<2>(num_tile, stream_rectx160, stream_recty160, stream_rectx161, stream_recty161, stream_q, + stream_dctin, stream_loss, stream_loss2, stream_entropy, stream_nzeros); + + ComputeEntropy3<2>(num_tile, cost1, mul16x16, stream_rectx161, stream_recty161, stream_loss, stream_loss2, + stream_entropy, stream_nzeros, stream_mask, stream_entropy_16); + +#else + + ComputeEntropy1<2>(num_tile, stream_rectx160, stream_recty160, stream_rectx161, stream_recty161, stream_q, + stream_dctin, stream_loss, stream_loss2, stream_entropy, stream_nzeros); + + ComputeEntropy2<2>(num_tile, stream_rectx161, stream_recty161, stream_rectx162, stream_recty162, stream_loss, + stream_loss2, stream_entropy, stream_nzeros, stream_loss_sum, stream_loss2_sum, + stream_entropy_sum, stream_nzeros_sum); + + ComputeEntropy3<2>(num_tile, cost1, mul16x16, stream_rectx162, stream_recty162, stream_loss_sum, stream_loss2_sum, + stream_entropy_sum, stream_nzeros_sum, stream_mask, stream_entropy_16); +#endif +} + +void EstimateEntropy32(uint16_t num_tile, + float cost1, + float mul32x32, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_q_org, + hls::stream& stream_mask_org, + hls::stream& stream_dctin, + hls::stream& stream_entropy_32) { +#pragma HLS inline + + hls::stream stream_rectx320("rectx320"); +#pragma HLS stream variable = stream_rectx320 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx320 type = fifo + hls::stream stream_recty320("recty320"); +#pragma HLS stream variable = stream_recty320 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty320 type = fifo + hls::stream stream_rectx321("rectx321"); +#pragma HLS stream variable = stream_rectx321 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx321 type = fifo + hls::stream stream_recty321("recty321"); +#pragma HLS stream variable = stream_recty321 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty321 type = fifo + hls::stream stream_rectx322("rectx322"); +#pragma HLS stream variable = stream_rectx322 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx322 type = fifo + hls::stream stream_recty322("recty322"); +#pragma HLS stream variable = stream_recty322 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty322 type = fifo + +#ifdef FIX + hls::stream > stream_loss("loss_32"); +#pragma HLS stream variable = stream_loss depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss type = fifo + hls::stream > stream_loss2("loss2_32"); +#pragma HLS stream variable = stream_loss2 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2 type = fifo + hls::stream > stream_entropy("entropy_32"); +#pragma HLS stream variable = stream_entropy depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy type = fifo + hls::stream > stream_nzeros("nzeros_32"); +#pragma HLS stream variable = stream_nzeros depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros type = fifo +#else + hls::stream stream_loss("loss_32"); +#pragma HLS stream variable = stream_loss depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss type = fifo + hls::stream stream_loss2("loss2_32"); +#pragma HLS stream variable = stream_loss2 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2 type = fifo + hls::stream stream_entropy("entropy_32"); +#pragma HLS stream variable = stream_entropy depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy type = fifo + hls::stream stream_nzeros("nzeros_32"); +#pragma HLS stream variable = stream_nzeros depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros type = fifo +#endif + hls::stream stream_loss_sum("loss_32_sum"); + ; +#pragma HLS stream variable = stream_loss_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss_sum type = fifo + hls::stream stream_loss2_sum("loss2_32_sum"); +#pragma HLS stream variable = stream_loss2_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2_sum type = fifo + hls::stream stream_entropy_sum("entropy_32_sum"); +#pragma HLS stream variable = stream_entropy_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy_sum type = fifo + hls::stream stream_nzeros_sum("nzeros_32_sum"); +#pragma HLS stream variable = stream_nzeros_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros_sum type = fifo + hls::stream stream_q("q_32"); +#pragma HLS stream variable = stream_q depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_q type = fifo + hls::stream stream_mask("mask_32"); +#pragma HLS stream variable = stream_mask depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_mask type = fifo + + // #pragma HLS dataflow + GetQAndMask_16_32<4>(num_tile, stream_rectx, stream_recty, stream_rectx320, stream_recty320, stream_q_org, + stream_mask_org, stream_q, stream_mask); + +#ifdef FIX + ComputeEntropy1<4>(num_tile, stream_rectx320, stream_recty320, stream_rectx321, stream_recty321, stream_q, + stream_dctin, stream_loss, stream_loss2, stream_entropy, stream_nzeros); + + ComputeEntropy3<4>(num_tile, cost1, mul32x32, stream_rectx321, stream_recty321, stream_loss, stream_loss2, + stream_entropy, stream_nzeros, stream_mask, stream_entropy_32); + +#else + + ComputeEntropy1<4>(num_tile, stream_rectx320, stream_recty320, stream_rectx321, stream_recty321, stream_q, + stream_dctin, stream_loss, stream_loss2, stream_entropy, stream_nzeros); + + ComputeEntropy2<4>(num_tile, stream_rectx321, stream_recty321, stream_rectx322, stream_recty322, stream_loss, + stream_loss2, stream_entropy, stream_nzeros, stream_loss_sum, stream_loss2_sum, + stream_entropy_sum, stream_nzeros_sum); + + ComputeEntropy3<4>(num_tile, cost1, mul32x32, stream_rectx322, stream_recty322, stream_loss_sum, stream_loss2_sum, + stream_entropy_sum, stream_nzeros_sum, stream_mask, stream_entropy_32); +#endif +} + +void CompareEntropy(uint16_t num_tile, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_rectx_out, + hls::stream& stream_recty_out, + hls::stream& stream_entropy_8, + hls::stream& stream_entropy_16, + hls::stream& stream_entropy_32, + uint8_t* strategy_ping, + uint8_t* strategy_pang, + hls::stream& stream_con, + hls::stream& stream_ok) { + bool flag = true; +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + stream_recty_out.write(rect_ysize); + stream_rectx_out.write(rect_xsize); + float entropy_32; + float entropy_16; + float entropy_8; + float entropy_sum[16] = {0}; + stream_con.write(1); + LOOP_1: + for (uint8_t iy = 0; iy < rect_ysize; iy++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_2: + for (uint8_t ix = 0; ix < rect_xsize; ix++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 +#pragma HLS pipeline + uint8_t idx = iy * 8 + ix; + uint8_t idx_8_sum = (iy / 2) * 4 + ix / 2; + entropy_sum[idx_8_sum] += stream_entropy_8.read(); +#ifdef __SYNTHESIS__ + if (flag) { + strategy_ping[idx] = 0; + } else { + strategy_pang[idx] = 0; + } +#else + int idx1 = 64 * tid + idx; + if (flag) { + strategy_ping[idx1] = 0; + } else { + strategy_pang[idx1] = 0; + } +#endif + if (iy % 2 == 1 && ix % 2 == 1) { + entropy_16 = stream_entropy_16.read(); + entropy_8 = entropy_sum[(iy / 2) * 4 + ix / 2]; + if (entropy_16 < entropy_8) { + LOOP_3: + for (uint8_t y = iy - 1; y < iy + 1; y++) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 +#pragma HLS unroll + LOOP_4: + for (uint8_t x = ix - 1; x < ix + 1; x++) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 +#pragma HLS unroll + uint8_t idx = y * 8 + x; +#ifdef __SYNTHESIS__ + if (flag) { + strategy_ping[idx] = 4; + } else { + strategy_pang[idx] = 4; + } +#else + int idx1 = 64 * tid + idx; + if (flag) { + strategy_ping[idx1] = 4; + } else { + strategy_pang[idx1] = 4; + } +#endif + entropy_sum[(y / 2) * 4 + x / 2] = entropy_16; + } + } + } + if (iy % 4 == 3 && ix % 4 == 3) { + entropy_32 = stream_entropy_32.read(); + entropy_16 = 0; + LOOP_5: + for (uint8_t y = iy - 3; y < iy + 1; y += 2) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 +#pragma HLS unroll + LOOP_6: + for (uint8_t x = ix - 3; x < ix + 1; x += 2) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 +#pragma HLS unroll + uint8_t idx_16 = y * 8 + x; + entropy_16 += entropy_sum[(y / 2) * 4 + x / 2]; + } + } + if (entropy_32 < entropy_16) { + LOOP_7: + for (uint8_t y = iy - 3; y < iy + 1; y++) { +#pragma HLS LOOP_TRIPCOUNT min = 4 max = 4 +#pragma HLS unroll + LOOP_8: + for (uint8_t x = ix - 3; x < ix + 1; x++) { +#pragma HLS LOOP_TRIPCOUNT min = 4 max = 4 +#pragma HLS unroll + uint8_t idx = y * 8 + x; +#ifdef __SYNTHESIS__ + if (flag) { + strategy_ping[idx] = 5; + } else { + strategy_pang[idx] = 5; + } +#else + int idx1 = 64 * tid + idx; + if (flag) { + strategy_ping[idx1] = 5; + } else { + strategy_pang[idx1] = 5; + } +#endif + } + } + } + } + } + } + } + flag = !flag; + stream_ok.write(1); + } +} + +void Reorder(uint16_t num_tile, + float* ping8, + float* pang8, + float* ping16, + float* pang16, + float* ping32, + float* pang32, + uint8_t* strategy_ping, + uint8_t* strategy_pang, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_rectx_out, + hls::stream& stream_recty_out, + hls::stream& stream_con, + hls::stream& stream_ok, + hls::stream& stream_con8, + hls::stream& stream_ok8, + hls::stream& stream_con16, + hls::stream& stream_ok16, + hls::stream& stream_con32, + hls::stream& stream_ok32, + hls::stream& stream_strategy, + hls::stream& stream_strategy1, + hls::stream& stream_select) { + bool flag = true; + uint16_t size = 4096; + uint8_t w = 64; + uint16_t total_size = 4096 * 3; + ap_uint<64> visited; +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + float entropy_32; + float entropy_16; + float entropy_8; + float entropy_sum[16] = {0}; + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + stream_recty_out.write(rect_ysize); + stream_rectx_out.write(rect_xsize); + stream_ok.read(); + stream_ok8.read(); + stream_ok16.read(); + stream_ok32.read(); + visited = 0; + LOOP_1: + for (uint8_t iy = 0; iy < rect_ysize; iy++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_2: + for (uint8_t ix = 0; ix < rect_xsize; ix++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + uint8_t strategy; + uint8_t idx = iy * 8 + ix; + if (visited.range(idx, idx) == 0) { +#ifdef __SYNTHESIS__ + if (flag) { + strategy = strategy_ping[idx]; + } else { + strategy = strategy_pang[idx]; + } +#else + if (flag) { + strategy = strategy_ping[tid * 64 + idx]; + } else { + strategy = strategy_pang[tid * 64 + idx]; + } +#endif + stream_strategy.write(strategy); + stream_strategy1.write(strategy); + if (strategy == 4) { + LOOP_3: + for (uint8_t y = 0; y < 2; y++) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 +#pragma HLS unroll + LOOP_4: + for (uint8_t x = 0; x < 2; x++) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 +#pragma HLS unroll + uint8_t idx = (iy + y) * 8 + (ix + x); + visited.range(idx, idx) = 1; + } + } + LOOP_5: + for (uint8_t y = 0; y < 16; y++) { + LOOP_6: + for (uint8_t x = 0; x < 16; x++) { + LOOP_7: + for (uint8_t c = 0; c < 3; c++) { +#pragma HLS pipeline + uint16_t sy = iy * 8 + y; + uint16_t sx = ix * 8 + x; +#ifdef __SYNTHESIS__ + uint16_t idx = c * size + sy * w + sx; + if (flag) { + float tmp = ping16[idx]; + stream_select.write(tmp); + } else { + float tmp = pang16[idx]; + stream_select.write(tmp); + } +#else + int idx = total_size * tid + c * size + sy * w + sx; + if (flag) { + float tmp = ping16[idx]; + stream_select.write(tmp); + } else { + float tmp = pang16[idx]; + stream_select.write(tmp); + } +#endif + } + } + } + } else if (strategy == 5) { + LOOP_8: + for (uint8_t y = 0; y < 4; y++) { +#pragma HLS LOOP_TRIPCOUNT min = 4 max = 4 +#pragma HLS unroll + LOOP_9: + for (uint8_t x = 0; x < 4; x++) { +#pragma HLS LOOP_TRIPCOUNT min = 4 max = 4 +#pragma HLS unroll + uint8_t idx = (iy + y) * 8 + (ix + x); + visited.range(idx, idx) = 1; + } + } + LOOP_10: + for (uint8_t y = 0; y < 32; y++) { + LOOP_11: + for (uint8_t x = 0; x < 32; x++) { + LOOP_12: + for (uint8_t c = 0; c < 3; c++) { +#pragma HLS pipeline + uint16_t sy = iy * 8 + y; + uint16_t sx = ix * 8 + x; +#ifdef __SYNTHESIS__ + uint16_t idx = c * size + sy * w + sx; + if (flag) { + float tmp = ping32[idx]; + stream_select.write(tmp); + } else { + float tmp = pang32[idx]; + stream_select.write(tmp); + } +#else + int idx = total_size * tid + c * size + sy * w + sx; + if (flag) { + float tmp = ping32[idx]; + stream_select.write(tmp); + } else { + float tmp = pang32[idx]; + stream_select.write(tmp); + } +#endif + } + } + } + } else { + visited.range(idx, idx) = 1; + LOOP_13: + for (uint8_t y = 0; y < 8; y++) { + LOOP_14: + for (uint8_t x = 0; x < 8; x++) { + LOOP_15: + for (uint8_t c = 0; c < 3; c++) { +#pragma HLS pipeline + uint16_t sy = iy * 8 + y; + uint16_t sx = ix * 8 + x; +#ifdef __SYNTHESIS__ + uint16_t idx = c * size + sy * w + sx; + if (flag) { + float tmp = ping8[idx]; + stream_select.write(tmp); + } else { + float tmp = pang8[idx]; + stream_select.write(tmp); + } +#else + int idx = total_size * tid + c * size + sy * w + sx; + float tmp; + if (flag) { + tmp = ping8[idx]; + stream_select.write(tmp); + } else { + tmp = pang8[idx]; + stream_select.write(tmp); + } +#endif + } + } + } + } + } + } + } + flag = !flag; + stream_con.read(); + stream_con8.read(); + stream_con16.read(); + stream_con32.read(); + } +} + +void ConsumeStrategyDCT(int xsize, + int ysize, + hls::stream& stream_strategy, + hls::stream& stream_select, + float* dctx_8x8, + float* dcty_8x8, + float* dctb_8x8, + float* dctx_16x16, + float* dcty_16x16, + float* dctb_16x16, + float* dctx_32x32, + float* dcty_32x32, + float* dctb_32x32) { + int xsize_blocks = xsize / 8; + int ysize_blocks = ysize / 8; + int n_enc_tiles = Div_Ceil(xsize_blocks, 8); + int count = 0; + int count_s = 0; + ap_uint<64> visited; +LOOP_0: + for (int tid = 0; tid < Div_Ceil(xsize_blocks, 8) * Div_Ceil(ysize_blocks, 8); tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + int tx1 = tid % n_enc_tiles; + int ty1 = tid / n_enc_tiles; + int by = ty1 * 8; + int by1 = fmin((int)((ty1 + 1) * 8), ysize_blocks); + int bx = tx1 * 8; + int bx1 = fmin((int)((tx1 + 1) * 8), xsize_blocks); + int rect_ysize = by1 - by; + int rect_xsize = bx1 - bx; + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + visited = 0; + for (int iy = 0; iy < rect_ysize; iy++) { + for (int ix = 0; ix < rect_xsize; ix++) { + char strategy; + int idx = iy * 8 + ix; + if (visited.range(idx, idx) == 0) { + strategy = stream_strategy.read(); + if (strategy == 0) { + visited.range(idx, idx) = 1; + for (int y = 0; y < 8; y++) { + for (int x = 0; x < 8; x++) { + for (int c = 0; c < 3; c++) { + float tmp = stream_select.read(); + int idx = ((ty1 * 64 + iy * 8 + y) * xsize) + (tx1 * 64 + ix * 8 + x); + if (c == 0) { + dcty_8x8[idx] = tmp; + } else if (c == 1) { + dctx_8x8[idx] = tmp; + } else if (c == 2) { + dctb_8x8[idx] = tmp; + } + } + } + } + } else if (strategy == 4) { + for (int y = 0; y < 2; y++) { + for (int x = 0; x < 2; x++) { + int idx = (iy + y) * 8 + (ix + x); + visited.range(idx, idx) = 1; + } + } + for (int y = 0; y < 16; y++) { + for (int x = 0; x < 16; x++) { + for (int c = 0; c < 3; c++) { + float tmp = stream_select.read(); + int idx = ((ty1 * 64 + iy * 8 + y) * xsize) + (tx1 * 64 + ix * 8 + x); + if (c == 0) { + dcty_16x16[idx] = tmp; + } else if (c == 1) { + dctx_16x16[idx] = tmp; + } else if (c == 2) { + dctb_16x16[idx] = tmp; + } + } + } + } + } else if (strategy == 5) { + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) { + int idx = (iy + y) * 8 + (ix + x); + visited.range(idx, idx) = 1; + } + } + for (int y = 0; y < 32; y++) { + for (int x = 0; x < 32; x++) { + for (int c = 0; c < 3; c++) { + float tmp = stream_select.read(); + int idx = ((ty1 * 64 + iy * 8 + y) * xsize) + (tx1 * 64 + ix * 8 + x); + if (c == 0) { + dcty_32x32[idx] = tmp; + } else if (c == 1) { + dctx_32x32[idx] = tmp; + } else if (c == 2) { + dctb_32x32[idx] = tmp; + } + } + } + } + } + } + } + } + } + // std::cout << "use count_s=" << count_s << ", count=" << count << std::endl; +} + +void SetQuantField(uint16_t num_tile, + float inv_global_scale, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_rqf_org, + hls::stream& stream_strategy1, + hls::stream& stream_rqf) { + ap_uint<64> visited; +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + visited = 0; + float rqf_array[64]; + LOOP_1: + for (uint8_t iy = 0; iy < rect_ysize; iy++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_2: + for (uint8_t ix = 0; ix < rect_xsize; ix++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 +#pragma HLS pipeline + uint16_t index = iy * 8 + ix; + rqf_array[index] = stream_rqf_org.read(); + } + } + LOOP_3: + for (uint8_t y = 0; y < rect_ysize; ++y) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_4: + for (uint8_t x = 0; x < rect_xsize; ++x) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 +#pragma HLS pipeline + float max = -3.40282e+038; + uint8_t idx = y * 8 + x; + if (visited.range(idx, idx) == 0) { + uint8_t strategy = stream_strategy1.read(); + uint8_t b = strategy_block[strategy]; + LOOP_5: + for (uint8_t iy = 0; iy < b; iy++) { + LOOP_6: + for (uint8_t ix = 0; ix < b; ix++) { +#pragma HLS pipeline + uint16_t idx = (iy + y) * 8 + (ix + x); + visited.range(idx, idx) = 1; + float tmp = rqf_array[idx]; + max = fmax(tmp, max); + } + } + float tmp = max; + tmp = tmp * inv_global_scale + 0.5f; + int16_t tmp_i = (int16_t)tmp; + tmp_i = tmp_i > 256 ? 256 : tmp_i; + int16_t val = tmp_i > 1 ? tmp_i : 1; + stream_rqf.write(val); + } + } + } + } +} + +template +void DupDCT(uint16_t num_tile, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_rectx_out0, + hls::stream& stream_recty_out0, + hls::stream& stream_rectx_out1, + hls::stream& stream_recty_out1, + hls::stream& stream_dctin, + hls::stream& stream_dctout0, + hls::stream& stream_dctout1) { + uint8_t block_n = N * N; + const uint16_t size = 64 * block_n; +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + stream_recty_out0.write(rect_ysize); + stream_rectx_out0.write(rect_xsize); + stream_recty_out1.write(rect_ysize); + stream_rectx_out1.write(rect_xsize); + LOOP_1: + for (uint8_t iy = 0; iy + N - 1 < rect_ysize; iy += N) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 + LOOP_2: + for (uint8_t ix = 0; ix + N - 1 < rect_xsize; ix += N) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 + LOOP_3: + for (uint8_t c = 0; c < 3; c++) { +#pragma HLS LOOP_TRIPCOUNT min = 3 max = 3 + LOOP_4: + for (uint16_t i = 0; i < size; i += 1) { +#pragma HLS LOOP_TRIPCOUNT min = 1024 max = 1024 +#pragma HLS pipeline II = 1 + float tmp = stream_dctin.read(); + stream_dctout0.write(tmp); + stream_dctout1.write(tmp); + } + } + } + } + } +} + +void ComputeTileACSHLS(uint16_t num_tile, + short ysize, + short xsize, + float cost1, + float butteraugli_target, + float inv_global_scale, + float mul8x8, + float mul16x16, + float mul32x32, + hls::stream& stream_rectx_acs, + hls::stream& stream_recty_acs, + hls::stream& stream_rqf_org, + hls::stream& stream_q_org, + hls::stream& stream_mask_org, + hls::stream& stream_dctin8, + hls::stream& stream_dctin16, + hls::stream& stream_dctin32, + hls::stream& stream_strategy, + hls::stream& stream_select, + hls::stream& stream_rqf) { +#pragma HLS INLINE + hls::stream stream_rectx0("rectx0"); +#pragma HLS stream variable = stream_rectx0 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx0 type = fifo + hls::stream stream_recty0("recty0"); +#pragma HLS stream variable = stream_recty0 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty0 type = fifo + + hls::stream stream_rectx1("rectx1"); +#pragma HLS stream variable = stream_rectx1 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx1 type = fifo + hls::stream stream_recty1("recty1"); +#pragma HLS stream variable = stream_recty1 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty1 type = fifo + + hls::stream stream_rectx2("rectx2"); +#pragma HLS stream variable = stream_rectx2 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx2 type = fifo + hls::stream stream_recty2("recty2"); +#pragma HLS stream variable = stream_recty2 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty2 type = fifo + + hls::stream stream_rectx3("rectx3"); +#pragma HLS stream variable = stream_rectx3 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx3 type = fifo + hls::stream stream_recty3("recty3"); +#pragma HLS stream variable = stream_recty3 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty3 type = fifo + + hls::stream stream_rectx4("rectx4"); +#pragma HLS stream variable = stream_rectx4 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx4 type = fifo + hls::stream stream_recty4("recty4"); +#pragma HLS stream variable = stream_recty4 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty4 type = fifo + + hls::stream stream_rectx5("rectx5"); +#pragma HLS stream variable = stream_rectx5 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx5 type = fifo + hls::stream stream_recty5("recty5"); +#pragma HLS stream variable = stream_recty5 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty5 type = fifo + + hls::stream stream_rectx6("rectx6"); +#pragma HLS stream variable = stream_rectx6 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx6 type = fifo + hls::stream stream_recty6("recty6"); +#pragma HLS stream variable = stream_recty6 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty6 type = fifo + + hls::stream stream_rectx7("rectx7"); +#pragma HLS stream variable = stream_rectx7 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx7 type = fifo + hls::stream stream_recty7("recty7"); +#pragma HLS stream variable = stream_recty7 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty7 type = fifo + + hls::stream stream_rectx8("rectx8"); +#pragma HLS stream variable = stream_rectx8 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx8 type = fifo + hls::stream stream_recty8("recty8"); +#pragma HLS stream variable = stream_recty8 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty8 type = fifo + + hls::stream stream_rectx9("rectx9"); +#pragma HLS stream variable = stream_rectx9 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx9 type = fifo + hls::stream stream_recty9("recty9"); +#pragma HLS stream variable = stream_recty9 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty9 type = fifo + + hls::stream stream_rectx10("rectx10"); +#pragma HLS stream variable = stream_rectx10 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx10 type = fifo + hls::stream stream_recty10("recty10"); +#pragma HLS stream variable = stream_recty10 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty10 type = fifo + + hls::stream stream_rectx11("rectx11"); +#pragma HLS stream variable = stream_rectx11 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx11 type = fifo + hls::stream stream_recty11("recty11"); +#pragma HLS stream variable = stream_recty11 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty11 type = fifo + + hls::stream stream_rectx12("rectx12"); +#pragma HLS stream variable = stream_rectx12 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx12 type = fifo + hls::stream stream_recty12("recty12"); +#pragma HLS stream variable = stream_recty12 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty12 type = fifo + + hls::stream stream_dctin8_0("dctin8_0"); +#pragma HLS stream variable = stream_dctin8_0 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_dctin8_0 type = fifo + hls::stream stream_dctin16_0("dctin16_0"); +#pragma HLS stream variable = stream_dctin16_0 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_dctin16_0 type = fifo + hls::stream stream_dctin32_0("dctin32_0"); +#pragma HLS stream variable = stream_dctin32_0 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_dctin32_0 type = fifo + hls::stream stream_dctin8_1("dctin8_1"); +#pragma HLS stream variable = stream_dctin8_1 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_dctin8_1 type = fifo + hls::stream stream_dctin16_1("dctin16_1"); +#pragma HLS stream variable = stream_dctin16_1 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_dctin16_1 type = fifo + hls::stream stream_dctin32_1("dctin32_1"); +#pragma HLS stream variable = stream_dctin32_1 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_dctin32_1 type = fifo + + hls::stream stream_entropy_8("entropy_8"); +#pragma HLS stream variable = stream_entropy_8 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy_8 type = fifo + hls::stream stream_entropy_16("entropy_16"); +#pragma HLS stream variable = stream_entropy_16 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy_16 type = fifo + hls::stream stream_entropy_32("entropy_32"); +#pragma HLS stream variable = stream_entropy_32 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy_32 type = fifo + + hls::stream stream_con("con"); +#pragma HLS stream variable = stream_con depth = 2 +#pragma HLS BIND_STORAGE variable = stream_con type = fifo + hls::stream stream_ok("ok"); +#pragma HLS stream variable = stream_ok depth = 2 +#pragma HLS BIND_STORAGE variable = stream_ok type = fifo + hls::stream stream_con8("con8"); +#pragma HLS stream variable = stream_con8 depth = 2 +#pragma HLS BIND_STORAGE variable = stream_con8 type = fifo + hls::stream stream_ok8("ok"); +#pragma HLS stream variable = stream_ok8 depth = 2 +#pragma HLS BIND_STORAGE variable = stream_ok8 type = fifo + hls::stream stream_con16("con16"); +#pragma HLS stream variable = stream_con16 depth = 2 +#pragma HLS BIND_STORAGE variable = stream_con16 type = fifo + hls::stream stream_ok16("ok16"); +#pragma HLS stream variable = stream_ok16 depth = 2 +#pragma HLS BIND_STORAGE variable = stream_ok16 type = fifo + hls::stream stream_con32("con32"); +#pragma HLS stream variable = stream_con32 depth = 2 +#pragma HLS BIND_STORAGE variable = stream_con32 type = fifo + hls::stream stream_ok32("ok32"); +#pragma HLS stream variable = stream_ok32 depth = 2 +#pragma HLS BIND_STORAGE variable = stream_ok32 type = fifo + + hls::stream stream_strategy1("strategy1"); +#pragma HLS stream variable = stream_strategy1 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_strategy1 type = fifo + + hls::stream stream_q_org_8("q_org_8"); +#pragma HLS stream variable = stream_q_org_8 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_q_org_8 type = fifo + hls::stream stream_mask_org_8("mask_org_8"); +#pragma HLS stream variable = stream_mask_org_8 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_mask_org_8 type = fifo + hls::stream stream_q_org_16("q_org_8"); +#pragma HLS stream variable = stream_q_org_16 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_q_org_16 type = fifo + hls::stream stream_mask_org_16("mask_org_8"); +#pragma HLS stream variable = stream_mask_org_16 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_mask_org_16 type = fifo + hls::stream stream_q_org_32("q_org_8"); +#pragma HLS stream variable = stream_q_org_32 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_q_org_32 type = fifo + hls::stream stream_mask_org_32("mask_org_8"); +#pragma HLS stream variable = stream_mask_org_32 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_mask_org_32 type = fifo + +// #pragma HLS dataflow + +#ifdef __SYNTHESIS__ + uint8_t strategy_ping[64]; + uint8_t strategy_pang[64]; +#pragma HLS bind_storage variable = strategy_ping type = RAM_T2P impl = bram +#pragma HLS shared variable = strategy_ping +#pragma HLS stable variable = strategy_ping +#pragma HLS bind_storage variable = strategy_pang type = RAM_T2P impl = bram +#pragma HLS shared variable = strategy_pang +#pragma HLS stable variable = strategy_pang +#else + uint8_t* strategy_ping = (uint8_t*)malloc(sizeof(uint8_t) * 64 * 32 * 32); + uint8_t* strategy_pang = (uint8_t*)malloc(sizeof(uint8_t) * 64 * 32 * 32); +#endif + +#ifdef __SYNTHESIS__ + float ping8[3 * 64 * 64]; + float pang8[3 * 64 * 64]; + float ping16[3 * 64 * 64]; + float pang16[3 * 64 * 64]; + float ping32[3 * 64 * 64]; + float pang32[3 * 64 * 64]; +#pragma HLS bind_storage variable = ping8 type = RAM_T2P impl = uram +#pragma HLS shared variable = ping8 +#pragma HLS stable variable = ping8 +#pragma HLS bind_storage variable = pang8 type = RAM_T2P impl = uram +#pragma HLS shared variable = pang8 +#pragma HLS stable variable = pang8 +#pragma HLS bind_storage variable = ping16 type = RAM_T2P impl = uram +#pragma HLS shared variable = ping16 +#pragma HLS stable variable = ping16 +#pragma HLS bind_storage variable = pang16 type = RAM_T2P impl = uram +#pragma HLS shared variable = pang16 +#pragma HLS stable variable = pang16 +#pragma HLS bind_storage variable = ping32 type = RAM_T2P impl = uram +#pragma HLS shared variable = ping32 +#pragma HLS stable variable = ping32 +#pragma HLS bind_storage variable = pang32 type = RAM_T2P impl = uram +#pragma HLS shared variable = pang32 +#pragma HLS stable variable = pang32 +#else + float* ping8 = (float*)malloc(sizeof(float) * 2048 * 2048 * 3); + float* pang8 = (float*)malloc(sizeof(float) * 2048 * 2048 * 3); + float* ping16 = (float*)malloc(sizeof(float) * 2048 * 2048 * 3); + float* pang16 = (float*)malloc(sizeof(float) * 2048 * 2048 * 3); + float* ping32 = (float*)malloc(sizeof(float) * 2048 * 2048 * 3); + float* pang32 = (float*)malloc(sizeof(float) * 2048 * 2048 * 3); +#endif + + // #pragma HLS dataflow + GetACSSize(xsize, ysize, stream_rectx_acs, stream_recty_acs, stream_rectx0, stream_recty0, stream_rectx1, + stream_recty1, stream_rectx2, stream_recty2, stream_rectx3, stream_recty3, stream_rectx10, + stream_recty10); + + DupQuantAndMask(num_tile, stream_rectx0, stream_recty0, stream_q_org, stream_mask_org, stream_q_org_8, + stream_mask_org_8, stream_q_org_16, stream_mask_org_16, stream_q_org_32, stream_mask_org_32); + + DupDCT<1>(num_tile, stream_rectx1, stream_recty1, stream_rectx4, stream_recty4, stream_rectx7, stream_recty7, + stream_dctin8, stream_dctin8_0, stream_dctin8_1); + + DupDCT<2>(num_tile, stream_rectx2, stream_recty2, stream_rectx5, stream_recty5, stream_rectx8, stream_recty8, + stream_dctin16, stream_dctin16_0, stream_dctin16_1); + + DupDCT<4>(num_tile, stream_rectx3, stream_recty3, stream_rectx6, stream_recty6, stream_rectx9, stream_recty9, + stream_dctin32, stream_dctin32_0, stream_dctin32_1); + + EstimateEntropy8(num_tile, cost1, mul8x8, stream_rectx4, stream_recty4, stream_q_org_8, stream_mask_org_8, + stream_dctin8_0, stream_entropy_8); + + EstimateEntropy16(num_tile, cost1, mul16x16, stream_rectx5, stream_recty5, stream_q_org_16, stream_mask_org_16, + stream_dctin16_0, stream_entropy_16); + + EstimateEntropy32(num_tile, cost1, mul32x32, stream_rectx6, stream_recty6, stream_q_org_32, stream_mask_org_32, + stream_dctin32_0, stream_entropy_32); + + CompareEntropy(num_tile, stream_rectx10, stream_recty10, stream_rectx11, stream_recty11, stream_entropy_8, + stream_entropy_16, stream_entropy_32, strategy_ping, strategy_pang, stream_con, stream_ok); + + BufferN<1>(num_tile, ping8, pang8, stream_rectx7, stream_recty7, stream_dctin8_1, stream_con8, stream_ok8); + + BufferN<2>(num_tile, ping16, pang16, stream_rectx8, stream_recty8, stream_dctin16_1, stream_con16, stream_ok16); + + BufferN<4>(num_tile, ping32, pang32, stream_rectx9, stream_recty9, stream_dctin32_1, stream_con32, stream_ok32); + + Reorder(num_tile, ping8, pang8, ping16, pang16, ping32, pang32, strategy_ping, strategy_pang, stream_rectx11, + stream_recty11, stream_rectx12, stream_recty12, stream_con, stream_ok, stream_con8, stream_ok8, + stream_con16, stream_ok16, stream_con32, stream_ok32, stream_strategy, stream_strategy1, stream_select); + + SetQuantField(num_tile, inv_global_scale, stream_rectx12, stream_recty12, stream_rqf_org, stream_strategy1, + stream_rqf); +} + +void GetSourceSize(short xsize, + short ysize, + hls::stream& stream_rectx_dct, + hls::stream& stream_recty_dct, + hls::stream& stream_rectx_acs, + hls::stream& stream_recty_acs, + hls::stream& stream_rectx_dc, + hls::stream& stream_recty_dc) { + uint16_t xsize_blocks = xsize / 8; + uint16_t ysize_blocks = ysize / 8; +LOOP_0: + for (uint16_t y = 0; y < Div_Ceil(ysize_blocks, 8); y++) { + LOOP_1: + for (uint16_t x = 0; x < Div_Ceil(xsize_blocks, 8); x++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint16_t by = y * 8; + uint16_t by1 = ((y + 1) * 8) < ysize_blocks ? ((y + 1) * 8) : ysize_blocks; + uint16_t bx = x * 8; + uint16_t bx1 = ((x + 1) * 8) < xsize_blocks ? ((x + 1) * 8) : xsize_blocks; + uint8_t rect_ysize = by1 - by; + uint8_t rect_xsize = bx1 - bx; + stream_rectx_dct.write(rect_xsize); + stream_recty_dct.write(rect_ysize); + stream_rectx_acs.write(rect_xsize); + stream_recty_acs.write(rect_ysize); + stream_rectx_dc.write(rect_xsize); + stream_recty_dc.write(rect_ysize); + } + } +} + +//=========================================================// +// data flow region +//=========================================================// +void hls_lossy_enc_compute_dataflow( + // config + uint32_t ysize, + uint32_t xsize, + int masking_field_stride, + int quant_field_stride, + float butteraugli_target, + float cost1, + float inv_global_scale, + float* hls_opsin_1, // mm1, input + float* hls_opsin_2, // mm2, input + float* hls_opsin_3, // mm3, input + float* quant_field_row, // mm4, input + float* masking_field_row, // mm5, input + float* aq_map_f, // mm6, input + int8_t* cmap_axi, // mm7, output + int* ac_coef_axiout, // mm8, output + // unsigned char* strategy_all, // mm9, output + uint8_t* strategy_all, // mm9, output + int* raw_quant_field_i, // mm10, output + float* hls_dc8x8, // mm11, output + float* hls_dc16x16, // mm12, output + float* hls_dc32x32, // mm13, output + int32_t num_zeros[3][320], + hls::stream, 2>& used_orders_strm) { +#pragma HLS INTERFACE mode = m_axi bundle = mm1 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_opsin_1 +#pragma HLS INTERFACE mode = m_axi bundle = mm2 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_opsin_2 +#pragma HLS INTERFACE mode = m_axi bundle = mm3 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_opsin_3 +#pragma HLS INTERFACE mode = m_axi bundle = mm4 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + BLOCK8_H* BLOCK8_W port = quant_field_row +#pragma HLS INTERFACE mode = m_axi bundle = mm5 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + BLOCK8_H* BLOCK8_W port = masking_field_row +#pragma HLS INTERFACE mode = m_axi bundle = mm6 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + BLOCK8_H* BLOCK8_W port = aq_map_f +#pragma HLS INTERFACE mode = m_axi bundle = mm7 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + TILE_W* TILE_H* 2 port = cmap_axi +#pragma HLS INTERFACE mode = m_axi bundle = mm8 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + ac_coef_axiout +#pragma HLS INTERFACE mode = m_axi bundle = mm9 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + BLOCK8_W* BLOCK8_H port = strategy_all +#pragma HLS INTERFACE mode = m_axi bundle = mm10 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + BLOCK8_H* BLOCK8_W port = raw_quant_field_i +// #pragma HLS INTERFACE mode = m_axi bundle = mm11 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = MAX_ORDER port = \ +// hls_order +#pragma HLS INTERFACE mode = m_axi bundle = mm12 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_dc8x8 +#pragma HLS INTERFACE mode = m_axi bundle = mm13 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_dc16x16 +#pragma HLS INTERFACE mode = m_axi bundle = mm14 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_dc32x32 +// #pragma HLS INTERFACE mode = m_axi bundle = mm15 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = MAX_NUM_CONFIG port = \ +// config +// #pragma HLS INTERFACE mode = m_axi bundle = mm16 latency = 32 offset = slave num_write_outstanding = \ +// 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = MAX_NUM_CONFIG port = \ +// config_fl +#pragma HLS DATAFLOW + + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + int xnum_tile = (xsize + 63) / 64; + int ynum_tile = (ysize + 63) / 64; + int num_tile = xnum_tile * ynum_tile; + hls::stream stream_rectx_dct; + hls::stream stream_recty_dct; + hls::stream stream_rectx_acs; + hls::stream stream_recty_acs; + hls::stream stream_rectx_dc; + hls::stream stream_recty_dc; + GetSourceSize(xsize, ysize, stream_rectx_dct, stream_recty_dct, stream_rectx_acs, stream_recty_acs, stream_rectx_dc, + stream_recty_dc); + + // load data + hls::stream stream_q_org("q_org"); + hls::stream stream_mask_org("mask_org"); + hls::stream stream_rqf_org("rqf_org"); + load_rqf_mask(xsize, ysize, aq_map_f, masking_field_row, quant_field_row, quant_field_stride, stream_q_org, + stream_mask_org, stream_rqf_org); + + // load pixel + hls::stream opsin8x8_stream; + hls::stream opsin16x16_stream; + hls::stream opsin32x32_stream; + loadPixel(ysize, xsize, hls_opsin_1, hls_opsin_2, hls_opsin_3, opsin8x8_stream, opsin16x16_stream, + opsin32x32_stream); + + // 1. dct8x8, dct16x16, dct32x32 + hls::stream ac_coef8x8_stream("ac_coef8"); + hls::stream ac_coef16x16_stream("ac_coef16"); + hls::stream ac_coef32x32_stream("ac_coef32"); + + hls::stream dc_coef8x8_stream("dc_coef8"); + hls::stream dc_coef16x16_stream("dc_coef16"); + hls::stream dc_coef32x32_stream("dc_coef32"); + hls_dct_top(ysize, xsize, stream_rectx_dct, stream_recty_dct, opsin8x8_stream, opsin16x16_stream, opsin32x32_stream, + ac_coef8x8_stream, ac_coef16x16_stream, ac_coef32x32_stream, dc_coef8x8_stream, dc_coef16x16_stream, + dc_coef32x32_stream); + + // 2. ac strategy + float k8x8mul1 = -0.55; + float k8x8mul2 = 1.0735757687292623f; + float k8x8base = 1.4; + float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base); + float k16X16mul1 = -0.35; + float k16X16mul2 = 0.82098067020252011; + float k16X16base = 2.0; + float entropy_mul16X16 = k16X16mul2 + k16X16mul1 / (butteraugli_target + k16X16base); + float entropy_mul32X32 = 0.9188333021616017f; + hls::stream acs_stream; + hls::stream dct_select_stream; + hls::stream acs_out_stream("acs_out_stream"); + hls::stream rqf_out_stream("rqf_out_stream"); + hls::stream rqf_out_stream2("rqf_out_stream2"); + ComputeTileACSHLS((uint16_t)num_tile, (short)ysize, (short)xsize, cost1, butteraugli_target, inv_global_scale, + mul8x8, entropy_mul16X16, entropy_mul32X32, stream_rectx_acs, stream_recty_acs, stream_rqf_org, + stream_q_org, stream_mask_org, ac_coef8x8_stream, ac_coef16x16_stream, ac_coef32x32_stream, + acs_stream, dct_select_stream, rqf_out_stream); + + // 3. cfl heuristic + hls::stream cmapx_stream("cmapx_stream"); + hls::stream cmapb_stream("cmapb_stream"); + hls::stream cmapx_axi_stream("cmpax_axi_stream"); + hls::stream cmapb_axi_stream("cmapb_axi_stream"); + + hls::stream rqf_cfl_stream("rqf_cfl_stream"); +#pragma HLS stream variable = rqf_cfl_stream depth = 4096 * 6 +#pragma HLS BIND_STORAGE variable = rqf_cfl_stream type = fifo impl = uram + hls::stream acs_cfl_stream("acs_cfl_stream"); +#pragma HLS stream variable = acs_cfl_stream depth = 4096 * 6 +#pragma HLS BIND_STORAGE variable = acs_cfl_stream type = fifo impl = uram + hls::stream ac_coef_cfl_stream("ac_coef_cfl_stream"); +#pragma HLS stream variable = ac_coef_cfl_stream depth = 4096 * 6 +#pragma HLS BIND_STORAGE variable = ac_coef_cfl_stream type = fifo impl = uram + + hls_CFLComputeTile(xsize, ysize, dct_select_stream, rqf_out_stream, acs_stream, cmapx_stream, cmapb_stream, + cmapx_axi_stream, cmapb_axi_stream, ac_coef_cfl_stream, rqf_cfl_stream, acs_cfl_stream); + + // 4. ComputeCoefficients + hls::stream acs_coeff_stream1("acs_coeff_stream1"); + hls::stream ac_quant_coeff_stream("ac_quant_coeff_stream"); + hls::stream coeff_axi_stream("coeff_axi_stream"); + hls::stream acs_axi_stream("acs_axi_stream"); + hls::stream qf_axi_stream("qf_axi_stream"); + hls_ComputeCoefficients(xsize, ysize, acs_cfl_stream, ac_coef_cfl_stream, rqf_cfl_stream, cmapx_stream, + cmapb_stream, acs_coeff_stream1, ac_quant_coeff_stream, coeff_axi_stream, acs_axi_stream, + qf_axi_stream); + + // 5. ComputeAllCoeffOrders + count_numzeros(xsize, ysize, acs_coeff_stream1, ac_quant_coeff_stream, used_orders_strm, num_zeros); + + // 6. axi writeout + dc_writeout(ysize, xsize, hls_dc8x8, hls_dc16x16, hls_dc32x32, stream_rectx_dc, stream_recty_dc, dc_coef8x8_stream, + dc_coef16x16_stream, dc_coef32x32_stream); + cfl_writeout(xsize, ysize, cmapx_axi_stream, cmapb_axi_stream, cmap_axi); + ac_coeff_writeout(xsize, ysize, coeff_axi_stream, ac_coef_axiout); + acs_rqf_writeout(xsize, ysize, strategy_all, raw_quant_field_i, acs_axi_stream, qf_axi_stream); +} + +namespace xf { +namespace codec { + +// ------------------------------------------------------------ +/** + * @brief Level 2 : kernel implement for JXL lossy frame encode computing + * + * @param config the int config signal, such as image size, field stride and etc. + * @param config_fl the floating config signal, such as cost, inv_global_scale and etc. + * @param hls_opsin_1 the input RGB image data for channnel-1. + * @param hls_opsin_2 the input RGB image data for channnel-2. + * @param hls_opsin_3 the input RGB image data for channnel-3. + * @param quant_field_row the initial quant_filed data. + * @param masking_filed_row the initial masking_filed data. + * @param aq_map_f the initial adjust quant map data. + * @param cmap_axi the output of color correlation map. + * @param ac_coef_axiout the output of quanted AC coefficients. + * @param strategy_all the output of strategy for each block in image + * @param raw_quant_field_i the output of computed raw_quant_field + * @param hls_order the output of orders for each block in image + * @param hls_dc8x8 the DC coefficients output for 8x8 blocks + * @param hls_dc16x16 the DC coefficients output for 16x16 blocks + * @param hls_dc32x32 the DC coefficients output for 32x32 blocks + */ +// ------------------------------------------------------------ + +extern "C" void JxlEnc_lossy_enc_compute(int config[MAX_NUM_CONFIG], // mm15, input + float config_fl[MAX_NUM_CONFIG], // mm16, input + float* hls_opsin_1, // mm1, input + float* hls_opsin_2, // mm2, input + float* hls_opsin_3, // mm3, input + float* quant_field_row, // mm4, input + float* masking_field_row, // mm5, input + float* aq_map_f, // mm6, input + int8_t* cmap_axi, // mm7, output + int* ac_coef_axiout, // mm8, output + // unsigned char* strategy_all, // mm9, output + uint8_t* strategy_all, // mm9, output + int* raw_quant_field_i, // mm10, output + uint32_t* hls_order, // mm11, output + float* hls_dc8x8, // mm12, output + float* hls_dc16x16, // mm13, output + float* hls_dc32x32 // mm14, output + ) { +#pragma HLS INTERFACE mode = m_axi bundle = mm1 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_opsin_1 +#pragma HLS INTERFACE mode = m_axi bundle = mm2 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_opsin_2 +#pragma HLS INTERFACE mode = m_axi bundle = mm3 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_opsin_3 +#pragma HLS INTERFACE mode = m_axi bundle = mm4 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + BLOCK8_H* BLOCK8_W port = quant_field_row +#pragma HLS INTERFACE mode = m_axi bundle = mm5 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + BLOCK8_H* BLOCK8_W port = masking_field_row +#pragma HLS INTERFACE mode = m_axi bundle = mm6 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + BLOCK8_H* BLOCK8_W port = aq_map_f +#pragma HLS INTERFACE mode = m_axi bundle = mm7 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + TILE_W* TILE_H* 2 port = cmap_axi +#pragma HLS INTERFACE mode = m_axi bundle = mm8 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + ac_coef_axiout +#pragma HLS INTERFACE mode = m_axi bundle = mm9 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + BLOCK8_W* BLOCK8_H port = strategy_all +#pragma HLS INTERFACE mode = m_axi bundle = mm10 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + BLOCK8_H* BLOCK8_W port = raw_quant_field_i +#pragma HLS INTERFACE mode = m_axi bundle = mm11 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = MAX_ORDER port = \ + hls_order +#pragma HLS INTERFACE mode = m_axi bundle = mm12 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_dc8x8 +#pragma HLS INTERFACE mode = m_axi bundle = mm13 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_dc16x16 +#pragma HLS INTERFACE mode = m_axi bundle = mm14 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_dc32x32 +#pragma HLS INTERFACE mode = m_axi bundle = mm15 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = MAX_NUM_CONFIG port = \ + config +#pragma HLS INTERFACE mode = m_axi bundle = mm16 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = MAX_NUM_CONFIG port = \ + config_fl + + // global config + uint32_t ysize = config[0]; + uint32_t xsize = config[1]; + int masking_field_stride = config[2]; + int quant_field_stride = config[3]; + float butteraugli_target = config_fl[0]; + float cost1 = config_fl[1]; + float inv_global_scale = config_fl[2]; + int32_t num_zeros[3][320]; +#pragma HLS BIND_STORAGE type = ram_2p variable = num_zeros impl = BRAM + + // Non-Dataflow region: initialization zeros + init_numzeros(num_zeros); + + // Dataflow region: enc_compute + hls::stream, 2> used_orders_strm; + hls_lossy_enc_compute_dataflow(ysize, xsize, masking_field_stride, quant_field_stride, butteraugli_target, cost1, + inv_global_scale, hls_opsin_1, hls_opsin_2, hls_opsin_3, quant_field_row, + masking_field_row, aq_map_f, cmap_axi, ac_coef_axiout, strategy_all, + raw_quant_field_i, hls_dc8x8, hls_dc16x16, hls_dc32x32, num_zeros, used_orders_strm); + + // Non-Dataflow region: compute orders + order_finalize_dataflow(used_orders_strm, num_zeros, hls_order); // 8us +} + +} // namespace codec +} // namespace xf +#endif diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/postSysLink.tcl b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/postSysLink.tcl new file mode 100644 index 0000000000..2dc2f67034 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/postSysLink.tcl @@ -0,0 +1 @@ +set_property -dict [list CONFIG.ECC_EN {false} CONFIG.ECC_SCRUB_EN {false}] [get_bd_cells hmss_0] diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/utils.mk b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/utils.mk new file mode 100644 index 0000000000..0ee80e90da --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute/utils.mk @@ -0,0 +1,270 @@ +# +# Copyright 2019-2022 Xilinx, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# vitis makefile-generator v2.0.6 +# +#+------------------------------------------------------------------------------- +# The following parameters are assigned with default values. These parameters can +# be overridden through the make command line +#+------------------------------------------------------------------------------- + +REPORT := no +PROFILE := no +DEBUG := no + +#'estimate' for estimate report generation +#'system' for system report generation +ifneq ($(REPORT), no) +VPP_LDFLAGS += --report estimate +VPP_LDFLAGS += --report system +endif + +#Generates profile summary report +ifeq ($(PROFILE), yes) +VPP_LDFLAGS += --profile_kernel data:all:all:all +endif + +#Generates debug summary report +ifeq ($(DEBUG), yes) +VPP_LDFLAGS += --dk protocol:all:all:all +endif + +#Check environment setup +ifndef XILINX_VITIS + XILINX_VITIS = /opt/xilinx/Vitis/$(TOOL_VERSION) + export XILINX_VITIS +endif +ifndef XILINX_XRT + XILINX_XRT = /opt/xilinx/xrt + export XILINX_XRT +endif + +check_device: + @set -eu; \ + inallowlist=False; \ + inblocklist=False; \ + for dev in $(PLATFORM_ALLOWLIST); \ + do if [[ $$(echo $(PLATFORM_NAME) | grep $$dev) != "" ]]; \ + then inallowlist=True; fi; \ + done ;\ + for dev in $(PLATFORM_BLOCKLIST); \ + do if [[ $$(echo $(PLATFORM_NAME) | grep $$dev) != "" ]]; \ + then inblocklist=True; fi; \ + done ;\ + if [[ $$inallowlist == False ]]; \ + then echo "[Warning]: The device $(PLATFORM_NAME) not in allowlist."; \ + fi; \ + if [[ $$inblocklist == True ]]; \ + then echo "[ERROR]: The device $(PLATFORM_NAME) in blocklist."; exit 1;\ + fi; + +#get HOST_ARCH by PLATFORM +ifneq (,$(PLATFORM)) +HOST_ARCH_temp = $(shell platforminfo -p $(PLATFORM) | grep 'CPU Type' | sed 's/.*://' | sed '/ai_engine/d' | sed 's/^[[:space:]]*//') +ifeq ($(HOST_ARCH_temp), x86) +HOST_ARCH := x86 +else ifeq ($(HOST_ARCH_temp), cortex-a9) +HOST_ARCH := aarch32 +else ifneq (,$(findstring cortex-a, $(HOST_ARCH_temp))) +HOST_ARCH := aarch64 +endif +endif + + +#get suffix of kernel by PLATFORM +VITIS_VER = $(shell v++ --version | grep 'v++' | sed 's/^[[:space:]]*//' | sed -e 's/^[*]* v++ v//g' | cut -d " " -f1) +DEVICE_TYPE = $(shell platforminfo -p $(PLATFORM) | grep 'FPGA Family' | sed 's/.*://' | sed '/ai_engine/d' | sed 's/^[[:space:]]*//') +ifeq ($(DEVICE_TYPE), versal) +ifeq ($(shell expr $(VITIS_VER) \>= 2022.1), 1) +LINK_TARGET_FMT := xsa +else +LINK_TARGET_FMT := xclbin +endif +else +LINK_TARGET_FMT := xclbin +endif + +#Checks for Device Family +ifeq ($(HOST_ARCH), aarch32) + DEV_FAM = 7Series +else ifeq ($(HOST_ARCH), aarch64) + DEV_FAM = Ultrascale +endif + +#Checks for Correct architecture +ifneq ($(HOST_ARCH), $(filter $(HOST_ARCH),aarch64 aarch32 x86)) +$(error HOST_ARCH variable not set, please set correctly and rerun) +endif + +check_version: +ifneq (, $(shell which git)) +ifneq (,$(wildcard $(XFLIB_DIR)/.git)) + @cd $(XFLIB_DIR) && git log --graph --pretty=format:'%Cred%h%Creset -%C(yellow)%d%Creset %s %Cgreen(%cr) %C(bold blue)<%an>%Creset' --abbrev-commit -n 1 && cd - +endif +endif + +#Checks for SYSROOT +check_sysroot: +ifneq ($(HOST_ARCH), x86) +ifndef SYSROOT + $(error SYSROOT ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif + +#Checks for g++ +CXX := g++ +ifeq ($(HOST_ARCH), x86) +ifeq ($(shell expr $(VITIS_VER) \>= 2022.1), 1) +CXX_VER := 8.3.0 +else +CXX_VER := 6.2.0 +endif +CXX_V := $(shell echo $(CXX_VER) | awk -F. '{print tolower($$1)}') +ifneq ($(shell expr $(shell echo "__GNUG__" | g++ -E -x c++ - | tail -1) \>= $(CXX_V)), 1) +ifndef XILINX_VIVADO +$(error [ERROR]: g++ version too old. Please use $(CXX_VER) or above) +else +CXX := $(XILINX_VIVADO)/tps/lnx64/gcc-$(CXX_VER)/bin/g++ +ifeq ($(LD_LIBRARY_PATH),) +export LD_LIBRARY_PATH := $(XILINX_VIVADO)/tps/lnx64/gcc-$(CXX_VER)/lib64 +else +export LD_LIBRARY_PATH := $(XILINX_VIVADO)/tps/lnx64/gcc-$(CXX_VER)/lib64:$(LD_LIBRARY_PATH) +endif +$(warning [WARNING]: g++ version too old. Using g++ provided by the tool: $(CXX)) +endif +endif +else ifeq ($(HOST_ARCH), aarch64) +CXX := $(XILINX_VITIS)/gnu/aarch64/lin/aarch64-linux/bin/aarch64-linux-gnu-g++ +else ifeq ($(HOST_ARCH), aarch32) +CXX := $(XILINX_VITIS)/gnu/aarch32/lin/gcc-arm-linux-gnueabi/bin/arm-linux-gnueabihf-g++ +endif + +#Check OS and setting env for xrt c++ api +OSDIST = $(shell lsb_release -i |awk -F: '{print tolower($$2)}' | tr -d ' \t' ) +OSREL = $(shell lsb_release -r |awk -F: '{print tolower($$2)}' |tr -d ' \t') + +# for centos and redhat +ifneq ($(findstring centos,$(OSDIST)),) +ifeq (7,$(shell echo $(OSREL) | awk -F. '{print tolower($$1)}' )) +ifeq ($(HOST_ARCH), x86) +XRT_CXXFLAGS += -D_GLIBCXX_USE_CXX11_ABI=0 +endif +endif +else ifneq ($(findstring redhat,$(OSDIST)),) +ifeq (7,$(shell echo $(OSREL) | awk -F. '{print tolower($$1)}' )) +ifeq ($(HOST_ARCH), x86) +XRT_CXXFLAGS += -D_GLIBCXX_USE_CXX11_ABI=0 +endif +endif +endif + +#Setting VPP +VPP := v++ + +#Cheks for aiecompiler +AIECXX := aiecompiler +AIESIMULATOR := aiesimulator +X86SIMULATOR := x86simulator + +.PHONY: check_vivado +check_vivado: +ifeq (,$(wildcard $(XILINX_VIVADO)/bin/vivado)) + @echo "Cannot locate Vivado installation. Please set XILINX_VIVADO variable." && false +endif + +.PHONY: check_vpp +check_vpp: +ifeq (,$(wildcard $(XILINX_VITIS)/bin/v++)) + @echo "Cannot locate Vitis installation. Please set XILINX_VITIS variable." && false +endif + +.PHONY: check_xrt +check_xrt: +ifeq (,$(wildcard $(XILINX_XRT)/lib/libxilinxopencl.so)) + @echo "Cannot locate XRT installation. Please set XILINX_XRT variable." && false +endif + +export PATH := $(XILINX_VITIS)/bin:$(XILINX_XRT)/bin:$(PATH) +ifeq ($(HOST_ARCH), x86) +ifeq (,$(LD_LIBRARY_PATH)) +LD_LIBRARY_PATH := $(XILINX_XRT)/lib +else +LD_LIBRARY_PATH := $(XILINX_XRT)/lib:$(LD_LIBRARY_PATH) +endif +endif + +ifneq (,$(wildcard $(PLATFORM))) +# Use PLATFORM as a file path +XPLATFORM := $(PLATFORM) +else +# Use PLATFORM as a file name pattern +# 1. search paths specified by variable +ifneq (,$(PLATFORM_REPO_PATHS)) +# 1.1 as exact name +XPLATFORM := $(strip $(foreach p, $(subst :, ,$(PLATFORM_REPO_PATHS)), $(wildcard $(p)/$(PLATFORM)/$(PLATFORM).xpfm))) +# 1.2 as a pattern +ifeq (,$(XPLATFORM)) +XPLATFORMS := $(foreach p, $(subst :, ,$(PLATFORM_REPO_PATHS)), $(wildcard $(p)/*/*.xpfm)) +XPLATFORM := $(strip $(foreach p, $(XPLATFORMS), $(shell echo $(p) | awk '$$1 ~ /$(PLATFORM)/'))) +endif # 1.2 +endif # 1 +# 2. search Vitis installation +ifeq (,$(XPLATFORM)) +# 2.1 as exact name +XPLATFORM := $(strip $(wildcard $(XILINX_VITIS)/platforms/$(PLATFORM)/$(PLATFORM).xpfm)) +# 2.2 as a pattern +ifeq (,$(XPLATFORM)) +XPLATFORMS := $(wildcard $(XILINX_VITIS)/platforms/*/*.xpfm) +XPLATFORM := $(strip $(foreach p, $(XPLATFORMS), $(shell echo $(p) | awk '$$1 ~ /$(PLATFORM)/'))) +endif # 2.2 +endif # 2 +# 3. search default locations +ifeq (,$(XPLATFORM)) +# 3.1 as exact name +XPLATFORM := $(strip $(wildcard /opt/xilinx/platforms/$(PLATFORM)/$(PLATFORM).xpfm)) +# 3.2 as a pattern +ifeq (,$(XPLATFORM)) +XPLATFORMS := $(wildcard /opt/xilinx/platforms/*/*.xpfm) +XPLATFORM := $(strip $(foreach p, $(XPLATFORMS), $(shell echo $(p) | awk '$$1 ~ /$(PLATFORM)/'))) +endif # 3.2 +endif # 3 +endif + +define MSG_PLATFORM +No platform matched pattern '$(PLATFORM)'. +Available platforms are: $(XPLATFORMS) +To add more platform directories, set the PLATFORM_REPO_PATHS variable or point PLATFORM variable to the full path of platform .xpfm file. +endef +export MSG_PLATFORM + + +.PHONY: check_platform +check_platform: +ifeq (,$(XPLATFORM)) + @echo "$${MSG_PLATFORM}" && false +endif +#Check ends + +# device2xsa - create a filesystem friendly name from device name +# $(1) - full name of device +PLATFORM_NAME = $(strip $(patsubst %.xpfm, % , $(shell basename $(PLATFORM)))) + + +# Cleaning stuff +RM = rm -f +RMDIR = rm -rf + +MV = mv -f +CP = cp -rf +ECHO:= @echo diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/Makefile b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/Makefile new file mode 100644 index 0000000000..1788ef2638 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/Makefile @@ -0,0 +1,281 @@ +# Copyright 2019-2021 Xilinx, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# sc makefile-generator v1.0.0 + +############################## Help Section ############################## +.PHONY: help + +help:: + $(ECHO) "Makefile Usage:" + $(ECHO) " make all TARGET= PLATFORM= HOST_ARCH=" + $(ECHO) " Command to generate the design for specified Target and Shell." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" + $(ECHO) "" + $(ECHO) " make run TARGET= PLATFORM= HOST_ARCH=" + $(ECHO) " Command to run application in emulation." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH required for SoC shells" + $(ECHO) "" + $(ECHO) " make host HOST_ARCH=" + $(ECHO) " Command to build host application." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" + $(ECHO) "" + $(ECHO) " make clean " + $(ECHO) " Command to remove the generated non-hardware files." + $(ECHO) "" + $(ECHO) " make cleanall" + $(ECHO) " Command to remove all the generated files." + $(ECHO) "" + +############################## Setting up Project Variables ############################## + +MK_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +XF_PROJ_ROOT ?= $(shell bash -c 'export MK_PATH=$(MK_PATH); echo $${MK_PATH%/L2/*}') +CUR_DIR := $(patsubst %/,%,$(dir $(MK_PATH))) +XFLIB_DIR = $(XF_PROJ_ROOT) + +# setting devault value +TARGET ?= sw_emu +HOST_ARCH ?= x86 + +#setting PLATFORM +ifeq ($(PLATFORM),) +PLATFORM := $(DEVICE) +endif +ifeq ($(PLATFORM),) +PLATFORM := xilinx_u50_gen3x16_xdma_5_202210_1 +endif + +# #################### Checking if PLATFORM in whitelist ############################ +PLATFORM_ALLOWLIST += u50 +PLATFORM_BLOCKLIST += zc + +GCC_INTOOL := 8.3.0 +BINUTILS_INTOOL := 2.37 +include ./utils.mk +TEMP_DIR := _x_temp.$(TARGET).$(PLATFORM_NAME) +TEMP_REPORT_DIR := $(CUR_DIR)/reports/_x.$(TARGET).$(PLATFORM_NAME) +BUILD_DIR := build_dir.$(TARGET).$(PLATFORM_NAME) +BUILD_REPORT_DIR := $(CUR_DIR)/reports/_build.$(TARGET).$(PLATFORM_NAME) +EMCONFIG := $(BUILD_DIR)/emconfig.json +XCLBIN_DIR := $(CUR_DIR)/$(BUILD_DIR) +export XCL_BINDIR = $(XCLBIN_DIR) + +EXE_FILE_DEPS := +BINARY_CONTAINERS_DEPS := +RUN_DEPS := + +# set debug switch +ifneq ($(debug),yes) +CXXFLAGS += -O3 +endif + +# get global setting +ifdef XILINX_SC_PFM_CONFIG +CXXFLAGS += -DXILINX_SC_PFM_CONFIG=$(XILINX_SC_PFM_CONFIG) +endif +ifdef XILINX_SC_PFM_EXT +CXXFLAGS += -DXILINX_SC_PFM_EXT=$(XILINX_SC_PFM_EXT) +endif +ifeq ($(HOST_ARCH), x86) +CXXFLAGS += -I $(XILINX_VITIS)/system_compiler/include -I $(XILINX_HLS)/include +LDFLAGS += -L$(XILINX_XRT)/lib -L$(XILINX_VITIS)/system_compiler/lib/x86 -lvpp_acc -l$(LIB_XRT) -lxrt_coreutil -Wl,-rpath=$(XILINX_VITIS)/system_compiler/lib/x86:$(XILINX_XRT)/lib:$(GCC_HOME)/lib64 -Wl,--enable-new-dtags -lpthread +VPP_FLAGS += -t $(TARGET) --platform $(XPLATFORM) --temp_dir $(TEMP_DIR) --save-temps -g -I $(XILINX_VITIS)/system_compiler/include +VPP_LDFLAGS += +else ifeq ($(HOST_ARCH), aarch64) +CXXFLAGS += +LDFLAGS += +VPP_FLAGS += +VPP_LDFLAGS += +endif +CXXFLAGS += $(EXTRA_CXXFLAGS) +VPP_FLAGS += $(EXTRA_VPP_FLAGS) + +ifeq ($(TARGET),sw) + $(error Error: The sw target is not supported anymore. Please use sw_emu instead) +else ifeq ($(TARGET),sw_emu) + LIB_XRT := xrt_swemu + HOST_PREAMBLE := XCL_EMULATION_MODE=sw_emu +else ifeq ($(TARGET),hw_emu) + LIB_XRT := xrt_hwemu + HOST_PREAMBLE := XCL_EMULATION_MODE=hw_emu + ifneq (,$(findstring -g,$(EXTRA_VPPFLAGS) $(CXXFLAGS))) + # for sourcing pre/post xsim scripts + ifneq ($(XILINX_SC_HW_EMU),0) + HOST_PREAMBLE += XILINX_SC_HW_EMU=1 XILINX_SC_BUILD_DIR=$(PWD)/$(BUILD_DIR) + endif + endif +else ifeq ($(TARGET),hw) + LIB_XRT := xrt_core +endif + +########################## Setting up Host Variables ########################## + +#Inclue Required Host Source Files +HOST_SRCS += $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cjxl.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cjxl_main.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cmdline.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/codec_config.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/speed_stats.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cpu/cpu.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cpu/os_specific.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/box/box.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/time.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_png.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_pgx.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_pnm.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_jpg.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_psd.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_internal.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/decode_to_jpeg.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_huffman.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_context_map.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/progressive_split.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_detect_dots.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_params.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/blending.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_comparator.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_table.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_tree.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/linalg.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_file.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/headers.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/alpha.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/image_metadata.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/frame_header.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_fast_heuristics.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_encode.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/fields.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/luminance.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_color_management.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_bit_writer.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/image.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/loop_filter.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/color_management.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_modular.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_quant_weights.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_chroma_from_luma.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_adaptive_quantization.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cache.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_group.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ac_strategy.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_noise.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_splines.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_patch_dictionary.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/splines.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_xyb.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ar_control_field.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/memory_manager_internal.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_file.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_image_bundle.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_toc.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/passes_state.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/chroma_from_luma.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_context_map.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_coeff_order.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_ans.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_entropy_coder.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_common.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_dc.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/epf.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_dot_dictionary.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_frame.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_patch_dictionary.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_comparator.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_reconstruct.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group_border.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/filters.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/convolve.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_cache.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_noise.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_huffman.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dct_scales.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_decode.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_icc_codec.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli/butteraugli.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cluster.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data_writer.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/jpeg_data.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data_reader.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/base/padded_bytes.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/base/data_parallel.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/base/cache_aligned.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/base/status.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/dec_ma.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/modular_image.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/encoding.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_rct.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_squeeze.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_palette.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/squeeze.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_transform.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/jxl_transform.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_ma.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_encoding.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/encode.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/memory.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/backward_references_hq.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/brotli_bit_stream.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/block_splitter.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/metablock.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/compress_fragment.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/compress_fragment_two_pass.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/backward_references.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/encoder_dict.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/utf8_util.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/decode.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/static_dict.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/literal_cost.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/entropy_encode.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/bit_cost.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/cluster.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/dictionary_hash.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/histogram.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/bit_reader.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/huffman.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/state.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/dictionary.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/transform.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmslut.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsnamed.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmspack.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscnvrt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsio1.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsgmt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsopt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsalpha.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmstypes.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsintrp.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsgamma.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscam02.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscgats.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmshalf.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsmtrx.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsps2.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmssamp.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmssm.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsxform.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsio0.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsplugin.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmserr.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmspcs.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmswtpnt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsvirt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lodepng/lodepng.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/highway/hwy/aligned_allocator.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/highway/hwy/targets.cc $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_enc_frame.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_host.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase1.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase2.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase3.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_enc_frame.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/host/host_lossy_enc_compute.cpp +CXXFLAGS += -I $(XFLIB_DIR)/../utils/L1/include/ -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/ -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/build/lib/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/highway -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lodepng -I $(XFLIB_DIR)/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute -I $(XFLIB_DIR)/L2/demos/jxlEnc/others/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/kernel -I $(XFLIB_DIR)/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/host +CXXFLAGS += -O3 + +EXE_NAME := host.exe +EXE_OBJS := $(addprefix $(TEMP_DIR)/, $(addsuffix .o,$(basename $(HOST_SRCS)))) +EXE_FILE := $(BUILD_DIR)/$(EXE_NAME) +EXE_FILE_DEPS := $(EXE_OBJS) +MAKEDEPEND = $(CXX) $< -MM -MP -MF $(basename $@).d -MT $@ $(CXXFLAGS) + +HOST_ARGS := --xclbin $(BUILD_DIR)/jxlEnc.xclbin $(XFLIB_DIR)/L2/demos/jxlEnc/images/small32x32.png small32x32.jxl +ifneq ($(HOST_ARCH), x86) +PKG_HOST_ARGS = $(foreach args,$(HOST_ARGS),$(subst $(dir $(patsubst %/,%,$(args))),,$(args))) +endif + +########################## Kernel compiler global settings ########################## +VPP_FLAGS += -I $(XFLIB_DIR)/../utils/L1/include/ -I $(XFLIB_DIR)/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/kernel + +######################### binary container global settings ########################## +VPP_FLAGS_hls_lossy_enc_compute += -D KERNEL_NAME=hls_lossy_enc_compute +VPP_FLAGS_hls_lossy_enc_compute += --hls.clock 300000000:hls_lossy_enc_compute +ifneq ($(HOST_ARCH), x86) +VPP_LDFLAGS_hls_lossy_enc_compute += --clock.defaultFreqHz 300000000 +else +VPP_LDFLAGS_hls_lossy_enc_compute += --kernel_frequency 300 +endif +VPP_LDFLAGS_hls_lossy_enc_compute_temp := --advanced.param compiler.userPostSysLinkOverlayTcl=postSysLink.tcl +VPP_LDFLAGS_hls_lossy_enc_compute += $(VPP_LDFLAGS_hls_lossy_enc_compute_temp) + +ifeq ($(HOST_ARCH), x86) +BINARY_CONTAINERS_TMP := $(BUILD_DIR)/$(TARGET).o +BINARY_CONTAINERS := $(BUILD_DIR)/$(TARGET).xclbin +ifeq ($(TARGET),sw_emu) + BINARY_CONTAINERS_TMP := +endif +else +# placeholder for non_x86 +endif + +.SECONDEXPANSION: +# ################ Setting Rules for Binary Containers (Building Kernels) ################ +ACC_SRCS_hls_lossy_enc_compute += $(XFLIB_DIR)/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/kernel/hls_lossy_enc_compute.cpp +ACC_OBJS_hls_lossy_enc_compute := $(addprefix $(TEMP_DIR)/, $(addsuffix .o,$(basename $(ACC_SRCS_hls_lossy_enc_compute)))) +$(ACC_OBJS_hls_lossy_enc_compute): $(TEMP_DIR)/%.o : %.cpp $$(@D)/.f + @echo "--> Making $@ from: $?" + $(MAKEDEPEND) + $(VPP) $(VPP_FLAGS) $(VPP_FLAGS_hls_lossy_enc_compute) -o $@ -c $< +BINARY_CONTAINERS_DEPS += $(ACC_OBJS_hls_lossy_enc_compute) +$(BINARY_CONTAINERS_TMP) : $(BINARY_CONTAINERS_DEPS) + @echo "--> Making $@ from: $?" + $(VPP) $(VPP_FLAGS) $(VPP_LDFLAGS) $(VPP_LDFLAGS_hls_lossy_enc_compute) -o $(BINARY_CONTAINERS) -l $^ +EXE_FILE_DEPS += $(BINARY_CONTAINERS_TMP) +EXE_FILE_DEPS += $(BINARY_CONTAINERS_DEPS) + +############################## Setting Rules for Host (Building Host Executable) ############################## +ifeq ($(HOST_ARCH), x86) +$(TEMP_DIR)/%.o : %.cpp $$(@D)/.f + @echo "--> Making $@ from: $?" + mkdir -p $(BUILD_DIR) + $(MAKEDEPEND) + $(CXX) -o $@ $(CXXFLAGS) -I . -c $< +$(TEMP_DIR)/%.o : %.cc $$(@D)/.f + @echo "--> Making $@ from: $?" + mkdir -p $(BUILD_DIR) + $(MAKEDEPEND) + $(CXX) -o $@ $(CXXFLAGS) -I . -c $< +$(TEMP_DIR)/%.o : %.c $$(@D)/.f + @echo "--> Making $@ from: $?" + mkdir -p $(BUILD_DIR) + $(MAKEDEPEND) + $(CXX) -o $@ $(CXXFLAGS) -I . -c $< +$(EXE_FILE): $(EXE_FILE_DEPS) + mkdir -p $(BUILD_DIR) + $(CXX) -o $@ $^ $(CXXFLAGS) $(LDFLAGS) +else +# place holder for arch64 +endif + +$(EMCONFIG): + emconfigutil --platform $(XPLATFORM) --od $(BUILD_DIR) + +%/.f: + mkdir -p $(dir $@) + touch $@ + +.PRECIOUS: %/.f + +RUN_DEPS += $(EXE_FILE) $(EMCONFIG) + +run: check_device $(RUN_DEPS) +#sw_emu +ifneq (,$(filter sw_emu, $(TARGET))) +ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(HOST_PREAMBLE) $(EXE_FILE) $(HOST_ARGS) + ./check.sh +else +# place holder for arch64 +endif +endif + +#hw_emu +ifneq (,$(filter hw_emu, $(TARGET))) +ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(HOST_PREAMBLE) $(EXE_FILE) $(HOST_ARGS) + ./check.sh +else +# place holder for arch64 +endif +endif + +#hw +ifeq ($(TARGET), hw) +ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(HOST_PREAMBLE) $(EXE_FILE) $(HOST_ARGS) + ./check.sh +else +# place holder for arch64 +endif +endif + +############################## Setting Targets ############################## + +.PHONY: all clean cleanall emconfig +emconfig: $(EMCONFIG) +ifeq ($(HOST_ARCH), x86) +all: check_vpp check_platform check_xrt $(EXE_FILE) $(BINARY_CONTAINERS) emconfig +else +all: check_vpp check_platform check_sysroot $(EXE_FILE) $(BINARY_CONTAINERS) emconfig sd_card +endif + +.PHONY: host xclbin +ifeq ($(HOST_ARCH), x86) +host: check_xrt $(EXE_FILE) +else +host: check_sysroot $(EXE_FILE) +endif +xclbin: $(BINARY_CONTAINERS_TMP) + +############################## Cleaning Rules ############################## +cleanh: + -$(RMDIR) $(EXE_FILE) vitis_* TempConfig system_estimate.xtxt *.rpt .run/ + -$(RMDIR) src/*.ll _xocc_* .Xil dltmp* xmltmp* *.log *.jou *.wcfg *.wdb sample_link.ini sample_compile.ini obj* bin* *.csv *.jpg *.jpeg *.png + +cleank: + -$(RMDIR) $(BUILD_DIR)/*.xclbin _vimage *xclbin.run_summary qemu-memory-_* emulation/ _vimage/ pl*start_simulation. sh *.xclbin + -$(RMDIR) _x_temp.*/_x.* _x_temp.*/.Xil _x_temp.*/profile_summary.* xo_* _x* + -$(RMDIR) _x_temp.*/dltmp* _x_temp.*/kernel_info.dat _x_temp.*/*.log + -$(RMDIR) _x_temp.* + +cleanall: cleanh cleank + -$(RMDIR) $(BUILD_DIR) build_dir.* emconfig.json *.html $(TEMP_DIR) $(CUR_DIR)/reports *.csv *.run_summary $(CUR_DIR)/*.raw package_* $(BUILD_DIR)/run_script.sh .ipcache *.str + -$(RMDIR) $(XFLIB_DIR)/common/data/*.xe2xd* $(XFLIB_DIR)/common/data/*.orig* + +clean: cleanh diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/check.sh b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/check.sh new file mode 100755 index 0000000000..d9450ab8d2 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/check.sh @@ -0,0 +1 @@ +echo "bcf0915760ea2ffbfd33a1bb2abe028a small32x32.jxl" | md5sum -c - diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/conn_u50.cfg b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/conn_u50.cfg new file mode 100644 index 0000000000..9324a2c545 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/conn_u50.cfg @@ -0,0 +1,21 @@ +[hls] +#pre_tcl=hls_pre.tcl + +[connectivity] +sp=hls_lossy_enc_compute_1.config:HBM[14] +sp=hls_lossy_enc_compute_1.config_fl:HBM[15] +sp=hls_lossy_enc_compute_1.hls_opsin_1:HBM[0] +sp=hls_lossy_enc_compute_1.hls_opsin_2:HBM[1] +sp=hls_lossy_enc_compute_1.hls_opsin_3:HBM[2] +sp=hls_lossy_enc_compute_1.quant_field_row:HBM[3] +sp=hls_lossy_enc_compute_1.masking_field_row:HBM[4] +sp=hls_lossy_enc_compute_1.aq_map_f:HBM[5] +sp=hls_lossy_enc_compute_1.cmap_axi:HBM[6] +sp=hls_lossy_enc_compute_1.ac_coef_axiout:HBM[7] +sp=hls_lossy_enc_compute_1.strategy_all:HBM[8] +sp=hls_lossy_enc_compute_1.raw_quant_field_i:HBM[9] +sp=hls_lossy_enc_compute_1.hls_order:HBM[10] +sp=hls_lossy_enc_compute_1.hls_dc8x8:HBM[11] +sp=hls_lossy_enc_compute_1.hls_dc16x16:HBM[12] +sp=hls_lossy_enc_compute_1.hls_dc32x32:HBM[13] + diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/description.json b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/description.json new file mode 100644 index 0000000000..7ca4133772 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/description.json @@ -0,0 +1,285 @@ +{ + "gui": false, + "name": "Xilinx JXL ACC LOSSY ENC Demo (SC)", + "description": "", + "flow": "vitis", + "platform_allowlist": [ + "u50" + ], + "platform_blocklist": [ + "zc" + ], + "launch": [ + { + "cmd_args": " --xclbin BUILD/jxlEnc.xclbin LIB_DIR/L2/demos/jxlEnc/images/small32x32.png small32x32.jxl", + "name": "generic launch for all flows" + } + ], + "post_launch": [ + { + "launch_cmd": [ + "./check.sh" + ] + } + ], + "host": { + "host_exe": "host.exe", + "compiler": { + "sources": [ + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cjxl.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cjxl_main.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cmdline.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/codec_config.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/speed_stats.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cpu/cpu.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cpu/os_specific.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/box/box.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/time.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_png.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_pgx.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_pnm.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_jpg.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_psd.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_internal.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/decode_to_jpeg.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_huffman.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_context_map.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/progressive_split.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_detect_dots.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_params.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/blending.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_comparator.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_table.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_tree.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/linalg.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_file.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/headers.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/alpha.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/image_metadata.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/frame_header.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_fast_heuristics.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_encode.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/fields.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/luminance.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_color_management.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_bit_writer.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/image.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/loop_filter.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/color_management.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_modular.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_quant_weights.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_chroma_from_luma.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_adaptive_quantization.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cache.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_group.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ac_strategy.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_noise.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_splines.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_patch_dictionary.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/splines.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_xyb.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ar_control_field.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/memory_manager_internal.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_file.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_image_bundle.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_toc.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/passes_state.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/chroma_from_luma.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_context_map.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_coeff_order.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_ans.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_entropy_coder.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_common.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_dc.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/epf.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_dot_dictionary.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_frame.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_patch_dictionary.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_comparator.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_reconstruct.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group_border.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/filters.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/convolve.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_cache.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_noise.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_huffman.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dct_scales.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_decode.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_icc_codec.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli/butteraugli.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cluster.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data_writer.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/jpeg_data.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data_reader.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/base/padded_bytes.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/base/data_parallel.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/base/cache_aligned.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/base/status.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/dec_ma.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/modular_image.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/encoding.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_rct.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_squeeze.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_palette.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/squeeze.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_transform.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/jxl_transform.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_ma.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_encoding.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/encode.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/memory.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/backward_references_hq.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/brotli_bit_stream.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/block_splitter.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/metablock.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/compress_fragment.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/compress_fragment_two_pass.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/backward_references.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/encoder_dict.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/utf8_util.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/decode.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/static_dict.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/literal_cost.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/entropy_encode.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/bit_cost.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/cluster.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/dictionary_hash.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/histogram.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/bit_reader.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/huffman.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/state.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/dictionary.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/transform.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmslut.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsnamed.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmspack.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscnvrt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsio1.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsgmt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsopt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsalpha.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmstypes.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsintrp.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsgamma.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscam02.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscgats.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmshalf.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsmtrx.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsps2.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmssamp.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmssm.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsxform.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsio0.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsplugin.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmserr.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmspcs.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmswtpnt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsvirt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lodepng/lodepng.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/highway/hwy/aligned_allocator.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/highway/hwy/targets.cc", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_enc_frame.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_host.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase1.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase2.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase3.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_enc_frame.cpp", + "LIB_DIR/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/host/host_lossy_enc_compute.cpp" + ], + "includepaths": [ + "LIB_DIR/../utils/L1/include/", + "LIB_DIR/L2/demos/jxlEnc/third_partys/", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/include", + "LIB_DIR/L2/demos/jxlEnc/third_partys/build/lib/include", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/include", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/highway", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/include", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lodepng", + "LIB_DIR/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute", + "LIB_DIR/L2/demos/jxlEnc/others/include", + "LIB_DIR/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/kernel", + "LIB_DIR/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/host" + ], + "options": "-O3 " + } + }, + "v++": { + "compiler": { + "includepaths": [ + "LIB_DIR/../utils/L1/include/", + "LIB_DIR/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/kernel" + ] + } + }, + "containers": [ + { + "accelerators": [ + { + "location": "LIB_DIR/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/kernel/hls_lossy_enc_compute.cpp", + "frequency": 300.0, + "clflags": " -D KERNEL_NAME=hls_lossy_enc_compute", + "name": "hls_lossy_enc_compute" + } + ], + "frequency": 300.0, + "name": "hls_lossy_enc_compute", + "ldclflags": "--advanced.param compiler.userPostSysLinkOverlayTcl=postSysLink.tcl" + } + ], + "testinfo": { + "disable": false, + "jobs": [ + { + "index": 0, + "dependency": [], + "env": "", + "cmd": "", + "max_memory_MB": { + "vitis_hw_build": 81920, + "vitis_hw_emu": 40960, + "vitis_sw_emu": 10240, + "vitis_hw_run": 10240 + }, + "max_time_min": { + "vitis_hw_build": 3200, + "vitis_hw_emu": 1600, + "vitis_sw_emu": 120, + "vitis_hw_run": 10 + } + } + ], + "targets": [ + "vitis_sw_emu", + "vitis_hw_emu", + "vitis_hw" + ], + "category": "canary" + } +} \ No newline at end of file diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/host/host_lossy_enc_compute.cpp b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/host/host_lossy_enc_compute.cpp new file mode 100644 index 0000000000..fa27f3e2f8 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/host/host_lossy_enc_compute.cpp @@ -0,0 +1,483 @@ +/* + * Copyright 2022 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HOST_LOSSY_ENC_COMPUTE_SC_CPP +#define HOST_LOSSY_ENC_COMPUTE_SC_CPP + +#include +#include +#include "ap_int.h" + +#include "host_lossy_enc_compute.hpp" + +#ifndef HLS_TEST +#include "xf_utils_sw/logger.hpp" +#endif + +unsigned long diff(const struct timeval* newTime, const struct timeval* oldTime) { + return (newTime->tv_sec - oldTime->tv_sec) * 1000000 + (newTime->tv_usec - oldTime->tv_usec); +} + +template +T* aligned_alloc(std::size_t num) { + void* ptr = NULL; + if (posix_memalign(&ptr, 4096, num * sizeof(T))) throw std::bad_alloc(); + return reinterpret_cast(ptr); +} + +void hls_lossy_enc_compute_wrapper(std::string xclbinPath, // xclbin + int config[32], // mm15, input + float config_fl[32], // mm16, input + float* hls_opsin_1, // mm1, input + float* hls_opsin_2, // mm2, input + float* hls_opsin_3, // mm3, input + float* hls_quant_field, // mm4, input + float* hls_masking_field, // mm5, input + float* aq_map_f, // mm6, input + int8_t* cmap_axi, // mm7, output + int* ac_coef_axiout, // mm8, output + unsigned char* strategy_all, // mm9, output + int* raw_quant_field_i, // mm10, output + uint32_t* hls_order, // mm11, output + float* hls_dc8x8, // mm12, output + float* hls_dc16x16, // mm13, output + float* hls_dc32x32 // mm14, output + ) { +#ifndef HLS_TEST + + auto config_pool = lossy_acc::create_bufpool(vpp::input); + auto config_fl_pool = lossy_acc::create_bufpool(vpp::input); + auto hls_opsin_1_pool = lossy_acc::create_bufpool(vpp::input); + auto hls_opsin_2_pool = lossy_acc::create_bufpool(vpp::input); + auto hls_opsin_3_pool = lossy_acc::create_bufpool(vpp::input); + auto quant_field_row_pool = lossy_acc::create_bufpool(vpp::input); + auto masking_field_row_pool = lossy_acc::create_bufpool(vpp::input); + auto aq_map_f_pool = lossy_acc::create_bufpool(vpp::input); + auto cmap_axi_pool = lossy_acc::create_bufpool(vpp::output); + auto ac_coef_axiout_pool = lossy_acc::create_bufpool(vpp::output); + auto strategy_all_pool = lossy_acc::create_bufpool(vpp::output); + auto raw_quant_field_i_pool = lossy_acc::create_bufpool(vpp::output); + auto hls_order_pool = lossy_acc::create_bufpool(vpp::output); + auto hls_dc8x8_pool = lossy_acc::create_bufpool(vpp::output); + auto hls_dc16x16_pool = lossy_acc::create_bufpool(vpp::output); + auto hls_dc32x32_pool = lossy_acc::create_bufpool(vpp::output); + + lossy_acc::send_while([&]() -> bool { + int* acc_config = (int*)lossy_acc::alloc_buf(config_pool, sizeof(int) * MAX_NUM_CONFIG); + float* acc_config_fl = (float*)lossy_acc::alloc_buf(config_fl_pool, sizeof(float) * MAX_NUM_CONFIG); + float* acc_hls_opsin_1 = (float*)lossy_acc::alloc_buf(hls_opsin_1_pool, sizeof(float) * ALL_PIXEL); + float* acc_hls_opsin_2 = (float*)lossy_acc::alloc_buf(hls_opsin_2_pool, sizeof(float) * ALL_PIXEL); + float* acc_hls_opsin_3 = (float*)lossy_acc::alloc_buf(hls_opsin_3_pool, sizeof(float) * ALL_PIXEL); + float* acc_quant_field_row = + (float*)lossy_acc::alloc_buf(quant_field_row_pool, sizeof(float) * BLOCK8_H * BLOCK8_W); + float* acc_masking_field_row = + (float*)lossy_acc::alloc_buf(masking_field_row_pool, sizeof(float) * BLOCK8_H * BLOCK8_W); + float* acc_aq_map_f = (float*)lossy_acc::alloc_buf(aq_map_f_pool, sizeof(float) * BLOCK8_H * BLOCK8_W); + int8_t* acc_cmap_axi = (int8_t*)lossy_acc::alloc_buf(cmap_axi_pool, sizeof(int8_t) * TILE_W * TILE_H * 2); + int* acc_ac_coef_axiout = (int*)lossy_acc::alloc_buf(ac_coef_axiout_pool, sizeof(int) * ALL_PIXEL); + unsigned char* acc_strategy_all = + (unsigned char*)lossy_acc::alloc_buf(strategy_all_pool, sizeof(float) * BLOCK8_H * BLOCK8_W); + int* acc_raw_quant_field_i = + (int*)lossy_acc::alloc_buf(raw_quant_field_i_pool, sizeof(int) * BLOCK8_H * BLOCK8_W); + uint32_t* acc_hls_order = (uint32_t*)lossy_acc::alloc_buf(hls_order_pool, sizeof(uint32_t) * MAX_ORDER); + float* acc_hls_dc8x8 = (float*)lossy_acc::alloc_buf(hls_dc8x8_pool, sizeof(float) * ALL_PIXEL); + float* acc_hls_dc16x16 = (float*)lossy_acc::alloc_buf(hls_dc16x16_pool, sizeof(float) * ALL_PIXEL); + float* acc_hls_dc32x32 = (float*)lossy_acc::alloc_buf(hls_dc32x32_pool, sizeof(float) * ALL_PIXEL); + + memcpy(acc_config, config, sizeof(int) * MAX_NUM_CONFIG); + memcpy(acc_config_fl, config_fl, sizeof(float) * MAX_NUM_CONFIG); + memcpy(acc_hls_opsin_1, hls_opsin_1, sizeof(float) * ALL_PIXEL); + memcpy(acc_hls_opsin_2, hls_opsin_2, sizeof(float) * ALL_PIXEL); + memcpy(acc_hls_opsin_3, hls_opsin_3, sizeof(float) * ALL_PIXEL); + memcpy(acc_quant_field_row, hls_quant_field, sizeof(float) * BLOCK8_H * BLOCK8_W); + memcpy(acc_masking_field_row, hls_masking_field, sizeof(float) * BLOCK8_H * BLOCK8_W); + memcpy(acc_aq_map_f, aq_map_f, sizeof(float) * BLOCK8_H * BLOCK8_W); + // memcpy(acc_cmap_axi, cmap_axi, sizeof(int8_t) * TILE_W * TILE_H * 2); + // memcpy(acc_ac_coef_axiout, ac_coef_axiout, sizeof(int) * ALL_PIXEL); + // memcpy(acc_strategy_all, strategy_all, sizeof(unsigned char) * BLOCK8_H * BLOCK8_W); + // memcpy(acc_raw_quant_field_i, raw_quant_field_i, sizeof(int) * BLOCK8_H * BLOCK8_W); + // memcpy(acc_hls_order, hls_order, sizeof(uint32_t) * MAX_ORDER); + // memcpy(acc_hls_dc8x8, hls_dc8x8, sizeof(float) * ALL_PIXEL); + // memcpy(acc_hls_dc16x16, hls_dc16x16, sizeof(float) * ALL_PIXEL); + // memcpy(acc_hls_dc32x32, hls_dc32x32, sizeof(float) * ALL_PIXEL); + + lossy_acc::compute(acc_config, acc_config_fl, acc_hls_opsin_1, acc_hls_opsin_2, acc_hls_opsin_3, + acc_quant_field_row, acc_masking_field_row, acc_aq_map_f, acc_cmap_axi, acc_ac_coef_axiout, + acc_strategy_all, acc_raw_quant_field_i, acc_hls_order, acc_hls_dc8x8, acc_hls_dc16x16, + acc_hls_dc32x32); + return 0; + }); + + lossy_acc::receive_all_in_order([&]() { + int* acc_config = (int*)lossy_acc::get_buf(config_pool); + float* acc_config_fl = (float*)lossy_acc::get_buf(config_fl_pool); + float* acc_hls_opsin_1 = (float*)lossy_acc::get_buf(hls_opsin_1_pool); + float* acc_hls_opsin_2 = (float*)lossy_acc::get_buf(hls_opsin_2_pool); + float* acc_hls_opsin_3 = (float*)lossy_acc::get_buf(hls_opsin_3_pool); + float* acc_quant_field_row = (float*)lossy_acc::get_buf(quant_field_row_pool); + float* acc_masking_field_row = (float*)lossy_acc::get_buf(masking_field_row_pool); + float* acc_aq_map_f = (float*)lossy_acc::get_buf(aq_map_f_pool); + int8_t* acc_cmap_axi = (int8_t*)lossy_acc::get_buf(cmap_axi_pool); + int* acc_ac_coef_axiout = (int*)lossy_acc::get_buf(ac_coef_axiout_pool); + unsigned char* acc_strategy_all = (unsigned char*)lossy_acc::get_buf(strategy_all_pool); + int* acc_raw_quant_field_i = (int*)lossy_acc::get_buf(raw_quant_field_i_pool); + uint32_t* acc_hls_order = (uint32_t*)lossy_acc::get_buf(hls_order_pool); + float* acc_hls_dc8x8 = (float*)lossy_acc::get_buf(hls_dc8x8_pool); + float* acc_hls_dc16x16 = (float*)lossy_acc::get_buf(hls_dc16x16_pool); + float* acc_hls_dc32x32 = (float*)lossy_acc::get_buf(hls_dc32x32_pool); + + // memcpy(config, acc_config, sizeof(int) * MAX_NUM_CONFIG); + // memcpy(config_fl, acc_config_fl, sizeof(float) * MAX_NUM_CONFIG); + // memcpy(hls_opsin_1, acc_hls_opsin_1, sizeof(float) * ALL_PIXEL); + // memcpy(hls_opsin_2, acc_hls_opsin_2, sizeof(float) * ALL_PIXEL); + // memcpy(hls_opsin_3, acc_hls_opsin_3, sizeof(float) * ALL_PIXEL); + // memcpy(hls_quant_field, acc_quant_field_row, sizeof(float) * BLOCK8_H * BLOCK8_W); + // memcpy(hls_masking_field, acc_masking_field_row, sizeof(float) * BLOCK8_H * BLOCK8_W); + // memcpy(aq_map_f, acc_aq_map_f, sizeof(float) * BLOCK8_H * BLOCK8_W); + memcpy(cmap_axi, acc_cmap_axi, sizeof(int8_t) * TILE_W * TILE_H * 2); + memcpy(ac_coef_axiout, acc_ac_coef_axiout, sizeof(int) * ALL_PIXEL); + memcpy(strategy_all, acc_strategy_all, sizeof(unsigned char) * BLOCK8_H * BLOCK8_W); + memcpy(raw_quant_field_i, acc_raw_quant_field_i, sizeof(int) * BLOCK8_H * BLOCK8_W); + memcpy(hls_order, acc_hls_order, sizeof(uint32_t) * MAX_ORDER); + memcpy(hls_dc8x8, acc_hls_dc8x8, sizeof(float) * ALL_PIXEL); + memcpy(hls_dc16x16, acc_hls_dc16x16, sizeof(float) * ALL_PIXEL); + memcpy(hls_dc32x32, acc_hls_dc32x32, sizeof(float) * ALL_PIXEL); + }); + + lossy_acc::join(); + +// xf::common::utils_sw::Logger logger(std::cout, std::cerr); +// cl_int fail; + +// struct timeval start_time; // End to end time clock start +// gettimeofday(&start_time, 0); + +// // platform related operations +// std::vector devices = xcl::get_xil_devices(); +// cl::Device device = devices[0]; + +// // Creating Context and Command Queue for selected Device +// cl::Context context(device, NULL, NULL, NULL, &fail); +// logger.logCreateContext(fail); +// cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &fail); +// logger.logCreateCommandQueue(fail); +// std::string devName = device.getInfo(); +// printf("INFO: Found Device=%s\n", devName.c_str()); +// cl::Program::Binaries xclBins = xcl::import_binary_file(xclbinPath); + +// devices.resize(1); +// cl::Program program(context, devices, xclBins, NULL, &fail); +// logger.logCreateProgram(fail); + +// int repInt = 1; +// // create kernels +// std::vector hls_lossy_enc_compute(repInt); +// for (int i = 0; i < repInt; i++) { +// hls_lossy_enc_compute[i] = cl::Kernel(program, "hls_lossy_enc_compute", &fail); +// logger.logCreateKernel(fail); +// } +// std::cout << "INFO: kernel has been created" << std::endl; + +// // 1. create all I/O Buffer +// float* hb_hls_opsin_1 = aligned_alloc(ALL_PIXEL); +// float* hb_hls_opsin_2 = aligned_alloc(ALL_PIXEL); +// float* hb_hls_opsin_3 = aligned_alloc(ALL_PIXEL); +// float* hb_hls_quant_field = aligned_alloc(BLOCK8_H * BLOCK8_W); +// float* hb_hls_masking_field = aligned_alloc(BLOCK8_H * BLOCK8_W); +// float* hb_aq_map_f = aligned_alloc(BLOCK8_H * BLOCK8_W); +// int8_t* hb_cmap_axi = aligned_alloc(TILE_W * TILE_H * 2); +// int32_t* hb_ac_coef_axiout = aligned_alloc(ALL_PIXEL); +// unsigned char* hb_strategy_all = aligned_alloc(BLOCK8_W * BLOCK8_H); +// int32_t* hb_raw_quant_field_i = aligned_alloc(BLOCK8_H * BLOCK8_W); +// uint32_t* hb_hls_order = aligned_alloc(MAX_ORDER); +// float* hb_hls_dc8x8 = aligned_alloc(ALL_PIXEL); +// float* hb_hls_dc16x16 = aligned_alloc(ALL_PIXEL); +// float* hb_hls_dc32x32 = aligned_alloc(ALL_PIXEL); +// int32_t* hb_config = aligned_alloc(MAX_NUM_CONFIG); +// float* hb_config_fl = aligned_alloc(MAX_NUM_CONFIG); + +// //================================================== +// // 2. init all the host Buffers +// //================================================== + +// // input port +// for (int j = 0; j < MAX_NUM_CONFIG; j++) { +// hb_config[j] = config[j]; +// } + +// for (int j = 0; j < MAX_NUM_CONFIG; j++) { +// hb_config_fl[j] = config_fl[j]; +// } + +// for (int j = 0; j < ALL_PIXEL; j++) { +// hb_hls_opsin_1[j] = hls_opsin_1[j]; +// } + +// for (int j = 0; j < ALL_PIXEL; j++) { +// hb_hls_opsin_2[j] = hls_opsin_2[j]; +// } + +// for (int j = 0; j < ALL_PIXEL; j++) { +// hb_hls_opsin_3[j] = hls_opsin_3[j]; +// } + +// for (int j = 0; j < BLOCK8_H * BLOCK8_W; j++) { +// hb_hls_quant_field[j] = hls_quant_field[j]; +// } + +// for (int j = 0; j < BLOCK8_H * BLOCK8_W; j++) { +// hb_hls_masking_field[j] = hls_masking_field[j]; +// } + +// for (int j = 0; j < BLOCK8_H * BLOCK8_W; j++) { +// hb_aq_map_f[j] = aq_map_f[j]; +// } + +// // mapping to HBM banks +// std::vector mext_o(33); +// mext_o[0] = {(((unsigned int)(0)) | XCL_MEM_TOPOLOGY), hb_hls_opsin_1, 0}; +// mext_o[1] = {(((unsigned int)(1)) | XCL_MEM_TOPOLOGY), hb_hls_opsin_2, 0}; +// mext_o[2] = {(((unsigned int)(2)) | XCL_MEM_TOPOLOGY), hb_hls_opsin_3, 0}; +// mext_o[3] = {(((unsigned int)(3)) | XCL_MEM_TOPOLOGY), hb_hls_quant_field, 0}; +// mext_o[4] = {(((unsigned int)(4)) | XCL_MEM_TOPOLOGY), hb_hls_masking_field, 0}; +// mext_o[5] = {(((unsigned int)(5)) | XCL_MEM_TOPOLOGY), hb_aq_map_f, 0}; +// mext_o[6] = {(((unsigned int)(6)) | XCL_MEM_TOPOLOGY), hb_cmap_axi, 0}; +// mext_o[7] = {(((unsigned int)(7)) | XCL_MEM_TOPOLOGY), hb_ac_coef_axiout, 0}; +// mext_o[8] = {(((unsigned int)(8)) | XCL_MEM_TOPOLOGY), hb_strategy_all, 0}; +// mext_o[9] = {(((unsigned int)(9)) | XCL_MEM_TOPOLOGY), hb_raw_quant_field_i, 0}; +// mext_o[10] = {(((unsigned int)(10)) | XCL_MEM_TOPOLOGY), hb_hls_order, 0}; +// mext_o[11] = {(((unsigned int)(11)) | XCL_MEM_TOPOLOGY), hb_hls_dc8x8, 0}; +// mext_o[12] = {(((unsigned int)(12)) | XCL_MEM_TOPOLOGY), hb_hls_dc16x16, 0}; +// mext_o[13] = {(((unsigned int)(13)) | XCL_MEM_TOPOLOGY), hb_hls_dc32x32, 0}; +// mext_o[14] = {(((unsigned int)(14)) | XCL_MEM_TOPOLOGY), hb_config, 0}; +// mext_o[15] = {(((unsigned int)(15)) | XCL_MEM_TOPOLOGY), hb_config_fl, 0}; + +// //=================================================== +// // 3. create device Buffer and map dev buf to host buf, +// //=================================================== +// cl::Buffer db_hls_opsin_1; // mm1, input +// cl::Buffer db_hls_opsin_2; // mm2, input +// cl::Buffer db_hls_opsin_3; // mm3, input +// cl::Buffer db_hls_quant_field; // mm4, input +// cl::Buffer db_hls_masking_field; // mm5, input +// cl::Buffer db_aq_map_f; // mm6, input +// cl::Buffer db_cmap_axi; // mm7, output +// cl::Buffer db_ac_coef_axiout; // mm8, output +// cl::Buffer db_strategy_all; // mm9, output +// cl::Buffer db_raw_quant_field_i; // mm10, output +// cl::Buffer db_hls_order; // mm11, output +// cl::Buffer db_hls_dc8x8; // mm12, output +// cl::Buffer db_hls_dc16x16; // mm13, output +// cl::Buffer db_hls_dc32x32; // mm14, output +// cl::Buffer db_config; // mm15, input +// cl::Buffer db_config_fl; // mm16, input + +// // init cl Buffer +// db_hls_opsin_1 = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, +// sizeof(float) * ALL_PIXEL, &mext_o[0]); + +// db_hls_opsin_2 = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, +// sizeof(float) * ALL_PIXEL, &mext_o[1]); + +// db_hls_opsin_3 = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, +// sizeof(float) * ALL_PIXEL, &mext_o[2]); + +// db_hls_quant_field = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, +// sizeof(float) * (BLOCK8_H * BLOCK8_W), &mext_o[3]); + +// db_hls_masking_field = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, +// sizeof(float) * (BLOCK8_H * BLOCK8_W), &mext_o[4]); + +// db_aq_map_f = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, +// sizeof(float) * (BLOCK8_H * BLOCK8_W), &mext_o[5]); + +// db_cmap_axi = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, +// sizeof(int8_t) * (TILE_W * TILE_H * 2), &mext_o[6]); + +// db_ac_coef_axiout = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, +// sizeof(int32_t) * ALL_PIXEL, &mext_o[7]); + +// db_strategy_all = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, +// sizeof(u_char) * (BLOCK8_H * BLOCK8_W), &mext_o[8]); + +// db_raw_quant_field_i = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, +// sizeof(float) * (BLOCK8_H * BLOCK8_W), &mext_o[9]); + +// db_hls_order = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, +// sizeof(float) * MAX_ORDER, &mext_o[10]); + +// db_hls_dc8x8 = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, +// sizeof(float) * ALL_PIXEL, &mext_o[11]); + +// db_hls_dc16x16 = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, +// sizeof(float) * ALL_PIXEL, &mext_o[12]); + +// db_hls_dc32x32 = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, +// sizeof(float) * ALL_PIXEL, &mext_o[13]); + +// db_config = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, +// sizeof(float) * MAX_NUM_CONFIG, &mext_o[14]); + +// db_config_fl = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, +// sizeof(float) * MAX_NUM_CONFIG, &mext_o[15]); + +// //================================== +// // add Buffers to migrate +// std::vector ob_in; +// std::vector ob_out; + +// ob_in.push_back(db_config); +// ob_in.push_back(db_config_fl); +// ob_in.push_back(db_hls_opsin_1); +// ob_in.push_back(db_hls_opsin_2); +// ob_in.push_back(db_hls_opsin_3); +// ob_in.push_back(db_hls_quant_field); +// ob_in.push_back(db_hls_masking_field); +// ob_in.push_back(db_aq_map_f); + +// ob_out.push_back(db_cmap_axi); +// ob_out.push_back(db_ac_coef_axiout); +// ob_out.push_back(db_strategy_all); +// ob_out.push_back(db_raw_quant_field_i); +// ob_out.push_back(db_hls_order); +// ob_out.push_back(db_hls_dc8x8); +// ob_out.push_back(db_hls_dc16x16); +// ob_out.push_back(db_hls_dc32x32); + +// // set kernel args +// for (int i = 0; i < repInt; i++) { +// hls_lossy_enc_compute[i].setArg(0, db_config); +// hls_lossy_enc_compute[i].setArg(1, db_config_fl); +// hls_lossy_enc_compute[i].setArg(2, db_hls_opsin_1); +// hls_lossy_enc_compute[i].setArg(3, db_hls_opsin_2); +// hls_lossy_enc_compute[i].setArg(4, db_hls_opsin_3); +// hls_lossy_enc_compute[i].setArg(5, db_hls_quant_field); +// hls_lossy_enc_compute[i].setArg(6, db_hls_masking_field); +// hls_lossy_enc_compute[i].setArg(7, db_aq_map_f); +// hls_lossy_enc_compute[i].setArg(8, db_cmap_axi); +// hls_lossy_enc_compute[i].setArg(9, db_ac_coef_axiout); +// hls_lossy_enc_compute[i].setArg(10, db_strategy_all); +// hls_lossy_enc_compute[i].setArg(11, db_raw_quant_field_i); +// hls_lossy_enc_compute[i].setArg(12, db_hls_order); +// hls_lossy_enc_compute[i].setArg(13, db_hls_dc8x8); +// hls_lossy_enc_compute[i].setArg(14, db_hls_dc16x16); +// hls_lossy_enc_compute[i].setArg(15, db_hls_dc32x32); +// } + +// // launch kernel and calculate kernel execution time +// std::cout << "INFO: Kernel Start" << std::endl; +// // declare events +// std::vector events_write(1); +// std::vector events_kernel(1); +// std::vector events_read(1); + +// // migrate, +// q.enqueueMigrateMemObjects(ob_in, 0, nullptr, &events_write[0]); +// q.enqueueTask(hls_lossy_enc_compute[0], &events_write, &events_kernel[0]); +// q.enqueueMigrateMemObjects(ob_out, 1, &events_kernel, &events_read[0]); +// q.finish(); + +// struct timeval end_time; +// gettimeofday(&end_time, 0); +// std::cout << "INFO: Finish kernel execution" << std::endl; +// std::cout << "INFO: Finish E2E execution" << std::endl; + +// // print related times +// unsigned long timeStart, timeEnd, exec_time0; +// std::cout << "-------------------------------------------------------" << std::endl; +// events_write[0].getProfilingInfo(CL_PROFILING_COMMAND_START, &timeStart); +// events_write[0].getProfilingInfo(CL_PROFILING_COMMAND_END, &timeEnd); +// exec_time0 = (timeEnd - timeStart) / 1000.0; +// std::cout << "INFO: Data transfer from host to device: " << exec_time0 << " us\n"; +// std::cout << "-------------------------------------------------------" << std::endl; +// events_read[0].getProfilingInfo(CL_PROFILING_COMMAND_START, &timeStart); +// events_read[0].getProfilingInfo(CL_PROFILING_COMMAND_END, &timeEnd); +// exec_time0 = (timeEnd - timeStart) / 1000.0; +// std::cout << "INFO: Kernel1 Data transfer from device to host: " << exec_time0 << " us\n"; +// std::cout << "-------------------------------------------------------" << std::endl; +// exec_time0 = 0; +// for (int i = 0; i < 1; ++i) { +// events_kernel[0].getProfilingInfo(CL_PROFILING_COMMAND_START, &timeStart); +// events_kernel[0].getProfilingInfo(CL_PROFILING_COMMAND_END, &timeEnd); +// exec_time0 += (timeEnd - timeStart) / 1000.0; + +// std::cout << "INFO: Kernel" << i + 1 << " execution: " << (timeEnd - timeStart) / 1000.0 << " us\n"; +// std::cout << "-------------------------------------------------------" << std::endl; +// } +// std::cout << "INFO: kernel total execution: " << exec_time0 << " us\n"; +// std::cout << "-------------------------------------------------------" << std::endl; +// unsigned long exec_timeE2E = diff(&end_time, &start_time); +// std::cout << "INFO: FPGA execution time:" << exec_timeE2E << " us\n"; +// std::cout << "-------------------------------------------------------" << std::endl; + +// // output +// for (int j = 0; j < TILE_W * TILE_H * 2; j++) { +// cmap_axi[j] = hb_cmap_axi[j]; +// } + +// for (int j = 0; j < ALL_PIXEL; j++) { +// ac_coef_axiout[j] = hb_ac_coef_axiout[j]; +// } + +// for (int j = 0; j < BLOCK8_W * BLOCK8_H; j++) { +// strategy_all[j] = hb_strategy_all[j]; +// } + +// for (int j = 0; j < BLOCK8_H * BLOCK8_W; j++) { +// raw_quant_field_i[j] = hb_raw_quant_field_i[j]; +// } + +// for (int j = 0; j < MAX_ORDER; j++) { +// hls_order[j] = hb_hls_order[j]; +// } + +// for (int j = 0; j < ALL_PIXEL; j++) { +// hls_dc8x8[j] = hb_hls_dc8x8[j]; +// } + +// for (int j = 0; j < ALL_PIXEL; j++) { +// hls_dc16x16[j] = hb_hls_dc16x16[j]; +// } + +// for (int j = 0; j < ALL_PIXEL; j++) { +// hls_dc32x32[j] = hb_hls_dc32x32[j]; +// } + +// // free mem +// free(hb_hls_opsin_1); +// free(hb_hls_opsin_2); +// free(hb_hls_opsin_3); +// free(hb_hls_quant_field); +// free(hb_hls_masking_field); +// free(hb_aq_map_f); +// free(hb_cmap_axi); +// free(hb_ac_coef_axiout); +// free(hb_strategy_all); +// free(hb_raw_quant_field_i); +// free(hb_hls_order); +// free(hb_hls_dc8x8); +// free(hb_hls_dc16x16); +// free(hb_hls_dc32x32); +// free(hb_config); +// free(hb_config_fl); +#else + hls_lossy_enc_compute(config, config_fl, hls_opsin_1, hls_opsin_2, hls_opsin_3, hls_quant_field, hls_masking_field, + aq_map_f, cmap_axi, ac_coef_axiout, strategy_all, raw_quant_field_i, hls_order, hls_dc8x8, + hls_dc16x16, hls_dc32x32); +#endif +} + +#endif diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/host/host_lossy_enc_compute.hpp b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/host/host_lossy_enc_compute.hpp new file mode 100644 index 0000000000..cb79414c90 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/host/host_lossy_enc_compute.hpp @@ -0,0 +1,60 @@ +/* + * Copyright 2022 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HOST_LOSSY_ENC_COMPUTE_SC_HPP +#define HOST_LOSSY_ENC_COMPUTE_SC_HPP + +#include +#include + +#ifndef HLS_TEST +#include "xf_utils_sw/logger.hpp" +#include "hls_lossy_enc_compute.hpp" + +// const int PIXEL_W = 2048; +// const int PIXEL_H = 2048; +// const int FRAME_DIM = 3; +// const int ALL_PIXEL = PIXEL_W * PIXEL_H * FRAME_DIM; +// const int BLOCK8_W = PIXEL_W / 8; +// const int BLOCK8_H = PIXEL_H / 8; +// const int BLOCK8_NUM = BLOCK8_W * BLOCK8_H * FRAME_DIM; +// const int TILE_W = PIXEL_W / 64; +// const int TILE_H = PIXEL_H / 64; +// const int MAX_ORDER = 320 * 3 + 1; +// const int MAX_NUM_CONFIG = 32; + +#else +#include "hls_lossy_enc_compute.hpp" +#endif + +void hls_lossy_enc_compute_wrapper(std::string xclbinPath, + int config[32], + float config_fl[32], + float* hls_opsin_1, + float* hls_opsin_2, + float* hls_opsin_3, + float* quant_field_row, + float* masking_field_row, + float* aq_map_f, + int8_t* cmap_axi, + int* ac_coef_axiout, + unsigned char* strategy_all, + int* raw_quant_field_i, + uint32_t* hls_order, + float* hls_dc8x8, + float* hls_dc16x16, + float* hls_dc32x32); +#endif diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/kernel/hls_lossy_enc_compute.cpp b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/kernel/hls_lossy_enc_compute.cpp new file mode 100644 index 0000000000..fdf8acb5a4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/kernel/hls_lossy_enc_compute.cpp @@ -0,0 +1,9401 @@ +/* + * Copyright 2022 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HLS_LOSSY_ENC_COMPUTE_CPP +#define HLS_LOSSY_ENC_COMPUTE_CPP + +#include "hls_lossy_enc_compute.hpp" + +#define FIX + +// uint8_t covered_blocks_x_set[6] = {1, 1, 1, 1, 2, 4}; +// uint8_t covered_blocks_y_set[6] = {1, 1, 1, 1, 2, 4}; +uint8_t strategy_block[6] = {1, 1, 1, 1, 2, 4}; + +const float inv_matrix_8[3][64] = {{0, + 560, + 558.510437012, + 489.194152832, + 428.480621338, + 375.302246094, + 328.723815918, + 287.926147461, + 560, + 560, + 541.309387207, + 478.786773682, + 421.547454834, + 370.409942627, + 325.138336182, + 285.227325439, + 558.510437012, + 541.309387207, + 500.443756104, + 451.472991943, + 402.49432373, + 356.627593994, + 314.88571167, + 277.434692383, + 489.194152832, + 478.786773682, + 451.472991943, + 414.922729492, + 375.302246094, + 336.170715332, + 299.277435303, + 265.364807129, + 428.480621338, + 421.547454834, + 402.49432373, + 375.302246094, + 344.016448975, + 311.624298096, + 279.983337402, + 250.119842529, + 375.302246094, + 370.409942627, + 356.627593994, + 336.170715332, + 311.624298096, + 285.227325439, + 258.613525391, + 232.845169067, + 328.723815918, + 325.138336182, + 314.88571167, + 299.277435303, + 279.983337402, + 258.613525391, + 236.484725952, + 214.558776855, + 287.926147461, + 285.227325439, + 277.434692383, + 265.364807129, + 250.119842529, + 232.845169067, + 214.558776855, + 196.071777344}, + {0, + 3150, + 3139.25854492, + 2648.63037109, + 2234.68115234, + 1885.42749023, + 1590.75805664, + 1342.14172363, + 3150, + 3150, + 3015.80957031, + 2576.58398438, + 2188.41503906, + 1853.96557617, + 1568.54064941, + 1326.02929688, + 3139.25854492, + 3015.80957031, + 2726.99536133, + 2389.61645508, + 2062.38256836, + 1765.96655273, + 1505.39343262, + 1279.74853516, + 2648.63037109, + 2576.58398438, + 2389.61645508, + 2144.4074707, + 1885.42749023, + 1637.12109375, + 1410.37487793, + 1208.78967285, + 2234.68115234, + 2188.41503906, + 2062.38256836, + 1885.42749023, + 1686.28210449, + 1485.42663574, + 1294.84509277, + 1060.59338379, + 1885.42749023, + 1853.96557617, + 1765.96655273, + 1637.12109375, + 1485.42663574, + 1326.02929688, + 1169.49206543, + 785.963012695, + 1590.75805664, + 1568.54064941, + 1505.39343262, + 1410.37487793, + 1294.84509277, + 1169.49206543, + 838.701721191, + 558.03729248, + 1342.14172363, + 1326.02929688, + 1279.74853516, + 1208.78967285, + 1060.59338379, + 785.963012695, + 558.03729248, + 382.654693604}, + {0, + 293.959503174, + 169.469955444, + 119.412483215, + 85.3333358765, + 85.3333358765, + 83.5508270264, + 58.8718566895, + 293.959503174, + 233.598114014, + 156.027160645, + 112.817504883, + 85.3333358765, + 85.3333358765, + 81.1647109985, + 57.4251747131, + 169.469955444, + 156.027160645, + 126.80493927, + 96.6006240845, + 85.3333358765, + 85.3333358765, + 74.5768890381, + 53.3726730347, + 119.412483215, + 112.817504883, + 96.6006240845, + 85.3333358765, + 85.3333358765, + 85.3333358765, + 65.2038497925, + 47.4551811218, + 85.3333358765, + 85.3333358765, + 85.3333358765, + 85.3333358765, + 85.3333358765, + 72.5535202026, + 54.6778106689, + 39.419506073, + 85.3333358765, + 85.3333358765, + 85.3333358765, + 85.3333358765, + 72.5535202026, + 57.4251747131, + 44.3317565918, + 29.2122058868, + 83.5508270264, + 81.1647109985, + 74.5768890381, + 65.2038497925, + 54.6778106689, + 44.3317565918, + 31.1723690033, + 20.7407989502, + 58.8718566895, + 57.4251747131, + 53.3726730347, + 47.4551811218, + 39.419506073, + 29.2122058868, + 20.7407989502, + 14.2222824097}}; +const float inv_matrix_16[3][256] = {{0, + 0, + 2384.4128418, + 2060.98974609, + 1763.60900879, + 1491.73779297, + 1261.77709961, + 1067.26635742, + 956.67767334, + 861.364074707, + 775.546569824, + 703.312927246, + 644.910888672, + 591.358520508, + 542.252990723, + 501.345214844, + 0, + 0, + 2303.75878906, + 2012.80981445, + 1727.63220215, + 1467.21154785, + 1244.41430664, + 1054.64306641, + 950.44720459, + 856.371826172, + 771.497619629, + 700.552734375, + 642.589599609, + 589.392944336, + 540.578857422, + 500.060272217, + 2384.4128418, + 2303.75878906, + 2113.18408203, + 1884.00744629, + 1629.57141113, + 1398.57958984, + 1195.04504395, + 1031.75708008, + 932.273986816, + 841.744262695, + 759.593811035, + 692.403076172, + 635.722961426, + 583.569458008, + 535.612548828, + 496.2421875, + 2060.98974609, + 2012.80981445, + 1884.00744629, + 1693.40161133, + 1491.73779297, + 1297.99816895, + 1120.69970703, + 996.043395996, + 903.588256836, + 818.460021973, + 740.524108887, + 679.239624023, + 624.590454102, + 574.100036621, + 528.409057617, + 489.997619629, + 1763.60900879, + 1727.63220215, + 1629.57141113, + 1491.73779297, + 1336.38830566, + 1179.42834473, + 1039.25634766, + 950.44720459, + 866.416687012, + 787.946533203, + 717.456176758, + 661.633422852, + 609.623046875, + 561.31427002, + 518.629089355, + 481.495361328, + 1491.73779297, + 1467.21154785, + 1398.57958984, + 1297.99816895, + 1179.42834473, + 1054.64294434, + 975.919921875, + 898.074401855, + 823.012390137, + 751.853820801, + 692.403076172, + 640.284667969, + 591.358520508, + 545.629760742, + 506.54699707, + 470.954223633, + 1261.77709961, + 1244.41430664, + 1195.04504395, + 1120.69970703, + 1039.25634766, + 975.919921875, + 909.174133301, + 841.744262695, + 775.546569824, + 714.580871582, + 664.092590332, + 615.952392578, + 570.392150879, + 528.409057617, + 492.477874756, + 458.628570557, + 1067.26635742, + 1054.64306641, + 1031.75708008, + 996.043395996, + 950.44720459, + 898.074401855, + 841.744262695, + 783.770263672, + 726.22833252, + 679.239624023, + 633.465698242, + 589.392944336, + 547.332580566, + 510.515045166, + 476.757659912, + 444.792907715, + 956.67767334, + 950.44720459, + 932.273986816, + 903.588256836, + 866.416687012, + 823.012390137, + 775.546569824, + 726.22833252, + 684.443725586, + 642.589599609, + 601.375, + 561.31439209, + 524.175048828, + 491.234863281, + 459.72479248, + 429.72869873, + 861.364074707, + 856.371826172, + 841.744262695, + 818.460021973, + 787.946533203, + 751.853820801, + 714.580871582, + 679.239624023, + 642.589599609, + 605.472290039, + 568.554870605, + 532.708679199, + 501.345214844, + 470.954223633, + 441.705718994, + 413.71182251, + 775.546569824, + 771.497619629, + 759.593811035, + 740.524108887, + 717.456176758, + 692.403076172, + 664.092590332, + 633.465698242, + 601.375, + 568.554870605, + 535.612426758, + 506.546936035, + 477.933990479, + 450.024688721, + 423.003997803, + 395.167694092, + 703.312927246, + 700.552734375, + 692.403076172, + 679.239624023, + 661.633422852, + 640.284667969, + 615.952392578, + 589.392944336, + 561.31439209, + 532.708679199, + 506.546936035, + 480.302856445, + 454.290039062, + 428.756591797, + 403.216186523, + 375.228302002, + 644.910888672, + 642.589599609, + 635.722961426, + 624.590454102, + 609.623046875, + 591.358520508, + 570.392150879, + 547.332580566, + 524.175048828, + 501.345214844, + 477.933990479, + 454.290039062, + 430.704803467, + 407.340545654, + 380.75769043, + 355.171173096, + 591.358520508, + 589.392944336, + 583.569458008, + 574.100036621, + 561.31427002, + 545.629760742, + 528.409057617, + 510.515045166, + 491.234863281, + 470.954223633, + 450.024688721, + 428.756591797, + 407.340545654, + 382.62991333, + 358.535705566, + 335.223266602, + 542.252990723, + 540.578857422, + 535.612548828, + 528.409057617, + 518.629089355, + 506.54699707, + 492.477874756, + 476.757659912, + 459.72479248, + 441.705718994, + 423.003997803, + 403.216186523, + 380.75769043, + 358.535705566, + 336.753845215, + 315.57409668, + 501.345214844, + 500.060272217, + 496.2421875, + 489.997619629, + 481.495361328, + 470.954223633, + 458.628570557, + 444.792907715, + 429.72869873, + 413.71182251, + 395.167694092, + 375.228302002, + 355.171173096, + 335.223266602, + 315.57409668, + 296.378265381}, + {0, + 0, + 5616.41552734, + 4437.54785156, + 3710.52368164, + 3312.08374023, + 2956.42822266, + 2638.96386719, + 2378.97973633, + 2146.23095703, + 1936.2532959, + 1722.18615723, + 1498.60571289, + 1304.05163574, + 1134.75488281, + 951.882019043, + 0, + 0, + 5312.58251953, + 4271.09716797, + 3658.99584961, + 3275.03710938, + 2928.76391602, + 2617.74536133, + 2363.77954102, + 2134.02709961, + 1926.33569336, + 1711.35717773, + 1489.96264648, + 1297.10559082, + 1129.14038086, + 946.136962891, + 5616.41552734, + 5312.58251953, + 4620.59277344, + 3880.56469727, + 3516.76147461, + 3170.29418945, + 2849.4152832, + 2562.00634766, + 2319.43164062, + 2098.26171875, + 1897.17285156, + 1679.53442383, + 1464.50524902, + 1276.60888672, + 1112.54638672, + 929.184143066, + 4437.54785156, + 4271.09716797, + 3880.56469727, + 3609.64770508, + 3312.08374023, + 3013.74951172, + 2727.90283203, + 2474.97729492, + 2249.39648438, + 2041.30578613, + 1850.4362793, + 1628.60998535, + 1423.58496094, + 1243.54284668, + 1077.57275391, + 901.836975098, + 3710.52368164, + 3658.99584961, + 3516.76147461, + 3312.08374023, + 3073.94458008, + 2824.09741211, + 2580.27368164, + 2363.77954102, + 2158.58081055, + 1966.61950684, + 1778.07653809, + 1561.42590332, + 1369.25976562, + 1199.41723633, + 1031.11547852, + 865.35723877, + 3312.08374023, + 3275.03710938, + 3170.29418945, + 3013.74951172, + 2824.09741211, + 2617.74511719, + 2425.91333008, + 2235.92993164, + 2052.44384766, + 1878.20617676, + 1679.53442383, + 1481.39880371, + 1304.05163574, + 1146.11157227, + 975.344787598, + 821.329833984, + 2956.42822266, + 2928.76391602, + 2849.4152832, + 2727.90283203, + 2580.27368164, + 2425.91333008, + 2263.03759766, + 2098.26171875, + 1936.2532959, + 1766.65966797, + 1570.74584961, + 1392.13525391, + 1230.68457031, + 1077.57275391, + 912.64251709, + 771.521240234, + 2638.96386719, + 2617.74536133, + 2562.00634766, + 2474.97729492, + 2363.77954102, + 2235.92993164, + 2098.26171875, + 1956.39318848, + 1813.07836914, + 1628.60998535, + 1456.17285156, + 1297.10559082, + 1151.85449219, + 993.464355469, + 845.405334473, + 717.737731934, + 2378.97973633, + 2363.77954102, + 2319.43164062, + 2249.39648438, + 2158.58081055, + 2052.44384766, + 1936.2532959, + 1813.07836914, + 1648.67211914, + 1489.96264648, + 1339.6640625, + 1199.41748047, + 1057.31555176, + 907.217956543, + 775.878479004, + 661.709289551, + 2146.23095703, + 2134.02709961, + 2098.26171875, + 2041.30578613, + 1966.61950684, + 1878.20617676, + 1766.65966797, + 1628.60998535, + 1489.96264648, + 1354.33557129, + 1224.33178711, + 1098.37109375, + 951.882019043, + 821.329833984, + 706.041503906, + 604.99597168, + 1936.2532959, + 1926.33569336, + 1897.17285156, + 1850.4362793, + 1778.07653809, + 1679.53442383, + 1570.74584961, + 1456.17285156, + 1339.6640625, + 1224.33178711, + 1112.54614258, + 975.344482422, + 850.33416748, + 737.812194824, + 637.541503906, + 531.866638184, + 1722.18615723, + 1711.35717773, + 1679.53442383, + 1628.60998535, + 1561.42590332, + 1481.39880371, + 1392.13525391, + 1297.10559082, + 1199.41748047, + 1098.37109375, + 975.344482422, + 860.309997559, + 754.414855957, + 658.18359375, + 565.168762207, + 455.065155029, + 1498.60571289, + 1489.96264648, + 1464.50524902, + 1423.58496094, + 1369.25976562, + 1304.05163574, + 1230.68457031, + 1151.85449219, + 1057.31555176, + 951.882019043, + 850.33416748, + 754.414855957, + 665.260375977, + 582.761047363, + 475.564758301, + 385.666412354, + 1304.05163574, + 1297.10559082, + 1276.60888672, + 1243.54284668, + 1199.41723633, + 1146.11157227, + 1077.57275391, + 993.464355469, + 907.217956543, + 821.329833984, + 737.812194824, + 658.18359375, + 582.761047363, + 482.643035889, + 396.775939941, + 324.039428711, + 1134.75488281, + 1129.14038086, + 1112.54638672, + 1077.57275391, + 1031.11547852, + 975.344787598, + 912.64251709, + 845.405334473, + 775.878479004, + 706.041503906, + 637.541503906, + 565.168762207, + 475.564758301, + 396.775939941, + 328.516326904, + 270.136077881, + 951.882019043, + 946.136962891, + 929.184143066, + 901.836975098, + 865.35723877, + 821.329833984, + 771.521240234, + 717.737731934, + 661.709289551, + 604.99597168, + 531.866638184, + 455.065155029, + 385.666412354, + 324.039428711, + 270.136077881, + 223.60848999}, + {0, + 0, + 615.613830566, + 448.953399658, + 337.930267334, + 263.807556152, + 205.943115234, + 160.770889282, + 141.832733154, + 126.301643372, + 112.471244812, + 100.763389587, + 91.1208114624, + 82.4009933472, + 74.5156097412, + 58.8962364197, + 0, + 0, + 571.402038574, + 426.532226562, + 327.784393311, + 257.417816162, + 201.765563965, + 157.966430664, + 140.812332153, + 125.492965698, + 111.822540283, + 100.304679871, + 90.7403564453, + 82.0832748413, + 74.2487335205, + 58.3933258057, + 615.613830566, + 571.402038574, + 473.941894531, + 372.602783203, + 300.644775391, + 239.80960083, + 190.039825439, + 154.182662964, + 137.840042114, + 123.126365662, + 109.91746521, + 98.952003479, + 89.6162185669, + 81.1429672241, + 73.4578170776, + 56.9167442322, + 448.953399658, + 426.532226562, + 372.602783203, + 318.224456787, + 263.807556152, + 214.746795654, + 172.817260742, + 148.295852661, + 133.160797119, + 119.368148804, + 106.872108459, + 96.7725219727, + 87.7978591919, + 79.6171722412, + 70.2083129883, + 54.5584373474, + 337.930267334, + 327.784393311, + 300.644775391, + 263.807556152, + 224.206954956, + 186.378311157, + 155.421569824, + 140.812332153, + 127.120582581, + 114.460098267, + 103.118339539, + 93.8680496216, + 85.3613052368, + 77.5634307861, + 65.9593734741, + 51.4587516785, + 263.807556152, + 257.417816162, + 239.80960083, + 214.746795654, + 186.378311157, + 157.966400146, + 144.988540649, + 132.263153076, + 120.102050781, + 108.680435181, + 98.952003479, + 90.3628005981, + 82.4009933472, + 75.0543060303, + 60.9631996155, + 47.7897415161, + 205.943115234, + 201.765563965, + 190.039825439, + 172.817260742, + 155.421569824, + 144.988540649, + 134.070770264, + 123.126365662, + 112.471244812, + 102.638969421, + 94.2730102539, + 86.3905029297, + 79.0208206177, + 70.2083129883, + 55.4867515564, + 43.7368011475, + 160.770889282, + 157.966430664, + 154.182662964, + 148.295852661, + 140.812332153, + 132.263153076, + 123.126365662, + 113.789886475, + 104.582710266, + 96.7725219727, + 89.2471008301, + 82.0832748413, + 75.3261566162, + 62.5737113953, + 49.7861824036, + 39.4813766479, + 141.832733154, + 140.812332153, + 137.840042114, + 133.160797119, + 127.120582581, + 120.102050781, + 112.471244812, + 104.582710266, + 97.6333694458, + 90.7403564453, + 84.0226669312, + 77.5634460449, + 68.3460235596, + 55.020149231, + 44.0871162415, + 35.1875991821, + 126.301643372, + 125.492965698, + 123.126365662, + 119.368148804, + 114.460098267, + 108.680435181, + 102.638969421, + 96.7725219727, + 90.7403564453, + 84.6872787476, + 78.7255554199, + 72.1355895996, + 58.8962364197, + 47.7897415161, + 38.5730819702, + 30.993062973, + 112.471244812, + 111.822540283, + 109.91746521, + 106.872108459, + 103.118339539, + 98.952003479, + 94.2730102539, + 89.2471008301, + 84.0226669312, + 78.7255554199, + 73.4578094482, + 60.9631729126, + 50.1978492737, + 41.0546913147, + 33.3810348511, + 24.7806758881, + 100.763389587, + 100.304679871, + 98.952003479, + 96.7725219727, + 93.8680496216, + 90.3628005981, + 86.3905029297, + 82.0832748413, + 77.5634460449, + 72.1355895996, + 60.9631729126, + 51.0341072083, + 42.3694725037, + 34.9223136902, + 27.7260704041, + 18.5722160339, + 91.1208114624, + 90.7403564453, + 89.6162185669, + 87.7978591919, + 85.3613052368, + 82.4009933472, + 79.0208206177, + 75.3261566162, + 68.3460235596, + 58.8962364197, + 50.1978492737, + 42.3694725037, + 35.4553947449, + 29.343132019, + 20.1489048004, + 13.676407814, + 82.4009933472, + 82.0832748413, + 81.1429672241, + 79.6171722412, + 77.5634307861, + 75.0543060303, + 70.2083129883, + 62.5737113953, + 55.020149231, + 47.7897415161, + 41.0546913147, + 34.9223136902, + 29.343132019, + 20.7069969177, + 14.4138498306, + 9.9115486145, + 74.5156097412, + 74.2487335205, + 73.4578170776, + 70.2083129883, + 65.9593734741, + 60.9631996155, + 55.4867515564, + 49.7861824036, + 44.0871162415, + 38.5730819702, + 33.3810348511, + 27.7260704041, + 20.1489048004, + 14.4138498306, + 10.166267395, + 7.07980155945, + 58.8962364197, + 58.3933258057, + 56.9167442322, + 54.5584373474, + 51.4587516785, + 47.7897415161, + 43.7368011475, + 39.4813766479, + 35.1875991821, + 30.993062973, + 24.7806758881, + 18.5722160339, + 13.676407814, + 9.9115486145, + 7.07980155945, + 4.99121952057}}; +const float inv_matrix_32[3][1024] = {{0, + 0, + 0, + 0, + 5011.67871094, + 4561.02685547, + 4150.89794922, + 3787.85327148, + 3459.89013672, + 3160.32299805, + 2886.69311523, + 2636.75488281, + 2408.45727539, + 2220.78833008, + 2069.29418945, + 1928.13452148, + 1796.60424805, + 1674.04626465, + 1559.84912109, + 1455.32824707, + 1364.40710449, + 1279.16601562, + 1199.25048828, + 1124.32775879, + 1054.08581543, + 988.231933594, + 932.328857422, + 879.889831543, + 830.400390625, + 783.694335938, + 739.61541748, + 698.015563965, + 0, + 0, + 0, + 0, + 4953.88232422, + 4518.67041016, + 4118.65429688, + 3763.55249023, + 3440.43725586, + 3144.51098633, + 2873.68359375, + 2625.9453125, + 2399.40185547, + 2214.77026367, + 2064.08569336, + 1923.60375977, + 1792.64550781, + 1670.57409668, + 1556.79296875, + 1452.8614502, + 1362.2097168, + 1277.20385742, + 1197.49438477, + 1122.75280762, + 1052.67053223, + 986.958068848, + 931.291748047, + 878.947387695, + 829.542602539, + 782.912841797, + 738.902404785, + 697.364379883, + 0, + 0, + 0, + 0, + 4793.61474609, + 4398.46826172, + 4026.78955078, + 3692.97387695, + 3383.59692383, + 3098.10839844, + 2835.38208008, + 2594.04101562, + 2372.62280273, + 2196.91870117, + 2048.6171875, + 1910.1348877, + 1780.86755371, + 1660.23608398, + 1547.6887207, + 1445.50598145, + 1355.65515137, + 1271.34863281, + 1192.25231934, + 1118.05004883, + 1048.44384766, + 983.335632324, + 928.192504883, + 876.130004883, + 826.978088379, + 780.575439453, + 736.76965332, + 695.41619873, + 0, + 0, + 0, + 0, + 4561.02685547, + 4217.54345703, + 3889.28466797, + 3582.40161133, + 3293.56469727, + 3024.01489258, + 2773.8503418, + 2542.54394531, + 2329.23535156, + 2167.82006836, + 2023.34472656, + 1888.08752441, + 1761.55737305, + 1643.26379395, + 1532.72387695, + 1433.39599609, + 1344.85412598, + 1261.69274902, + 1183.6015625, + 1110.28479004, + 1041.46118164, + 977.688903809, + 923.064758301, + 871.467163086, + 822.732299805, + 776.704589844, + 733.236633301, + 692.188110352, + 5011.67871094, + 4953.88232422, + 4793.61474609, + 4561.02685547, + 4287.29882812, + 3998.23925781, + 3716.125, + 3440.43725586, + 3176.31298828, + 2926.47753906, + 2692.17285156, + 2473.73583984, + 2276.53393555, + 2128.3894043, + 1988.98718262, + 1858.03149414, + 1735.17175293, + 1620.02612305, + 1512.1998291, + 1416.7467041, + 1329.98596191, + 1248.38586426, + 1171.66845703, + 1099.56396484, + 1031.81274414, + 969.874816895, + 915.964599609, + 865.006896973, + 816.846923828, + 771.336730957, + 728.335144043, + 687.707824707, + 4561.02685547, + 4518.67041016, + 4398.46826172, + 4217.54345703, + 3998.23925781, + 3763.55249023, + 3519.85961914, + 3276.21459961, + 3038.5234375, + 2810.43310547, + 2594.04101562, + 2390.41162109, + 2220.78833008, + 2079.79077148, + 1946.46655273, + 1820.70666504, + 1702.30786133, + 1591.01000977, + 1486.51586914, + 1395.84521484, + 1311.29003906, + 1231.62927246, + 1156.62243652, + 1086.03051758, + 1019.62091064, + 959.981201172, + 906.967834473, + 856.815124512, + 809.379150391, + 764.521179199, + 722.108276367, + 682.245117188, + 4150.89794922, + 4118.65429688, + 4026.78955078, + 3889.28466797, + 3716.125, + 3519.85961914, + 3311.12646484, + 3098.10839844, + 2886.69311523, + 2680.90209961, + 2483.34204102, + 2295.77954102, + 2156.40185547, + 2023.34472656, + 1896.84716797, + 1776.97375488, + 1663.66943359, + 1556.79296875, + 1457.80285645, + 1371.0369873, + 1289.05615234, + 1211.6673584, + 1138.67041016, + 1069.86108398, + 1005.03588867, + 948.117004395, + 896.168579102, + 846.973266602, + 800.399719238, + 756.319885254, + 714.610473633, + 675.848571777, + 3787.85327148, + 3763.55249023, + 3692.97387695, + 3582.40161133, + 3440.43725586, + 3276.21459961, + 3098.10839844, + 2913.08789062, + 2726.56884766, + 2542.54394531, + 2363.82275391, + 2214.77026367, + 2085.0793457, + 1960.44067383, + 1841.2644043, + 1727.76757812, + 1620.02624512, + 1518.01391602, + 1426.21765137, + 1342.71228027, + 1263.61376953, + 1188.7791748, + 1118.05004883, + 1051.25854492, + 988.232055664, + 934.408508301, + 883.676330566, + 835.576843262, + 789.991882324, + 746.805908203, + 705.905090332, + 668.410766602, + 3459.89013672, + 3440.43725586, + 3383.59692383, + 3293.56469727, + 3176.31298828, + 3038.5234375, + 2886.69311523, + 2726.56884766, + 2562.8984375, + 2399.40185547, + 2251.3984375, + 2128.38916016, + 2008.47729492, + 1892.4576416, + 1780.86755371, + 1674.04638672, + 1572.18115234, + 1475.3458252, + 1391.27539062, + 1311.29003906, + 1235.31958008, + 1163.2689209, + 1095.02172852, + 1030.44628906, + 970.984558105, + 918.996520996, + 869.613464355, + 822.732299805, + 778.249511719, + 736.060913086, + 696.064697266, + 659.988525391, + 3160.32299805, + 3144.51098633, + 3098.10839844, + 3024.01489258, + 2926.47753906, + 2810.43310547, + 2680.90209961, + 2542.54394531, + 2399.40185547, + 2263.89282227, + 2150.73901367, + 2038.4329834, + 1928.13452148, + 1820.70666504, + 1716.77062988, + 1616.75537109, + 1520.93566895, + 1433.39599609, + 1353.48266602, + 1277.20385742, + 1204.54553223, + 1135.45678711, + 1069.86108398, + 1007.66223145, + 952.402770996, + 902.032531738, + 854.112670898, + 808.556152344, + 765.27355957, + 724.174499512, + 685.184509277, + 650.644348145, + 2886.69311523, + 2873.68359375, + 2835.38208008, + 2773.8503418, + 2692.17285156, + 2594.04101562, + 2483.34204102, + 2363.82275391, + 2251.3984375, + 2150.73901367, + 2048.6171875, + 1946.46655273, + 1845.42907715, + 1746.38916016, + 1650.01464844, + 1556.79296875, + 1467.77758789, + 1389.00024414, + 1313.34448242, + 1240.89025879, + 1171.66845703, + 1105.66882324, + 1042.85168457, + 983.335632324, + 932.328857422, + 883.676208496, + 837.313964844, + 793.171142578, + 751.173278809, + 711.242553711, + 674.120605469, + 640.444885254, + 2636.75488281, + 2625.9453125, + 2594.04101562, + 2542.54394531, + 2473.73583984, + 2390.41162109, + 2295.77954102, + 2214.77026367, + 2128.38916016, + 2038.4329834, + 1946.46655273, + 1853.81237793, + 1761.55737305, + 1670.57421875, + 1581.54528809, + 1494.99133301, + 1416.74682617, + 1342.71228027, + 1271.34875488, + 1202.77600098, + 1137.06164551, + 1074.23132324, + 1014.27752686, + 959.981201172, + 910.948364258, + 864.090393066, + 819.361022949, + 776.704589844, + 736.060913086, + 697.364379883, + 662.218017578, + 629.4609375, + 2408.45727539, + 2399.40185547, + 2372.62280273, + 2329.23535156, + 2276.53393555, + 2220.78833008, + 2156.40185547, + 2085.0793457, + 2008.47729492, + 1928.13452148, + 1845.42907715, + 1761.55737305, + 1677.53186035, + 1594.18786621, + 1512.1998291, + 1435.8034668, + 1364.40710449, + 1295.05419922, + 1227.95825195, + 1163.2689209, + 1101.08483887, + 1041.46118164, + 984.471923828, + 935.451171875, + 888.446838379, + 843.440002441, + 800.399719238, + 759.286071777, + 720.051269531, + 682.831237793, + 649.558654785, + 617.765075684, + 2220.78833008, + 2214.77026367, + 2196.91870117, + 2167.82006836, + 2128.3894043, + 2079.79077148, + 2023.34472656, + 1960.44067383, + 1892.4576416, + 1820.70666504, + 1746.38916016, + 1670.57421875, + 1594.18786621, + 1518.01391602, + 1445.50622559, + 1377.7244873, + 1311.29003906, + 1246.50488281, + 1183.6015625, + 1122.75280762, + 1064.07983398, + 1007.66223145, + 956.720947266, + 909.950622559, + 865.006896973, + 821.887329102, + 780.575439453, + 741.04473877, + 703.25994873, + 668.410766602, + 636.225524902, + 605.431762695, + 2069.29418945, + 2064.08569336, + 2048.6171875, + 2023.34472656, + 1988.98718262, + 1946.46655273, + 1896.84716797, + 1841.2644043, + 1780.86755371, + 1716.77062988, + 1650.01464844, + 1581.54528809, + 1512.1998291, + 1445.50622559, + 1382.21533203, + 1319.54187012, + 1257.8659668, + 1197.49438477, + 1138.67041016, + 1081.58068848, + 1026.36474609, + 974.326965332, + 928.192504883, + 883.676208496, + 840.805603027, + 799.591430664, + 760.030456543, + 722.108276367, + 685.801452637, + 653.370849609, + 622.301086426, + 592.535888672, + 1928.13452148, + 1923.60375977, + 1910.1348877, + 1888.08752441, + 1858.03149414, + 1820.70666504, + 1776.97375488, + 1727.76757812, + 1674.04638672, + 1616.75537109, + 1556.79296875, + 1494.99133301, + 1435.8034668, + 1377.7244873, + 1319.54187012, + 1261.69274902, + 1204.54553223, + 1148.40356445, + 1093.51477051, + 1040.0736084, + 988.232055664, + 942.805053711, + 899.092651367, + 856.815124512, + 816.011779785, + 776.704589844, + 738.902404785, + 702.601013184, + 668.97833252, + 637.802612305, + 607.867736816, + 579.151306152, + 1796.60424805, + 1792.64550781, + 1780.86755371, + 1761.55737305, + 1735.17175293, + 1702.30786133, + 1663.66943359, + 1620.02624512, + 1572.18115234, + 1520.93566895, + 1467.77758789, + 1416.74682617, + 1364.40710449, + 1311.29003906, + 1257.8659668, + 1204.54553223, + 1151.67944336, + 1099.56396484, + 1048.44384766, + 998.518310547, + 953.479187012, + 910.948364258, + 869.613342285, + 829.542602539, + 790.784973145, + 753.372253418, + 717.322814941, + 682.831237793, + 651.732788086, + 621.794555664, + 593.00592041, + 565.35144043, + 1674.04626465, + 1670.57409668, + 1660.23608398, + 1643.26379395, + 1620.02612305, + 1591.01000977, + 1556.79296875, + 1518.01391602, + 1475.3458252, + 1433.39599609, + 1389.00024414, + 1342.71228027, + 1295.05419922, + 1246.50488281, + 1197.49438477, + 1148.40356445, + 1099.56396484, + 1051.25854492, + 1003.72686768, + 959.981201172, + 918.996520996, + 878.947387695, + 839.930419922, + 802.020446777, + 765.27355957, + 729.730224609, + 695.41619873, + 663.897949219, + 634.132019043, + 605.431762695, + 577.793395996, + 551.20690918, + 1559.84912109, + 1556.79296875, + 1547.6887207, + 1532.72387695, + 1512.1998291, + 1486.51586914, + 1457.80285645, + 1426.21765137, + 1391.27539062, + 1353.48266602, + 1313.34448242, + 1271.34875488, + 1227.95825195, + 1183.6015625, + 1138.67041016, + 1093.51477051, + 1048.44384766, + 1003.72686768, + 962.165222168, + 923.064758301, + 884.62689209, + 846.973266602, + 810.203491211, + 774.396850586, + 739.61541748, + 705.905090332, + 674.120605469, + 644.708618164, + 616.264648438, + 588.796264648, + 562.305175781, + 536.786437988, + 1455.32824707, + 1452.8614502, + 1445.50598145, + 1433.39599609, + 1416.7467041, + 1395.84521484, + 1371.0369873, + 1342.71228027, + 1311.29003906, + 1277.20385742, + 1240.89025879, + 1202.77600098, + 1163.2689209, + 1122.75280762, + 1081.58068848, + 1040.0736084, + 998.518310547, + 959.981201172, + 923.064758301, + 886.533447266, + 850.530761719, + 815.17779541, + 780.575439453, + 746.805908203, + 713.934814453, + 682.245117188, + 653.370849609, + 625.353942871, + 598.214355469, + 571.965026855, + 546.611877441, + 522.155395508, + 1364.40710449, + 1362.2097168, + 1355.65515137, + 1344.85412598, + 1329.98596191, + 1311.29003906, + 1289.05615234, + 1263.61376953, + 1235.31958008, + 1204.54553223, + 1171.66845703, + 1137.06164551, + 1101.08483887, + 1064.07983398, + 1026.36474609, + 988.232055664, + 953.479187012, + 918.996520996, + 884.62689209, + 850.530761719, + 816.846923828, + 783.694335938, + 751.173278809, + 719.367553711, + 688.345153809, + 659.988647461, + 632.568969727, + 605.91784668, + 580.059387207, + 555.010437012, + 530.781066895, + 507.375701904, + 1279.16601562, + 1277.20385742, + 1271.34863281, + 1261.69274902, + 1248.38586426, + 1231.62927246, + 1211.6673584, + 1188.7791748, + 1163.2689209, + 1135.45678711, + 1105.66882324, + 1074.23132324, + 1041.46118164, + 1007.66223145, + 974.326965332, + 942.805053711, + 910.948364258, + 878.947387695, + 846.973266602, + 815.17779541, + 783.694335938, + 752.638000488, + 722.108276367, + 692.188110352, + 664.459472656, + 637.802490234, + 611.796875, + 586.477539062, + 561.871887207, + 537.999816895, + 514.875244141, + 492.505737305, + 1199.25048828, + 1197.49438477, + 1192.25231934, + 1183.6015625, + 1171.66845703, + 1156.62243652, + 1138.67041016, + 1118.05004883, + 1095.02172852, + 1069.86108398, + 1042.85168457, + 1014.27752686, + 984.471923828, + 956.720947266, + 928.192504883, + 899.092651367, + 869.613342285, + 839.930419922, + 810.203491211, + 780.575439453, + 751.173278809, + 722.108276367, + 693.4765625, + 666.712768555, + 640.975524902, + 615.765563965, + 591.129272461, + 567.10345459, + 543.718078613, + 520.995727539, + 498.952667236, + 480.805541992, + 1124.32775879, + 1122.75280762, + 1118.05004883, + 1110.28479004, + 1099.56396484, + 1086.03051758, + 1069.86108398, + 1051.25854492, + 1030.44628906, + 1007.66223145, + 983.335632324, + 959.981201172, + 935.451171875, + 909.950622559, + 883.676208496, + 856.815124512, + 829.542602539, + 802.020446777, + 774.396850586, + 746.805908203, + 719.367553711, + 692.188110352, + 666.712768555, + 642.038635254, + 617.765075684, + 593.947570801, + 570.63269043, + 547.859313965, + 525.65826416, + 504.054443359, + 484.735015869, + 470.0362854, + 1054.08581543, + 1052.67053223, + 1048.44384766, + 1041.46118164, + 1031.81274414, + 1019.62091064, + 1005.03588867, + 988.232055664, + 970.984558105, + 952.402770996, + 932.328857422, + 910.948364258, + 888.446838379, + 865.006896973, + 840.805603027, + 816.011779785, + 790.784973145, + 765.27355957, + 739.61541748, + 713.934814453, + 688.345153809, + 664.459472656, + 640.975524902, + 617.765075684, + 594.891723633, + 572.41003418, + 550.3671875, + 528.801818848, + 507.746917725, + 487.717651367, + 473.343078613, + 459.212097168, + 988.231933594, + 986.958068848, + 983.335632324, + 977.688903809, + 969.874816895, + 959.981201172, + 948.117004395, + 934.408508301, + 918.996520996, + 902.032531738, + 883.676208496, + 864.090393066, + 843.440002441, + 821.887329102, + 799.591430664, + 776.704589844, + 753.372253418, + 729.730224609, + 705.905090332, + 682.245117188, + 659.988647461, + 637.802490234, + 615.765563965, + 593.947570801, + 572.41003418, + 551.20690918, + 530.384277344, + 509.981781006, + 490.03225708, + 475.728912354, + 461.936004639, + 448.361358643, + 932.328857422, + 931.291748047, + 928.192504883, + 923.064758301, + 915.964599609, + 906.967834473, + 896.168579102, + 883.676330566, + 869.613464355, + 854.112670898, + 837.313964844, + 819.361022949, + 800.399719238, + 780.575439453, + 760.030456543, + 738.902404785, + 717.322814941, + 695.41619873, + 674.120605469, + 653.370849609, + 632.568969727, + 611.796875, + 591.129272461, + 570.63269043, + 550.3671875, + 530.384277344, + 510.730102539, + 491.443481445, + 477.170257568, + 463.766784668, + 450.541046143, + 437.510101318, + 879.889831543, + 878.947387695, + 876.130004883, + 871.467163086, + 865.006896973, + 856.815124512, + 846.973266602, + 835.576843262, + 822.732299805, + 808.556152344, + 793.171142578, + 776.704589844, + 759.286071777, + 741.04473877, + 722.108276367, + 702.601013184, + 682.831237793, + 663.897949219, + 644.708618164, + 625.353942871, + 605.91784668, + 586.477539062, + 567.10345459, + 547.859313965, + 528.801818848, + 509.981781006, + 491.443481445, + 477.65222168, + 464.686828613, + 451.85736084, + 439.183532715, + 426.682556152, + 830.400390625, + 829.542602539, + 826.978088379, + 822.732299805, + 816.846923828, + 809.379150391, + 800.399719238, + 789.991882324, + 778.249511719, + 765.27355957, + 751.173278809, + 736.060913086, + 720.051269531, + 703.25994873, + 685.801452637, + 668.97833252, + 651.732788086, + 634.132019043, + 616.264648438, + 598.214355469, + 580.059387207, + 561.871887207, + 543.718078613, + 525.65826416, + 507.746917725, + 490.03225708, + 477.170257568, + 464.686828613, + 452.297576904, + 440.024200439, + 427.886230469, + 415.901092529, + 783.694335938, + 782.912841797, + 780.575439453, + 776.704589844, + 771.336730957, + 764.521179199, + 756.319885254, + 746.805908203, + 736.060913086, + 724.174499512, + 711.242553711, + 697.364379883, + 682.831237793, + 668.410766602, + 653.370849609, + 637.802612305, + 621.794555664, + 605.431762695, + 588.796264648, + 571.965026855, + 555.010437012, + 537.999816895, + 520.995727539, + 504.054443359, + 487.717651367, + 475.728912354, + 463.766784668, + 451.85736084, + 440.024200439, + 428.288726807, + 416.670166016, + 405.185882568, + 739.61541748, + 738.902404785, + 736.76965332, + 733.236633301, + 728.335144043, + 722.108276367, + 714.610473633, + 705.905090332, + 696.064697266, + 685.184509277, + 674.120605469, + 662.218017578, + 649.558654785, + 636.225524902, + 622.301086426, + 607.867736816, + 593.00592041, + 577.793395996, + 562.305175781, + 546.611877441, + 530.781066895, + 514.875244141, + 498.952667236, + 484.735015869, + 473.343078613, + 461.936004639, + 450.541046143, + 439.183532715, + 427.886230469, + 416.670166016, + 405.554260254, + 394.555480957, + 698.015563965, + 697.364379883, + 695.41619873, + 692.188110352, + 687.707824707, + 682.245117188, + 675.848571777, + 668.410766602, + 659.988525391, + 650.644348145, + 640.444885254, + 629.4609375, + 617.765075684, + 605.431762695, + 592.535888672, + 579.151306152, + 565.35144043, + 551.20690918, + 536.786437988, + 522.155395508, + 507.375701904, + 492.505737305, + 480.805541992, + 470.0362854, + 459.212097168, + 448.361358643, + 437.510101318, + 426.682556152, + 415.901092529, + 405.185882568, + 394.555480957, + 384.026672363}, + {0, + 0, + 0, + 0, + 10016.1787109, + 8949.01855469, + 7995.55859375, + 7162.60107422, + 6422.47558594, + 5758.82910156, + 5163.75830078, + 4630.17675781, + 4151.73242188, + 3734.18823242, + 3370.10986328, + 3041.52880859, + 2744.98388672, + 2477.35107422, + 2235.81323242, + 2038.74963379, + 1932.10974121, + 1831.04748535, + 1735.27160645, + 1644.50561523, + 1558.48730469, + 1476.96801758, + 1386.82666016, + 1301.52868652, + 1221.47717285, + 1146.34912109, + 1075.84216309, + 1009.67150879, + 0, + 0, + 0, + 0, + 9878.22460938, + 8849.74414062, + 7921.35595703, + 7107.29541016, + 6379.01171875, + 5724.14550781, + 5135.74365234, + 4607.32568359, + 4132.93945312, + 3719.50512695, + 3357.80053711, + 3031.15722656, + 2736.20654297, + 2469.89428711, + 2229.45581055, + 2035.87133789, + 1929.51806641, + 1828.70812988, + 1733.15539551, + 1642.58703613, + 1556.74450684, + 1475.38232422, + 1385.13513184, + 1300, + 1220.09375, + 1145.09570312, + 1074.70495605, + 1008.63867188, + 0, + 0, + 0, + 0, + 9497.34082031, + 8569.00976562, + 7710.1953125, + 6947.08251953, + 6252.30078125, + 5622.56835938, + 5053.41699219, + 4539.99316406, + 4077.45068359, + 3676.05541992, + 3321.32641602, + 3000.390625, + 2710.14355469, + 2447.73339844, + 2210.55053711, + 2027.28417969, + 1921.78308105, + 1821.72375488, + 1726.8347168, + 1636.85534668, + 1551.53735352, + 1470.2409668, + 1380.08117676, + 1295.43139648, + 1215.95825195, + 1141.34753418, + 1071.30383301, + 1005.54919434, + 0, + 0, + 0, + 0, + 8949.01855469, + 8149.28955078, + 7394.22412109, + 6697.34423828, + 6052.48828125, + 5461.01953125, + 4921.63427734, + 4431.66796875, + 3987.81860352, + 3605.57324219, + 3262.00317383, + 2950.23999023, + 2667.58154297, + 2411.48632812, + 2179.58496094, + 2013.13024902, + 1909.02331543, + 1810.1940918, + 1716.39379883, + 1627.38232422, + 1542.92712402, + 1460.9855957, + 1371.72302246, + 1287.87316895, + 1209.11425781, + 1135.14245605, + 1065.67199707, + 1000.43200684, + 10016.1787109, + 9878.22460938, + 9497.34082031, + 8949.01855469, + 8310.703125, + 7644.40527344, + 6999.56738281, + 6379.01171875, + 5793.93896484, + 5249.58837891, + 4747.62841797, + 4287.62841797, + 3871.05200195, + 3510.74609375, + 3181.88964844, + 2882.29760742, + 2609.7644043, + 2362.1328125, + 2137.33789062, + 1993.63818359, + 1891.43066406, + 1794.28063965, + 1701.97070312, + 1614.28540039, + 1531.01391602, + 1448.18615723, + 1360.15734863, + 1277.40795898, + 1199.63305664, + 1126.54284668, + 1057.86291504, + 993.333496094, + 8949.01855469, + 8849.74414062, + 8569.00976562, + 8149.28955078, + 7644.40527344, + 7107.29541016, + 6556.77978516, + 6014.109375, + 5492.59033203, + 4999.91259766, + 4539.99316406, + 4114.296875, + 3734.18823242, + 3394.95922852, + 3083.60522461, + 2798.61328125, + 2538.30639648, + 2300.9543457, + 2084.83349609, + 1969.11364746, + 1869.26220703, + 1774.20117188, + 1683.74963379, + 1597.7220459, + 1515.93359375, + 1431.99487305, + 1345.51452637, + 1264.14855957, + 1187.61206055, + 1115.63220215, + 1047.94946289, + 984.940734863, + 7995.55859375, + 7921.35595703, + 7710.1953125, + 7394.22412109, + 6999.56738281, + 6556.77978516, + 6091.37744141, + 5622.56835938, + 5163.75830078, + 4723.70117188, + 4307.68896484, + 3918.65771484, + 3578.03271484, + 3262.00317383, + 2970.13378906, + 2701.54418945, + 2455.08642578, + 2229.45581055, + 2041.63623047, + 1939.92504883, + 1842.82971191, + 1750.22119141, + 1661.9576416, + 1577.88708496, + 1497.85339355, + 1412.59997559, + 1327.95666504, + 1248.23461914, + 1173.171875, + 1102.515625, + 1036.02380371, + 975.335021973, + 7162.60107422, + 7107.29541016, + 6947.08251953, + 6697.34423828, + 6379.01171875, + 6014.109375, + 5622.56835938, + 5220.67480469, + 4820.77636719, + 4431.66796875, + 4059.2434082, + 3719.50512695, + 3407.50024414, + 3115.79980469, + 2844.60986328, + 2593.61132812, + 2362.13305664, + 2149.27954102, + 2004.73095703, + 1906.4909668, + 1812.48913574, + 1722.64440918, + 1636.85534668, + 1555.00537109, + 1476.96813965, + 1390.21911621, + 1307.67175293, + 1229.82922363, + 1156.4543457, + 1087.31677246, + 1022.19244385, + 964.170288086, + 6422.47558594, + 6379.01171875, + 6252.30078125, + 6052.48828125, + 5793.93896484, + 5492.59033203, + 5163.75830078, + 4820.77636719, + 4474.43066406, + 4132.93945312, + 3809.15112305, + 3510.74560547, + 3227.25976562, + 2960.15966797, + 2710.14355469, + 2477.3515625, + 2261.5234375, + 2062.12646484, + 1963.74353027, + 1869.26220703, + 1778.62731934, + 1691.80358887, + 1608.73010254, + 1529.32519531, + 1450.00341797, + 1365.09509277, + 1284.86962891, + 1209.11425781, + 1137.61865234, + 1070.17382812, + 1006.57757568, + 951.533935547, + 5758.82910156, + 5724.14550781, + 5622.56835938, + 5461.01953125, + 5249.58837891, + 4999.91259766, + 4723.70117188, + 4431.66796875, + 4132.93945312, + 3839.88183594, + 3564.3984375, + 3297.38085938, + 3041.52880859, + 2798.61328125, + 2569.67749023, + 2355.21166992, + 2155.28833008, + 2013.13024902, + 1919.21801758, + 1828.70812988, + 1741.64990234, + 1658.05053711, + 1577.88708496, + 1501.11230469, + 1419.60339355, + 1337.48803711, + 1259.77709961, + 1186.2878418, + 1116.83618164, + 1051.23815918, + 989.356018066, + 937.521850586, + 5163.75830078, + 5135.74365234, + 5053.41699219, + 4921.63427734, + 4747.62841797, + 4539.99316406, + 4307.68896484, + 4059.2434082, + 3809.15112305, + 3564.3984375, + 3321.32641602, + 3083.60522461, + 2853.95654297, + 2634.29614258, + 2425.88378906, + 2229.45581055, + 2053.26318359, + 1961.06884766, + 1871.70092773, + 1785.30419922, + 1701.97070312, + 1621.74597168, + 1544.64245605, + 1470.2409668, + 1386.82666016, + 1307.67150879, + 1232.63317871, + 1161.55883789, + 1094.29150391, + 1030.67077637, + 972.740783691, + 922.236572266, + 4630.17675781, + 4607.32568359, + 4539.99316406, + 4431.66796875, + 4287.62841797, + 4114.296875, + 3918.65771484, + 3719.50512695, + 3510.74560547, + 3297.38085938, + 3083.60522461, + 2872.79980469, + 2667.58154297, + 2469.89453125, + 2281.10717773, + 2102.11450195, + 1993.63842773, + 1906.4909668, + 1821.72387695, + 1739.5189209, + 1660.00183105, + 1583.25292969, + 1509.31469727, + 1431.99487305, + 1351.99133301, + 1275.92382812, + 1203.68237305, + 1135.14245605, + 1070.17382812, + 1008.63867188, + 954.878356934, + 905.786682129, + 4151.73242188, + 4132.93945312, + 4077.45068359, + 3987.81860352, + 3871.05200195, + 3734.18823242, + 3578.03271484, + 3407.50024414, + 3227.25976562, + 3041.52880859, + 2853.95654297, + 2667.58154297, + 2484.84399414, + 2307.63012695, + 2137.33789062, + 2015.94567871, + 1932.10974121, + 1849.96801758, + 1769.79614258, + 1691.80358887, + 1616.14465332, + 1542.92712402, + 1472.10400391, + 1391.92028809, + 1315.41491699, + 1242.52575684, + 1173.171875, + 1107.25793457, + 1044.67651367, + 985.821166992, + 935.89440918, + 888.28326416, + 3734.18823242, + 3719.50512695, + 3676.05541992, + 3605.57324219, + 3510.74609375, + 3394.95922852, + 3262.00317383, + 3115.79980469, + 2960.15966797, + 2798.61328125, + 2634.29614258, + 2469.89453125, + 2307.63012695, + 2149.27954102, + 2027.2845459, + 1947.80200195, + 1869.26220703, + 1792.02880859, + 1716.39379883, + 1642.58703613, + 1570.78308105, + 1501.11230469, + 1426.6628418, + 1350.36767578, + 1277.40795898, + 1207.75256348, + 1141.34753418, + 1078.12231445, + 1017.99298096, + 964.170288086, + 915.916137695, + 869.840393066, + 3370.10986328, + 3357.80053711, + 3321.32641602, + 3262.00317383, + 3181.88964844, + 3083.60522461, + 2970.13378906, + 2844.60986328, + 2710.14355469, + 2569.67749023, + 2425.88378906, + 2281.10717773, + 2137.33789062, + 2027.2845459, + 1953.08789062, + 1879.05322266, + 1805.62060547, + 1733.15539551, + 1661.9576416, + 1592.26843262, + 1524.27880859, + 1455.47753906, + 1380.08117676, + 1307.67150879, + 1238.27075195, + 1171.8729248, + 1108.44836426, + 1047.94946289, + 990.31439209, + 941.609558105, + 895.070007324, + 850.572570801, + 3041.52880859, + 3031.15722656, + 3000.390625, + 2950.23999023, + 2882.29760742, + 2798.61328125, + 2701.54418945, + 2593.61132812, + 2477.3515625, + 2355.21166992, + 2229.45581055, + 2102.11450195, + 2015.94567871, + 1947.80200195, + 1879.05322266, + 1810.1940918, + 1741.64990234, + 1673.77990723, + 1606.8861084, + 1541.21508789, + 1476.96813965, + 1403.92370605, + 1332.70874023, + 1264.14855957, + 1198.28808594, + 1135.14245605, + 1074.70495605, + 1016.94714355, + 965.022094727, + 918.278381348, + 873.481933594, + 830.592407227, + 2744.98388672, + 2736.20654297, + 2710.14355469, + 2667.58154297, + 2609.7644043, + 2538.30639648, + 2455.08642578, + 2362.13305664, + 2261.5234375, + 2155.28833008, + 2053.26318359, + 1993.63842773, + 1932.10974121, + 1869.26220703, + 1805.62060547, + 1741.64990234, + 1677.75500488, + 1614.28540039, + 1551.53735352, + 1489.75976562, + 1421.36291504, + 1351.99133301, + 1284.86938477, + 1220.09375, + 1157.72741699, + 1097.8046875, + 1040.33654785, + 985.821166992, + 939.153625488, + 894.312011719, + 851.274475098, + 810.01184082, + 2477.35107422, + 2469.89428711, + 2447.73339844, + 2411.48632812, + 2362.1328125, + 2300.9543457, + 2229.45581055, + 2149.27954102, + 2062.12646484, + 2013.13024902, + 1961.06884766, + 1906.4909668, + 1849.96801758, + 1792.02880859, + 1733.15539551, + 1673.77990723, + 1614.28540039, + 1555.00537109, + 1496.22851562, + 1431.99487305, + 1365.09509277, + 1300, + 1236.85754395, + 1175.77709961, + 1116.83618164, + 1060.08496094, + 1005.54919434, + 957.398681641, + 912.780822754, + 869.840393066, + 828.566345215, + 788.938354492, + 2235.81323242, + 2229.45581055, + 2210.55053711, + 2179.58496094, + 2137.33789062, + 2084.83349609, + 2041.63623047, + 2004.73095703, + 1963.74353027, + 1919.21801758, + 1871.70092773, + 1821.72387695, + 1769.79614258, + 1716.39379883, + 1661.9576416, + 1606.8861084, + 1551.53735352, + 1496.22851562, + 1435.56774902, + 1371.72302246, + 1309.21435547, + 1248.23461914, + 1188.93847656, + 1131.44470215, + 1075.84216309, + 1022.19244385, + 972.740783691, + 928.625244141, + 886.038757324, + 844.988342285, + 805.471496582, + 767.476257324, + 2038.74963379, + 2035.87133789, + 2027.28417969, + 2013.13024902, + 1993.63818359, + 1969.11364746, + 1939.92504883, + 1906.4909668, + 1869.26220703, + 1828.70812988, + 1785.30419922, + 1739.5189209, + 1691.80358887, + 1642.58703613, + 1592.26843262, + 1541.21508789, + 1489.75976562, + 1431.99487305, + 1371.72302246, + 1312.30871582, + 1253.98498535, + 1196.94519043, + 1141.34753418, + 1087.31677246, + 1034.94958496, + 984.940734863, + 941.609558105, + 899.638793945, + 859.054748535, + 819.872619629, + 782.097106934, + 745.724487305, + 1932.10974121, + 1929.51806641, + 1921.78308105, + 1909.02331543, + 1891.43066406, + 1869.26220703, + 1842.82971191, + 1812.48913574, + 1778.62731934, + 1741.64990234, + 1701.97070312, + 1660.00183105, + 1616.14465332, + 1570.78308105, + 1524.27880859, + 1476.96813965, + 1421.36291504, + 1365.09509277, + 1309.21435547, + 1253.98498535, + 1199.63305664, + 1146.34912109, + 1094.29150391, + 1043.58874512, + 994.342956543, + 951.534057617, + 910.440185547, + 870.567016602, + 831.947387695, + 794.602966309, + 758.545227051, + 723.776733398, + 1831.04748535, + 1828.70812988, + 1821.72375488, + 1810.1940918, + 1794.28063965, + 1774.20117188, + 1750.22119141, + 1722.64440918, + 1691.80358887, + 1658.05053711, + 1621.74597168, + 1583.25292969, + 1542.92712402, + 1501.11230469, + 1455.47753906, + 1403.92370605, + 1351.99133301, + 1300, + 1248.23461914, + 1196.94519043, + 1146.34912109, + 1096.63146973, + 1047.94946289, + 1000.43200684, + 958.241088867, + 918.278259277, + 879.356750488, + 841.526489258, + 804.825805664, + 769.28125, + 734.910339355, + 701.721008301, + 1735.27160645, + 1733.15539551, + 1726.8347168, + 1716.39379883, + 1701.97070312, + 1683.74963379, + 1661.9576416, + 1636.85534668, + 1608.73010254, + 1577.88708496, + 1544.64245605, + 1509.31469727, + 1472.10400391, + 1426.6628418, + 1380.08117676, + 1332.70874023, + 1284.86938477, + 1236.85754395, + 1188.93847656, + 1141.34753418, + 1094.29150391, + 1047.94946289, + 1002.47418213, + 961.622131348, + 923.031555176, + 885.292236328, + 848.471923828, + 812.623657227, + 777.789794922, + 744.001464844, + 711.280090332, + 684.97052002, + 1644.50561523, + 1642.58703613, + 1636.85534668, + 1627.38232422, + 1614.28540039, + 1597.7220459, + 1577.88708496, + 1555.00537109, + 1529.32519531, + 1501.11230469, + 1470.2409668, + 1431.99487305, + 1391.92028809, + 1350.36767578, + 1307.67150879, + 1264.14855957, + 1220.09375, + 1175.77709961, + 1131.44470215, + 1087.31677246, + 1043.58874512, + 1000.43200684, + 961.622131348, + 924.624450684, + 888.28326416, + 852.680969238, + 817.885742188, + 783.954040527, + 750.929992676, + 718.848205566, + 690.50970459, + 669.78717041, + 1558.48730469, + 1556.74450684, + 1551.53735352, + 1542.92712402, + 1531.01391602, + 1515.93359375, + 1497.85339355, + 1476.96813965, + 1450.00341797, + 1419.60339355, + 1386.82666016, + 1351.99133301, + 1315.41491699, + 1277.40795898, + 1238.27075195, + 1198.28808594, + 1157.72741699, + 1116.83618164, + 1075.84216309, + 1034.94958496, + 994.342956543, + 958.241088867, + 923.031555176, + 888.28326416, + 854.091186523, + 820.536315918, + 787.687927246, + 755.602600098, + 724.327697754, + 694.713867188, + 674.449768066, + 654.522705078, + 1476.96801758, + 1475.38232422, + 1470.2409668, + 1460.9855957, + 1448.18615723, + 1431.99487305, + 1412.59997559, + 1390.21911621, + 1365.09509277, + 1337.48803711, + 1307.67150879, + 1275.92382812, + 1242.52575684, + 1207.75256348, + 1171.8729248, + 1135.14245605, + 1097.8046875, + 1060.08496094, + 1022.19244385, + 984.940734863, + 951.534057617, + 918.278259277, + 885.292236328, + 852.680969238, + 820.536315918, + 788.938354492, + 757.955322266, + 727.644897461, + 698.05480957, + 677.813598633, + 658.364379883, + 639.217041016, + 1386.82666016, + 1385.13513184, + 1380.08117676, + 1371.72302246, + 1360.15734863, + 1345.51452637, + 1327.95666504, + 1307.67175293, + 1284.86962891, + 1259.77709961, + 1232.63317871, + 1203.68237305, + 1173.171875, + 1141.34753418, + 1108.44836426, + 1074.70495605, + 1040.33654785, + 1005.54919434, + 972.740783691, + 941.609558105, + 910.440185547, + 879.356750488, + 848.471923828, + 817.885742188, + 787.687927246, + 757.955322266, + 728.75579834, + 700.14642334, + 679.84564209, + 660.946289062, + 642.291992188, + 623.906738281, + 1301.52868652, + 1300, + 1295.43139648, + 1287.87316895, + 1277.40795898, + 1264.14855957, + 1248.23461914, + 1229.82922363, + 1209.11425781, + 1186.2878418, + 1161.55883789, + 1135.14245605, + 1107.25793457, + 1078.12231445, + 1047.94946289, + 1016.94714355, + 985.821166992, + 957.398681641, + 928.625244141, + 899.638793945, + 870.567016602, + 841.526489258, + 812.623657227, + 783.954040527, + 755.602600098, + 727.644897461, + 700.14642334, + 680.525085449, + 662.243774414, + 644.148803711, + 626.268066406, + 608.625976562, + 1221.47717285, + 1220.09375, + 1215.95825195, + 1209.11425781, + 1199.63305664, + 1187.61206055, + 1173.171875, + 1156.4543457, + 1137.61865234, + 1116.83618164, + 1094.29150391, + 1070.17382812, + 1044.67651367, + 1017.99298096, + 990.31439209, + 965.022094727, + 939.153625488, + 912.780822754, + 886.038757324, + 859.054748535, + 831.947387695, + 804.825805664, + 777.789794922, + 750.929992676, + 724.327697754, + 698.05480957, + 679.84564209, + 662.243774414, + 644.769775391, + 627.454284668, + 610.324890137, + 593.40612793, + 1146.34912109, + 1145.09570312, + 1141.34753418, + 1135.14245605, + 1126.54284668, + 1115.63220215, + 1102.515625, + 1087.31677246, + 1070.17382812, + 1051.23815918, + 1030.67077637, + 1008.63867188, + 985.821166992, + 964.170288086, + 941.609558105, + 918.278381348, + 894.312011719, + 869.840393066, + 844.988342285, + 819.872619629, + 794.602966309, + 769.28125, + 744.001464844, + 718.848205566, + 694.713867188, + 677.813598633, + 660.946289062, + 644.148803711, + 627.454284668, + 610.893005371, + 594.491943359, + 578.275817871, + 1075.84216309, + 1074.70495605, + 1071.30383301, + 1065.67199707, + 1057.86291504, + 1047.94946289, + 1036.02380371, + 1022.19244385, + 1006.57757568, + 989.356018066, + 972.740783691, + 954.878356934, + 935.89440918, + 915.916137695, + 895.070007324, + 873.481933594, + 851.274475098, + 828.566345215, + 805.471496582, + 782.097106934, + 758.545227051, + 734.910339355, + 711.280090332, + 690.50970459, + 674.449768066, + 658.364379883, + 642.291992188, + 626.268066406, + 610.324890137, + 594.491943359, + 578.796020508, + 563.261047363, + 1009.67150879, + 1008.63867188, + 1005.54919434, + 1000.43200684, + 993.333496094, + 984.940734863, + 975.335021973, + 964.170288086, + 951.533935547, + 937.521850586, + 922.236572266, + 905.786682129, + 888.28326416, + 869.840393066, + 850.572570801, + 830.592407227, + 810.01184082, + 788.938354492, + 767.476257324, + 745.724487305, + 723.776733398, + 701.721008301, + 684.97052002, + 669.78717041, + 654.522705078, + 639.217041016, + 623.906738281, + 608.625976562, + 593.40612793, + 578.275817871, + 563.261047363, + 548.385559082}, + {0, + 0, + 0, + 0, + 1554.1237793, + 1242.53955078, + 993.424560547, + 821.738647461, + 688.023742676, + 576.067199707, + 482.328430176, + 403.842987061, + 338.128967285, + 283.233520508, + 237.367095947, + 198.928222656, + 166.714080811, + 139.71661377, + 117.091148376, + 100.366226196, + 93.5875701904, + 87.2667160034, + 81.3727798462, + 75.8769226074, + 70.7522583008, + 65.9736862183, + 62.4703788757, + 59.2027587891, + 56.1060714722, + 53.1713485718, + 50.3901405334, + 47.7544021606, + 0, + 0, + 0, + 0, + 1511.89892578, + 1215.3125, + 975.19708252, + 811.432128906, + 680.458190918, + 570.428588867, + 478.074890137, + 400.60269165, + 335.640289307, + 281.318328857, + 235.876022339, + 197.761489868, + 165.797119141, + 138.993164062, + 116.518371582, + 100.18183136, + 93.4242019653, + 87.1216278076, + 81.2436294556, + 75.76171875, + 70.6492919922, + 65.8815155029, + 62.4058837891, + 59.1439094543, + 56.0522842407, + 53.1221389771, + 50.3450584412, + 47.7130508423, + 0, + 0, + 0, + 0, + 1398.31689453, + 1139.93933105, + 926.469055176, + 781.859680176, + 658.586914062, + 554.038146973, + 465.659057617, + 391.113586426, + 328.333618164, + 275.682922363, + 231.480926514, + 194.317352295, + 163.08682251, + 136.852478027, + 114.822052002, + 99.6321716309, + 92.9370193481, + 86.6887664795, + 80.8581924438, + 75.4178009033, + 70.3418579102, + 65.6360702515, + 62.2131195068, + 58.9679450989, + 55.8914489746, + 52.974937439, + 50.2101821899, + 47.5893363953, + 0, + 0, + 0, + 0, + 1242.53955078, + 1031.72070312, + 865.446105957, + 736.612304688, + 624.660888672, + 528.352294922, + 446.048339844, + 376.032348633, + 316.662597656, + 266.643341064, + 224.406723022, + 188.757797241, + 158.70111084, + 133.381164551, + 112.06615448, + 98.727722168, + 92.1346588135, + 85.9753189087, + 80.2224273682, + 74.8501815796, + 69.8341827393, + 65.2862091064, + 61.8940887451, + 58.6766357422, + 55.625087738, + 52.7310905457, + 49.9866943359, + 47.3842887878, + 1554.1237793, + 1511.89892578, + 1398.31689453, + 1242.53955078, + 1072.70471191, + 913.631103516, + 791.500732422, + 680.458190918, + 581.79699707, + 495.450836182, + 420.656097412, + 356.335296631, + 301.3465271, + 254.681488037, + 214.999893188, + 181.334152222, + 152.824005127, + 128.714950562, + 108.351615906, + 97.4852905273, + 91.031036377, + 84.9928512573, + 79.3460922241, + 74.0670623779, + 69.1331710815, + 64.8018112183, + 61.4521331787, + 58.2728424072, + 55.2557067871, + 52.3927955627, + 49.6765098572, + 47.0995864868, + 1242.53955078, + 1215.3125, + 1139.93933105, + 1031.72070312, + 913.631103516, + 811.432128906, + 711.604919434, + 618.224060059, + 533.334899902, + 457.657806396, + 391.113586426, + 333.178527832, + 283.233520508, + 240.389190674, + 203.691101074, + 172.362747192, + 145.689254761, + 123.027793884, + 103.808616638, + 95.9272842407, + 89.6447677612, + 83.7569274902, + 78.2422027588, + 73.0794143677, + 68.2481460571, + 64.1881027222, + 60.8917617798, + 57.7605171204, + 54.7867393494, + 51.9630203247, + 49.2822341919, + 46.7375259399, + 993.424560547, + 975.19708252, + 926.469055176, + 865.446105957, + 791.500732422, + 711.604919434, + 631.209533691, + 554.038146973, + 482.328430176, + 417.209503174, + 359.053955078, + 307.756866455, + 263.145568848, + 224.406723022, + 190.955062866, + 162.196685791, + 137.561203003, + 116.518371582, + 100.551231384, + 94.0806045532, + 87.9983291626, + 82.2864456177, + 76.9266967773, + 71.9007492065, + 67.1905670166, + 63.4515571594, + 60.2185935974, + 57.1445236206, + 54.2224197388, + 51.4454956055, + 48.8071632385, + 46.300994873, + 821.738647461, + 811.432128906, + 781.859680176, + 736.612304688, + 680.458190918, + 618.224060059, + 554.038146973, + 491.015136719, + 431.260406494, + 376.032348633, + 325.949676514, + 281.318328857, + 241.920471191, + 207.367233276, + 177.270202637, + 151.198699951, + 128.714981079, + 109.396255493, + 98.1919021606, + 91.9756011963, + 86.1172180176, + 80.602897644, + 75.4178009033, + 70.5465774536, + 65.9736938477, + 62.5996856689, + 59.4391670227, + 56.4305839539, + 53.5677680969, + 50.8446311951, + 48.2551269531, + 45.7933807373, + 688.023742676, + 680.458190918, + 658.586914062, + 624.660888672, + 581.79699707, + 533.334899902, + 482.328430176, + 431.260406494, + 381.958526611, + 335.640289307, + 293.096069336, + 254.681427002, + 220.30645752, + 189.852081299, + 163.08682251, + 139.716659546, + 119.419540405, + 101.869186401, + 95.5869064331, + 89.6447677612, + 84.0289993286, + 78.7296905518, + 73.7354660034, + 69.0339279175, + 64.8706207275, + 61.6408843994, + 58.5607948303, + 55.625087738, + 52.8284225464, + 50.1653556824, + 47.6305236816, + 45.2185592651, + 576.067199707, + 570.428588867, + 554.038146973, + 528.352294922, + 495.450836182, + 457.657806396, + 417.209503174, + 376.032348633, + 335.640289307, + 297.179992676, + 261.421112061, + 228.614364624, + 198.928222656, + 172.362747192, + 148.803924561, + 128.066162109, + 109.923492432, + 98.727722168, + 92.7755966187, + 87.1216278076, + 81.7623062134, + 76.6913833618, + 71.9007492065, + 67.3809127808, + 63.7176971436, + 60.5841941833, + 57.5914230347, + 54.7350311279, + 52.0104789734, + 49.4130935669, + 46.938117981, + 44.5807800293, + 482.328430176, + 478.074890137, + 465.659057617, + 446.048339844, + 420.656097412, + 391.113586426, + 359.053955078, + 325.949676514, + 293.096069336, + 261.421112061, + 231.480926514, + 203.691101074, + 178.274459839, + 155.306304932, + 134.755508423, + 116.518371582, + 101.297218323, + 95.4174804688, + 89.7970199585, + 84.4398193359, + 79.3460922241, + 74.5129318237, + 69.935256958, + 65.6360702515, + 62.4703788757, + 59.4391593933, + 56.5394515991, + 53.7678108215, + 51.1205253601, + 48.5936508179, + 46.1830673218, + 43.8845863342, + 403.842987061, + 400.60269165, + 391.113586426, + 376.032348633, + 356.335296631, + 333.178527832, + 307.756866455, + 281.318328857, + 254.681427002, + 228.614364624, + 203.691101074, + 180.30632019, + 158.70111084, + 138.99319458, + 121.205970764, + 105.29486084, + 97.4852981567, + 91.9756011963, + 86.6887817383, + 81.6321105957, + 76.8088912964, + 72.2191696167, + 67.8605422974, + 64.1881027222, + 61.1397399902, + 58.2155418396, + 55.4135246277, + 52.7310905457, + 50.1653556824, + 47.7130508423, + 45.3707275391, + 43.1347961426, + 338.128967285, + 335.640289307, + 328.333618164, + 316.662597656, + 301.3465271, + 283.233520508, + 263.145568848, + 241.920471191, + 220.30645752, + 198.928222656, + 178.274459839, + 158.70111084, + 140.445129395, + 123.643127441, + 108.351615906, + 98.9074707031, + 93.5875701904, + 88.4422607422, + 83.4863510132, + 78.7296905518, + 74.1781234741, + 69.8341827393, + 65.7064590454, + 62.6645126343, + 59.736907959, + 56.9232521057, + 54.2224197388, + 51.6327171326, + 49.1519355774, + 46.7775268555, + 44.5066757202, + 42.336353302, + 283.233520508, + 281.318328857, + 275.682922363, + 266.643341064, + 254.681488037, + 240.389190674, + 224.406723022, + 207.367233276, + 189.852081299, + 172.362747192, + 155.306304932, + 138.99319458, + 123.643127441, + 109.396255493, + 99.632194519, + 94.5781402588, + 89.6447677612, + 84.8540420532, + 80.2224273682, + 75.76171875, + 71.4796905518, + 67.3809127808, + 63.9857673645, + 61.0775909424, + 58.2728424072, + 55.5720672607, + 52.974937439, + 50.4805107117, + 48.0872917175, + 45.7933807373, + 43.5965652466, + 41.4943313599, + 237.367095947, + 235.876022339, + 231.480926514, + 224.406723022, + 214.999893188, + 203.691101074, + 190.955062866, + 177.270202637, + 163.08682251, + 148.803924561, + 134.755508423, + 121.205970764, + 108.351615906, + 99.632194519, + 94.9123535156, + 90.2564239502, + 85.6926956177, + 81.2436294556, + 76.9266967773, + 72.7549057007, + 68.7375793457, + 65.077835083, + 62.2131195068, + 59.4391593933, + 56.7582321167, + 54.1716041565, + 51.6796913147, + 49.2822341919, + 46.9784011841, + 44.7668800354, + 42.6460189819, + 40.6138343811, + 198.928222656, + 197.761489868, + 194.317352295, + 188.757797241, + 181.334152222, + 172.362747192, + 162.196685791, + 151.198699951, + 139.716659546, + 128.066162109, + 116.518371582, + 105.29486084, + 98.9074707031, + 94.5781402588, + 90.2564239502, + 85.9753189087, + 81.7623062134, + 77.6397247314, + 73.6254806519, + 69.7333374023, + 65.9736938477, + 63.1215667725, + 60.4009246826, + 57.7605171204, + 55.2032775879, + 52.7310905457, + 50.3450584412, + 48.0454750061, + 45.832118988, + 43.7042236328, + 41.6606483459, + 39.699886322, + 166.714080811, + 165.797119141, + 163.08682251, + 158.70111084, + 152.824005127, + 145.689254761, + 137.561203003, + 128.714981079, + 119.419540405, + 109.923492432, + 101.297218323, + 97.4852981567, + 93.5875701904, + 89.6447677612, + 85.6926956177, + 81.7623062134, + 77.8798141479, + 74.0670623779, + 70.3418579102, + 66.7183837891, + 63.7845306396, + 61.1397399902, + 58.5607872009, + 56.0522842407, + 53.6176719666, + 51.2593917847, + 48.9790611267, + 46.7775268555, + 44.6550750732, + 42.6114387512, + 40.6459236145, + 38.7575035095, + 139.71661377, + 138.993164062, + 136.852478027, + 133.381164551, + 128.714950562, + 123.027793884, + 116.518371582, + 109.396255493, + 101.869186401, + 98.727722168, + 95.4174804688, + 91.9756011963, + 88.4422607422, + 84.8540420532, + 81.2436294556, + 77.6397247314, + 74.0670623779, + 70.5465774536, + 67.0957107544, + 64.1881027222, + 61.6408843994, + 59.1439094543, + 56.7033996582, + 54.3243103027, + 52.0104789734, + 49.7648124695, + 47.5893363953, + 45.4853858948, + 43.4536628723, + 41.4943313599, + 39.6071586609, + 37.7914886475, + 117.091148376, + 116.518371582, + 114.822052002, + 112.06615448, + 108.351615906, + 103.808616638, + 100.551231384, + 98.1919021606, + 95.5869064331, + 92.7755966187, + 89.7970199585, + 86.6887817383, + 83.4863510132, + 80.2224273682, + 76.9266967773, + 73.6254806519, + 70.3418579102, + 67.0957107544, + 64.3236160278, + 61.8940887451, + 59.4985046387, + 57.1445236206, + 54.83852005, + 52.5856742859, + 50.3901405334, + 48.2551269531, + 46.1830673218, + 44.1756248474, + 42.2339172363, + 40.3584899902, + 38.5494613647, + 36.8065338135, + 100.366226196, + 100.18183136, + 99.6321716309, + 98.727722168, + 97.4852905273, + 95.9272842407, + 94.0806045532, + 91.9756011963, + 89.6447677612, + 87.1216278076, + 84.4398193359, + 81.6321105957, + 78.7296905518, + 75.76171875, + 72.7549057007, + 69.7333374023, + 66.7183837891, + 64.1881027222, + 61.8940887451, + 59.6175003052, + 57.3672447205, + 55.1509132385, + 52.974937439, + 50.8446311951, + 48.7643356323, + 46.7375259399, + 44.7668800354, + 42.8544273376, + 41.0015525818, + 39.2091522217, + 37.4776496887, + 35.8070869446, + 93.5875701904, + 93.4242019653, + 92.9370193481, + 92.1346588135, + 91.031036377, + 89.6447677612, + 87.9983291626, + 86.1172180176, + 84.0289993286, + 81.7623062134, + 79.3460922241, + 76.8088912964, + 74.1781234741, + 71.4796905518, + 68.7375793457, + 65.9736938477, + 63.7845306396, + 61.6408843994, + 59.4985046387, + 57.3672447205, + 55.2557067871, + 53.1713485718, + 51.1205253601, + 49.1086196899, + 47.1400909424, + 45.2185668945, + 43.3469619751, + 41.5275192261, + 39.7618980408, + 38.0512619019, + 36.396320343, + 34.7973823547, + 87.2667160034, + 87.1216278076, + 86.6887664795, + 85.9753189087, + 84.9928512573, + 83.7569274902, + 82.2864456177, + 80.602897644, + 78.7296905518, + 76.6913833618, + 74.5129318237, + 72.2191696167, + 69.8341827393, + 67.3809127808, + 65.077835083, + 63.1215667725, + 61.1397399902, + 59.1439094543, + 57.1445236206, + 55.1509132385, + 53.1713485718, + 51.2130279541, + 49.2822341919, + 47.3842887878, + 45.5237045288, + 43.7042160034, + 41.9289016724, + 40.2001571655, + 38.5198707581, + 36.8894119263, + 35.3097419739, + 33.7813949585, + 81.3727798462, + 81.2436294556, + 80.8581924438, + 80.2224273682, + 79.3460922241, + 78.2422027588, + 76.9266967773, + 75.4178009033, + 73.7354660034, + 71.9007492065, + 69.935256958, + 67.8605422974, + 65.7064590454, + 63.9857673645, + 62.2131195068, + 60.4009246826, + 58.5607872009, + 56.7033996582, + 54.83852005, + 52.974937439, + 51.1205253601, + 49.2822341919, + 47.4661407471, + 45.6774940491, + 43.9208030701, + 42.199848175, + 40.5177879333, + 38.877155304, + 37.2799949646, + 35.7278671265, + 34.2218933105, + 32.9320831299, + 75.8769226074, + 75.76171875, + 75.4178009033, + 74.8501815796, + 74.0670623779, + 73.0794143677, + 71.9007492065, + 70.5465774536, + 69.0339279175, + 67.3809127808, + 65.6360702515, + 64.1881027222, + 62.6645126343, + 61.0775909424, + 59.4391593933, + 57.7605171204, + 56.0522842407, + 54.3243103027, + 52.5856742859, + 50.8446311951, + 49.1086196899, + 47.3842887878, + 45.6774940491, + 43.9933776855, + 42.336353302, + 40.7102241516, + 39.1181678772, + 37.5628471375, + 36.0463790894, + 34.5704689026, + 33.2245254517, + 32.1316642761, + 70.7522583008, + 70.6492919922, + 70.3418579102, + 69.8341827393, + 69.1331710815, + 68.2481460571, + 67.1905670166, + 65.9736938477, + 64.8706207275, + 63.7176971436, + 62.4703788757, + 61.1397399902, + 59.736907959, + 58.2728424072, + 56.7582321167, + 55.2032775879, + 53.6176719666, + 52.0104789734, + 50.3901405334, + 48.7643356323, + 47.1400909424, + 45.5237045288, + 43.9208030701, + 42.336353302, + 40.7746887207, + 39.2395439148, + 37.7341346741, + 36.2611160278, + 34.8227424622, + 33.4466362, + 32.3772735596, + 31.328754425, + 65.9736862183, + 65.8815155029, + 65.6360702515, + 65.2862091064, + 64.8018112183, + 64.1881027222, + 63.4515571594, + 62.5996856689, + 61.6408843994, + 60.5841941833, + 59.4391593933, + 58.2155418396, + 56.9232521057, + 55.5720672607, + 54.1716041565, + 52.7310905457, + 51.2593917847, + 49.7648124695, + 48.2551269531, + 46.7375259399, + 45.2185668945, + 43.7042160034, + 42.199848175, + 40.7102241516, + 39.2395439148, + 37.7914886475, + 36.3692169189, + 34.9754295349, + 33.6123847961, + 32.5545730591, + 31.5306549072, + 30.5255126953, + 62.4703788757, + 62.4058837891, + 62.2131195068, + 61.8940887451, + 61.4521331787, + 60.8917617798, + 60.2185935974, + 59.4391670227, + 58.5607948303, + 57.5914230347, + 56.5394515991, + 55.4135246277, + 54.2224197388, + 52.974937439, + 51.6796913147, + 50.3450584412, + 48.9790611267, + 47.5893363953, + 46.1830673218, + 44.7668800354, + 43.3469619751, + 41.9289016724, + 40.5177879333, + 39.1181678772, + 37.7341346741, + 36.3692169189, + 35.0265541077, + 33.7088127136, + 32.6617202759, + 31.6664142609, + 30.6867351532, + 29.723903656, + 59.2027587891, + 59.1439094543, + 58.9679450989, + 58.6766357422, + 58.2728424072, + 57.7605171204, + 57.1445236206, + 56.4305839539, + 55.625087738, + 54.7350311279, + 53.7678108215, + 52.7310905457, + 51.6327171326, + 50.4805107117, + 49.2822341919, + 48.0454750061, + 46.7775268555, + 45.4853858948, + 44.1756248474, + 42.8544273376, + 41.5275192261, + 40.2001571655, + 38.877155304, + 37.5628471375, + 36.2611160278, + 34.9754295349, + 33.7088127136, + 32.6975517273, + 31.7346553802, + 30.7841281891, + 29.8474140167, + 28.9257545471, + 56.1060714722, + 56.0522842407, + 55.8914489746, + 55.625087738, + 55.2557067871, + 54.7867393494, + 54.2224197388, + 53.5677680969, + 52.8284225464, + 52.0104789734, + 51.1205253601, + 50.1653556824, + 49.1519355774, + 48.0872917175, + 46.9784011841, + 45.832118988, + 44.6550750732, + 43.4536628723, + 42.2339172363, + 41.0015525818, + 39.7618980408, + 38.5198707581, + 37.2799949646, + 36.0463790894, + 34.8227424622, + 33.6123847961, + 32.6617202759, + 31.7346553802, + 30.8167037964, + 29.9094753265, + 29.0143985748, + 28.1327323914, + 53.1713485718, + 53.1221389771, + 52.974937439, + 52.7310905457, + 52.3927955627, + 51.9630203247, + 51.4454956055, + 50.8446311951, + 50.1653556824, + 49.4130935669, + 48.5936508179, + 47.7130508423, + 46.7775268555, + 45.7933807373, + 44.7668800354, + 43.7042236328, + 42.6114387512, + 41.4943313599, + 40.3584899902, + 39.2091522217, + 38.0512619019, + 36.8894119263, + 35.7278671265, + 34.5704689026, + 33.4466362, + 32.5545730591, + 31.6664142609, + 30.7841281891, + 29.9094753265, + 29.0440425873, + 28.1892433167, + 27.3463401794, + 50.3901405334, + 50.3450584412, + 50.2101821899, + 49.9866943359, + 49.6765098572, + 49.2822341919, + 48.8071632385, + 48.2551269531, + 47.6305236816, + 46.938117981, + 46.1830673218, + 45.3707275391, + 44.5066757202, + 43.5965652466, + 42.6460189819, + 41.6606483459, + 40.6459236145, + 39.6071586609, + 38.5494613647, + 37.4776496887, + 36.396320343, + 35.3097419739, + 34.2218933105, + 33.2245254517, + 32.3772735596, + 31.5306549072, + 30.6867351532, + 29.8474140167, + 29.0143985748, + 28.1892433167, + 27.3733463287, + 26.5679397583, + 47.7544021606, + 47.7130508423, + 47.5893363953, + 47.3842887878, + 47.0995864868, + 46.7375259399, + 46.300994873, + 45.7933807373, + 45.2185592651, + 44.5807800293, + 43.8845863342, + 43.1347961426, + 42.336353302, + 41.4943313599, + 40.6138343811, + 39.699886322, + 38.7575035095, + 37.7914886475, + 36.8065338135, + 35.8070869446, + 34.7973823547, + 33.7813949585, + 32.9320831299, + 32.1316642761, + 31.328754425, + 30.5255126953, + 29.723903656, + 28.9257545471, + 28.1327323914, + 27.3463401794, + 26.5679397583, + 25.7987575531}}; + +ap_uint<24> inv_matrix_8_fix[3][64] = { + {0, 573440, 571914, 500934, 438764, 384309, 336613, 294836, 573440, 573440, 554300, 490277, 431664, + 379299, 332941, 292072, 571914, 554300, 512454, 462308, 412154, 365186, 322442, 284093, 500934, 490277, + 462308, 424880, 384309, 344238, 306460, 271733, 438764, 431664, 412154, 384309, 352272, 319103, 286702, + 256122, 384309, 379299, 365186, 344238, 319103, 292072, 264820, 238433, 336613, 332941, 322442, 306460, + 286702, 264820, 242160, 219708, 294836, 292072, 284093, 271733, 256122, 238433, 219708, 200777}, + {0, 3225600, 3214600, 2712197, 2288313, 1930677, 1628936, 1374353, 3225600, 3225600, 3088189, + 2638422, 2240937, 1898460, 1606185, 1357854, 3214600, 3088189, 2792443, 2446967, 2111879, 1808349, + 1541522, 1310462, 2712197, 2638422, 2446967, 2195873, 1930677, 1676412, 1444223, 1237800, 2288313, + 2240937, 2111879, 1930677, 1726752, 1521076, 1325921, 1086047, 1930677, 1898460, 1808349, 1676412, + 1521076, 1357854, 1197559, 804826, 1628936, 1606185, 1541522, 1444223, 1325921, 1197559, 858830, + 571430, 1374353, 1357854, 1310462, 1237800, 1086047, 804826, 571430, 391838}, + {0, 301014, 173537, 122278, 87381, 87381, 85556, 60284, 301014, 239204, 159771, 115525, 87381, + 87381, 83112, 58803, 173537, 159771, 129848, 98919, 87381, 87381, 76366, 54653, 122278, 115525, + 98919, 87381, 87381, 87381, 66768, 48594, 87381, 87381, 87381, 87381, 87381, 74294, 55990, + 40365, 87381, 87381, 87381, 87381, 74294, 58803, 45395, 29913, 85556, 83112, 76366, 66768, + 55990, 45395, 31920, 21238, 60284, 58803, 54653, 48594, 40365, 29913, 21238, 14563}}; + +ap_uint<24> inv_matrix_16_fix[3][256] = { + {0, 0, 2441638, 2110453, 1805935, 1527539, 1292059, 1092880, 979637, 882036, 794159, 720192, + 660388, 605551, 555267, 513377, 0, 0, 2359049, 2061117, 1769095, 1502424, 1274280, 1079954, + 973257, 876924, 790013, 717366, 658011, 603538, 553552, 512061, 2441638, 2359049, 2163900, 1929223, + 1668681, 1432145, 1223726, 1056519, 954648, 861946, 777824, 709020, 650980, 597575, 548467, 508152, + 2110453, 2061117, 1929223, 1734043, 1527539, 1329150, 1147596, 1019948, 925274, 838103, 758296, 695541, + 639580, 587878, 541090, 501757, 1805935, 1769095, 1668681, 1527539, 1368461, 1207734, 1064198, 973257, + 887210, 806857, 734675, 677512, 624254, 574785, 531076, 493051, 1527539, 1502424, 1432145, 1329150, + 1207734, 1079954, 999342, 919628, 842764, 769898, 709020, 655651, 605551, 558724, 518704, 482257, + 1292059, 1274280, 1223726, 1147596, 1064198, 999342, 930994, 861946, 794159, 731730, 680030, 630735, + 584081, 541090, 504297, 469635, 1092880, 1079954, 1056519, 1019948, 973257, 919628, 861946, 802580, + 743657, 695541, 648668, 603538, 560468, 522767, 488199, 455467, 979637, 973257, 954648, 925274, + 887210, 842764, 794159, 743657, 700870, 658011, 615808, 574785, 536755, 503024, 470758, 440042, + 882036, 876924, 861946, 838103, 806857, 769898, 731730, 695541, 658011, 620003, 582200, 545493, + 513377, 482257, 452306, 423640, 794159, 790013, 777824, 758296, 734675, 709020, 680030, 648668, + 615808, 582200, 548467, 518704, 489404, 460825, 433156, 404651, 720192, 717366, 709020, 695541, + 677512, 655651, 630735, 603538, 574785, 545493, 518704, 491830, 465193, 439046, 412893, 384233, + 660388, 658011, 650980, 639580, 624254, 605551, 584081, 560468, 536755, 513377, 489404, 465193, + 441041, 417116, 389895, 363695, 605551, 603538, 597575, 587878, 574785, 558724, 541090, 522767, + 503024, 482257, 460825, 439046, 417116, 391813, 367140, 343268, 555267, 553552, 548467, 541090, + 531076, 518704, 504297, 488199, 470758, 452306, 433156, 412893, 389895, 367140, 344835, 323147, + 513377, 512061, 508152, 501757, 493051, 482257, 469635, 455467, 440042, 423640, 404651, 384233, + 363695, 343268, 323147, 303491}, + {0, 0, 5751209, 4544049, 3799576, 3391573, 3027382, 2702299, 2436075, 2197740, 1982723, 1763518, + 1534572, 1335348, 1161989, 974727, 0, 0, 5440084, 4373603, 3746811, 3353638, 2999054, 2680571, + 2420510, 2185243, 1972567, 1752429, 1525721, 1328236, 1156239, 968844, 5751209, 5440084, 4731487, 3973698, + 3601163, 3246381, 2917801, 2623494, 2375098, 2148620, 1942705, 1719843, 1499653, 1307247, 1139247, 951484, + 4544049, 4373603, 3973698, 3696279, 3391573, 3086079, 2793372, 2534376, 2303382, 2090297, 1894846, 1667696, + 1457751, 1273387, 1103434, 923481, 3799576, 3746811, 3601163, 3391573, 3147719, 2891875, 2642200, 2420510, + 2210386, 2013818, 1820750, 1598900, 1402122, 1228203, 1055862, 886125, 3391573, 3353638, 3246381, 3086079, + 2891875, 2680571, 2484135, 2289592, 2101702, 1923283, 1719843, 1516952, 1335348, 1173618, 998753, 841041, + 3027382, 2999054, 2917801, 2793372, 2642200, 2484135, 2317350, 2148620, 1982723, 1809059, 1608443, 1425546, + 1260221, 1103434, 934545, 790037, 2702299, 2680571, 2623494, 2534376, 2420510, 2289592, 2148620, 2003346, + 1856592, 1667696, 1491121, 1328236, 1179499, 1017307, 865695, 734963, 2436075, 2420510, 2375098, 2303382, + 2210386, 2101702, 1982723, 1856592, 1688240, 1525721, 1371816, 1228203, 1082691, 928991, 794499, 677590, + 2197740, 2185243, 2148620, 2090297, 2013818, 1923283, 1809059, 1667696, 1525721, 1386839, 1253715, 1124732, + 974727, 841041, 722986, 619515, 1982723, 1972567, 1942705, 1894846, 1820750, 1719843, 1608443, 1491121, + 1371816, 1253715, 1139247, 998752, 870742, 755519, 652842, 544631, 1763518, 1752429, 1719843, 1667696, + 1598900, 1516952, 1425546, 1328236, 1228203, 1124732, 998752, 880957, 772520, 673980, 578732, 465986, + 1534572, 1525721, 1499653, 1457751, 1402122, 1335348, 1260221, 1179499, 1082691, 974727, 870742, 772520, + 681226, 596747, 486978, 394922, 1335348, 1328236, 1307247, 1273387, 1228203, 1173618, 1103434, 1017307, + 928991, 841041, 755519, 673980, 596747, 494226, 406298, 331816, 1161989, 1156239, 1139247, 1103434, + 1055862, 998753, 934545, 865695, 794499, 722986, 652842, 578732, 486978, 406298, 336400, 276619, + 974727, 968844, 951484, 923481, 886125, 841041, 790037, 734963, 677590, 619515, 544631, 465986, + 394922, 331816, 276619, 228975}, + {0, 0, 630388, 459728, 346040, 270138, 210885, 164629, 145236, 129332, 115170, 103181, 93307, 84378, + 76303, 60309, 0, 0, 585115, 436769, 335651, 263595, 206607, 161757, 144191, 128504, 114506, 102711, + 92918, 84053, 76030, 59794, 630388, 585115, 485316, 381545, 307860, 245565, 194600, 157883, 141148, 126081, + 112555, 101326, 91767, 83090, 75220, 58282, 459728, 436769, 381545, 325861, 270138, 219900, 176964, 151854, + 136356, 122232, 109437, 99095, 89905, 81527, 71893, 55867, 346040, 335651, 307860, 270138, 229587, 190851, + 159151, 144191, 130171, 117207, 105593, 96120, 87409, 79424, 67542, 52693, 270138, 263595, 245565, 219900, + 190851, 161757, 148468, 135437, 122984, 111288, 101326, 92531, 84378, 76855, 62426, 48936, 210885, 206607, + 194600, 176964, 159151, 148468, 137288, 126081, 115170, 105102, 96535, 88463, 80917, 71893, 56818, 44786, + 164629, 161757, 157883, 151854, 144191, 135437, 126081, 116520, 107092, 99095, 91389, 84053, 77133, 64075, + 50981, 40428, 145236, 144191, 141148, 136356, 130171, 122984, 115170, 107092, 99976, 92918, 86039, 79424, + 69986, 56340, 45145, 36032, 129332, 128504, 126081, 122232, 117207, 111288, 105102, 99095, 92918, 86719, + 80614, 73866, 60309, 48936, 39498, 31736, 115170, 114506, 112555, 109437, 105593, 101326, 96535, 91389, + 86039, 80614, 75220, 62426, 51402, 42040, 34182, 25375, 103181, 102711, 101326, 99095, 96120, 92531, + 88463, 84053, 79424, 73866, 62426, 52258, 43386, 35760, 28391, 19017, 93307, 92918, 91767, 89905, + 87409, 84378, 80917, 77133, 69986, 60309, 51402, 43386, 36306, 30047, 20632, 14004, 84378, 84053, + 83090, 81527, 79424, 76855, 71893, 64075, 56340, 48936, 42040, 35760, 30047, 21203, 14759, 10149, + 76303, 76030, 75220, 71893, 67542, 62426, 56818, 50981, 45145, 39498, 34182, 28391, 20632, 14759, + 10410, 7249, 60309, 59794, 58282, 55867, 52693, 48936, 44786, 40428, 36032, 31736, 25375, 19017, + 14004, 10149, 7249, 5111}}; + +ap_uint<24> inv_matrix_32_fix[3][1024] = { + {0, 0, 0, 0, 5131959, 4670491, 4250519, 3878761, 3542927, 3236170, 2955973, 2700037, + 2466260, 2274087, 2118957, 1974409, 1839722, 1714223, 1597285, 1490256, 1397152, 1309866, 1228032, 1151311, + 1079383, 1011949, 954704, 901007, 850330, 802503, 757366, 714767, 0, 0, 0, 0, + 5072775, 4627118, 4217502, 3853877, 3523007, 3219979, 2942652, 2688968, 2456987, 2267924, 2113623, 1969770, + 1835669, 1710667, 1594156, 1487730, 1394902, 1307856, 1226234, 1149698, 1077934, 1010645, 953642, 900042, + 849451, 801702, 756636, 714101, 0, 0, 0, 0, 4908661, 4504031, 4123432, 3781605, + 3464803, 3172463, 2903431, 2656298, 2429565, 2249644, 2097784, 1955978, 1823608, 1700081, 1584833, 1480198, + 1388190, 1301861, 1220866, 1144883, 1073606, 1006935, 950469, 897157, 846825, 799309, 754452, 712106, + 0, 0, 0, 0, 4670491, 4318764, 3982627, 3668379, 3372610, 3096591, 2840422, 2603565, + 2385137, 2219847, 2071905, 1933401, 1803834, 1682702, 1569509, 1467797, 1377130, 1291973, 1212008, 1136931, + 1066456, 1001153, 945218, 892382, 842477, 795345, 750834, 708800, 5131959, 5072775, 4908661, 4670491, + 4390194, 4094197, 3805312, 3523007, 3252544, 2996713, 2756785, 2533105, 2331170, 2179470, 2036722, 1902624, + 1776815, 1658906, 1548492, 1450748, 1361905, 1278347, 1199788, 1125953, 1056576, 993151, 937947, 885767, + 836451, 789848, 745815, 704212, 4670491, 4627118, 4504031, 4318764, 4094197, 3853877, 3604336, 3354843, + 3111448, 2877883, 2656298, 2447781, 2274087, 2129705, 1993181, 1864403, 1743163, 1629194, 1522192, 1429345, + 1342761, 1261188, 1184381, 1112095, 1044091, 983020, 928735, 877378, 828804, 782869, 739438, 698619, + 4250519, 4217502, 4123432, 3982627, 3805312, 3604336, 3390593, 3172463, 2955973, 2745243, 2542942, 2350878, + 2208155, 2071905, 1942371, 1819621, 1703597, 1594156, 1492790, 1403941, 1319993, 1240747, 1165998, 1095537, + 1029156, 970871, 917676, 867300, 819609, 774471, 731761, 692068, 3878761, 3853877, 3781605, 3668379, + 3523007, 3354843, 3172463, 2983002, 2792006, 2603565, 2420554, 2267924, 2135121, 2007491, 1885454, 1769234, + 1658906, 1554446, 1460446, 1374937, 1293940, 1217309, 1144883, 1076488, 1011949, 956834, 904884, 855630, + 808951, 764729, 722846, 684452, 3542927, 3523007, 3464803, 3372610, 3252544, 3111448, 2955973, 2792006, + 2624408, 2456987, 2305432, 2179470, 2056680, 1937876, 1823608, 1714223, 1609913, 1510754, 1424666, 1342761, + 1264967, 1191187, 1121302, 1055177, 994288, 941052, 890484, 842477, 796927, 753726, 712770, 675828, + 3236170, 3219979, 3172463, 3096591, 2996713, 2877883, 2745243, 2603565, 2456987, 2318226, 2202356, 2087355, + 1974409, 1864403, 1757973, 1655557, 1557438, 1467797, 1385966, 1307856, 1233454, 1162707, 1095537, 1031846, + 975260, 923681, 874611, 827961, 783640, 741554, 701628, 666259, 2955973, 2942652, 2903431, 2840422, + 2756785, 2656298, 2542942, 2420554, 2305432, 2202356, 2097784, 1993181, 1889719, 1788302, 1689615, 1594156, + 1503004, 1422336, 1344864, 1270671, 1199788, 1132204, 1067880, 1006935, 954704, 904884, 857409, 812207, + 769201, 728312, 690299, 655815, 2700037, 2688968, 2656298, 2603565, 2533105, 2447781, 2350878, 2267924, + 2179470, 2087355, 1993181, 1898303, 1803834, 1710668, 1619502, 1530871, 1450748, 1374937, 1301861, 1231642, + 1164351, 1100012, 1038620, 983020, 932811, 884828, 839025, 795345, 753726, 714101, 678111, 644568, + 2466260, 2456987, 2429565, 2385137, 2331170, 2274087, 2208155, 2135121, 2056680, 1974409, 1889719, 1803834, + 1717792, 1632448, 1548492, 1470262, 1397152, 1326135, 1257429, 1191187, 1127510, 1066456, 1008099, 957902, + 909769, 863682, 819609, 777508, 737332, 699219, 665148, 632591, 2274087, 2267924, 2249644, 2219847, + 2179470, 2129705, 2071905, 2007491, 1937876, 1864403, 1788302, 1710668, 1632448, 1554446, 1480198, 1410789, + 1342761, 1276421, 1212008, 1149698, 1089617, 1031846, 979682, 931789, 885767, 841612, 799309, 758829, + 720138, 684452, 651494, 619962, 2118957, 2113623, 2097784, 2071905, 2036722, 1993181, 1942371, 1885454, + 1823608, 1757973, 1689615, 1619502, 1548492, 1480198, 1415388, 1351210, 1288054, 1226234, 1165998, 1107538, + 1050997, 997710, 950469, 904884, 860984, 818781, 778271, 739438, 702260, 669051, 637236, 606756, + 1974409, 1969770, 1955978, 1933401, 1902624, 1864403, 1819621, 1769234, 1714223, 1655557, 1594156, 1530871, + 1470262, 1410789, 1351210, 1291973, 1233454, 1175965, 1119759, 1065035, 1011949, 965432, 920670, 877378, + 835596, 795345, 756636, 719463, 685033, 653109, 622456, 593050, 1839722, 1835669, 1823608, 1803834, + 1776815, 1743163, 1703597, 1658906, 1609913, 1557438, 1503004, 1450748, 1397152, 1342761, 1288054, 1233454, + 1179319, 1125953, 1073606, 1022482, 976362, 932811, 890484, 849451, 809763, 771453, 734538, 699219, + 667374, 636717, 607238, 578919, 1714223, 1710667, 1700081, 1682702, 1658906, 1629194, 1594156, 1554446, + 1510754, 1467797, 1422336, 1374937, 1326135, 1276421, 1226234, 1175965, 1125953, 1076488, 1027816, 983020, + 941052, 900042, 860088, 821268, 783640, 747243, 712106, 679831, 649351, 619962, 591660, 564435, + 1597285, 1594156, 1584833, 1569509, 1548492, 1522192, 1492790, 1460446, 1424666, 1385966, 1344864, 1301861, + 1257429, 1212008, 1165998, 1119759, 1073606, 1027816, 985257, 945218, 905857, 867300, 829648, 792982, + 757366, 722846, 690299, 660181, 631055, 602927, 575800, 549669, 1490256, 1487730, 1480198, 1467797, + 1450748, 1429345, 1403941, 1374937, 1342761, 1307856, 1270671, 1231642, 1191187, 1149698, 1107538, 1065035, + 1022482, 983020, 945218, 907810, 870943, 834742, 799309, 764729, 731069, 698619, 669051, 640362, + 612571, 585692, 559730, 534687, 1397152, 1394902, 1388190, 1377130, 1361905, 1342761, 1319993, 1293940, + 1264967, 1233454, 1199788, 1164351, 1127510, 1089617, 1050997, 1011949, 976362, 941052, 905857, 870943, + 836451, 802503, 769201, 736632, 704865, 675828, 647750, 620459, 593980, 568330, 543519, 519552, + 1309866, 1307856, 1301861, 1291973, 1278347, 1261188, 1240747, 1217309, 1191187, 1162707, 1132204, 1100012, + 1066456, 1031846, 997710, 965432, 932811, 900042, 867300, 834742, 802503, 770701, 739438, 708800, + 680406, 653109, 626480, 600553, 575356, 550911, 527232, 504325, 1228032, 1226234, 1220866, 1212008, + 1199788, 1184381, 1165998, 1144883, 1121302, 1095537, 1067880, 1038620, 1008099, 979682, 950469, 920670, + 890484, 860088, 829648, 799309, 769201, 739438, 710120, 682713, 656358, 630543, 605316, 580713, + 556767, 533499, 510927, 492344, 1151311, 1149698, 1144883, 1136931, 1125953, 1112095, 1095537, 1076488, + 1055177, 1031846, 1006935, 983020, 957902, 931789, 904884, 877378, 849451, 821268, 792982, 764729, + 736632, 708800, 682713, 657447, 632591, 608202, 584327, 561007, 538274, 516151, 496368, 481317, + 1079383, 1077934, 1073606, 1066456, 1056576, 1044091, 1029156, 1011949, 994288, 975260, 954704, 932811, + 909769, 885767, 860984, 835596, 809763, 783640, 757366, 731069, 704865, 680406, 656358, 632591, + 609169, 586147, 563576, 541493, 519932, 499422, 484703, 470233, 1011949, 1010645, 1006935, 1001153, + 993151, 983020, 970871, 956834, 941052, 923681, 904884, 884828, 863682, 841612, 818781, 795345, + 771453, 747243, 722846, 698619, 675828, 653109, 630543, 608202, 586147, 564435, 543113, 522221, + 501793, 487146, 473022, 459122, 954704, 953642, 950469, 945218, 937947, 928735, 917676, 904884, + 890484, 874611, 857409, 839025, 819609, 799309, 778271, 756636, 734538, 712106, 690299, 669051, + 647750, 626480, 605316, 584327, 563576, 543113, 522987, 503238, 488622, 474897, 461354, 448010, + 901007, 900042, 897157, 892382, 885767, 877378, 867300, 855630, 842477, 827961, 812207, 795345, + 777508, 758829, 739438, 719463, 699219, 679831, 660181, 640362, 620459, 600553, 580713, 561007, + 541493, 522221, 503238, 489115, 475839, 462701, 449723, 436922, 850330, 849451, 846825, 842477, + 836451, 828804, 819609, 808951, 796927, 783640, 769201, 753726, 737332, 720138, 702260, 685033, + 667374, 649351, 631055, 612571, 593980, 575356, 556767, 538274, 519932, 501793, 488622, 475839, + 463152, 450584, 438155, 425882, 802503, 801702, 799309, 795345, 789848, 782869, 774471, 764729, + 753726, 741554, 728312, 714101, 699219, 684452, 669051, 653109, 636717, 619962, 602927, 585692, + 568330, 550911, 533499, 516151, 499422, 487146, 474897, 462701, 450584, 438567, 426670, 414910, + 757366, 756636, 754452, 750834, 745815, 739438, 731761, 722846, 712770, 701628, 690299, 678111, + 665148, 651494, 637236, 622456, 607238, 591660, 575800, 559730, 543519, 527232, 510927, 496368, + 484703, 473022, 461354, 449723, 438155, 426670, 415287, 404024, 714767, 714101, 712106, 708800, + 704212, 698619, 692068, 684452, 675828, 666259, 655815, 644568, 632591, 619962, 606756, 593050, + 578919, 564435, 549669, 534687, 519552, 504325, 492344, 481317, 470233, 459122, 448010, 436922, + 425882, 414910, 404024, 393243}, + {0, 0, 0, 0, 10256567, 9163795, 8187452, 7334503, 6576615, 5897041, 5287688, 4741301, + 4251374, 3823808, 3450992, 3114525, 2810863, 2536807, 2289472, 2087679, 1978480, 1874992, 1776918, 1683973, + 1595891, 1512415, 1420110, 1332765, 1250792, 1173861, 1101662, 1033903, 0, 0, 0, 0, + 10115302, 9062138, 8111468, 7277870, 6532108, 5861525, 5259001, 4717901, 4232130, 3808773, 3438387, 3103905, + 2801875, 2529171, 2282962, 2084732, 1975826, 1872597, 1774751, 1682009, 1594106, 1510791, 1418378, 1331200, + 1249376, 1172578, 1100497, 1032846, 0, 0, 0, 0, 9725277, 8774666, 7895240, 7113812, + 6402356, 5757510, 5174699, 4648953, 4175309, 3764280, 3401038, 3072400, 2775187, 2506479, 2263603, 2075939, + 1967905, 1865445, 1768278, 1676139, 1588774, 1505526, 1413203, 1326521, 1245141, 1168739, 1097015, 1029682, + 0, 0, 0, 0, 9163795, 8344872, 7571685, 6858080, 6197748, 5592084, 5039753, 4538028, + 4083526, 3692107, 3340291, 3021045, 2731603, 2469362, 2231895, 2061445, 1954839, 1853638, 1757587, 1666439, + 1579957, 1496049, 1404644, 1318782, 1238133, 1162385, 1091248, 1024442, 10256567, 10115302, 9725277, 9163795, + 8510160, 7827871, 7167557, 6532108, 5932993, 5375578, 4861571, 4390531, 3963957, 3595004, 3258255, 2951472, + 2672398, 2418824, 2188634, 2041485, 1936825, 1837343, 1742818, 1653028, 1567758, 1482942, 1392801, 1308065, + 1228424, 1153579, 1083251, 1017173, 9163795, 9062138, 8774666, 8344872, 7827871, 7277870, 6714142, 6158448, + 5624412, 5119910, 4648953, 4213040, 3823808, 3476438, 3157611, 2865780, 2599225, 2356177, 2134869, 2016372, + 1914124, 1816782, 1724159, 1636067, 1552316, 1466362, 1377806, 1294488, 1216114, 1142407, 1073100, 1008579, + 8187452, 8111468, 7895240, 7571685, 7167557, 6714142, 6237570, 5757510, 5287688, 4837070, 4411073, 4012705, + 3663905, 3340291, 3041417, 2766381, 2514008, 2282962, 2090635, 1986483, 1887057, 1792226, 1701844, 1615756, + 1533801, 1446502, 1359827, 1278192, 1201328, 1128976, 1060888, 998743, 7334503, 7277870, 7113812, 6858080, + 6532108, 6158448, 5757510, 5345971, 4936475, 4538028, 4156665, 3808773, 3489280, 3190579, 2912880, 2655858, + 2418824, 2200862, 2052844, 1952246, 1855988, 1763987, 1676139, 1592325, 1512415, 1423584, 1339055, 1259345, + 1184209, 1113412, 1046725, 987310, 6576615, 6532108, 6402356, 6197748, 5932993, 5624412, 5287688, 4936475, + 4581817, 4232130, 3900570, 3595003, 3304714, 3031203, 2775187, 2536808, 2315800, 2111617, 2010873, 1914124, + 1821314, 1732406, 1647339, 1566029, 1484803, 1397857, 1315706, 1238133, 1164921, 1095858, 1030735, 974370, + 5897041, 5861525, 5757510, 5592084, 5375578, 5119910, 4837070, 4538028, 4232130, 3932039, 3649944, 3376518, + 3114525, 2865780, 2631349, 2411736, 2207015, 2061445, 1965279, 1872597, 1783449, 1697843, 1615756, 1537139, + 1453673, 1369587, 1290011, 1214758, 1143640, 1076467, 1013100, 960022, 5287688, 5259001, 5174699, 5039753, + 4861571, 4648953, 4411073, 4156665, 3900570, 3649944, 3401038, 3157611, 2922451, 2697519, 2484105, 2282962, + 2102541, 2008134, 1916621, 1828151, 1742818, 1660667, 1581713, 1505526, 1420110, 1339055, 1262216, 1189436, + 1120554, 1055406, 996086, 944370, 4741301, 4717901, 4648953, 4538028, 4390531, 4213040, 4012705, 3808773, + 3595003, 3376518, 3157611, 2941747, 2731603, 2529172, 2335853, 2152565, 2041485, 1952246, 1865445, 1781267, + 1699841, 1621251, 1545538, 1466362, 1384439, 1306546, 1232570, 1162385, 1095858, 1032846, 977795, 927525, + 4251374, 4232130, 4175309, 4083526, 3963957, 3823808, 3663905, 3489280, 3304714, 3114525, 2922451, 2731603, + 2544480, 2363013, 2188634, 2064328, 1978480, 1894367, 1812271, 1732406, 1654932, 1579957, 1507434, 1425326, + 1346984, 1272346, 1201328, 1133832, 1069748, 1009480, 958355, 909602, 3823808, 3808773, 3764280, 3692107, + 3595004, 3476438, 3340291, 3190579, 3031203, 2865780, 2697519, 2529172, 2363013, 2200862, 2075939, 1994549, + 1914124, 1835037, 1757587, 1682009, 1608481, 1537139, 1460902, 1382776, 1308065, 1236738, 1168739, 1103997, + 1042424, 987310, 937898, 890716, 3450992, 3438387, 3401038, 3340291, 3258255, 3157611, 3041417, 2912880, + 2775187, 2631349, 2484105, 2335853, 2188634, 2075939, 1999962, 1924150, 1848955, 1774751, 1701844, 1630482, + 1560861, 1490409, 1413203, 1339055, 1267989, 1199997, 1135051, 1073100, 1014081, 964208, 916551, 870986, + 3114525, 3103905, 3072400, 3021045, 2951472, 2865780, 2766381, 2655858, 2536808, 2411736, 2282962, 2152565, + 2064328, 1994549, 1924150, 1853638, 1783449, 1713950, 1645451, 1578204, 1512415, 1437617, 1364693, 1294488, + 1227047, 1162385, 1100497, 1041353, 988182, 940317, 894445, 850526, 2810863, 2801875, 2775187, 2731603, + 2672398, 2599225, 2514008, 2418824, 2315800, 2207015, 2102541, 2041485, 1978480, 1914124, 1848955, 1783449, + 1718021, 1653028, 1588774, 1525514, 1455475, 1384439, 1315706, 1249376, 1185512, 1124152, 1065304, 1009480, + 961693, 915775, 871705, 829452, 2536807, 2529171, 2506479, 2469362, 2418824, 2356177, 2282962, 2200862, + 2111617, 2061445, 2008134, 1952246, 1894367, 1835037, 1774751, 1713950, 1653028, 1592325, 1532138, 1466362, + 1397857, 1331200, 1266542, 1203995, 1143640, 1085527, 1029682, 980376, 934687, 890716, 848451, 807872, + 2289472, 2282962, 2263603, 2231895, 2188634, 2134869, 2090635, 2052844, 2010873, 1965279, 1916621, 1865445, + 1812271, 1757587, 1701844, 1645451, 1588774, 1532138, 1470021, 1404644, 1340635, 1278192, 1217473, 1158599, + 1101662, 1046725, 996086, 950912, 907303, 865268, 824802, 785895, 2087679, 2084732, 2075939, 2061445, + 2041485, 2016372, 1986483, 1952246, 1914124, 1872597, 1828151, 1781267, 1732406, 1682009, 1630482, 1578204, + 1525514, 1466362, 1404644, 1343804, 1284080, 1225671, 1168739, 1113412, 1059788, 1008579, 964208, 921230, + 879672, 839549, 800867, 763621, 1978480, 1975826, 1967905, 1954839, 1936825, 1914124, 1887057, 1855988, + 1821314, 1783449, 1742818, 1699841, 1654932, 1608481, 1560861, 1512415, 1455475, 1397857, 1340635, 1284080, + 1228424, 1173861, 1120554, 1068634, 1018207, 974370, 932290, 891460, 851914, 813673, 776750, 741147, + 1874992, 1872597, 1865445, 1853638, 1837343, 1816782, 1792226, 1763987, 1732406, 1697843, 1660667, 1621251, + 1579957, 1537139, 1490409, 1437617, 1384439, 1331200, 1278192, 1225671, 1173861, 1122950, 1073100, 1024442, + 981238, 940316, 900461, 861723, 824141, 787744, 752548, 718562, 1776918, 1774751, 1768278, 1757587, + 1742818, 1724159, 1701844, 1676139, 1647339, 1615756, 1581713, 1545538, 1507434, 1460902, 1413203, 1364693, + 1315706, 1266542, 1217473, 1168739, 1120554, 1073100, 1026533, 984701, 945184, 906539, 868835, 832126, + 796456, 761857, 728350, 701409, 1683973, 1682009, 1676139, 1666439, 1653028, 1636067, 1615756, 1592325, + 1566029, 1537139, 1505526, 1466362, 1425326, 1382776, 1339055, 1294488, 1249376, 1203995, 1158599, 1113412, + 1068634, 1024442, 984701, 946815, 909602, 873145, 837515, 802768, 768952, 736100, 707081, 685862, + 1595891, 1594106, 1588774, 1579957, 1567758, 1552316, 1533801, 1512415, 1484803, 1453673, 1420110, 1384439, + 1346984, 1308065, 1267989, 1227047, 1185512, 1143640, 1101662, 1059788, 1018207, 981238, 945184, 909602, + 874589, 840229, 806592, 773737, 741711, 711387, 690636, 670231, 1512415, 1510791, 1505526, 1496049, + 1482942, 1466362, 1446502, 1423584, 1397857, 1369587, 1339055, 1306546, 1272346, 1236738, 1199997, 1162385, + 1124152, 1085527, 1046725, 1008579, 974370, 940316, 906539, 873145, 840229, 807872, 776146, 745108, + 714808, 694081, 674165, 654558, 1420110, 1418378, 1413203, 1404644, 1392801, 1377806, 1359827, 1339055, + 1315706, 1290011, 1262216, 1232570, 1201328, 1168739, 1135051, 1100497, 1065304, 1029682, 996086, 964208, + 932290, 900461, 868835, 837515, 806592, 776146, 746245, 716949, 696161, 676809, 657707, 638880, + 1332765, 1331200, 1326521, 1318782, 1308065, 1294488, 1278192, 1259345, 1238133, 1214758, 1189436, 1162385, + 1133832, 1103997, 1073100, 1041353, 1009480, 980376, 950912, 921230, 891460, 861723, 832126, 802768, + 773737, 745108, 716949, 696857, 678137, 659608, 641298, 623233, 1250792, 1249376, 1245141, 1238133, + 1228424, 1216114, 1201328, 1184209, 1164921, 1143640, 1120554, 1095858, 1069748, 1042424, 1014081, 988182, + 961693, 934687, 907303, 879672, 851914, 824141, 796456, 768952, 741711, 714808, 696161, 678137, + 660244, 642513, 624972, 607647, 1173861, 1172578, 1168739, 1162385, 1153579, 1142407, 1128976, 1113412, + 1095858, 1076467, 1055406, 1032846, 1009480, 987310, 964208, 940317, 915775, 890716, 865268, 839549, + 813673, 787744, 761857, 736100, 711387, 694081, 676809, 659608, 642513, 625554, 608759, 592154, + 1101662, 1100497, 1097015, 1091248, 1083251, 1073100, 1060888, 1046725, 1030735, 1013100, 996086, 977795, + 958355, 937898, 916551, 894445, 871705, 848451, 824802, 800867, 776750, 752548, 728350, 707081, + 690636, 674165, 657707, 641298, 624972, 608759, 592687, 576779, 1033903, 1032846, 1029682, 1024442, + 1017173, 1008579, 998743, 987310, 974370, 960022, 944370, 927525, 909602, 890716, 870986, 850526, + 829452, 807872, 785895, 763621, 741147, 718562, 701409, 685862, 670231, 654558, 638880, 623233, + 607647, 592154, 576779, 561546}, + {0, 0, 0, 0, 1591422, 1272360, 1017266, 841460, 704536, 589892, 493904, 413535, 346244, + 290031, 243063, 203702, 170715, 143069, 119901, 102775, 95833, 89361, 83325, 77697, 72450, 67557, + 63969, 60623, 57452, 54447, 51599, 48900, 0, 0, 0, 0, 1548184, 1244480, 998601, + 830906, 696789, 584118, 489548, 410217, 343695, 288069, 241537, 202507, 169776, 142329, 119314, 102586, + 95666, 89212, 83193, 77580, 72344, 67462, 63903, 60563, 57397, 54397, 51553, 48858, 0, + 0, 0, 0, 1431876, 1167297, 948704, 800624, 674393, 567335, 476834, 400500, 336213, 282299, + 237036, 198980, 167000, 140136, 117577, 102023, 95167, 88769, 82798, 77227, 72030, 67211, 63706, + 60383, 57232, 54246, 51415, 48731, 0, 0, 0, 0, 1272360, 1056482, 886216, 754291, + 639652, 541032, 456753, 385057, 324262, 273042, 229792, 193287, 162509, 136582, 114755, 101097, 94345, + 88038, 82147, 76646, 71510, 66853, 63379, 60084, 56960, 53996, 51186, 48521, 1591422, 1548184, + 1431876, 1272360, 1098449, 935558, 810496, 696789, 595760, 507341, 430751, 364887, 308578, 260793, 220159, + 185686, 156491, 131804, 110952, 99824, 93215, 87032, 81250, 75844, 70792, 66357, 62926, 59671, + 56581, 53650, 50868, 48229, 1272360, 1244480, 1167297, 1056482, 935558, 830906, 728683, 633061, 546134, + 468641, 400500, 341174, 290031, 246158, 208579, 176499, 149185, 125980, 106300, 98229, 91796, 85767, + 80120, 74833, 69886, 65728, 62353, 59146, 56101, 53210, 50465, 47859, 1017266, 998601, 948704, + 886216, 810496, 728683, 646358, 567335, 493904, 427222, 367671, 315143, 269461, 229792, 195537, 166089, + 140862, 119314, 102964, 96338, 90110, 84261, 78772, 73626, 68803, 64974, 61663, 58515, 55523, + 52680, 49978, 47412, 841460, 830906, 800624, 754291, 696789, 633061, 567335, 502799, 441610, 385057, + 333772, 288069, 247726, 212344, 181524, 154827, 131804, 112021, 100548, 94183, 88184, 82537, 77227, + 72239, 67557, 64102, 60865, 57784, 54853, 52064, 49413, 46892, 704536, 696789, 674393, 639652, + 595760, 546134, 493904, 441610, 391125, 343695, 300130, 260793, 225593, 194408, 167000, 143069, 122285, + 104314, 97880, 91796, 86045, 80619, 75505, 70690, 66427, 63120, 59966, 56960, 54096, 51369, + 48773, 46303, 589892, 584118, 567335, 541032, 507341, 468641, 427222, 385057, 343695, 304312, 267695, + 234101, 203702, 176499, 152375, 131139, 112561, 101097, 95002, 89212, 83724, 78531, 73626, 68998, + 65246, 62038, 58973, 56048, 53258, 50599, 48064, 45650, 493904, 489548, 476834, 456753, 430751, + 400500, 367671, 333772, 300130, 267695, 237036, 208579, 182553, 159033, 137989, 119314, 103728, 97707, + 91952, 86466, 81250, 76301, 71613, 67211, 63969, 60865, 57896, 55058, 52347, 49759, 47291, + 44937, 413535, 410217, 400500, 385057, 364887, 341174, 315143, 288069, 260793, 234101, 208579, 184633, + 162509, 142329, 124114, 107821, 99824, 94183, 88769, 83591, 78652, 73952, 69489, 65728, 62607, + 59612, 56743, 53996, 51369, 48858, 46459, 44170, 346244, 343695, 336213, 324262, 308578, 290031, + 269461, 247726, 225593, 203702, 182553, 162509, 143815, 126610, 110952, 101281, 95833, 90564, 85490, + 80619, 75958, 71510, 67283, 64168, 61170, 58289, 55523, 52871, 50331, 47900, 45574, 43352, + 290031, 288069, 282299, 273042, 260793, 246158, 229792, 212344, 194408, 176499, 159033, 142329, 126610, + 112021, 102023, 96848, 91796, 86890, 82147, 77580, 73195, 68998, 65521, 62543, 59671, 56905, + 54246, 51692, 49241, 46892, 44642, 42490, 243063, 241537, 237036, 229792, 220159, 208579, 195537, + 181524, 167000, 152375, 137989, 124114, 110952, 102023, 97190, 92422, 87749, 83193, 78772, 74501, + 70387, 66639, 63706, 60865, 58120, 55471, 52920, 50465, 48105, 45841, 43669, 41588, 203702, + 202507, 198980, 193287, 185686, 176499, 166089, 154827, 143069, 131139, 119314, 107821, 101281, 96848, + 92422, 88038, 83724, 79503, 75392, 71406, 67557, 64636, 61850, 59146, 56528, 53996, 51553, + 49198, 46932, 44753, 42660, 40652, 170715, 169776, 167000, 162509, 156491, 149185, 140862, 131804, + 122285, 112561, 103728, 99824, 95833, 91796, 87749, 83724, 79748, 75844, 72030, 68319, 65315, + 62607, 59966, 57397, 54904, 52489, 50154, 47900, 45726, 43634, 41621, 39687, 143069, 142329, + 140136, 136582, 131804, 125980, 119314, 112021, 104314, 101097, 97707, 94183, 90564, 86890, 83193, + 79503, 75844, 72239, 68706, 65728, 63120, 60563, 58064, 55628, 53258, 50959, 48731, 46577, + 44496, 42490, 40557, 38698, 119901, 119314, 117577, 114755, 110952, 106300, 102964, 100548, 97880, + 95002, 91952, 88769, 85490, 82147, 78772, 75392, 72030, 68706, 65867, 63379, 60926, 58515, + 56154, 53847, 51599, 49413, 47291, 45235, 43247, 41327, 39474, 37689, 102775, 102586, 102023, + 101097, 99824, 98229, 96338, 94183, 91796, 89212, 86466, 83591, 80619, 77580, 74501, 71406, + 68319, 65728, 63379, 61048, 58744, 56474, 54246, 52064, 49934, 47859, 45841, 43882, 41985, + 40150, 38377, 36666, 95833, 95666, 95167, 94345, 93215, 91796, 90110, 88184, 86045, 83724, + 81250, 78652, 75958, 73195, 70387, 67557, 65315, 63120, 60926, 58744, 56581, 54447, 52347, + 50287, 48271, 46303, 44387, 42524, 40716, 38964, 37269, 35632, 89361, 89212, 88769, 88038, + 87032, 85767, 84261, 82537, 80619, 78531, 76301, 73952, 71510, 68998, 66639, 64636, 62607, + 60563, 58515, 56474, 54447, 52442, 50465, 48521, 46616, 44753, 42935, 41164, 39444, 37774, + 36157, 34592, 83325, 83193, 82798, 82147, 81250, 80120, 78772, 77227, 75505, 73626, 71613, + 69489, 67283, 65521, 63706, 61850, 59966, 58064, 56154, 54246, 52347, 50465, 48605, 46773, + 44974, 43212, 41490, 39810, 38174, 36585, 35043, 33722, 77697, 77580, 77227, 76646, 75844, + 74833, 73626, 72239, 70690, 68998, 67211, 65728, 64168, 62543, 60865, 59146, 57397, 55628, + 53847, 52064, 50287, 48521, 46773, 45049, 43352, 41687, 40057, 38464, 36911, 35400, 34021, + 32902, 72450, 72344, 72030, 71510, 70792, 69886, 68803, 67557, 66427, 65246, 63969, 62607, + 61170, 59671, 58120, 56528, 54904, 53258, 51599, 49934, 48271, 46616, 44974, 43352, 41753, + 40181, 38639, 37131, 35658, 34249, 33154, 32080, 67557, 67462, 67211, 66853, 66357, 65728, + 64974, 64102, 63120, 62038, 60865, 59612, 58289, 56905, 55471, 53996, 52489, 50959, 49413, + 47859, 46303, 44753, 43212, 41687, 40181, 38698, 37242, 35814, 34419, 33335, 32287, 31258, + 63969, 63903, 63706, 63379, 62926, 62353, 61663, 60865, 59966, 58973, 57896, 56743, 55523, + 54246, 52920, 51553, 50154, 48731, 47291, 45841, 44387, 42935, 41490, 40057, 38639, 37242, + 35867, 34517, 33445, 32426, 31423, 30437, 60623, 60563, 60383, 60084, 59671, 59146, 58515, + 57784, 56960, 56048, 55058, 53996, 52871, 51692, 50465, 49198, 47900, 46577, 45235, 43882, + 42524, 41164, 39810, 38464, 37131, 35814, 34517, 33482, 32496, 31522, 30563, 29619, 57452, + 57397, 57232, 56960, 56581, 56101, 55523, 54853, 54096, 53258, 52347, 51369, 50331, 49241, + 48105, 46932, 45726, 44496, 43247, 41985, 40716, 39444, 38174, 36911, 35658, 34419, 33445, + 32496, 31556, 30627, 29710, 28807, 54447, 54397, 54246, 53996, 53650, 53210, 52680, 52064, + 51369, 50599, 49759, 48858, 47900, 46892, 45841, 44753, 43634, 42490, 41327, 40150, 38964, + 37774, 36585, 35400, 34249, 33335, 32426, 31522, 30627, 29741, 28865, 28002, 51599, 51553, + 51415, 51186, 50868, 50465, 49978, 49413, 48773, 48064, 47291, 46459, 45574, 44642, 43669, + 42660, 41621, 40557, 39474, 38377, 37269, 36157, 35043, 34021, 33154, 32287, 31423, 30563, + 29710, 28865, 28030, 27205, 48900, 48858, 48731, 48521, 48229, 47859, 47412, 46892, 46303, + 45650, 44937, 44170, 43352, 42490, 41588, 40652, 39687, 38698, 37689, 36666, 35632, 34592, + 33722, 32902, 32080, 31258, 30437, 29619, 28807, 28002, 27205, 26417}}; + +const uint8_t LUTCeilLog2Nonzero[1024] = { + 127, 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10}; + +//==========================================================// +// load data +//==========================================================// +void load_dct8_pixel(unsigned ysize, unsigned xsize, float* axi_opsin, hls::stream& opsin8x8_stream) { +#pragma HLS INLINE off + + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + uint32_t ysize64 = tile_ysize / 64; + uint32_t xsize64 = tile_xsize / 64; + +loop_load_dct8_pixel: + for (int i = 0; i < ysize64 * xsize64; i++) { + uint32_t addr = i * 4096 * 3; + for (int j = 0; j < 4096 * 3; j++) { +#pragma HLS PIPELINE II = 1 + float reg = axi_opsin[addr + j]; + opsin8x8_stream.write(reg); + } + } +} + +void load_dct16_pixel(unsigned ysize, unsigned xsize, float* axi_opsin, hls::stream& opsin16x16_stream) { +#pragma HLS INLINE off + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + uint32_t ysize64 = tile_ysize / 64; + uint32_t xsize64 = tile_xsize / 64; + +loop_load_dct16_pixel: + for (int i = 0; i < ysize64 * xsize64; i++) { + uint32_t addr = i * 4096 * 3; + for (int j = 0; j < 4096 * 3; j++) { +#pragma HLS PIPELINE II = 1 + float reg = axi_opsin[addr + j]; + opsin16x16_stream.write(reg); + } + } +} + +void load_dct32_pixel(unsigned ysize, unsigned xsize, float* axi_opsin, hls::stream& opsin32x32_stream) { +#pragma HLS INLINE off + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + uint32_t ysize64 = tile_ysize / 64; + uint32_t xsize64 = tile_xsize / 64; + +loop_load_dct32_pixel: + for (int i = 0; i < ysize64 * xsize64; i++) { + uint32_t addr = i * 4096 * 3; + for (int j = 0; j < 4096 * 3; j++) { +#pragma HLS PIPELINE II = 1 + float reg = axi_opsin[addr + j]; + opsin32x32_stream.write(reg); + } + } +} + +void loadPixel(unsigned ysize, + unsigned xsize, + float* axi_opsin_1, + float* axi_opsin_2, + float* axi_opsin_3, + hls::stream& opsin8x8_stream, + hls::stream& opsin16x16_stream, + hls::stream& opsin32x32_stream) { +#pragma HLS INLINE + load_dct8_pixel(ysize, xsize, axi_opsin_1, opsin8x8_stream); + load_dct16_pixel(ysize, xsize, axi_opsin_2, opsin16x16_stream); + load_dct32_pixel(ysize, xsize, axi_opsin_3, opsin32x32_stream); +} + +void load_rqf_mask(int xsize, + int ysize, + float* aq_map_f, + float* masking_field_row, + float* quant_field_row, + int stride, + hls::stream& stream_q, + hls::stream& stream_mask, + hls::stream& stream_rqf) { +#pragma HLS INLINE off + int xsize_blocks = xsize / 8; + int ysize_blocks = ysize / 8; + int n_enc_tiles = (xsize_blocks + 7) / 8; + uint32_t xnum_tile = (xsize_blocks + 7) / 8; + uint32_t ynum_tile = (ysize_blocks + 7) / 8; +LOOP_0: + for (int tid = 0; tid < xnum_tile * ynum_tile; tid++) { + int tx1 = tid % n_enc_tiles; + int ty1 = tid / n_enc_tiles; + int by = ty1 * 8; + int by1 = fmin((int)((ty1 + 1) * 8), ysize_blocks); + int bx = tx1 * 8; + int bx1 = fmin((int)((tx1 + 1) * 8), xsize_blocks); + int rect_ysize = by1 - by; + int rect_xsize = bx1 - bx; + LOOP_1: + for (int iy = 0; iy < rect_ysize; iy++) { + LOOP_2: + for (int ix = 0; ix < rect_xsize; ix++) { +#pragma HLS PIPELINE II = 1 + int x = 8 * (bx + ix); + int y = 8 * (by + iy); + int index0 = (y / 8 * stride) + x / 8; + float quant_norm8 = 0; + float masking = 0; + quant_norm8 = quant_field_row[index0]; + stream_q.write(quant_norm8); + masking = masking_field_row[index0]; + stream_mask.write(masking); + int index = (by + iy) * xsize_blocks + (bx + ix); + float rqf_tmp = aq_map_f[index]; + stream_rqf.write(rqf_tmp); + } + } + } +} + +//==========================================================================// +// data write out +//==========================================================================// +void ac_coeff_writeout(int xsize, int ysize, hls::stream& ac_coef_strm, int* ac_coef_axiout) { + unsigned xsizeblock = (xsize + 7) / 8; + unsigned ysizeblock = (ysize + 7) / 8; + for (int i = 0; i < xsizeblock * ysizeblock * 3 * 64; i++) { + ac_coef_axiout[i] = ac_coef_strm.read(); + } +} + +void dc_8x8_writeout(unsigned ysize, + unsigned xsize, + float* hls_dc8x8, + hls::stream& stream_rectx_dc0, + hls::stream& stream_recty_dc0, + hls::stream& dc_coef8x8_stream) { +#pragma HLS INLINE off + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + uint32_t ysize64 = tile_ysize / 64; + uint32_t xsize64 = tile_xsize / 64; + uint32_t ysize32 = tile_ysize / 32; + uint32_t xsize32 = tile_xsize / 32; + uint32_t ysize16 = tile_ysize / 16; + uint32_t xsize16 = tile_xsize / 16; + uint32_t ysize8 = tile_ysize / 8; + uint32_t xsize8 = tile_xsize / 8; + + // dc writeout + int N = 1; + int block_n = N * N; + int block_half_n = N * 8; + int xsize_blocks = xsize / 8; + int ysize_blocks = ysize / 8; + int n_enc_tiles = (xsize_blocks + 7) / 8; +loop_dc8_writeout: + for (uint32_t y64 = 0; y64 < ysize64; y64++) { + for (uint32_t x64 = 0; x64 < xsize64; x64++) { + int rect_ysize = stream_recty_dc0.read(); + int rect_xsize = stream_rectx_dc0.read(); + for (uint32_t y8 = 0; y8 < 8; y8++) { + for (uint32_t x8 = 0; x8 < 8; x8++) { + for (int c = 0; c < 3; c++) { +#pragma HLS PIPELINE II = 1 + // edge judgement + // int tx1 = x64; + // int ty1 = y64; + // int by = ty1 * 8; + // int by1 = fmin((int)((ty1 + 1) * 8), ysize_blocks); + // int bx = tx1 * 8; + // int bx1 = fmin((int)((tx1 + 1) * 8), xsize_blocks); + // int rect_ysize = by1 - by; + // int rect_xsize = bx1 - bx; + // int tile_xsize = (xsize + 63) / 64 * 64; + // int tile_ysize = (ysize + 63) / 64 * 64; + + int c_tmp = 0; + if (c == 0) { + c_tmp = 1; + } else if (c == 1) { + c_tmp = 0; + } else { + c_tmp = 2; + } + + size_t addr = y64 * xsize8 * 8 + x64 * 8 + y8 * xsize8 + x8; + + if (x8 < rect_xsize && y8 < rect_ysize) { + float reg = dc_coef8x8_stream.read(); + hls_dc8x8[c_tmp * tile_ysize * tile_xsize + addr] = reg; + } + } + } + } + } + } +} + +void dc_16x16_writeout(unsigned ysize, + unsigned xsize, + float* hls_dc16x16, + hls::stream& stream_rectx_dc1, + hls::stream& stream_recty_dc1, + hls::stream& dc_coef16x16_stream) { +#pragma HLS INLINE off + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + size_t ysize64 = tile_ysize / 64; + size_t xsize64 = tile_xsize / 64; + size_t ysize32 = tile_ysize / 32; + size_t xsize32 = tile_xsize / 32; + size_t ysize16 = tile_ysize / 16; + size_t xsize16 = tile_xsize / 16; + size_t ysize8 = tile_ysize / 8; + size_t xsize8 = tile_xsize / 8; + int N = 2; + int block_n = N * N; + int block_half_n = N * 8; + int xsize_blocks = xsize / 8; + int ysize_blocks = ysize / 8; + int n_enc_tiles = (xsize_blocks + 7) / 8; + for (size_t y64 = 0; y64 < ysize64; y64++) { + for (size_t x64 = 0; x64 < xsize64; x64++) { + int rect_ysize = stream_recty_dc1.read(); + int rect_xsize = stream_rectx_dc1.read(); + for (size_t y16 = 0; y16 < 4; y16++) { + for (size_t x16 = 0; x16 < 4; x16++) { + for (int c = 0; c < 3; c++) { + for (size_t m = 0; m < 2; m++) { + for (size_t n = 0; n < 2; n++) { +#pragma HLS PIPELINE II = 1 + // edge judgement + // int tx1 = x64; // tid % n_enc_tiles; + // int ty1 = y64; // tid / n_enc_tiles; + // int by = ty1 * 8; + // int by1 = fmin((int)((ty1 + 1) * 8), ysize_blocks); + // int bx = tx1 * 8; + // int bx1 = fmin((int)((tx1 + 1) * 8), xsize_blocks); + // int rect_ysize = by1 - by; + // int rect_xsize = bx1 - bx; + // int tile_xsize = (xsize + 63) / 64 * 64; + // int tile_ysize = (ysize + 63) / 64 * 64; + + int c_tmp = 0; + if (c == 0) { + c_tmp = 1; + } else if (c == 1) { + c_tmp = 0; + } else { + c_tmp = 2; + } + + size_t addr = + y64 * xsize16 * 4 * 4 + x64 * 4 * 4 + y16 * xsize16 * 4 + x16 * 4 + m * 2 + n; + + if ((2 * x16 + 1) < rect_xsize && (2 * y16 + 1) < rect_ysize) { + float reg = dc_coef16x16_stream.read(); + hls_dc16x16[c_tmp * tile_ysize * tile_xsize + addr] = reg; + } + } + } + } + } + } + } + } +} + +void dc_32x32_writeout(unsigned ysize, + unsigned xsize, + float* hls_dc32x32, + hls::stream& stream_rectx_dc2, + hls::stream& stream_recty_dc2, + hls::stream& dc_coef32x32_stream) { +#pragma HLS INLINE off + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + size_t ysize64 = tile_ysize / 64; + size_t xsize64 = tile_xsize / 64; + size_t ysize32 = tile_ysize / 32; + size_t xsize32 = tile_xsize / 32; + size_t ysize16 = tile_ysize / 16; + size_t xsize16 = tile_xsize / 16; + size_t ysize8 = tile_ysize / 8; + size_t xsize8 = tile_xsize / 8; + int N = 4; + int block_n = N * N; + int block_half_n = N * 8; + int xsize_blocks = xsize / 8; + int ysize_blocks = ysize / 8; + int n_enc_tiles = (xsize_blocks + 7) / 8; + for (size_t y64 = 0; y64 < ysize64; y64++) { + for (size_t x64 = 0; x64 < xsize64; x64++) { + int rect_ysize = stream_recty_dc2.read(); + int rect_xsize = stream_rectx_dc2.read(); + for (size_t y32 = 0; y32 < 2; y32++) { + for (size_t x32 = 0; x32 < 2; x32++) { + for (int c = 0; c < 3; c++) { + for (size_t m = 0; m < 4; m++) { + for (size_t n = 0; n < 4; n++) { +#pragma HLS PIPELINE II = 1 + // edge judgement + // int tx1 = x64; // tid % n_enc_tiles; + // int ty1 = y64; // tid / n_enc_tiles; + // int by = ty1 * 8; + // int by1 = fmin((int)((ty1 + 1) * 8), ysize_blocks); + // int bx = tx1 * 8; + // int bx1 = fmin((int)((tx1 + 1) * 8), xsize_blocks); + // int rect_ysize = by1 - by; + // int rect_xsize = bx1 - bx; + // int tile_xsize = (xsize + 63) / 64 * 64; + // int tile_ysize = (ysize + 63) / 64 * 64; + + int c_tmp = 0; + if (c == 0) { + c_tmp = 1; + } else if (c == 1) { + c_tmp = 0; + } else { + c_tmp = 2; + } + + size_t addr = y64 * xsize32 * 2 * 16 + x64 * 2 * 16 + y32 * xsize32 * 1 * 16 + + x32 * 1 * 16 + m * 4 + n; + + if ((4 * x32 + 3) < rect_xsize && (4 * y32 + 3) < rect_ysize) { + float reg = dc_coef32x32_stream.read(); + hls_dc32x32[c_tmp * tile_ysize * tile_xsize + addr] = reg; + } + } + } + } + } + } + } + } +} + +void GetDCSize(short xsize, + short ysize, + hls::stream& stream_rectx_dc, + hls::stream& stream_recty_dc, + hls::stream& stream_rectx0, + hls::stream& stream_recty0, + hls::stream& stream_rectx1, + hls::stream& stream_recty1, + hls::stream& stream_rectx2, + hls::stream& stream_recty2) { + uint16_t xsize_blocks = xsize / 8; + uint16_t ysize_blocks = ysize / 8; +LOOP_0: + for (uint16_t y = 0; y < (ysize_blocks + 8 - 1) / 8; y++) { + LOOP_1: + for (uint16_t x = 0; x < (xsize_blocks + 8 - 1) / 8; x++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty_dc.read(); + uint8_t rect_xsize = stream_rectx_dc.read(); + stream_rectx0.write(rect_xsize); + stream_recty0.write(rect_ysize); + stream_rectx1.write(rect_xsize); + stream_recty1.write(rect_ysize); + stream_rectx2.write(rect_xsize); + stream_recty2.write(rect_ysize); + } + } +} +void dc_writeout(unsigned ysize, + unsigned xsize, + float* hls_dc8x8, + float* hls_dc16x16, + float* hls_dc32x32, + + hls::stream& stream_rectx_dc, + hls::stream& stream_recty_dc, + hls::stream& dc_coef8x8_stream, + hls::stream& dc_coef16x16_stream, + hls::stream& dc_coef32x32_stream) { +#pragma HLS INLINE + hls::stream stream_rectx_dc0; + hls::stream stream_recty_dc0; + hls::stream stream_rectx_dc1; + hls::stream stream_recty_dc1; + hls::stream stream_rectx_dc2; + hls::stream stream_recty_dc2; + GetDCSize(xsize, ysize, stream_rectx_dc, stream_recty_dc, stream_rectx_dc0, stream_recty_dc0, stream_rectx_dc1, + stream_recty_dc1, stream_rectx_dc2, stream_recty_dc2); + dc_8x8_writeout(ysize, xsize, hls_dc8x8, stream_rectx_dc0, stream_recty_dc0, dc_coef8x8_stream); + dc_16x16_writeout(ysize, xsize, hls_dc16x16, stream_rectx_dc1, stream_recty_dc1, dc_coef16x16_stream); + dc_32x32_writeout(ysize, xsize, hls_dc32x32, stream_rectx_dc2, stream_recty_dc2, dc_coef32x32_stream); +} + +void cfl_writeout(unsigned xsize, + unsigned ysize, + hls::stream& cmapx_strm, + hls::stream& cmapb_strm, + int8_t* cmap_axi) { +#pragma HLS INLINE off + + int xnum_tile = (xsize + 63) / 64; + int ynum_tile = (ysize + 63) / 64; + int num_tile = xnum_tile * ynum_tile; + + for (int tid = 0; tid < num_tile; tid++) { +#pragma HLS PIPELINE II = 2 + cmap_axi[tid] = cmapx_strm.read(); + cmap_axi[num_tile + tid] = cmapb_strm.read(); + } +} + +void acs_rqf_writeout(int xsize, + int ysize, + unsigned char* strategy_all, + int* raw_quant_field_i, + hls::stream& stream_strategy, + hls::stream& stream_rqf) { +#pragma HLS INLINE off + int xsize_blocks = xsize / 8; + int ysize_blocks = ysize / 8; + uint32_t xnum_tile = (xsize_blocks + 7) / 8; + uint32_t ynum_tile = (ysize_blocks + 7) / 8; + ap_uint<64> visited; +LOOP_1: + for (uint8_t ty1 = 0; ty1 < ynum_tile; ty1++) { + LOOP_2: + for (uint8_t tx1 = 0; tx1 < xnum_tile; tx1++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + int by0 = ty1 * 8; + int by1 = fmin((int)((ty1 + 1) * 8), ysize_blocks); + int bx0 = tx1 * 8; + int bx1 = fmin((int)((tx1 + 1) * 8), xsize_blocks); + int rect_ysize = by1 - by0; + int rect_xsize = bx1 - bx0; + visited = 0; + LOOP_3: + for (uint8_t y = 0; y < rect_ysize; ++y) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_4: + for (uint8_t x = 0; x < rect_xsize; ++x) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + uint8_t idx = y * 8 + x; + if (visited.range(idx, idx) == 0) { + char strategy = stream_strategy.read(); + int rqf = stream_rqf.read(); + int b = strategy_block[strategy]; + LOOP_5: + for (uint8_t iy = 0; iy < b; iy++) { + LOOP_6: + for (uint8_t ix = 0; ix < b; ix++) { +#pragma HLS pipeline + uint16_t idx = (iy + y) * 8 + (ix + x); + visited.range(idx, idx) = 1; + uint16_t idxout = (y + by0 + iy) * xsize_blocks + (x + bx0 + ix); + strategy_all[(y + by0 + iy) * xsize_blocks + (x + bx0 + ix)] = strategy; + raw_quant_field_i[(y + by0 + iy) * xsize_blocks + (x + bx0 + ix)] = rqf; + } + } + } + } + } + } + } +} + +//=========================================================// +// module +//=========================================================// +// cfl ----------------------------------------------------- +void hls_CFLComputeTile(unsigned xsize, + unsigned ysize, + hls::stream& ac_coef_strm, + hls::stream& rqf_in_stream, + hls::stream& acs_strm, + hls::stream& cmapx_strm, + hls::stream& cmapb_strm, + hls::stream& cmapx_axi_strm, + hls::stream& cmapb_axi_strm, + hls::stream& ac_coef_cflout_strm, + hls::stream& rqf_out_stream, + hls::stream& acs_cflout_strm) { +#pragma HLS INLINE off + const uint8_t kDefaultColorFactor = 84U; + const float kInvColorFactor = 1.0f / kDefaultColorFactor; + const float kYToBRatio = 1.0f; + const float kDistanceMultiplierAC = 1e-3f; + + unsigned xsize_alg = (xsize + 7) / 8 * 8; + unsigned ysize_alg = (ysize + 7) / 8 * 8; + int xnum_tile = (xsize + 63) / 64; + int ynum_tile = (ysize + 63) / 64; + int num_tile = xnum_tile * ynum_tile; + unsigned tx0 = 0; + unsigned ty0 = 0; + + for (int tid = 0; tid < num_tile; tid++) { + ca_x_t ca_x = 0; + cb_x_t cb_x = 0; + ca_b_t ca_b = 0; + cb_b_t cb_b = 0; + unsigned xsize; + unsigned ysize; + + if (ty0 + 64 > ysize_alg) { + ysize = ysize_alg - ty0; + } else { + ysize = 64; + } + + if (tx0 + 64 > xsize_alg) { + xsize = xsize_alg - tx0; + } else { + xsize = 64; + } + + unsigned total_pix = xsize * ysize; + unsigned cur_pix = 0; + + while (cur_pix < total_pix) { + uint8_t acsRaw = acs_strm.read(); + acs_cflout_strm.write(acsRaw); + rqf_out_stream.write(rqf_in_stream.read()); + + float q = 27.996826171875; + float q_dc_x = 0.000218007407966069877147674560546875; + float q_dc_b = 0.00348811852745711803436279296875; + + unsigned csize; + if (acsRaw == 0) { + csize = 64; + } else if (acsRaw == 4) { + csize = 256; + } else if (acsRaw == 5) { + csize = 1024; + } + + int error_flag = 0; + + for (unsigned i = 0; i < csize; i++) { +#pragma HLS PIPELINE II = 3 + float b_y = ac_coef_strm.read(); + float b_x = ac_coef_strm.read(); + float b_b = ac_coef_strm.read(); + + ac_coef_cflout_strm.write(b_y); + ac_coef_cflout_strm.write(b_x); + ac_coef_cflout_strm.write(b_b); + + float qm_x; + float qm_b; + + if (acsRaw == 0) { + qm_x = qmx8x8[i]; + qm_b = qmb8x8[i]; + } else if (acsRaw == 4) { + qm_x = qmx16x16[i]; + qm_b = qmb16x16[i]; + } else if (acsRaw == 5) { + qm_x = qmx32x32[i]; + qm_b = qmb32x32[i]; + } + + float qqm_x = q * qm_x; + float qqm_b = q * qm_b; + + float coeffs_yx = b_y * qqm_x; + float coeffs_x = b_x * qqm_x; + float a = kInvColorFactor * coeffs_yx; + float b = 0.0f * coeffs_yx - coeffs_x; + ca_x = (ca_x_t)(a * a) + ca_x; + cb_x = (cb_x_t)(a * b) + cb_x; + + float coeffs_yb = b_y * qqm_b; + float coeffs_b = b_b * qqm_b; + + a = kInvColorFactor * coeffs_yb; + b = kYToBRatio * coeffs_yb - coeffs_b; + ca_b = (ca_b_t)(a * a) + ca_b; + cb_b = (cb_b_t)(a * b) + cb_b; + + cur_pix++; + } + } + + float x; + x = -(float)cb_x / ((float)ca_x + total_pix * kDistanceMultiplierAC * 0.5f); + int8_t cmap_x_reg = std::max(-128.0f, std::min(127.0f, std::roundf(x))); + cmapx_strm.write(cmap_x_reg); + cmapx_axi_strm.write(cmap_x_reg); + + x = -(float)cb_b / ((float)ca_b + total_pix * kDistanceMultiplierAC * 0.5f); + int8_t cmap_b_reg = std::max(-128.0f, std::min(127.0f, std::roundf(x))); + cmapb_strm.write(cmap_b_reg); + cmapb_axi_strm.write(cmap_b_reg); + + if (tx0 + 64 >= xsize_alg) { + tx0 = 0; + ty0 = ty0 + 64; + } else { + tx0 = tx0 + 64; + } + } +} +//--------------------------hls_compute_coefficients--------------------------// +float adjustQuantBias(size_t c, int32_t quant_i, const float* biases) { + int32_t min = INT32_MIN; + cast mi, ani, anno; + mi.i = min; + int32_t and_result = quant_i & mi.i; + ani.i = and_result; + float sign = ani.f; + // int32_t and_no_result = (~mi.i) & quant_i; + // anno.i = and_no_result; + float abs_quant = std::abs(quant_i); + // printf("%f %f\n", sign, abs_quant); + bool is_01 = abs_quant < 1.125f; + bool not_0 = abs_quant > 0; + cast bi, si; + bi.f = biases[c]; + int32_t iTmp = bi.i ^ ani.i; + si.i = iTmp; + float one_bias = not_0 ? (si.f) : 0; + float tmp = quant_i ? (1.0 / quant_i) : 0.0f; + float bias = quant_i - biases[3] * tmp; + return is_01 ? one_bias : bias; +} + +void hls_ComputeCoefficients(uint32_t xsize, + uint32_t ysize, + hls::stream& acsStrm, + hls::stream& dctStrm, + hls::stream& quantFieldStrm, + hls::stream& ytoxMapStrm, + hls::stream& ytobMapStrm, + hls::stream& acs_coeff_stream1, + hls::stream& coeffOutStrm, + hls::stream& coeff_axi_stream, + hls::stream& acs_axi_strm, + hls::stream& qf_axi_strm) { +#pragma HLS INLINE off + uint8_t acs; + uint8_t xblocks, yblocks; + int8_t ytox_map, ytob_map; + float x_factor, b_factor; + float qm_multiplier = 1.0f; + bool stop(false); + float coef_dct[3]; +#pragma HLS ARRAY_PARTITION variable = coef_dct complete dim = 1 + ap_uint<32> offset; + int block_out; + float thr_x, thr_y, thr_b, out_x, out_b; + + float thresy[4] = {0.5f, 0.6f, 0.6f, 0.65f}; + float thresxb[4] = {0.5f, 0.75f, 0.75f, 0.75f}; +#pragma HLS ARRAY_PARTITION variable = thresy complete dim = 1 +#pragma HLS ARRAY_PARTITION variable = thresxb complete dim = 1 + + uint32_t xsize_blocks = (xsize + 7) / 8; + uint32_t ysize_blocks = (ysize + 7) / 8; + uint16_t xsize_tails = DivCeil(xsize_blocks, kEncTileDimInBlocks); + uint16_t ysize_tails = DivCeil(ysize_blocks, kEncTileDimInBlocks); + uint16_t xsize_left = 8 - (xsize_tails * 8 - xsize_blocks); // not aligned for blocks + uint16_t ysize_left = 8 - (ysize_tails * 8 - ysize_blocks); // not aligned for blocks + uint16_t num_blocks; + + for (uint16_t ty = 0; ty < ysize_tails; ++ty) { + for (uint16_t tx = 0; tx < xsize_tails; ++tx) { + ytoxMapStrm.read(ytox_map); + ytobMapStrm.read(ytob_map); + x_factor = base_correlation_x + ytox_map * color_scale; + b_factor = base_correlation_b + ytob_map * color_scale; + if (tx == (xsize_tails - 1) && ty != (ysize_tails - 1)) { + num_blocks = xsize_left * 8; + } else if (tx != (xsize_tails - 1) && ty == (ysize_tails - 1)) { + num_blocks = ysize_left * 8; + } else if (tx == (xsize_tails - 1) && ty == (ysize_tails - 1)) { + num_blocks = xsize_left * ysize_left; + } else { + num_blocks = 64; + } + + uint32_t total = num_blocks * 64; + uint32_t cur = 0; + ap_uint<16> size = 0, count = 0; + ap_uint<16> y, x; + int quant; + float qac, fquant, inv_qac; + + while (cur < total) { +#pragma HLS PIPELINE II = 3 + if (count == 0) { + acsStrm.read(acs); + acs_axi_strm.write(acs); + acs_coeff_stream1.write(acs); + if (acs == Type::DCT) { + xblocks = 1; + yblocks = 1; + } else if (acs == Type::DCT16X16) { + xblocks = 2; + yblocks = 2; + } else { + xblocks = 4; + yblocks = 4; + } + + size = kDCTBlockSize * xblocks * yblocks; + quant = quantFieldStrm.read(); + qf_axi_strm.write(quant); + qac = global_scale_float * quant; + fquant = qac * qm_multiplier; // fquant_table[quant - 1]; + inv_qac = inv_global_scale / quant; // inv_qac_table[quant - 1]; + } + y = count / (yblocks * kBlockDim); + x = count % (xblocks * kBlockDim); + ap_uint<32> off; + ap_uint<32> yfix; + if (x == 0) { + off = y * kBlockDim * xblocks; + ap_uint<32> yhalf = yblocks * 4; // ysize * kBlockDim / 2 + if (y >= yhalf) + yfix = 2; + else + yfix = 0; + } + + thr_x = 0; + thr_y = 0; + thr_b = 0; + if (xblocks == 1) { + if (x >= 4) { + thr_x = thresxb[yfix + 1]; //(c == 1) ? thresy[yfix + 1] : thresxb[yfix + 1]; + thr_y = thresy[yfix + 1]; + thr_b = thresxb[yfix + 1]; + } else { + thr_x = thresxb[yfix]; //(c == 1) ? thresy[yfix + 1] : + // thresxb[yfix + 1]; + thr_y = thresy[yfix]; + thr_b = thresxb[yfix]; + } + } else { + ap_uint<32> xhalf = xblocks * 4; // xsize * kBlockDim / 2 + ap_uint<32> xfix; + if (x < xhalf) + xfix = 0; + else + xfix = 1; + thr_x = thresxb[yfix + xfix]; + thr_y = thresy[yfix + xfix]; + thr_b = thresxb[yfix + xfix]; // thr = (c == 1) ? thresy[yfix + + // xfix] : thresxb[yfix + xfix]; + } + + float q_x; + float q_y; + float q_b; + if (acs == Type::DCT) { + q_x = inv_dequant_stable[0 + off + x] * fquant; + q_y = inv_dequant_stable[64 + off + x] * fquant; + q_b = inv_dequant_stable[128 + off + x] * fquant; + } else if (acs == Type::DCT16X16) { + q_x = inv_dequant_stable[768 + off + x] * fquant; + q_y = inv_dequant_stable[1024 + off + x] * fquant; + q_b = inv_dequant_stable[1280 + off + x] * fquant; + } else if (acs == Type::DCT32X32) { + q_x = inv_dequant_stable[1536 + off + x] * fquant; + q_y = inv_dequant_stable[2560 + off + x] * fquant; + q_b = inv_dequant_stable[3584 + off + x] * fquant; + } + + coef_dct[1] = dctStrm.read(); + coef_dct[0] = dctStrm.read(); + coef_dct[2] = dctStrm.read(); + + float val_y; + val_y = q_y * coef_dct[1]; + + bool nzero_mask_y = std::abs(val_y) >= thr_y; + + int32_t v_y; + if (nzero_mask_y) { + v_y = std::roundf(val_y); + } else { + v_y = 0; + } + + float adj_quant = adjustQuantBias(1, v_y, kDefaultQuantBias); + float dequantm; + if (acs == Type::DCT) { + dequantm = dequant_table[64 + off + x]; + } else if (acs == Type::DCT16X16) { + dequantm = dequant_table[1024 + off + x]; + } else if (acs == Type::DCT32X32) { + dequantm = dequant_table[2560 + off + x]; + } + coef_dct[1] = adj_quant * dequantm * inv_qac; + + out_x = coef_dct[0] - x_factor * coef_dct[1]; + coef_dct[0] = out_x; + + out_b = coef_dct[2] - b_factor * coef_dct[1]; + coef_dct[2] = out_b; + + float val_x; //= q * coef_dct[c]; // block_in[off + x] + float val_b; + val_x = q_x * coef_dct[0]; + val_b = q_b * coef_dct[2]; + + bool nzero_mask_x = std::abs(val_x) >= thr_x; + + bool nzero_mask_b = std::abs(val_b) >= thr_b; + + int32_t v_x; + + int32_t v_b; + if (nzero_mask_x) { + v_x = std::roundf(val_x); + } else { + v_x = 0; + } + + if (nzero_mask_b) { + v_b = std::roundf(val_b); + } else { + v_b = 0; + } + + coeffOutStrm.write(v_y); + coeffOutStrm.write(v_x); + coeffOutStrm.write(v_b); + + coeff_axi_stream.write(v_y); + coeff_axi_stream.write(v_x); + coeff_axi_stream.write(v_b); + cur++; + count++; + if (count == size) count = 0; + } // while + } // tx + } // ty +} + +//--------------------- Compute ALL orders---------------------// +template // opt1:256(slow), opt2:8(fast) +void hls_sort(int size, + hls::stream& count_instrm, + hls::stream& pos_instrm, + hls::stream& pos_outstrm) { + unsigned count_shift[RANGE]; + unsigned pos_shift[RANGE]; + ap_uint cmp = 0; + + for (int i = 0; i < RANGE; i++) { +#pragma HLS UNROLL + count_shift[i] = 0; + } + + for (int i = 0; i < size + RANGE; i++) { +#pragma HLS PIPELINE II = 1 + unsigned count_reg; + unsigned pos_reg; + if (i < size) { + pos_reg = pos_instrm.read(); + count_reg = count_instrm.read(); + } else { + count_reg = -1; + pos_reg = -1; + } + + for (int i = 0; i < RANGE; i++) { +#pragma HLS UNROLL + cmp[i] = count_reg >= count_shift[i]; + } + + if (i >= RANGE) { + pos_outstrm.write(pos_shift[0]); + } + + for (int i = 1; i < RANGE; i++) { +#pragma HLS UNROLL + if (cmp[i] == 1) { + count_shift[i - 1] = count_shift[i]; + pos_shift[i - 1] = pos_shift[i]; + } + } + + unsigned insert_pos; + ap_uint cmp_br = ~cmp; + cmp_br.reverse(); + if (cmp_br == 0) { + insert_pos = RANGE - 1; + } else if (cmp == 0) { + insert_pos = 0; + } else { + insert_pos = cmp_br.countLeadingZeros() - 1; + } + + count_shift[insert_pos] = count_reg; + pos_shift[insert_pos] = pos_reg; + } +} + +void hls_sort_top(hls::stream& count_instrm, + hls::stream& pos_instrm, + hls::stream& pos_outstrm) { + unsigned sz; + for (uint8_t o = 0; o < 2; ++o) { + if (o == 0) { + sz = 64; + } else { + sz = 256; + } + + for (uint8_t c = 0; c < 3; c++) { +#ifndef __SYNTHESIS__ + hls_sort<8>(sz, count_instrm, pos_instrm, pos_outstrm); +#else + hls_sort<8>(sz, count_instrm, pos_instrm, pos_outstrm); +#endif + } + } +} + +void init_numzeros(int32_t num_zeros[3][320]) { + for (int i = 0; i < 320; i++) { + for (int c = 0; c < 3; c++) { +#pragma HLS PIPELINE II = 1 + num_zeros[c][i] = 0; + } + } +} + +void count_numzeros(unsigned xsize, + unsigned ysize, + hls::stream& ac_strategy_strm, + hls::stream& ac_coef_quant_strm, + hls::stream >& used_orders_strm, + int32_t num_zeros[3][320]) { +#pragma HLS INLINE off + unsigned xsize_alg = (xsize + 7) / 8 * 8; + unsigned ysize_alg = (ysize + 7) / 8 * 8; + unsigned total_pix = xsize_alg * ysize_alg; + unsigned cur_pix = 0; + + const int32_t offset8x8 = 0; + // const int32_t offsetIDT = 64; + const int32_t offset16x16 = 64; + + ap_uint<3> used_orders_ap = 0; + + while (cur_pix < total_pix) { + uint8_t acsRaw = ac_strategy_strm.read(); + unsigned size; + if (acsRaw == 0) { + size = 64; + used_orders_ap[0] = 1; + } else if (acsRaw != 0 && acsRaw < 4) { + used_orders_ap[1] = 1; + size = 64; + } else if (acsRaw == 4) { + used_orders_ap[2] = 1; + size = 256; + } else if (acsRaw == 5) { + size = 1024; + } + cur_pix = cur_pix + size; + + for (unsigned k = 0; k < size; k++) { + for (int c = 0; c < 3; c++) { +#pragma HLS PIPELINE II = 1 +#pragma HLS DEPENDENCE variable = num_zeros type = inter false + bool is_zerox = ac_coef_quant_strm.read() == 0; + if (is_zerox) { + if (acsRaw == 0) { + num_zeros[c][offset8x8 + k]++; + } else if (acsRaw == 4) { + num_zeros[c][offset16x16 + k]++; + } + } + } + } + } + + used_orders_strm.write(used_orders_ap); + num_zeros[0][offset8x8] = -1; + num_zeros[0][offset16x16 + 0] = -1; + num_zeros[0][offset16x16 + 1] = -1; + num_zeros[0][offset16x16 + 16] = -1; + num_zeros[0][offset16x16 + 17] = -1; + num_zeros[1][offset8x8] = -1; + num_zeros[1][offset16x16 + 0] = -1; + num_zeros[1][offset16x16 + 1] = -1; + num_zeros[1][offset16x16 + 16] = -1; + num_zeros[1][offset16x16 + 17] = -1; + num_zeros[2][offset8x8] = -1; + num_zeros[2][offset16x16 + 0] = -1; + num_zeros[2][offset16x16 + 1] = -1; + num_zeros[2][offset16x16 + 16] = -1; + num_zeros[2][offset16x16 + 17] = -1; +} + +void load_nz2strm(int32_t num_zeros[3][320], hls::stream& count_strm, hls::stream& pos_strm) { + const int32_t offset8x8 = 0; + // const int32_t offsetIDT = 64; + const int32_t offset16x16 = 64; + + const uint32_t coef8x8_zigzag[64] = {0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63}; + const uint32_t coef16x16_zigzag[256] = { + 0, 1, 16, 17, 32, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4, 5, 20, 35, 50, 65, 80, 96, + 81, 66, 51, 36, 21, 6, 7, 22, 37, 52, 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, + 8, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100, 85, 70, 55, 40, 25, 10, + 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87, 72, 57, + 42, 27, 12, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, 224, 209, 194, 179, 164, + 149, 134, 119, 104, 89, 74, 59, 44, 29, 14, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, + 195, 210, 225, 240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91, 76, 61, 46, 31, 47, 62, 77, + 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242, 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93, + 78, 63, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185, 170, 155, 140, + 125, 110, 95, 111, 126, 141, 156, 171, 186, 201, 216, 231, 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, + 143, 158, 173, 188, 203, 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235, 250, 251, + 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255}; + + unsigned sz; + + for (uint8_t o = 0; o < 2; ++o) { + float inv_sqrt_sz; + + if (o == 0) { + sz = 64; + inv_sqrt_sz = 1.0f / 8.0f; + } else { + sz = 256; + inv_sqrt_sz = 1.0f / 16.0f; + } + + for (uint8_t c = 0; c < 3; c++) { + for (unsigned i = 0; i < sz; ++i) { +#pragma HLS PIPELINE II = 1 + unsigned pos; + if (o == 0) { + pos = coef8x8_zigzag[i]; + } else { + pos = coef16x16_zigzag[i]; + } + + // We don't care for the exact number -> quantize number of zeros, + // to get less permuted order. + if (o == 0) { + pos_strm.write(pos); + count_strm.write(num_zeros[c][offset8x8 + pos] * inv_sqrt_sz + 0.1f); + } else { + pos_strm.write(pos); + count_strm.write(num_zeros[c][offset16x16 + pos] * inv_sqrt_sz + 0.1f); + } + } + } + } +} + +void order_writeout(hls::stream >& used_orders_strm, + hls::stream& pos_strm, + uint32_t hls_order[320 * 3 + 1] // AXI port + ) { + const int32_t offset8x8 = 0; + // const int32_t offsetIDT = 64; + const int32_t offset16x16 = 64; + + const uint32_t coef8x8_zigzag[64] = {0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63}; + const uint32_t coef16x16_zigzag[256] = { + 0, 1, 16, 17, 32, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4, 5, 20, 35, 50, 65, 80, 96, + 81, 66, 51, 36, 21, 6, 7, 22, 37, 52, 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, + 8, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100, 85, 70, 55, 40, 25, 10, + 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87, 72, 57, + 42, 27, 12, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, 224, 209, 194, 179, 164, + 149, 134, 119, 104, 89, 74, 59, 44, 29, 14, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, + 195, 210, 225, 240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91, 76, 61, 46, 31, 47, 62, 77, + 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242, 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93, + 78, 63, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185, 170, 155, 140, + 125, 110, 95, 111, 126, 141, 156, 171, 186, 201, 216, 231, 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, + 143, 158, 173, 188, 203, 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235, 250, 251, + 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255}; + + unsigned sz; + ap_uint<3> used_orders_ap = used_orders_strm.read(); + for (uint8_t o = 0; o < 2; ++o) { + if (o == 0) { + sz = 64; + } else { + sz = 256; + } + + bool is_nondefault = false; + for (uint8_t c = 0; c < 3; c++) { + for (unsigned i = 0; i < sz; ++i) { +#pragma HLS PIPELINE II = 1 + unsigned pos_reg = pos_strm.read(); + if (o == 0) { + hls_order[c * 320 + offset8x8 + i] = pos_reg; + } else { + hls_order[c * 320 + offset16x16 + i] = pos_reg; + } + if (o == 0) { + is_nondefault |= coef8x8_zigzag[i] != pos_reg; + } else { + is_nondefault |= coef16x16_zigzag[i] != pos_reg; + } + } + } + if (!is_nondefault) { + if (o == 0) + used_orders_ap[0] = 0; + else + used_orders_ap[2] = 0; + } + } + hls_order[320 * 3] = used_orders_ap; +} + +void order_finalize_dataflow(hls::stream >& used_orders_strm, + int32_t num_zeros[3][320], + uint32_t hls_order[320 * 3 + 1]) { +#pragma HLS DATAFLOW + hls::stream count_instrm("count_instrm"); + hls::stream pos_instrm("pos_instrm"); + hls::stream pos_outstrm("pos_outstrm"); + + load_nz2strm(num_zeros, count_instrm, pos_instrm); + + hls_sort_top(count_instrm, pos_instrm, pos_outstrm); + + order_writeout(used_orders_strm, pos_outstrm, hls_order); +} + +//-------------------------- dct --------------------------// +// dct8x8 +void hls_DCT1DImpl_8x8(float in[64], float out[64]) { +#pragma HLS INLINE off + float kMultipliers_N8_c1 = 0.5097955791041592; + float kMultipliers_N8_c2 = 0.6013448869350453; + float kMultipliers_N8_c3 = 0.8999762231364156; + float kMultipliers_N8_c4 = 2.5629154477415055; + float kMultipliers_N4_c1 = 0.541196100146197; + float kMultipliers_N4_c2 = 1.3065629648763764; + float sqrt2 = 1.4142135623730951f; + +loop_dct8x8: + for (int i = 0; i < 8; i += 1) { +#pragma HLS DEPENDENCE variable = in inter false +#pragma HLS DEPENDENCE variable = out inter false +#pragma HLS LOOP_FLATTEN off +#pragma HLS pipeline II = 2 + float tmp8_0 = in[i * 8 + 0] + in[i * 8 + 7]; + float tmp8_1 = in[i * 8 + 1] + in[i * 8 + 6]; + float tmp8_2 = in[i * 8 + 2] + in[i * 8 + 5]; + float tmp8_3 = in[i * 8 + 3] + in[i * 8 + 4]; + float tmp8_4 = in[i * 8 + 0] - in[i * 8 + 7]; + float tmp8_5 = in[i * 8 + 1] - in[i * 8 + 6]; + float tmp8_6 = in[i * 8 + 2] - in[i * 8 + 5]; + float tmp8_7 = in[i * 8 + 3] - in[i * 8 + 4]; + + float t00 = tmp8_0 + tmp8_3; + float t01 = tmp8_1 + tmp8_2; + float t02 = tmp8_0 - tmp8_3; + float t03 = tmp8_1 - tmp8_2; + + float t16 = tmp8_4 * kMultipliers_N8_c1; + float t17 = tmp8_5 * kMultipliers_N8_c2; + float t18 = tmp8_6 * kMultipliers_N8_c3; + float t19 = tmp8_7 * kMultipliers_N8_c4; + + // tmp 0~3 + float t04 = t00 + t01; + float t05 = t00 - t01; + float t06 = t02 * kMultipliers_N4_c1; + float t07 = t03 * kMultipliers_N4_c2; + + float t09 = t05; + float t10 = t06 + t07; + float t11 = t06 - t07; + + float t13 = t09; + float t14 = t10 * sqrt2 + t11; + float t15 = t11; + // tmp 4~7 + float t00_a = t16 + t19; + float t01_a = t17 + t18; + float t02_a = t16 - t19; + float t03_a = t17 - t18; + + float t04_a = t00_a + t01_a; + float t05_a = t00_a - t01_a; + float t06_a = t02_a * kMultipliers_N4_c1; + float t07_a = t03_a * kMultipliers_N4_c2; + + float t08_a = t04_a; + float t09_a = t05_a; + float t10_a = t06_a + t07_a; + float t11_a = t06_a - t07_a; + + float t12_a = t08_a; + float t13_a = t09_a; + float t14_a = t10_a * sqrt2 + t11_a; + float t15_a = t11_a; + + float tmp8_out1 = t14; + float tmp8_out2 = t05; + float tmp8_out3 = t15; + float tmp8_out4 = t12_a * sqrt2 + t14_a; + float tmp8_out5 = t14_a + t13_a; + float tmp8_out6 = t13_a + t15_a; + float tmp8_out7 = t15_a; + + out[i * 8 + 0] = t04; + out[i * 8 + 1] = tmp8_out4; + out[i * 8 + 2] = tmp8_out1; + out[i * 8 + 3] = tmp8_out5; + out[i * 8 + 4] = t05; + out[i * 8 + 5] = tmp8_out6; + out[i * 8 + 6] = tmp8_out3; + out[i * 8 + 7] = tmp8_out7; + } +} + +void hls_TransposeBlock8(float in[64], float out[64]) { +#pragma HLS INLINE off +loop_transposeBlock8: + for (int m = 0; m < 8; m++) { + for (int n = 0; n < 8; n++) { +#pragma HLS pipeline II = 1 + float mul = 1.0f / 8.0f; + out[n * 8 + m] = mul * in[m * 8 + n]; + } + } +} + +void dct8_block(float in[1024], float out[1024]) { +#pragma HLS INLINE off + float kMultipliers_N8_c1 = 0.5097955791041592; + float kMultipliers_N8_c2 = 0.6013448869350453; + float kMultipliers_N8_c3 = 0.8999762231364156; + float kMultipliers_N8_c4 = 2.5629154477415055; + float kMultipliers_N4_c1 = 0.541196100146197; + float kMultipliers_N4_c2 = 1.3065629648763764; + float sqrt2 = 1.4142135623730951f; + +loop_dct_block: + for (ap_uint<8> by = 0; by < 4; by++) { + for (ap_uint<8> bx = 0; bx < 4; bx++) { + for (ap_uint<8> x = 0; x < 8; x++) { +#pragma HLS DEPENDENCE variable = in inter false +#pragma HLS DEPENDENCE variable = out inter false +#pragma HLS LOOP_FLATTEN off +#pragma HLS pipeline + int addr = 8 * x + bx * 64 + by * 256; + + float mem_0 = in[addr + 0]; + float mem_1 = in[addr + 1]; + float mem_2 = in[addr + 2]; + float mem_3 = in[addr + 3]; + float mem_4 = in[addr + 4]; + float mem_5 = in[addr + 5]; + float mem_6 = in[addr + 6]; + float mem_7 = in[addr + 7]; + + float tmp8_0 = mem_0 + mem_7; + float tmp8_1 = mem_1 + mem_6; + float tmp8_2 = mem_2 + mem_5; + float tmp8_3 = mem_3 + mem_4; + float tmp8_4 = mem_0 - mem_7; + float tmp8_5 = mem_1 - mem_6; + float tmp8_6 = mem_2 - mem_5; + float tmp8_7 = mem_3 - mem_4; + + float t00 = tmp8_0 + tmp8_3; + float t01 = tmp8_1 + tmp8_2; + float t02 = tmp8_0 - tmp8_3; + float t03 = tmp8_1 - tmp8_2; + + float t16 = tmp8_4 * kMultipliers_N8_c1; + float t17 = tmp8_5 * kMultipliers_N8_c2; + float t18 = tmp8_6 * kMultipliers_N8_c3; + float t19 = tmp8_7 * kMultipliers_N8_c4; + + // tmp 0~3 + float t04 = t00 + t01; + float t05 = t00 - t01; + float t06 = t02 * kMultipliers_N4_c1; + float t07 = t03 * kMultipliers_N4_c2; + + float t08 = t04; + float t09 = t05; + float t10 = t06 + t07; + float t11 = t06 - t07; + + float t12 = t08; + float t13 = t09; + float t14 = t10 * sqrt2 + t11; + float t15 = t11; + // tmp 4~7 + float t00_a = t16 + t19; + float t01_a = t17 + t18; + float t02_a = t16 - t19; + float t03_a = t17 - t18; + + float t04_a = t00_a + t01_a; + float t05_a = t00_a - t01_a; + float t06_a = t02_a * kMultipliers_N4_c1; + float t07_a = t03_a * kMultipliers_N4_c2; + + float t08_a = t04_a; + float t09_a = t05_a; + float t10_a = t06_a + t07_a; + float t11_a = t06_a - t07_a; + + float t12_a = t08_a; + float t13_a = t09_a; + float t14_a = t10_a * sqrt2 + t11_a; + float t15_a = t11_a; + + float tmp8_out0 = t12; + float tmp8_out1 = t14; + float tmp8_out2 = t13; + float tmp8_out3 = t15; + float tmp8_out4 = t12_a * sqrt2 + t14_a; + float tmp8_out5 = t14_a + t13_a; + float tmp8_out6 = t13_a + t15_a; + float tmp8_out7 = t15_a; + + out[addr + 0] = tmp8_out0; + out[addr + 1] = tmp8_out4; + out[addr + 2] = tmp8_out1; + out[addr + 3] = tmp8_out5; + out[addr + 4] = tmp8_out2; + out[addr + 5] = tmp8_out6; + out[addr + 6] = tmp8_out3; + out[addr + 7] = tmp8_out7; + } + } + } +} + +void hls_TransposeBlock_dct8(float in[1024], float out[1024]) { +#pragma HLS INLINE off + for (ap_uint<8> by = 0; by < 4; by++) { + for (ap_uint<8> bx = 0; bx < 4; bx++) { + for (ap_uint<8> y = 0; y < 8; y++) { + for (ap_uint<8> x = 0; x < 8; x++) { +#pragma HLS pipeline II = 1 + ap_uint<10> addr_i, addr_o; + addr_i(9, 8) = by(1, 0); + addr_i(7, 5) = x(2, 0); + addr_i(4, 3) = bx(1, 0); + addr_i(2, 0) = y(2, 0); + addr_o(9, 8) = by(1, 0); + addr_o(7, 5) = y(2, 0); + addr_o(4, 3) = bx(1, 0); + addr_o(2, 0) = x(2, 0); + float mul = 1.0f / 8.0f; + out[addr_o] = mul * in[addr_i]; + } + } + } + } +} + +void split_ac_dc_dct8(float in[64], float to_ac[64], float to_dc[1]) { +#pragma HLS INLINE off + for (int m = 0; m < 8; m++) { + for (int n = 0; n < 8; n++) { +#pragma HLS PIPELINE II = 1 + to_ac[8 * m + n] = in[8 * m + n]; + if (m == 0 && n == 0) { + to_dc[0] = in[0]; + } + } + } +} + +void feed_ac_dct8(uint32_t x8, + uint32_t y8, + hls::stream& stream_recty, + hls::stream& stream_rectx, + float in[64], + hls::stream& ac_coef8x8_stream) { +#pragma HLS INLINE off + uint8_t rect_xsize; + uint8_t rect_ysize; +hls_feed_b64: + for (int m = 0; m < 8; m++) { + for (int n = 0; n < 8; n++) { +#pragma HLS PIPELINE II = 1 + if (m == 0 && n == 0) { + rect_xsize = stream_rectx.read(); + rect_ysize = stream_recty.read(); + } + if (x8 < rect_xsize && y8 < rect_ysize) { + ac_coef8x8_stream.write(in[m * 8 + n]); + } + } + } +} + +void feed_dc_dct8(uint32_t x8, + uint32_t y8, + hls::stream& stream_recty, + hls::stream& stream_rectx, + float in[1], + hls::stream& dc_coef8x8_stream) { +#pragma HLS INLINE off + uint8_t rect_xsize; + uint8_t rect_ysize; +hls_feed_b64: + for (int m = 0; m < 8; m++) { + for (int n = 0; n < 8; n++) { +#pragma HLS PIPELINE II = 1 + if (m == 0 && n == 0) { + rect_xsize = stream_rectx.read(); + rect_ysize = stream_recty.read(); + } + if (x8 < rect_xsize && y8 < rect_ysize) { + if (m == 0 && n == 0) { + dc_coef8x8_stream.write(in[0]); + } + } + } + } +} + +void hls_dct8x8_module(unsigned ysize, + unsigned xsize, + hls::stream& stream_recty8, + hls::stream& stream_rectx8, + hls::stream& stream_recty8_1, + hls::stream& stream_rectx8_1, + hls::stream& opsin8x8_stream, + hls::stream& ac_coef8x8_stream, + hls::stream& dc_coef8x8_stream) { +#pragma HLS INLINE off + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + + uint32_t ysize64 = tile_ysize / 64; + uint32_t xsize64 = tile_xsize / 64; + int xsize_blocks = xsize / 8; + int ysize_blocks = ysize / 8; + +loop_dct8_all: + for (uint32_t y64 = 0; y64 < ysize64; y64++) { + for (uint32_t x64 = 0; x64 < xsize64; x64++) { + int tx1 = x64; + int ty1 = y64; + int by = ty1 * 8; + int by1 = fmin((int)((ty1 + 1) * 8), ysize_blocks); + int bx = tx1 * 8; + int bx1 = fmin((int)((tx1 + 1) * 8), xsize_blocks); + int rect_ysize = by1 - by; + int rect_xsize = bx1 - bx; + loop_dct8_tile: + for (uint32_t y8 = 0; y8 < 8; y8++) { + for (uint32_t x8 = 0; x8 < 8; x8++) { + for (int c = 0; c < 3; c++) { +#pragma HLS DATAFLOW + float temp0[64]; +#pragma HLS bind_storage variable = temp0 type = ram_2p impl = bram + float temp1[64]; +#pragma HLS bind_storage variable = temp1 type = ram_2p impl = bram + float temp2[64]; +#pragma HLS bind_storage variable = temp2 type = ram_2p impl = bram + float temp3[64]; +#pragma HLS bind_storage variable = temp3 type = ram_2p impl = bram + float temp4[64]; +#pragma HLS bind_storage variable = temp4 type = ram_2p impl = bram + float to_ac[64]; +#pragma HLS bind_storage variable = to_ac type = ram_2p impl = bram + float to_dc[1]; +#pragma HLS bind_storage variable = to_dc type = ram_2p impl = bram + + load_b64: + for (int m = 0; m < 8; m++) { + for (int n = 0; n < 8; n++) { +#pragma HLS PIPELINE II = 1 + int addr = n * 8 + m; + temp0[addr] = opsin8x8_stream.read(); + } + } + + hls_DCT1DImpl_8x8(temp0, temp1); + hls_TransposeBlock8(temp1, temp2); + hls_DCT1DImpl_8x8(temp2, temp3); + hls_TransposeBlock8(temp3, temp4); + split_ac_dc_dct8(temp4, to_ac, to_dc); + feed_ac_dct8(x8, y8, stream_recty8, stream_rectx8, to_ac, ac_coef8x8_stream); + feed_dc_dct8(x8, y8, stream_recty8_1, stream_rectx8_1, to_dc, dc_coef8x8_stream); + } + } + } + } + } +} + +void hls_DCT1DImpl_16(float in[256], float out[256]) { +#pragma HLS INLINE off + float kMultipliers_N8_c1 = 0.5097955791041592; + + float kMultipliers_N8_c2 = 0.6013448869350453; + float kMultipliers_N8_c3 = 0.8999762231364156; + float kMultipliers_N8_c4 = 2.5629154477415055; + float kMultipliers_N4_c1 = 0.541196100146197; + float kMultipliers_N4_c2 = 1.3065629648763764; + + float kMultipliers_N16_0 = 0.5024192861881557; + float kMultipliers_N16_1 = 0.5224986149396889; + float kMultipliers_N16_2 = 0.5669440348163577; + float kMultipliers_N16_3 = 0.6468217833599901; + float kMultipliers_N16_4 = 0.7881546234512502; + float kMultipliers_N16_5 = 1.060677685990347; + float kMultipliers_N16_6 = 1.7224470982383342; + float kMultipliers_N16_7 = 5.101148618689155; + float sqrt2 = 1.4142135623730951f; + + float tmp16_0 = in[0] + in[15]; + float tmp16_1 = in[1] + in[14]; + float tmp16_2 = in[2] + in[13]; + float tmp16_3 = in[3] + in[12]; + float tmp16_4 = in[4] + in[11]; + float tmp16_5 = in[5] + in[10]; + float tmp16_6 = in[6] + in[9]; + float tmp16_7 = in[7] + in[8]; + float tmp16_8 = in[0] - in[15]; + float tmp16_9 = in[1] - in[14]; + float tmp16_10 = in[2] - in[13]; + float tmp16_11 = in[3] - in[12]; + float tmp16_12 = in[4] - in[11]; + float tmp16_13 = in[5] - in[10]; + float tmp16_14 = in[6] - in[9]; + float tmp16_15 = in[7] - in[8]; + + float tmp8_0 = tmp16_0 + tmp16_7; + float tmp8_1 = tmp16_1 + tmp16_6; + float tmp8_2 = tmp16_2 + tmp16_5; + float tmp8_3 = tmp16_3 + tmp16_4; + float tmp8_4 = tmp16_0 - tmp16_7; + float tmp8_5 = tmp16_1 - tmp16_6; + float tmp8_6 = tmp16_2 - tmp16_5; + float tmp8_7 = tmp16_3 - tmp16_4; + + float t00 = tmp8_0 + tmp8_3; + float t01 = tmp8_1 + tmp8_2; + float t02 = tmp8_0 - tmp8_3; + float t03 = tmp8_1 - tmp8_2; + float t04 = tmp8_4 * kMultipliers_N8_c1; + float t05 = tmp8_5 * kMultipliers_N8_c2; + float t06 = tmp8_6 * kMultipliers_N8_c3; + float t07 = tmp8_7 * kMultipliers_N8_c4; + + float t08 = t02 * kMultipliers_N4_c1; + float t09 = t03 * kMultipliers_N4_c2; + float t10 = t04 + t07; + float t11 = t05 + t06; + float t12 = t04 - t07; + float t13 = t05 - t06; + + float t14 = t08 + t09; + float t15 = t10 + t11; + float t16 = t08 - t09; + float t17 = t10 - t11; + + float t18 = t12 * kMultipliers_N4_c1; + float t19 = t13 * kMultipliers_N4_c2; + float t20 = t14 * sqrt2; + float t21 = t15 * sqrt2; + + float t22 = t18 + t19; + float t23 = t18 - t19; + + float t24 = t22 * sqrt2; + + float t25 = t24 + t23; + + float t26 = kMultipliers_N16_0 * tmp16_8; + float t27 = kMultipliers_N16_1 * tmp16_9; + float t28 = kMultipliers_N16_2 * tmp16_10; + float t29 = kMultipliers_N16_3 * tmp16_11; + float t30 = kMultipliers_N16_4 * tmp16_12; + float t31 = kMultipliers_N16_5 * tmp16_13; + float t32 = kMultipliers_N16_6 * tmp16_14; + float t33 = kMultipliers_N16_7 * tmp16_15; + + float dmp8_0 = t26 + t33; + float dmp8_1 = t27 + t32; + float dmp8_2 = t28 + t31; + float dmp8_3 = t29 + t30; + float dmp8_4 = t26 - t33; + float dmp8_5 = t27 - t32; + float dmp8_6 = t28 - t31; + float dmp8_7 = t29 - t30; + + float d00 = dmp8_0 + dmp8_3; + float d01 = dmp8_1 + dmp8_2; + float d02 = dmp8_0 - dmp8_3; + float d03 = dmp8_1 - dmp8_2; + float d04 = dmp8_4 * kMultipliers_N8_c1; + float d05 = dmp8_5 * kMultipliers_N8_c2; + float d06 = dmp8_6 * kMultipliers_N8_c3; + float d07 = dmp8_7 * kMultipliers_N8_c4; + + float d08 = d02 * kMultipliers_N4_c1; + float d09 = d03 * kMultipliers_N4_c2; + float d10 = d04 + d07; + float d11 = d05 + d06; + float d12 = d04 - d07; + float d13 = d05 - d06; + + float d14 = d08 + d09; + float d15 = d10 + d11; + float d16 = d08 - d09; + float d17 = d10 - d11; + + float d18 = d12 * kMultipliers_N4_c1; + float d19 = d13 * kMultipliers_N4_c2; + float d20 = d14 * sqrt2; + float d21 = d15 * sqrt2; + + float d22 = d18 + d19; + float d23 = d18 - d19; + + float d24 = d22 * sqrt2; + + float d25 = d24 + d23; + + float d26 = d00 + d01; + float d27 = d21 + d25; + float d28 = d20 + d16; + float d29 = d25 + d17; + float d30 = d00 - d01; + float d31 = d17 + d23; + float d32 = d26 * sqrt2; + + out[0] = t00 + t01; + out[1] = d32 + d27; + out[2] = t21 + t25; + out[3] = d27 + d28; + out[4] = t20 + t16; + out[5] = d28 + d29; + out[6] = t25 + t17; + out[7] = d29 + d30; + out[8] = t00 - t01; + out[9] = d30 + d31; + out[10] = t17 + t23; + out[11] = d31 + d16; + out[12] = t16; + out[13] = d16 + d23; + out[14] = t23; + out[15] = d23; +} + +void hls_dct16_block(float in[256], float out[256]) { +#pragma HLS INLINE off + float kMultipliers_N8_c1 = 0.5097955791041592; + + float kMultipliers_N8_c2 = 0.6013448869350453; + float kMultipliers_N8_c3 = 0.8999762231364156; + float kMultipliers_N8_c4 = 2.5629154477415055; + float kMultipliers_N4_c1 = 0.541196100146197; + float kMultipliers_N4_c2 = 1.3065629648763764; + + float kMultipliers_N16_0 = 0.5024192861881557; + float kMultipliers_N16_1 = 0.5224986149396889; + float kMultipliers_N16_2 = 0.5669440348163577; + float kMultipliers_N16_3 = 0.6468217833599901; + float kMultipliers_N16_4 = 0.7881546234512502; + float kMultipliers_N16_5 = 1.060677685990347; + float kMultipliers_N16_6 = 1.7224470982383342; + float kMultipliers_N16_7 = 5.101148618689155; + float sqrt2 = 1.4142135623730951f; + + for (int i = 0; i < 16; i++) { +#pragma HLS DEPENDENCE variable = in inter false +#pragma HLS DEPENDENCE variable = out inter false +#pragma HLS LOOP_FLATTEN off +#pragma HLS pipeline II = 11 + float tmp16_0 = in[16 * i + 0] + in[16 * i + 15]; + float tmp16_1 = in[16 * i + 1] + in[16 * i + 14]; + float tmp16_2 = in[16 * i + 2] + in[16 * i + 13]; + float tmp16_3 = in[16 * i + 3] + in[16 * i + 12]; + float tmp16_4 = in[16 * i + 4] + in[16 * i + 11]; + float tmp16_5 = in[16 * i + 5] + in[16 * i + 10]; + float tmp16_6 = in[16 * i + 6] + in[16 * i + 9]; + float tmp16_7 = in[16 * i + 7] + in[16 * i + 8]; + float tmp16_8 = in[16 * i + 0] - in[16 * i + 15]; + float tmp16_9 = in[16 * i + 1] - in[16 * i + 14]; + float tmp16_10 = in[16 * i + 2] - in[16 * i + 13]; + float tmp16_11 = in[16 * i + 3] - in[16 * i + 12]; + float tmp16_12 = in[16 * i + 4] - in[16 * i + 11]; + float tmp16_13 = in[16 * i + 5] - in[16 * i + 10]; + float tmp16_14 = in[16 * i + 6] - in[16 * i + 9]; + float tmp16_15 = in[16 * i + 7] - in[16 * i + 8]; + + float tmp8_0 = tmp16_0 + tmp16_7; + float tmp8_1 = tmp16_1 + tmp16_6; + float tmp8_2 = tmp16_2 + tmp16_5; + float tmp8_3 = tmp16_3 + tmp16_4; + float tmp8_4 = tmp16_0 - tmp16_7; + float tmp8_5 = tmp16_1 - tmp16_6; + float tmp8_6 = tmp16_2 - tmp16_5; + float tmp8_7 = tmp16_3 - tmp16_4; + + float t00 = tmp8_0 + tmp8_3; + float t01 = tmp8_1 + tmp8_2; + float t02 = tmp8_0 - tmp8_3; + float t03 = tmp8_1 - tmp8_2; + float t04 = tmp8_4 * kMultipliers_N8_c1; + float t05 = tmp8_5 * kMultipliers_N8_c2; + float t06 = tmp8_6 * kMultipliers_N8_c3; + float t07 = tmp8_7 * kMultipliers_N8_c4; + + float t08 = t02 * kMultipliers_N4_c1; + float t09 = t03 * kMultipliers_N4_c2; + float t10 = t04 + t07; + float t11 = t05 + t06; + float t12 = t04 - t07; + float t13 = t05 - t06; + + float t14 = t08 + t09; + float t15 = t10 + t11; + float t16 = t08 - t09; + float t17 = t10 - t11; + + float t18 = t12 * kMultipliers_N4_c1; + float t19 = t13 * kMultipliers_N4_c2; + float t20 = t14 * sqrt2; + float t21 = t15 * sqrt2; + + float t22 = t18 + t19; + float t23 = t18 - t19; + + float t24 = t22 * sqrt2; + + float t25 = t24 + t23; + + float t26 = kMultipliers_N16_0 * tmp16_8; + float t27 = kMultipliers_N16_1 * tmp16_9; + float t28 = kMultipliers_N16_2 * tmp16_10; + float t29 = kMultipliers_N16_3 * tmp16_11; + float t30 = kMultipliers_N16_4 * tmp16_12; + float t31 = kMultipliers_N16_5 * tmp16_13; + float t32 = kMultipliers_N16_6 * tmp16_14; + float t33 = kMultipliers_N16_7 * tmp16_15; + + float dmp8_0 = t26 + t33; + float dmp8_1 = t27 + t32; + float dmp8_2 = t28 + t31; + float dmp8_3 = t29 + t30; + float dmp8_4 = t26 - t33; + float dmp8_5 = t27 - t32; + float dmp8_6 = t28 - t31; + float dmp8_7 = t29 - t30; + + float d00 = dmp8_0 + dmp8_3; + float d01 = dmp8_1 + dmp8_2; + float d02 = dmp8_0 - dmp8_3; + float d03 = dmp8_1 - dmp8_2; + float d04 = dmp8_4 * kMultipliers_N8_c1; + float d05 = dmp8_5 * kMultipliers_N8_c2; + float d06 = dmp8_6 * kMultipliers_N8_c3; + float d07 = dmp8_7 * kMultipliers_N8_c4; + + float d08 = d02 * kMultipliers_N4_c1; + float d09 = d03 * kMultipliers_N4_c2; + float d10 = d04 + d07; + float d11 = d05 + d06; + float d12 = d04 - d07; + float d13 = d05 - d06; + + float d14 = d08 + d09; + float d15 = d10 + d11; + float d16 = d08 - d09; + float d17 = d10 - d11; + + float d18 = d12 * kMultipliers_N4_c1; + float d19 = d13 * kMultipliers_N4_c2; + float d20 = d14 * sqrt2; + float d21 = d15 * sqrt2; + + float d22 = d18 + d19; + float d23 = d18 - d19; + + float d24 = d22 * sqrt2; + + float d25 = d24 + d23; + + float d26 = d00 + d01; + float d27 = d21 + d25; + float d28 = d20 + d16; + float d29 = d25 + d17; + float d30 = d00 - d01; + float d31 = d17 + d23; + float d32 = d26 * sqrt2; + + out[16 * i + 0] = t00 + t01; + out[16 * i + 1] = d32 + d27; + out[16 * i + 2] = t21 + t25; + out[16 * i + 3] = d27 + d28; + out[16 * i + 4] = t20 + t16; + out[16 * i + 5] = d28 + d29; + out[16 * i + 6] = t25 + t17; + out[16 * i + 7] = d29 + d30; + out[16 * i + 8] = t00 - t01; + out[16 * i + 9] = d30 + d31; + out[16 * i + 10] = t17 + t23; + out[16 * i + 11] = d31 + d16; + out[16 * i + 12] = t16; + out[16 * i + 13] = d16 + d23; + out[16 * i + 14] = t23; + out[16 * i + 15] = d23; + } +} + +void hls_ReinterpretingIDCT16(float input[4], float output[4]) { +#pragma HLS INLINE off + float resample = 0.901764214038848876953125; + + float t0 = input[0]; + float t1 = input[1] * resample; + float t2 = input[2] * resample; + float t3 = input[3] * resample * resample; + + float t4 = t0 + t2; + float t5 = t1 + t3; + float t6 = t0 - t2; + float t7 = t1 - t3; + + float t8 = t4; + float t9 = t6; + float t10 = t5; + float t11 = t7; + + output[0] = t8 + t10; + output[2] = t8 - t10; + output[1] = t9 + t11; + output[3] = t9 - t11; +} + +// dct 16x16 +void load_dct16(float in[256], hls::stream& opsin16x16_stream) { +#pragma HLS INLINE off + for (int y8 = 0; y8 < 2; y8++) { + for (int x8 = 0; x8 < 2; x8++) { + for (int m = 0; m < 8; m++) { + for (int n = 0; n < 8; n++) { +#pragma HLS PIPELINE II = 1 + int addr = y8 * 16 * 8 + x8 * 8 + m * 16 + n; + in[addr] = opsin16x16_stream.read(); + } + } + } + } +} + +void transposeDct16(float in[256], float out[256]) { +#pragma HLS INLINE off + for (int i = 0; i < 16; i++) { + for (int j = 0; j < 16; j++) { +#pragma HLS PIPELINE II = 1 + out[j * 16 + i] = in[i * 16 + j]; + } + } +} + +void transposeDct16_scale(float in[256], float out[256]) { +#pragma HLS INLINE off + for (int i = 0; i < 16; i++) { + for (int j = 0; j < 16; j++) { +#pragma HLS PIPELINE II = 1 + float mul = 1.0f / 16.0f; + out[j * 16 + i] = mul * in[i * 16 + j]; + } + } +} + +void dct16_ac_writeout(float to_ac[256], + hls::stream& stream_recty, + hls::stream& stream_rectx, + hls::stream& ac_coef16x16_stream, + uint32_t x16, + uint32_t y16) { +#pragma HLS INLINE off + uint8_t rect_xsize; + uint8_t rect_ysize; + for (int m = 0; m < 256; m++) { +#pragma HLS PIPELINE II = 1 + if (m == 0) { + rect_xsize = stream_rectx.read(); + rect_ysize = stream_recty.read(); + } + if ((2 * x16 + 1) < rect_xsize && (2 * y16 + 1) < rect_ysize) { + ac_coef16x16_stream.write(to_ac[m]); + } + } +} + +void dct16_dc_writeout(float to_dc[4], + hls::stream& stream_recty, + hls::stream& stream_rectx, + hls::stream& dc_coef16x16_stream, + uint32_t x16, + uint32_t y16) { +#pragma HLS INLINE off + uint8_t rect_xsize; + uint8_t rect_ysize; + for (int m = 0; m < 4; m++) { +#pragma HLS PIPELINE II = 1 + if (m == 0) { + rect_xsize = stream_rectx.read(); + rect_ysize = stream_recty.read(); + } + if ((2 * x16 + 1) < rect_xsize && (2 * y16 + 1) < rect_ysize) { + dc_coef16x16_stream.write(to_dc[m]); + } + } +} + +void dct16_ac_dc_split(float in[256], float ac_out1[256], float dc_out[4]) { +#pragma HLS INLINE off + for (int i = 0; i < 256; i++) { +#pragma HLS PIPELINE II = 1 + ac_out1[i] = in[i]; + if (i == 0) + dc_out[0] = in[i]; + else if (i == 1) + dc_out[1] = in[i]; + else if (i == 16) + dc_out[2] = in[i]; + else if (i == 17) + dc_out[3] = in[i]; + } +} + +void hls_dct16x16_module(unsigned ysize, + unsigned xsize, + hls::stream& stream_recty16, + hls::stream& stream_rectx16, + hls::stream& stream_recty16_1, + hls::stream& stream_rectx16_1, + hls::stream& opsin16x16_stream, + hls::stream& ac_coef16x16_stream, + hls::stream& dc_coef16x16_stream) { +#pragma HLS INLINE off + + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + + uint32_t ysize64 = tile_ysize / 64; + uint32_t xsize64 = tile_xsize / 64; + uint32_t ysize16 = tile_ysize / 16; + uint32_t xsize16 = tile_xsize / 16; + int xsize_blocks = xsize / 8; + int ysize_blocks = ysize / 8; + + for (uint32_t y64 = 0; y64 < ysize64; y64++) { + for (uint32_t x64 = 0; x64 < xsize64; x64++) { + int tx1 = x64; + int ty1 = y64; + int by = ty1 * 8; + int by1 = fmin((int)((ty1 + 1) * 8), ysize_blocks); + int bx = tx1 * 8; + int bx1 = fmin((int)((tx1 + 1) * 8), xsize_blocks); + int rect_ysize = by1 - by; + int rect_xsize = bx1 - bx; + for (uint32_t y16 = 0; y16 < 4; y16++) { + for (uint32_t x16 = 0; x16 < 4; x16++) { + for (int c = 0; c < 3; c++) { +#pragma HLS LOOP_TRIPCOUNT min = 3 max = 3 +#pragma HLS DATAFLOW + float from[256]; +#pragma HLS bind_storage variable = from type = ram_2p impl = bram + float temp0[256]; +#pragma HLS bind_storage variable = temp0 type = ram_2p impl = bram + float temp1[256]; +#pragma HLS bind_storage variable = temp1 type = ram_2p impl = bram + float temp2[256]; +#pragma HLS bind_storage variable = temp2 type = ram_2p impl = bram + float temp3[256]; +#pragma HLS bind_storage variable = temp3 type = ram_2p impl = bram + float temp4[256]; +#pragma HLS bind_storage variable = temp4 type = ram_2p impl = bram + float to_ac[256]; +#pragma HLS bind_storage variable = to_ac type = ram_2p impl = bram + float to_dc[4]; +#pragma HLS bind_storage variable = to_dc type = ram_2p impl = bram + float dc_mem[4]; +#pragma HLS bind_storage variable = dc_mem type = ram_2p impl = bram + + dct16_test_load: + for (int m = 0; m < 16; m++) { + for (int n = 0; n < 16; n++) { +#pragma HLS PIPELINE II = 1 + int addr = 16 * m + n; + from[addr] = opsin16x16_stream.read(); + } + } + + transposeDct16(from, temp0); + hls_dct16_block(temp0, temp1); + transposeDct16_scale(temp1, temp2); + hls_dct16_block(temp2, temp3); + transposeDct16_scale(temp3, temp4); + dct16_ac_dc_split(temp4, to_ac, to_dc); + // output ac_coeff_stream + dct16_ac_writeout(to_ac, stream_recty16, stream_rectx16, ac_coef16x16_stream, x16, y16); + // output dc_coeff_stream + hls_ReinterpretingIDCT16(to_dc, dc_mem); + dct16_dc_writeout(dc_mem, stream_recty16_1, stream_rectx16_1, dc_coef16x16_stream, x16, y16); + } + } + } + } + } +} + +// template +void hls_DCT1DImpl_32(float in[1024], float out[1024]) { +#pragma HLS INLINE off + + float kMultipliers_N32_0 = 0.5006029982351963; + float kMultipliers_N32_1 = 0.5054709598975436; + float kMultipliers_N32_2 = 0.5154473099226246; + float kMultipliers_N32_3 = 0.531042591089784; + float kMultipliers_N32_4 = 0.553103896034444; + float kMultipliers_N32_5 = 0.5829349682061339; + float kMultipliers_N32_6 = 0.622504123035664; + float kMultipliers_N32_7 = 0.674808341455005; + float kMultipliers_N32_8 = 0.7445362710022986; + float kMultipliers_N32_9 = 0.839349645415526; + float kMultipliers_N32_10 = 0.9725682378619608; + float kMultipliers_N32_11 = 1.169439933432884; + float kMultipliers_N32_12 = 1.4841646163141662; + float kMultipliers_N32_13 = 2.057781009953411; + float kMultipliers_N32_14 = 3.407608418468719; + float kMultipliers_N32_15 = 10.19000812354803; + + float sqrt2 = 1.4142135623730951f; + + float kMultipliers_N8_c1 = 0.5097955791041592; + + float kMultipliers_N8_c2 = 0.6013448869350453; + float kMultipliers_N8_c3 = 0.8999762231364156; + float kMultipliers_N8_c4 = 2.5629154477415055; + float kMultipliers_N4_c1 = 0.541196100146197; + float kMultipliers_N4_c2 = 1.3065629648763764; + + float kMultipliers_N16_0 = 0.5024192861881557; + float kMultipliers_N16_1 = 0.5224986149396889; + float kMultipliers_N16_2 = 0.5669440348163577; + float kMultipliers_N16_3 = 0.6468217833599901; + float kMultipliers_N16_4 = 0.7881546234512502; + float kMultipliers_N16_5 = 1.060677685990347; + float kMultipliers_N16_6 = 1.7224470982383342; + float kMultipliers_N16_7 = 5.101148618689155; + + for (int i = 0; i < 32; i++) { +#pragma HLS PIPELINE II = 30 + float tmp32_b16_0 = in[0 + 32 * i] + in[31 + 32 * i]; + float tmp32_b16_1 = in[1 + 32 * i] + in[30 + 32 * i]; + float tmp32_b16_2 = in[2 + 32 * i] + in[29 + 32 * i]; + float tmp32_b16_3 = in[3 + 32 * i] + in[28 + 32 * i]; + float tmp32_b16_4 = in[4 + 32 * i] + in[27 + 32 * i]; + float tmp32_b16_5 = in[5 + 32 * i] + in[26 + 32 * i]; + float tmp32_b16_6 = in[6 + 32 * i] + in[25 + 32 * i]; + float tmp32_b16_7 = in[7 + 32 * i] + in[24 + 32 * i]; + float tmp32_b16_8 = in[8 + 32 * i] + in[23 + 32 * i]; + float tmp32_b16_9 = in[9 + 32 * i] + in[22 + 32 * i]; + float tmp32_b16_10 = in[10 + 32 * i] + in[21 + 32 * i]; + float tmp32_b16_11 = in[11 + 32 * i] + in[20 + 32 * i]; + float tmp32_b16_12 = in[12 + 32 * i] + in[19 + 32 * i]; + float tmp32_b16_13 = in[13 + 32 * i] + in[18 + 32 * i]; + float tmp32_b16_14 = in[14 + 32 * i] + in[17 + 32 * i]; + float tmp32_b16_15 = in[15 + 32 * i] + in[16 + 32 * i]; + + float tmp16_0_b16 = tmp32_b16_0 + tmp32_b16_15; + float tmp16_1_b16 = tmp32_b16_1 + tmp32_b16_14; + float tmp16_2_b16 = tmp32_b16_2 + tmp32_b16_13; + float tmp16_3_b16 = tmp32_b16_3 + tmp32_b16_12; + float tmp16_4_b16 = tmp32_b16_4 + tmp32_b16_11; + float tmp16_5_b16 = tmp32_b16_5 + tmp32_b16_10; + float tmp16_6_b16 = tmp32_b16_6 + tmp32_b16_9; + float tmp16_7_b16 = tmp32_b16_7 + tmp32_b16_8; + float tmp16_8_b16 = tmp32_b16_0 - tmp32_b16_15; + float tmp16_9_b16 = tmp32_b16_1 - tmp32_b16_14; + float tmp16_10_b16 = tmp32_b16_2 - tmp32_b16_13; + float tmp16_11_b16 = tmp32_b16_3 - tmp32_b16_12; + float tmp16_12_b16 = tmp32_b16_4 - tmp32_b16_11; + float tmp16_13_b16 = tmp32_b16_5 - tmp32_b16_10; + float tmp16_14_b16 = tmp32_b16_6 - tmp32_b16_9; + float tmp16_15_b16 = tmp32_b16_7 - tmp32_b16_8; + + float tmp8_0_b16 = tmp16_0_b16 + tmp16_7_b16; + float tmp8_1_b16 = tmp16_1_b16 + tmp16_6_b16; + float tmp8_2_b16 = tmp16_2_b16 + tmp16_5_b16; + float tmp8_3_b16 = tmp16_3_b16 + tmp16_4_b16; + float tmp8_4_b16 = tmp16_0_b16 - tmp16_7_b16; + float tmp8_5_b16 = tmp16_1_b16 - tmp16_6_b16; + float tmp8_6_b16 = tmp16_2_b16 - tmp16_5_b16; + float tmp8_7_b16 = tmp16_3_b16 - tmp16_4_b16; + + float t00_b16 = tmp8_0_b16 + tmp8_3_b16; + float t01_b16 = tmp8_1_b16 + tmp8_2_b16; + float t02_b16 = tmp8_0_b16 - tmp8_3_b16; + float t03_b16 = tmp8_1_b16 - tmp8_2_b16; + float t04_b16 = tmp8_4_b16 * kMultipliers_N8_c1; + float t05_b16 = tmp8_5_b16 * kMultipliers_N8_c2; + float t06_b16 = tmp8_6_b16 * kMultipliers_N8_c3; + float t07_b16 = tmp8_7_b16 * kMultipliers_N8_c4; + + float t08_b16 = t02_b16 * kMultipliers_N4_c1; + float t09_b16 = t03_b16 * kMultipliers_N4_c2; + float t10_b16 = t04_b16 + t07_b16; + float t11_b16 = t05_b16 + t06_b16; + float t12_b16 = t04_b16 - t07_b16; + float t13_b16 = t05_b16 - t06_b16; + + float t14_b16 = t08_b16 + t09_b16; + float t15_b16 = t10_b16 + t11_b16; + float t16_b16 = t08_b16 - t09_b16; + float t17_b16 = t10_b16 - t11_b16; + + float t18_b16 = t12_b16 * kMultipliers_N4_c1; + float t19_b16 = t13_b16 * kMultipliers_N4_c2; + float t20_b16 = t14_b16 * sqrt2; + float t21_b16 = t15_b16 * sqrt2; + + float t22_b16 = t18_b16 + t19_b16; + float t23_b16 = t18_b16 - t19_b16; + + float t24_b16 = t22_b16 * sqrt2; + + float t25_b16 = t24_b16 + t23_b16; + + float t26_b16 = kMultipliers_N16_0 * tmp16_8_b16; + float t27_b16 = kMultipliers_N16_1 * tmp16_9_b16; + float t28_b16 = kMultipliers_N16_2 * tmp16_10_b16; + float t29_b16 = kMultipliers_N16_3 * tmp16_11_b16; + float t30_b16 = kMultipliers_N16_4 * tmp16_12_b16; + float t31_b16 = kMultipliers_N16_5 * tmp16_13_b16; + float t32_b16 = kMultipliers_N16_6 * tmp16_14_b16; + float t33_b16 = kMultipliers_N16_7 * tmp16_15_b16; + + float dmp8_0_b16 = t26_b16 + t33_b16; + float dmp8_1_b16 = t27_b16 + t32_b16; + float dmp8_2_b16 = t28_b16 + t31_b16; + float dmp8_3_b16 = t29_b16 + t30_b16; + float dmp8_4_b16 = t26_b16 - t33_b16; + float dmp8_5_b16 = t27_b16 - t32_b16; + float dmp8_6_b16 = t28_b16 - t31_b16; + float dmp8_7_b16 = t29_b16 - t30_b16; + + float d00_b16 = dmp8_0_b16 + dmp8_3_b16; + float d01_b16 = dmp8_1_b16 + dmp8_2_b16; + float d02_b16 = dmp8_0_b16 - dmp8_3_b16; + float d03_b16 = dmp8_1_b16 - dmp8_2_b16; + float d04_b16 = dmp8_4_b16 * kMultipliers_N8_c1; + float d05_b16 = dmp8_5_b16 * kMultipliers_N8_c2; + float d06_b16 = dmp8_6_b16 * kMultipliers_N8_c3; + float d07_b16 = dmp8_7_b16 * kMultipliers_N8_c4; + + float d08_b16 = d02_b16 * kMultipliers_N4_c1; + float d09_b16 = d03_b16 * kMultipliers_N4_c2; + float d10_b16 = d04_b16 + d07_b16; + float d11_b16 = d05_b16 + d06_b16; + float d12_b16 = d04_b16 - d07_b16; + float d13_b16 = d05_b16 - d06_b16; + + float d14_b16 = d08_b16 + d09_b16; + float d15_b16 = d10_b16 + d11_b16; + float d16_b16 = d08_b16 - d09_b16; + float d17_b16 = d10_b16 - d11_b16; + + float d18_b16 = d12_b16 * kMultipliers_N4_c1; + float d19_b16 = d13_b16 * kMultipliers_N4_c2; + float d20_b16 = d14_b16 * sqrt2; + float d21_b16 = d15_b16 * sqrt2; + + float d22_b16 = d18_b16 + d19_b16; + float d23_b16 = d18_b16 - d19_b16; + + float d24_b16 = d22_b16 * sqrt2; + + float d25_b16 = d24_b16 + d23_b16; + + float d26_b16 = d00_b16 + d01_b16; + float d27_b16 = d21_b16 + d25_b16; + float d28_b16 = d20_b16 + d16_b16; + float d29_b16 = d25_b16 + d17_b16; + float d30_b16 = d00_b16 - d01_b16; + float d31_b16 = d17_b16 + d23_b16; + float d32_b16 = d26_b16 * sqrt2; + + float tmp32_b16_out1_0 = t00_b16 + t01_b16; + float tmp32_b16_out1_1 = d32_b16 + d27_b16; + float tmp32_b16_out1_2 = t21_b16 + t25_b16; + float tmp32_b16_out1_3 = d27_b16 + d28_b16; + float tmp32_b16_out1_4 = t20_b16 + t16_b16; + float tmp32_b16_out1_5 = d28_b16 + d29_b16; + float tmp32_b16_out1_6 = t25_b16 + t17_b16; + float tmp32_b16_out1_7 = d29_b16 + d30_b16; + float tmp32_b16_out1_8 = t00_b16 - t01_b16; + float tmp32_b16_out1_9 = d30_b16 + d31_b16; + float tmp32_b16_out1_10 = t17_b16 + t23_b16; + float tmp32_b16_out1_11 = d31_b16 + d16_b16; + float tmp32_b16_out1_12 = t16_b16; + float tmp32_b16_out1_13 = d16_b16 + d23_b16; + float tmp32_b16_out1_14 = t23_b16; + float tmp32_b16_out1_15 = d23_b16; + + float tmp32_b32_add_sub_16 = in[0 + 32 * i] - in[31 + 32 * i]; + float tmp32_b32_add_sub_17 = in[1 + 32 * i] - in[30 + 32 * i]; + float tmp32_b32_add_sub_18 = in[2 + 32 * i] - in[29 + 32 * i]; + float tmp32_b32_add_sub_19 = in[3 + 32 * i] - in[28 + 32 * i]; + float tmp32_b32_add_sub_20 = in[4 + 32 * i] - in[27 + 32 * i]; + float tmp32_b32_add_sub_21 = in[5 + 32 * i] - in[26 + 32 * i]; + float tmp32_b32_add_sub_22 = in[6 + 32 * i] - in[25 + 32 * i]; + float tmp32_b32_add_sub_23 = in[7 + 32 * i] - in[24 + 32 * i]; + float tmp32_b32_add_sub_24 = in[8 + 32 * i] - in[23 + 32 * i]; + float tmp32_b32_add_sub_25 = in[9 + 32 * i] - in[22 + 32 * i]; + float tmp32_b32_add_sub_26 = in[10 + 32 * i] - in[21 + 32 * i]; + float tmp32_b32_add_sub_27 = in[11 + 32 * i] - in[20 + 32 * i]; + float tmp32_b32_add_sub_28 = in[12 + 32 * i] - in[19 + 32 * i]; + float tmp32_b32_add_sub_29 = in[13 + 32 * i] - in[18 + 32 * i]; + float tmp32_b32_add_sub_30 = in[14 + 32 * i] - in[17 + 32 * i]; + float tmp32_b32_add_sub_31 = in[15 + 32 * i] - in[16 + 32 * i]; + + float tmp32_b32_mul_16 = tmp32_b32_add_sub_16 * kMultipliers_N32_0; + float tmp32_b32_mul_17 = tmp32_b32_add_sub_17 * kMultipliers_N32_1; + float tmp32_b32_mul_18 = tmp32_b32_add_sub_18 * kMultipliers_N32_2; + float tmp32_b32_mul_19 = tmp32_b32_add_sub_19 * kMultipliers_N32_3; + float tmp32_b32_mul_20 = tmp32_b32_add_sub_20 * kMultipliers_N32_4; + float tmp32_b32_mul_21 = tmp32_b32_add_sub_21 * kMultipliers_N32_5; + float tmp32_b32_mul_22 = tmp32_b32_add_sub_22 * kMultipliers_N32_6; + float tmp32_b32_mul_23 = tmp32_b32_add_sub_23 * kMultipliers_N32_7; + float tmp32_b32_mul_24 = tmp32_b32_add_sub_24 * kMultipliers_N32_8; + float tmp32_b32_mul_25 = tmp32_b32_add_sub_25 * kMultipliers_N32_9; + float tmp32_b32_mul_26 = tmp32_b32_add_sub_26 * kMultipliers_N32_10; + float tmp32_b32_mul_27 = tmp32_b32_add_sub_27 * kMultipliers_N32_11; + float tmp32_b32_mul_28 = tmp32_b32_add_sub_28 * kMultipliers_N32_12; + float tmp32_b32_mul_29 = tmp32_b32_add_sub_29 * kMultipliers_N32_13; + float tmp32_b32_mul_30 = tmp32_b32_add_sub_30 * kMultipliers_N32_14; + float tmp32_b32_mul_31 = tmp32_b32_add_sub_31 * kMultipliers_N32_15; + + float tmp16_0_b32 = tmp32_b32_mul_16 + tmp32_b32_mul_31; + float tmp16_1_b32 = tmp32_b32_mul_17 + tmp32_b32_mul_30; + float tmp16_2_b32 = tmp32_b32_mul_18 + tmp32_b32_mul_29; + float tmp16_3_b32 = tmp32_b32_mul_19 + tmp32_b32_mul_28; + float tmp16_4_b32 = tmp32_b32_mul_20 + tmp32_b32_mul_27; + float tmp16_5_b32 = tmp32_b32_mul_21 + tmp32_b32_mul_26; + float tmp16_6_b32 = tmp32_b32_mul_22 + tmp32_b32_mul_25; + float tmp16_7_b32 = tmp32_b32_mul_23 + tmp32_b32_mul_24; + float tmp16_8_b32 = tmp32_b32_mul_16 - tmp32_b32_mul_31; + float tmp16_9_b32 = tmp32_b32_mul_17 - tmp32_b32_mul_30; + float tmp16_10_b32 = tmp32_b32_mul_18 - tmp32_b32_mul_29; + float tmp16_11_b32 = tmp32_b32_mul_19 - tmp32_b32_mul_28; + float tmp16_12_b32 = tmp32_b32_mul_20 - tmp32_b32_mul_27; + float tmp16_13_b32 = tmp32_b32_mul_21 - tmp32_b32_mul_26; + float tmp16_14_b32 = tmp32_b32_mul_22 - tmp32_b32_mul_25; + float tmp16_15_b32 = tmp32_b32_mul_23 - tmp32_b32_mul_24; + + float tmp8_0_b32 = tmp16_0_b32 + tmp16_7_b32; + float tmp8_1_b32 = tmp16_1_b32 + tmp16_6_b32; + float tmp8_2_b32 = tmp16_2_b32 + tmp16_5_b32; + float tmp8_3_b32 = tmp16_3_b32 + tmp16_4_b32; + float tmp8_4_b32 = tmp16_0_b32 - tmp16_7_b32; + float tmp8_5_b32 = tmp16_1_b32 - tmp16_6_b32; + float tmp8_6_b32 = tmp16_2_b32 - tmp16_5_b32; + float tmp8_7_b32 = tmp16_3_b32 - tmp16_4_b32; + + float t00_b32 = tmp8_0_b32 + tmp8_3_b32; + float t01_b32 = tmp8_1_b32 + tmp8_2_b32; + float t02_b32 = tmp8_0_b32 - tmp8_3_b32; + float t03_b32 = tmp8_1_b32 - tmp8_2_b32; + float t04_b32 = tmp8_4_b32 * kMultipliers_N8_c1; + float t05_b32 = tmp8_5_b32 * kMultipliers_N8_c2; + float t06_b32 = tmp8_6_b32 * kMultipliers_N8_c3; + float t07_b32 = tmp8_7_b32 * kMultipliers_N8_c4; + + float t08_b32 = t02_b32 * kMultipliers_N4_c1; + float t09_b32 = t03_b32 * kMultipliers_N4_c2; + float t10_b32 = t04_b32 + t07_b32; + float t11_b32 = t05_b32 + t06_b32; + float t12_b32 = t04_b32 - t07_b32; + float t13_b32 = t05_b32 - t06_b32; + + float t14_b32 = t08_b32 + t09_b32; + float t15_b32 = t10_b32 + t11_b32; + float t16_b32 = t08_b32 - t09_b32; + float t17_b32 = t10_b32 - t11_b32; + + float t18_b32 = t12_b32 * kMultipliers_N4_c1; + float t19_b32 = t13_b32 * kMultipliers_N4_c2; + float t20_b32 = t14_b32 * sqrt2; + float t21_b32 = t15_b32 * sqrt2; + + float t22_b32 = t18_b32 + t19_b32; + float t23_b32 = t18_b32 - t19_b32; + + float t24_b32 = t22_b32 * sqrt2; + + float t25_b32 = t24_b32 + t23_b32; + + float t26_b32 = kMultipliers_N16_0 * tmp16_8_b32; + float t27_b32 = kMultipliers_N16_1 * tmp16_9_b32; + float t28_b32 = kMultipliers_N16_2 * tmp16_10_b32; + float t29_b32 = kMultipliers_N16_3 * tmp16_11_b32; + float t30_b32 = kMultipliers_N16_4 * tmp16_12_b32; + float t31_b32 = kMultipliers_N16_5 * tmp16_13_b32; + float t32_b32 = kMultipliers_N16_6 * tmp16_14_b32; + float t33_b32 = kMultipliers_N16_7 * tmp16_15_b32; + + float dmp8_0_b32 = t26_b32 + t33_b32; + float dmp8_1_b32 = t27_b32 + t32_b32; + float dmp8_2_b32 = t28_b32 + t31_b32; + float dmp8_3_b32 = t29_b32 + t30_b32; + float dmp8_4_b32 = t26_b32 - t33_b32; + float dmp8_5_b32 = t27_b32 - t32_b32; + float dmp8_6_b32 = t28_b32 - t31_b32; + float dmp8_7_b32 = t29_b32 - t30_b32; + + float d00_b32 = dmp8_0_b32 + dmp8_3_b32; + float d01_b32 = dmp8_1_b32 + dmp8_2_b32; + float d02_b32 = dmp8_0_b32 - dmp8_3_b32; + float d03_b32 = dmp8_1_b32 - dmp8_2_b32; + float d04_b32 = dmp8_4_b32 * kMultipliers_N8_c1; + float d05_b32 = dmp8_5_b32 * kMultipliers_N8_c2; + float d06_b32 = dmp8_6_b32 * kMultipliers_N8_c3; + float d07_b32 = dmp8_7_b32 * kMultipliers_N8_c4; + + float d08_b32 = d02_b32 * kMultipliers_N4_c1; + float d09_b32 = d03_b32 * kMultipliers_N4_c2; + float d10_b32 = d04_b32 + d07_b32; + float d11_b32 = d05_b32 + d06_b32; + float d12_b32 = d04_b32 - d07_b32; + float d13_b32 = d05_b32 - d06_b32; + + float d14_b32 = d08_b32 + d09_b32; + float d15_b32 = d10_b32 + d11_b32; + float d16_b32 = d08_b32 - d09_b32; + float d17_b32 = d10_b32 - d11_b32; + + float d18_b32 = d12_b32 * kMultipliers_N4_c1; + float d19_b32 = d13_b32 * kMultipliers_N4_c2; + float d20_b32 = d14_b32 * sqrt2; + float d21_b32 = d15_b32 * sqrt2; + + float d22_b32 = d18_b32 + d19_b32; + float d23_b32 = d18_b32 - d19_b32; + + float d24_b32 = d22_b32 * sqrt2; + + float d25_b32 = d24_b32 + d23_b32; + + float d26_b32 = d00_b32 + d01_b32; + float d27_b32 = d21_b32 + d25_b32; + float d28_b32 = d20_b32 + d16_b32; + float d29_b32 = d25_b32 + d17_b32; + float d30_b32 = d00_b32 - d01_b32; + float d31_b32 = d17_b32 + d23_b32; + float d32_b32 = d26_b32 * sqrt2; + + float tmp32_b32_add_out2_16 = t00_b32 + t01_b32; + float tmp32_b32_add_out2_17 = d32_b32 + d27_b32; + float tmp32_b32_add_out2_18 = t21_b32 + t25_b32; + float tmp32_b32_add_out2_19 = d27_b32 + d28_b32; + float tmp32_b32_add_out2_20 = t20_b32 + t16_b32; + float tmp32_b32_add_out2_21 = d28_b32 + d29_b32; + float tmp32_b32_add_out2_22 = t25_b32 + t17_b32; + float tmp32_b32_add_out2_23 = d29_b32 + d30_b32; + float tmp32_b32_add_out2_24 = t00_b32 - t01_b32; + float tmp32_b32_add_out2_25 = d30_b32 + d31_b32; + float tmp32_b32_add_out2_26 = t17_b32 + t23_b32; + float tmp32_b32_add_out2_27 = d31_b32 + d16_b32; + float tmp32_b32_add_out2_28 = t16_b32; + float tmp32_b32_add_out2_29 = d16_b32 + d23_b32; + float tmp32_b32_add_out2_30 = t23_b32; + float tmp32_b32_add_out2_31 = d23_b32; + + float tmp32_b32_out2_16 = tmp32_b32_add_out2_16 * sqrt2 + tmp32_b32_add_out2_17; + float tmp32_b32_out2_17 = tmp32_b32_add_out2_17 + tmp32_b32_add_out2_18; + float tmp32_b32_out2_18 = tmp32_b32_add_out2_18 + tmp32_b32_add_out2_19; + float tmp32_b32_out2_19 = tmp32_b32_add_out2_19 + tmp32_b32_add_out2_20; + float tmp32_b32_out2_20 = tmp32_b32_add_out2_20 + tmp32_b32_add_out2_21; + float tmp32_b32_out2_21 = tmp32_b32_add_out2_21 + tmp32_b32_add_out2_22; + float tmp32_b32_out2_22 = tmp32_b32_add_out2_22 + tmp32_b32_add_out2_23; + float tmp32_b32_out2_23 = tmp32_b32_add_out2_23 + tmp32_b32_add_out2_24; + float tmp32_b32_out2_24 = tmp32_b32_add_out2_24 + tmp32_b32_add_out2_25; + float tmp32_b32_out2_25 = tmp32_b32_add_out2_25 + tmp32_b32_add_out2_26; + float tmp32_b32_out2_26 = tmp32_b32_add_out2_26 + tmp32_b32_add_out2_27; + float tmp32_b32_out2_27 = tmp32_b32_add_out2_27 + tmp32_b32_add_out2_28; + float tmp32_b32_out2_28 = tmp32_b32_add_out2_28 + tmp32_b32_add_out2_29; + float tmp32_b32_out2_29 = tmp32_b32_add_out2_29 + tmp32_b32_add_out2_30; + float tmp32_b32_out2_30 = tmp32_b32_add_out2_30 + tmp32_b32_add_out2_31; + float tmp32_b32_out2_31 = tmp32_b32_add_out2_31; + + out[0 + 32 * i] = tmp32_b16_out1_0; + out[2 + 32 * i] = tmp32_b16_out1_1; + out[4 + 32 * i] = tmp32_b16_out1_2; + out[6 + 32 * i] = tmp32_b16_out1_3; + out[8 + 32 * i] = tmp32_b16_out1_4; + out[10 + 32 * i] = tmp32_b16_out1_5; + out[12 + 32 * i] = tmp32_b16_out1_6; + out[14 + 32 * i] = tmp32_b16_out1_7; + out[16 + 32 * i] = tmp32_b16_out1_8; + out[18 + 32 * i] = tmp32_b16_out1_9; + out[20 + 32 * i] = tmp32_b16_out1_10; + out[22 + 32 * i] = tmp32_b16_out1_11; + out[24 + 32 * i] = tmp32_b16_out1_12; + out[26 + 32 * i] = tmp32_b16_out1_13; + out[28 + 32 * i] = tmp32_b16_out1_14; + out[30 + 32 * i] = tmp32_b16_out1_15; + + out[1 + 32 * i] = tmp32_b32_out2_16; + out[3 + 32 * i] = tmp32_b32_out2_17; + out[5 + 32 * i] = tmp32_b32_out2_18; + out[7 + 32 * i] = tmp32_b32_out2_19; + out[9 + 32 * i] = tmp32_b32_out2_20; + out[11 + 32 * i] = tmp32_b32_out2_21; + out[13 + 32 * i] = tmp32_b32_out2_22; + out[15 + 32 * i] = tmp32_b32_out2_23; + out[17 + 32 * i] = tmp32_b32_out2_24; + out[19 + 32 * i] = tmp32_b32_out2_25; + out[21 + 32 * i] = tmp32_b32_out2_26; + out[23 + 32 * i] = tmp32_b32_out2_27; + out[25 + 32 * i] = tmp32_b32_out2_28; + out[27 + 32 * i] = tmp32_b32_out2_29; + out[29 + 32 * i] = tmp32_b32_out2_30; + out[31 + 32 * i] = tmp32_b32_out2_31; + + // if (transpose_scale) { + // for (int i = 0; i < 32; i++) { + // for (int j = 0; j < 32; j++) { + // #pragma HLS PIPELINE II = 1 + // float mul = 1.0f / 32.0f; + // out[32 * j + i] = out[i * 32 + j]; + // } + // } + // } + } +} + +void hls_IDCT1D_32(float from[16], float to[16]) { +#pragma HLS INLINE off + float IDCT_kMUltipliers_N4_0 = 0.541196100146197; + float IDCT_kMUltipliers_N4_1 = 1.3065629648763764; + float sqrt2 = 1.4142135623730951f; + + for (int i = 0; i < 4; i++) { +#pragma HLS PIPELINE II = 128 + float* from_addr = &from[i]; + float* to_addr = &to[i]; + + float tmp_IDCT_in_0 = from[i + 0]; + float tmp_IDCT_in_1 = from[i + 8]; + float tmp_IDCT_in_2 = from[i + 4]; + float tmp_IDCT_in_3 = from[i + 12]; + + float tmp_IDCT_add_0 = tmp_IDCT_in_0 + tmp_IDCT_in_1; + float in1_dct = tmp_IDCT_in_2 * sqrt2; + float tmp_IDCT_add_1 = tmp_IDCT_in_0 - tmp_IDCT_in_1; + float in2_dct = tmp_IDCT_in_3 + tmp_IDCT_in_2; + + float tmp_IDCT_add_2 = in1_dct + in2_dct; + float tmp_IDCT_add_3 = in1_dct - in2_dct; + + to[i + 0] = IDCT_kMUltipliers_N4_0 * tmp_IDCT_add_2 + tmp_IDCT_add_0; + to[i + 4] = IDCT_kMUltipliers_N4_1 * tmp_IDCT_add_3 + tmp_IDCT_add_1; + to[i + 8] = tmp_IDCT_add_1 - IDCT_kMUltipliers_N4_1 * tmp_IDCT_add_3; + to[i + 12] = tmp_IDCT_add_0 - IDCT_kMUltipliers_N4_0 * tmp_IDCT_add_2; + } +} + +void hls_idct32_scale_2d(float in[16], float out[16]) { +#pragma HLS INLINE off + +Loop_idct32_1: + for (int y = 0; y < 4; y++) { + Loop_idct32_2: + for (int x = 0; x < 4; x++) { +#pragma HLS PIPELINE + float resampley; + float resamplex; + if (x == 0) { + resamplex = 1; + } else if (x == 1) { + resamplex = 0.974886834621429443359375; + } else if (x == 2) { + resamplex = 0.901764214038848876953125; + } else if (x == 3) { + resamplex = 0.78705489635467529296875; + } + if (y == 0) { + resampley = 1; + } else if (y == 1) { + resampley = 0.974886834621429443359375; + } else if (y == 2) { + resampley = 0.901764214038848876953125; + } else if (y == 3) { + resampley = 0.78705489635467529296875; + } + out[y * 4 + x] = in[y * 4 + x] * resampley * resamplex; + } + } +} + +void hls_idct_transpose4x4(float in[16], float out[16]) { +#pragma HLS INLINE off +Loop_idct_transpose: + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { +#pragma HLS PIPELINE + out[i * 4 + j] = in[j * 4 + i]; + } + } +} + +// void hls_ReinterpretingIDCT32(float input[16], float output[16]) { +// #pragma HLS INLINE +// float temp0[16]; +// #pragma HLS bind_storage variable = temp0 type = ram_2p impl = bram +// float temp1[16]; +// #pragma HLS bind_storage variable = temp1 type = ram_2p impl = bram +// float temp2[16]; +// #pragma HLS bind_storage variable = temp2 type = ram_2p impl = bram +// hls_idct32_scale_2d(input, temp0); +// hls_IDCT1D_32(temp0, temp1); +// hls_idct_transpose4x4(temp1, temp2); +// hls_IDCT1D_32(temp2, output); +// } + +void load_dct32(hls::stream& opsin32x32_stream, float from[1024]) { +#pragma HLS INLINE off +load_dct32: + for (int m = 0; m < 32; m++) { + for (int n = 0; n < 32; n++) { +#pragma HLS PIPELINE II = 1 + int addr = n * 32 + m; // m * 32 + n; + from[addr] = opsin32x32_stream.read(); + } + } +} + +void transpose_dct32(float in[1024], float out[1024]) { +#pragma HLS INLINE off + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 32; j++) { +#pragma HLS PIPELINE II = 1 + out[32 * j + i] = in[i * 32 + j]; + } + } +} + +void transpose_scaled_dct32(float in[1024], float out[1024]) { +#pragma HLS INLINE off + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 32; j++) { +#pragma HLS PIPELINE II = 1 + float mul = 1.0f / 32.0f; + float temp = mul * in[i * 32 + j]; + out[32 * j + i] = temp; + } + } +} + +void split_ac_dc_dct32(float in[1024], float to_ac[1024], float to_dc[16]) { +#pragma HLS INLINE off + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 32; j++) { +#pragma HLS PIPELINE II = 1 + to_ac[32 * i + j] = in[32 * i + j]; + + if (j < 4 && i < 4) { + to_dc[i * 4 + j] = in[32 * i + j]; + } + } + } +} + +void scaled_dct32(float in[1024], float out[1024]) { +#pragma HLS INLINE off + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 32; j++) { +#pragma HLS PIPELINE II = 1 + float mul = 1.0f / 32.0f; + out[32 * i + j] = mul * in[i * 32 + j]; + } + } +} + +void feed_dct32_ac(uint32_t x32, + uint32_t y32, + hls::stream& stream_rectx32, + hls::stream& stream_recty32, + float in[1024], + hls::stream& ac_coef32x32_stream) { +#pragma HLS INLINE off + uint32_t rect_xsize; + uint32_t rect_ysize; +loop_feed_dct32_ac: + for (int m = 0; m < 1024; m++) { +#pragma HLS PIPELINE II = 1 + if (m == 0) { + rect_xsize = stream_rectx32.read(); + rect_ysize = stream_recty32.read(); + } + if ((4 * x32 + 3) < rect_xsize && (4 * y32 + 3) < rect_ysize) { + // printf("feed ac %d %d %d\n", in[m], rect_xsize, rect_ysize); + ac_coef32x32_stream.write(in[m]); + } + } +} + +void feed_dct32_dc(uint32_t x32, + uint32_t y32, + hls::stream& stream_rectx32, + hls::stream& stream_recty32, + float dc_mem[16], + hls::stream& dc_coef32x32_stream) { +#pragma HLS INLINE off + uint32_t rect_xsize; + uint32_t rect_ysize; +loop_feed_dct32_dc: + for (int m = 0; m < 16; m++) { +#pragma HLS PIPELINE II = 1 + if (m == 0) { + rect_xsize = stream_rectx32.read(); + rect_ysize = stream_recty32.read(); + } + if ((4 * x32 + 3) < rect_xsize && (4 * y32 + 3) < rect_ysize) { + dc_coef32x32_stream.write(dc_mem[m]); + } + } +} + +void hls_dct32x32_module(unsigned ysize, + unsigned xsize, + hls::stream& stream_recty32, + hls::stream& stream_rectx32, + hls::stream& stream_recty32_1, + hls::stream& stream_rectx32_1, + hls::stream& opsin32x32_stream, + hls::stream& ac_coef32x32_stream, + hls::stream& dc_coef32x32_stream) { +#pragma HLS INLINE off + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + + int ysize64 = tile_ysize / 64; + int xsize64 = tile_xsize / 64; +// int xsize_blocks = xsize / 8; +// int ysize_blocks = ysize / 8; +loop_dct32_tile_y: + for (uint32_t y64 = 0; y64 < ysize64; y64++) { +#pragma HLS LOOP_TRIPCOUNT min = 1 max = 1 + loop_dct32_tile_x: + for (uint32_t x64 = 0; x64 < xsize64; x64++) { +#pragma HLS LOOP_TRIPCOUNT min = 1 max = 1 + // int tx1 = x64; + // int ty1 = y64; + // int by = ty1 * 8; + // int by1 = fmin((int)((ty1 + 1) * 8), ysize_blocks); + // int bx = tx1 * 8; + // int bx1 = fmin((int)((tx1 + 1) * 8), xsize_blocks); + // int rect_ysize = by1 - by; + // int rect_xsize = bx1 - bx; + loop_dct32_y: + for (uint32_t y32 = 0; y32 < 2; y32++) { + loop_dct32_x: + for (uint32_t x32 = 0; x32 < 2; x32++) { + loop_dct32_c: + for (int c = 0; c < 3; c++) { +#pragma HLS DATAFLOW + float from[1024]; +#pragma HLS bind_storage variable = from type = ram_2p impl = bram + float temp0[1024]; +#pragma HLS bind_storage variable = temp0 type = ram_2p impl = bram + float temp1[1024]; +#pragma HLS bind_storage variable = temp1 type = ram_2p impl = bram + float temp2[1024]; +#pragma HLS bind_storage variable = temp2 type = ram_2p impl = bram + float temp3[1024]; +#pragma HLS bind_storage variable = temp3 type = ram_2p impl = bram + float temp4[1024]; +#pragma HLS bind_storage variable = temp4 type = ram_2p impl = bram + float temp5[1024]; +#pragma HLS bind_storage variable = temp4 type = ram_2p impl = bram + float to_ac[1024]; +#pragma HLS bind_storage variable = to_ac type = ram_2p impl = bram + float to_dc[16]; +#pragma HLS bind_storage variable = to_dc type = ram_2p impl = bram + float dc_mem[16]; +#pragma HLS bind_storage variable = to_dc type = ram_2p impl = bram + float dc_temp0[16]; +#pragma HLS bind_storage variable = temp0 type = ram_2p impl = bram + float dc_temp1[16]; +#pragma HLS bind_storage variable = temp1 type = ram_2p impl = bram + float dc_temp2[16]; +#pragma HLS bind_storage variable = temp2 type = ram_2p impl = bram + + load_dct32(opsin32x32_stream, from); + hls_DCT1DImpl_32(from, temp1); + scaled_dct32(temp1, temp2); + transpose_dct32(temp2, temp3); + hls_DCT1DImpl_32(temp3, temp4); + transpose_scaled_dct32(temp4, temp5); + split_ac_dc_dct32(temp5, to_ac, to_dc); + // output ac_coeff dct32 + feed_dct32_ac(x32, y32, stream_rectx32, stream_recty32, to_ac, ac_coef32x32_stream); + // feed_dct32_ac(x32, y32, rect_xsize, rect_ysize, to_ac, ac_coef32x32_stream); + // ouput dc_coeff dct32 + hls_idct32_scale_2d(to_dc, dc_temp0); + hls_IDCT1D_32(dc_temp0, dc_temp1); + hls_idct_transpose4x4(dc_temp1, dc_temp2); + hls_IDCT1D_32(dc_temp2, dc_mem); + // feed_dct32_dc(x32, y32, rect_xsize, rect_ysize, dc_mem, dc_coef32x32_stream); + feed_dct32_dc(x32, y32, stream_rectx32_1, stream_recty32_1, dc_mem, dc_coef32x32_stream); + } + } + } + } + } + // printf("count1 = %d\n", count); +} + +int Div_Ceil2(int a, int b) { +#pragma HLS inline + return (a + b - 1) / b; +} + +void GetRectSizeDCT(short xsize, + short ysize, + hls::stream& stream_rectx_dct, + hls::stream& stream_recty_dct, + hls::stream& stream_rectx32, + hls::stream& stream_recty32, + hls::stream& stream_rectx32_1, + hls::stream& stream_recty32_1, + hls::stream& stream_rectx16, + hls::stream& stream_recty16, + hls::stream& stream_rectx16_1, + hls::stream& stream_recty16_1, + hls::stream& stream_rectx8, + hls::stream& stream_recty8, + hls::stream& stream_rectx8_1, + hls::stream& stream_recty8_1) { + uint16_t xsize_blocks = xsize / 8; + uint16_t ysize_blocks = ysize / 8; +LOOP_0: + for (uint16_t y = 0; y < Div_Ceil2(ysize_blocks, 8); y++) { + LOOP_1: + for (uint16_t x = 0; x < Div_Ceil2(xsize_blocks, 8); x++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + // uint16_t by = y * 8; + // uint16_t by1 = ((y + 1) * 8) < ysize_blocks ? ((y + 1) * 8) : ysize_blocks; + // uint16_t bx = x * 8; + // uint16_t bx1 = ((x + 1) * 8) < xsize_blocks ? ((x + 1) * 8) : xsize_blocks; + // uint8_t rect_ysize = by1 - by; + // uint8_t rect_xsize = bx1 - bx; + uint8_t rect_ysize = stream_recty_dct.read(); + uint8_t rect_xsize = stream_rectx_dct.read(); + // printf("rect_xsize=%d, rect_ysize=%d\n", rect_xsize, rect_ysize); + for (int i = 0; i < 192; i++) { + if (i < 12) { + stream_rectx32.write(rect_xsize); + stream_recty32.write(rect_ysize); + stream_rectx32_1.write(rect_xsize); + stream_recty32_1.write(rect_ysize); + } + if (i < 48) { + stream_rectx16.write(rect_xsize); + stream_recty16.write(rect_ysize); + stream_rectx16_1.write(rect_xsize); + stream_recty16_1.write(rect_ysize); + } + stream_rectx8.write(rect_xsize); + stream_recty8.write(rect_ysize); + stream_rectx8_1.write(rect_xsize); + stream_recty8_1.write(rect_ysize); + } + } + } +} + +void hls_dct_top(unsigned ysize, + unsigned xsize, + hls::stream& stream_rectx_dct, + hls::stream& stream_recty_dct, + hls::stream& opsin8x8_stream, + hls::stream& opsin16x16_stream, + hls::stream& opsin32x32_stream, + hls::stream& ac_coef8x8_stream, + hls::stream& ac_coef16x16_stream, + hls::stream& ac_coef32x32_stream, + hls::stream& dc_coef8x8_stream, + hls::stream& dc_coef16x16_stream, + hls::stream& dc_coef32x32_stream) { + // #pragma HLS INLINE + hls::stream stream_rectx32; + hls::stream stream_recty32; + hls::stream stream_rectx32_1; + hls::stream stream_recty32_1; + hls::stream stream_rectx16; + hls::stream stream_recty16; + hls::stream stream_rectx16_1; + hls::stream stream_recty16_1; + hls::stream stream_rectx8; + hls::stream stream_recty8; + hls::stream stream_rectx8_1; + hls::stream stream_recty8_1; +// #pragma HLS DATAFLOW +#pragma HLS INLINE + GetRectSizeDCT(xsize, ysize, stream_rectx_dct, stream_recty_dct, stream_rectx32, stream_recty32, stream_rectx32_1, + stream_recty32_1, stream_rectx16, stream_recty16, stream_rectx16_1, stream_recty16_1, stream_rectx8, + stream_recty8, stream_rectx8_1, stream_recty8_1); + hls_dct8x8_module(ysize, xsize, stream_recty8, stream_rectx8, stream_recty8_1, stream_rectx8_1, opsin8x8_stream, + ac_coef8x8_stream, dc_coef8x8_stream); + hls_dct16x16_module(ysize, xsize, stream_recty16, stream_rectx16, stream_recty16_1, stream_rectx16_1, + opsin16x16_stream, ac_coef16x16_stream, dc_coef16x16_stream); + hls_dct32x32_module(ysize, xsize, stream_recty32, stream_rectx32, stream_recty32_1, stream_rectx32_1, + opsin32x32_stream, ac_coef32x32_stream, dc_coef32x32_stream); +} + +//-----------------------acs_heuristic---------------------// + +int Div_Ceil(int a, int b) { +#pragma HLS inline + return (a + b - 1) / b; +} + +float EvalRationalPolynomial3_2(float x, float p[3], float q[3]) { + float yp = p[2]; + float yq = q[2]; + yp = (yp * x) + p[1]; + yq = (yq * x) + q[1]; + yp = (yp * x) + p[0]; + yq = (yq * x) + q[0]; + return yp / yq; +} + +float FastLog2f_HLS2(float x) { + union { + float x_f; + int x_i; + } u = {x}; + float p[3] = {-1.8503833400518310E-06f, 1.4287160470083755E+00f, 7.4245873327820566E-01f}; + float q[3] = {9.9032814277590719E-01f, 1.0096718572241148E+00f, 1.7409343003366853E-01f}; + int x_bits = u.x_i; + int exp_bits = x_bits - 0x3f2aaaab; // = 2/3 + int exp_shifted = exp_bits >> 23; + int result0 = exp_shifted << 23; + int result = x_bits - result0; + u.x_i = result; + float mantissa = u.x_f; + float exp_val = static_cast(exp_shifted); + float output = EvalRationalPolynomial3_2(mantissa - 1.0f, p, q) + exp_val; + return output; +} + +float FastPow2f_HLS(float x) { + int floorx = floor(x); + int tmp = ((floorx + 127) << 23); + union { + float x_f; + int x_i; + } u; + u.x_i = tmp; + float exp = u.x_f; + float frac = x - floorx; + float num = frac + 1.01749063e+01; + num = num * frac + 4.88687798e+01; + num = num * frac + 9.85506591e+01; + num = num * exp; + float den = frac * 2.10242958e-01 - 2.22328856e-02; + den = den * frac - 1.94414990e+01; + den = den * frac + 9.85506633e+01; + return num / den; +} + +float FastPowf_HLS(float base, float exponent) { + return FastPow2f_HLS(FastLog2f_HLS2(base) * exponent); +} + +int CeilLog2NonzeroHLS(ap_int<32> x) { + int leading_zeros = x.countLeadingZeros(); + int floor_log2 = 63 ^ (leading_zeros + 32); + if ((x & (x - 1)) != 0) { + floor_log2 = floor_log2 + 1; + } + return floor_log2; +} + +void GetACSSize(short xsize, + short ysize, + hls::stream& stream_rectx_acs, + hls::stream& stream_recty_acs, + hls::stream& stream_rectx0, + hls::stream& stream_recty0, + hls::stream& stream_rectx1, + hls::stream& stream_recty1, + hls::stream& stream_rectx2, + hls::stream& stream_recty2, + hls::stream& stream_rectx3, + hls::stream& stream_recty3, + hls::stream& stream_rectx10, + hls::stream& stream_recty10) { + uint16_t xsize_blocks = xsize / 8; + uint16_t ysize_blocks = ysize / 8; +LOOP_0: + for (uint16_t y = 0; y < Div_Ceil(ysize_blocks, 8); y++) { + LOOP_1: + for (uint16_t x = 0; x < Div_Ceil(xsize_blocks, 8); x++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + // uint16_t by = y * 8; + // uint16_t by1 = ((y + 1) * 8) < ysize_blocks ? ((y + 1) * 8) : ysize_blocks; + // uint16_t bx = x * 8; + // uint16_t bx1 = ((x + 1) * 8) < xsize_blocks ? ((x + 1) * 8) : xsize_blocks; + // uint8_t rect_ysize = by1 - by; + // uint8_t rect_xsize = bx1 - bx; + uint8_t rect_ysize = stream_recty_acs.read(); + uint8_t rect_xsize = stream_rectx_acs.read(); + stream_rectx0.write(rect_xsize); + stream_recty0.write(rect_ysize); + stream_rectx1.write(rect_xsize); + stream_recty1.write(rect_ysize); + stream_rectx2.write(rect_xsize); + stream_recty2.write(rect_ysize); + stream_rectx3.write(rect_xsize); + stream_recty3.write(rect_ysize); + stream_rectx10.write(rect_xsize); + stream_recty10.write(rect_ysize); + } + } +} + +void DupQuantAndMask(uint16_t num_tile, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_q_org, + hls::stream& stream_mask_org, + hls::stream& stream_q_org_8, + hls::stream& stream_mask_org_8, + hls::stream& stream_q_org_16, + hls::stream& stream_mask_org_16, + hls::stream& stream_q_org_32, + hls::stream& stream_mask_org_32) { +DUP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + DUP_1: + for (uint8_t iy = 0; iy < rect_ysize; iy++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + DUP_2: + for (uint8_t ix = 0; ix < rect_xsize; ix++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + // do computation once for 16 and 32 + float tmp0 = stream_q_org.read(); + stream_q_org_8.write(tmp0); + tmp0 *= tmp0; + tmp0 *= tmp0; + tmp0 *= tmp0; + stream_q_org_16.write(tmp0); + stream_q_org_32.write(tmp0); + float tmp1 = stream_mask_org.read(); + stream_mask_org_8.write(tmp1); + stream_mask_org_16.write(tmp1); + stream_mask_org_32.write(tmp1); + } + } + } +} + +void GetQAndMask_8(uint16_t num_tile, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_rectx_out, + hls::stream& stream_recty_out, + hls::stream& stream_q_org, + hls::stream& stream_mask_org, + hls::stream& stream_q, + hls::stream& stream_mask) { +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + stream_recty_out.write(rect_ysize); + stream_rectx_out.write(rect_xsize); + LOOP_1: + for (uint8_t iy = 0; iy < rect_ysize; iy++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_2: + for (uint8_t ix = 0; ix < rect_xsize; ix++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 +#pragma HLS pipeline II = 64 + float quant_norm8 = 0; + float masking = 0; + quant_norm8 = stream_q_org.read(); + stream_q.write(quant_norm8); + masking = 2.0f * stream_mask_org.read(); + stream_mask.write(masking); + } + } + } +} + +template +void GetQAndMask_16_32(uint16_t num_tile, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_rectx_out, + hls::stream& stream_recty_out, + hls::stream& stream_q_org, + hls::stream& stream_mask_org, + hls::stream& stream_q, + hls::stream& stream_mask) { + uint8_t block_n = N * N; +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + stream_recty_out.write(rect_ysize); + stream_rectx_out.write(rect_xsize); + float q_array[64]; +#pragma HLS BIND_STORAGE variable = q_array type = RAM_1P impl = bram + float mask_array[64]; +#pragma HLS BIND_STORAGE variable = mask_array type = RAM_1P impl = bram + LOOP_1: + for (uint8_t iy = 0; iy < rect_ysize; iy++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 +#pragma HLS loop_flatten off + LOOP_2: + for (uint8_t ix = 0; ix < rect_xsize; ix++) { +#pragma HLS pipeline II = 1 +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 +#pragma HLS loop_flatten off + int index = iy * 8 + ix; + q_array[index] = stream_q_org.read(); + mask_array[index] = stream_mask_org.read(); + } + } + LOOP_3: + for (uint8_t iy = 0; iy + N - 1 < rect_ysize; iy += N) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 +#pragma HLS loop_flatten off + LOOP_4: + for (uint8_t ix = 0; ix + N - 1 < rect_xsize; ix += N) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 +#pragma HLS loop_flatten off + float quant_norm8 = 0; + float masking = 0; + float masking_norm2 = 0; + float masking_max = 0; + LOOP_5: + for (uint8_t dy = 0; dy < N; dy++) { +#pragma HLS LOOP_TRIPCOUNT min = 4 max = 4 +#pragma HLS loop_flatten off + LOOP_6: + for (uint8_t dx = 0; dx < N; dx++) { +#pragma HLS LOOP_TRIPCOUNT min = 4 max = 4 +#pragma HLS loop_flatten off +#pragma HLS pipeline + uint8_t idx = (iy + dy) * 8 + ix + dx; + float qval = q_array[idx]; + quant_norm8 += qval; + float maskval = mask_array[idx]; + masking_max = fmax(masking_max, maskval); + masking_norm2 += maskval * maskval; + } + } + quant_norm8 /= block_n; + // Change: use 3 sqrtf to replace FastPowf_HLS, and try to only use on sqrtf to do all things + // float tmp = quant_norm8; + // quant_norm8 = sqrtf(quant_norm8); + // quant_norm8 = sqrtf(quant_norm8); + // quant_norm8 = sqrtf(quant_norm8); + LOOP_7: + for (int dx = 0; dx < 3; dx++) { +#pragma HLS pipeline + quant_norm8 = sqrtf(quant_norm8); + } + // quant_norm8 = FastPowf_HLS(quant_norm8, 1.0f / 8.0f); + masking_norm2 = sqrtf(masking_norm2 / block_n); + masking = masking_norm2 + masking_max; + stream_q.write(quant_norm8); + stream_mask.write(masking); + } + } + } +} + +template +void ComputeEntropy1(uint16_t num_tile, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_rectx_out, + hls::stream& stream_recty_out, + hls::stream& stream_q, + hls::stream& stream_dctin, +#ifdef FIX + hls::stream >& stream_loss, + hls::stream >& stream_loss2, + hls::stream >& stream_entropy, + hls::stream >& stream_nzeros +#else + hls::stream& stream_loss, + hls::stream& stream_loss2, + hls::stream& stream_entropy, + hls::stream& stream_nzeros +#endif + ) { + uint8_t block_n = N * N; + int count_array; + float info_loss = 0.0; + float info_loss2 = 0.0; + float entropy = 0.0; + float zeros_mul = 7.565053364251793f; + float cost2 = 4.4628149885273363f; + float cost_delta = 5.3359184934516337f; + float cmap_factor; + float q; + float entropy_v[3] = {0.0, 0.0, 0.0}; + float nzeros_v[3] = {0.0, 0.0, 0.0}; + float entropy_array[8]; + float info_loss_array[8]; + float info_loss2_array[8]; + float nzeros_array[8]; + float y_ram[1024]; + float cmap_factors_init[3] = {0.0f, 0.0f, 1.0f}; +#ifdef FIX + ap_int<23> info_loss_fix[8]; + ap_int<45> info_loss2_fix[8]; + ap_int<11> nzeros_fix[8]; + ap_int<32> y_fix_ram[1024]; + ap_int<32> cost2_fix = (int)(cost2 * 1024); + ap_int<32> cost_delta_fix = (int)(cost_delta * 1024); + ap_int<28> info_loss_sum; + ap_int<44> info_loss2_sum; + ap_int<11> nzeros_sum; + ap_int<42> entropy_sum; +#endif +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint16_t rect_ysize = stream_recty.read(); + uint16_t rect_xsize = stream_rectx.read(); + stream_recty_out.write(rect_ysize); + stream_rectx_out.write(rect_xsize); + float q_tmp[64]; + LOOP_1: + for (uint8_t iy = 0; iy + N - 1 < rect_ysize; iy += N) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_2: + for (uint8_t ix = 0; ix + N - 1 < rect_xsize; ix += N) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_3: + for (uint8_t c = 0; c < 3; c++) { +#pragma HLS LOOP_TRIPCOUNT min = 3 max = 3 + LOOP_4: + for (uint16_t i = 0; i < block_n * 64; i += 1) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 +#pragma HLS pipeline II = 1 + float in = stream_dctin.read(); +#ifdef FIX + if (i == 0) { + nzeros_sum = 0; + entropy_sum = 0; + if (c == 0) { + q = stream_q.read(); + info_loss_sum = 0; + info_loss2_sum = 0; + } + } + ap_int<30> in_fix = in * 0x1fffffff; // exp=29 + float in_fix_y_tmp; + if (c == 0) { + y_fix_ram[i] = in_fix; + in_fix_y_tmp = in_fix; + } else { + in_fix_y_tmp = y_fix_ram[i]; + } + ap_int<30> in_fix_y = (c == 2) ? in_fix_y_tmp : 0; + ap_int<31> in_fix_m = in_fix - in_fix_y; + + ap_uint<24> im_fix; + if (N == 1) { + im_fix = inv_matrix_8_fix[c][i]; // exp=10 + } + if (N == 2) { + im_fix = inv_matrix_16_fix[c][i]; + } + if (N == 4) { + im_fix = inv_matrix_32_fix[c][i]; + } + + ap_uint<15> rqf_fix = q * 32768; // exp=15 + ap_int<55> val_tmp0 = in_fix_m * im_fix; // exp=29+10=39 + ap_int<28> val_tmp1 = val_tmp0 >> 27; // exp=39-27=12 + ap_int<43> val_tmp2 = val_tmp1 * rqf_fix; // exp=12+15=27 + ap_int<35> val_fix = val_tmp2 >> 11; // exp=27-11=16 + + // actual value is not that large, so just reduce bitwidth + ap_int<11> val_shift0 = val_fix >> 15; + ap_int<10> val_shift1 = val_fix >> 16; + if (val_shift0.range(0, 0) == 1) { + val_shift1 += 1; + } + ap_int<10> rval_fix = val_shift1; // exp=0 + ap_int<32> val_shift_back = val_shift1 * 65536; // exp=16 + ap_uint<16> diff_fix = std::abs(val_shift_back - val_fix); // exp=-16 hls_abs? + ap_uint<32> diff_fix_square = diff_fix * diff_fix; // exp=-32 + ap_uint<10> q_fix = std::abs(rval_fix); // hls_abs? + bool q_fix_is_zero = q_fix == 0; + float entropy_tmp = (q_fix > 1 ? cost2 : 0.0f) + sqrtf(q_fix) * cost_delta; + ap_uint<32> entropy_fix = (uint32_t)(entropy_tmp * 65536); + + info_loss_sum += diff_fix; + info_loss2_sum += diff_fix_square; + nzeros_sum += q_fix_is_zero ? 0 : 1; + entropy_sum += entropy_fix; + + if (i == block_n * 64 - 1) { + stream_entropy.write(entropy_sum); + stream_nzeros.write(nzeros_sum); + } + if (i == block_n * 64 - 1 && c == 2) { + stream_loss.write(info_loss_sum); + stream_loss2.write(info_loss2_sum); + } +#else + if (c == 0 && i == 0) { + q = stream_q.read(); + count_array = 0; + } + cmap_factor = cmap_factors_init[c]; + float in_y_tmp; + if (c == 0) { + y_ram[i] = in; + in_y_tmp = in; + } else { + in_y_tmp = y_ram[i]; + } + float in_y = in_y_tmp * cmap_factor; + float im; + if (N == 1) { + im = inv_matrix_8[c][i]; + } + if (N == 2) { + im = inv_matrix_16[c][i]; + } + if (N == 4) { + im = inv_matrix_32[c][i]; + } + const float val = (in - in_y) * im * q; + const int rval = roundf(val); + const float diff = fabs(val - rval); + + info_loss_array[count_array] = diff; + info_loss2_array[count_array] = diff * diff; + + const int q = abs(rval); + const bool q_is_zero = q == 0; + float tmp = (q >= 1.5f ? cost2 : 0.0f) + sqrtf(q) * cost_delta; + entropy_array[count_array] = tmp; + nzeros_array[count_array] = q_is_zero ? 0.0f : 1.0f; + count_array++; + if (count_array == 8) { + float sum0 = entropy_array[0] + entropy_array[1] + entropy_array[2] + entropy_array[3] + + entropy_array[4] + entropy_array[5] + entropy_array[6] + entropy_array[7]; + stream_entropy.write(sum0); + float sum1 = nzeros_array[0] + nzeros_array[1] + nzeros_array[2] + nzeros_array[3] + + nzeros_array[4] + nzeros_array[5] + nzeros_array[6] + nzeros_array[7]; + stream_nzeros.write(sum1); + float sum2 = info_loss_array[0] + info_loss_array[1] + info_loss_array[2] + + info_loss_array[3] + info_loss_array[4] + info_loss_array[5] + + info_loss_array[6] + info_loss_array[7]; + stream_loss.write(sum2); + float sum3 = info_loss2_array[0] + info_loss2_array[1] + info_loss2_array[2] + + info_loss2_array[3] + info_loss2_array[4] + info_loss2_array[5] + + info_loss2_array[6] + info_loss2_array[7]; + stream_loss2.write(sum3); + count_array = 0; + } +#endif + } // loop i + } + } + } + } +} + +template +void ComputeEntropy2(uint16_t num_tile, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_rectx_out, + hls::stream& stream_recty_out, + hls::stream& stream_loss, + hls::stream& stream_loss2, + hls::stream& stream_entropy, + hls::stream& stream_nzeros, + hls::stream& stream_loss_sum, + hls::stream& stream_loss2_sum, + hls::stream& stream_entropy_sum, + hls::stream& stream_nzeros_sum) { + float entropy_v[3]; + float nzeros_v[3]; + float info_loss; + float info_loss2; +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + stream_recty_out.write(rect_ysize); + stream_rectx_out.write(rect_xsize); + LOOP_1: + for (uint8_t iy = 0; iy + N - 1 < rect_ysize; iy += N) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_2: + for (uint8_t ix = 0; ix + N - 1 < rect_xsize; ix += N) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_3: + for (uint8_t c = 0; c < 3; c++) { +#pragma HLS LOOP_TRIPCOUNT min = 3 max = 3 + LOOP_4: + for (uint8_t i = 0; i < 64 * N * N / 8; i += 1) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 +#pragma HLS pipeline II = 8 + if (c == 0 && i == 0) { + info_loss = 0.0; + info_loss2 = 0.0; + } + if (i == 0) { + entropy_v[c] = 0; + nzeros_v[c] = 0; + } + entropy_v[c] += stream_entropy.read(); + nzeros_v[c] += stream_nzeros.read(); + info_loss += stream_loss.read(); + info_loss2 += stream_loss2.read(); + if (i == 64 * N * N / 8 - 1) { + stream_entropy_sum.write(entropy_v[c]); + stream_nzeros_sum.write(nzeros_v[c]); + } + if (c == 2 && i == 64 * N * N / 8 - 1) { + stream_loss_sum.write(info_loss); + stream_loss2_sum.write(info_loss2); + } + } + } + } + } + } +} + +template +void ComputeEntropy3(uint16_t num_tile, + float cost1, + float mul, + hls::stream& stream_rectx, + hls::stream& stream_recty, +#ifdef FIX + hls::stream >& stream_loss, + hls::stream >& stream_loss2, + hls::stream >& stream_entropy, + hls::stream >& stream_nzeros, +#else + hls::stream& stream_loss_sum, + hls::stream& stream_loss2_sum, + hls::stream& stream_entropy_sum, + hls::stream& stream_nzeros_sum, +#endif + hls::stream& stream_mask, + hls::stream& stream_entropy_final) { +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + LOOP_1: + for (uint8_t iy = 0; iy + N - 1 < rect_ysize; iy += N) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_2: + for (uint8_t ix = 0; ix + N - 1 < rect_xsize; ix += N) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 +#ifdef FIX + float entropy = 0.0; + float zeros_mul = 7.565053364251793f; + float entropy_v[3]; + ap_int<11> nzeros_v[3]; + float entropy_bits[3] = {0.0, 0.0, 0.0}; + for (uint8_t c = 0; c < 3; c++) { +#pragma HLS LOOP_TRIPCOUNT min = 3 max = 3 +#pragma HLS pipeline + ap_int<42> entropy_tmp = stream_entropy.read(); + entropy_v[c] = entropy_tmp / 65536.0; + nzeros_v[c] = stream_nzeros.read(); + entropy_v[c] += nzeros_v[c] * cost1; + uint8_t nbits = LUTCeilLog2Nonzero[(nzeros_v[c] + 1)] + 1; + entropy_bits[c] = zeros_mul * (LUTCeilLog2Nonzero[nbits + 17] + nbits); + } + entropy = entropy_v[0] + entropy_v[1] + entropy_v[2]; + entropy += entropy_bits[0] + entropy_bits[1] + entropy_bits[2]; + ap_int<28> tmp_loss = stream_loss.read(); + float loss_f = tmp_loss / 65536.0; + ap_int<44> tmp_loss2 = stream_loss2.read(); + float loss2_f = tmp_loss2 / 65536.0 / 65536.0; + float info_loss_multiplier = 138.0f; + float info_loss_multiplier2 = 50.46839691767866; + float loss = ((info_loss_multiplier * loss_f) + (info_loss_multiplier2 * N * sqrtf(loss2_f))); + float loss_mask = stream_mask.read() * loss; + float ret = entropy + loss_mask; + if (N == 1) { + ret = 3.0f + 0.745f * ret; + } + ret = ret * mul; + stream_entropy_final.write(ret); +#else + float entropy = 0.0; + float zeros_mul = 7.565053364251793f; + float entropy_v[3] = {0.0, 0.0, 0.0}; + float nzeros_v[3] = {0.0, 0.0, 0.0}; + float entropy_bits[3] = {0.0, 0.0, 0.0}; + for (int c = 0; c < 3; c++) { +#pragma HLS LOOP_TRIPCOUNT min = 3 max = 3 +#pragma HLS pipeline + + entropy_v[c] = stream_entropy_sum.read(); + nzeros_v[c] = stream_nzeros_sum.read(); + entropy_v[c] += nzeros_v[c] * cost1; + // TODO: Integer to integer, can we use look up table to implement this? + // int nbits = CeilLog2NonzeroHLS(nzeros_v[c] + 1) + 1; + // entropy_bits[c] = zeros_mul * (CeilLog2NonzeroHLS(nbits + 17) + nbits); + int nbits = LUTCeilLog2Nonzero[(short)(nzeros_v[c] + 1)] + 1; + entropy_bits[c] = zeros_mul * (LUTCeilLog2Nonzero[nbits + 17] + nbits); + } + entropy = entropy_v[0] + entropy_v[1] + entropy_v[2]; + entropy += entropy_bits[0] + entropy_bits[1] + entropy_bits[2]; + float tmp_loss = stream_loss_sum.read(); + float tmp_loss2 = stream_loss2_sum.read(); + float info_loss_multiplier = 138.0f; + float info_loss_multiplier2 = 50.46839691767866; + float ret = entropy + + stream_mask.read() * ((info_loss_multiplier * tmp_loss) + + (info_loss_multiplier2 * sqrtf((float)(N * N * tmp_loss2)))); + if (N == 1) { + ret = 3.0f + 0.745f * ret; + } + stream_entropy_final.write(ret * mul); +#endif + } + } + } +} + +template +void BufferN(uint16_t num_tile, + float* ping, + float* pang, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_dctin, + hls::stream& stream_con, + hls::stream& stream_ok) { + uint8_t block = N; + uint8_t block_n = N * N; + bool flag = true; + uint16_t size = 4096; + uint8_t w = 64; + uint16_t total_size = 4096 * 3; +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + stream_con.write(1); + LOOP_1: + for (uint8_t jy = 0; jy < 8 / N; jy += 1) { + LOOP_2: + for (uint8_t jx = 0; jx < 8 / N; jx += 1) { + LOOP_3: + for (uint8_t c = 0; c < 3; c++) { + LOOP_4: + for (uint8_t iy = 0; iy < 8 * N; iy += 1) { + LOOP_5: + for (uint8_t ix = 0; ix < 8 * N; ix += 1) { + uint8_t y = jy * 8 * N + iy; + uint8_t x = jx * 8 * N + ix; + bool read = false; + if (N == 1 && (jy < rect_ysize) && (jx < rect_xsize)) { + read = true; + } + if (N == 2 && (jy * 2 + 1) < rect_ysize && (jx * 2 + 1) < rect_xsize) { + read = true; + } + if (N == 4 && (jy * 4 + 3) < rect_ysize && (jx * 4 + 3) < rect_xsize) { + read = true; + } + if (read) { + float tmp = stream_dctin.read(); +#ifdef __SYNTHESIS__ + if (flag) { + ping[c * size + y * w + x] = tmp; + } else { + pang[c * size + y * w + x] = tmp; + } +#else + if (flag) { + ping[total_size * tid + c * size + y * w + x] = tmp; + } else { + pang[total_size * tid + c * size + y * w + x] = tmp; + } +#endif + } + } + } + } + } + } + flag = !flag; + stream_ok.write(1); + } +} + +void EstimateEntropy8(uint16_t num_tile, + float cost1, + float mul8x8, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_q_org, + hls::stream& stream_mask_org, + hls::stream& stream_dctin, + hls::stream& stream_entropy_8) { +#pragma HLS inline + + hls::stream stream_rectx80("rectx80"); +#pragma HLS stream variable = stream_rectx80 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx80 type = fifo + hls::stream stream_recty80("recty80"); +#pragma HLS stream variable = stream_recty80 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty80 type = fifo + hls::stream stream_rectx81("rectx81"); +#pragma HLS stream variable = stream_rectx81 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx81 type = fifo + hls::stream stream_recty81("recty81"); +#pragma HLS stream variable = stream_recty81 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty81 type = fifo + hls::stream stream_rectx82("rectx82"); +#pragma HLS stream variable = stream_rectx82 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx82 type = fifo + hls::stream stream_recty82("recty82"); +#pragma HLS stream variable = stream_recty82 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty82 type = fifo + +#ifdef FIX + hls::stream > stream_loss("loss_8"); +#pragma HLS stream variable = stream_loss depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss type = fifo + hls::stream > stream_loss2("loss2_8"); +#pragma HLS stream variable = stream_loss2 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2 type = fifo + hls::stream > stream_entropy("entropy_8"); +#pragma HLS stream variable = stream_entropy depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy type = fifo + hls::stream > stream_nzeros("nzeros_8"); +#pragma HLS stream variable = stream_nzeros depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros type = fifo +#else + hls::stream stream_loss("loss_8"); +#pragma HLS stream variable = stream_loss depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss type = fifo + hls::stream stream_loss2("loss2_8"); +#pragma HLS stream variable = stream_loss2 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2 type = fifo + hls::stream stream_entropy("entropy_8"); +#pragma HLS stream variable = stream_entropy depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy type = fifo + hls::stream stream_nzeros("nzeros_8"); +#pragma HLS stream variable = stream_nzeros depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros type = fifo +#endif + hls::stream stream_loss_sum("loss_8_sum"); + ; +#pragma HLS stream variable = stream_loss_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss_sum type = fifo + hls::stream stream_loss2_sum("loss2_8_sum"); +#pragma HLS stream variable = stream_loss2_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2_sum type = fifo + hls::stream stream_entropy_sum("entropy_8_sum"); +#pragma HLS stream variable = stream_entropy_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy_sum type = fifo + hls::stream stream_nzeros_sum("nzeros_8_sum"); +#pragma HLS stream variable = stream_nzeros_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros_sum type = fifo + hls::stream stream_q("q_8"); +#pragma HLS stream variable = stream_q depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_q type = fifo + hls::stream stream_mask("mask_8"); +#pragma HLS stream variable = stream_mask depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_mask type = fifo + + // #pragma HLS dataflow + GetQAndMask_8(num_tile, stream_rectx, stream_recty, stream_rectx80, stream_recty80, stream_q_org, stream_mask_org, + stream_q, stream_mask); + +#ifdef FIX + ComputeEntropy1<1>(num_tile, stream_rectx80, stream_recty80, stream_rectx81, stream_recty81, stream_q, stream_dctin, + stream_loss, stream_loss2, stream_entropy, stream_nzeros); + + ComputeEntropy3<1>(num_tile, cost1, mul8x8, stream_rectx81, stream_recty81, stream_loss, stream_loss2, + stream_entropy, stream_nzeros, stream_mask, stream_entropy_8); + +#else + + ComputeEntropy1<1>(num_tile, stream_rectx80, stream_recty80, stream_rectx81, stream_recty81, stream_q, stream_dctin, + stream_loss, stream_loss2, stream_entropy, stream_nzeros); + + ComputeEntropy2<1>(num_tile, stream_rectx81, stream_recty81, stream_rectx82, stream_recty82, stream_loss, + stream_loss2, stream_entropy, stream_nzeros, stream_loss_sum, stream_loss2_sum, + stream_entropy_sum, stream_nzeros_sum); + + ComputeEntropy3<1>(num_tile, cost1, mul8x8, stream_rectx82, stream_recty82, stream_loss_sum, stream_loss2_sum, + stream_entropy_sum, stream_nzeros_sum, stream_mask, stream_entropy_8); +#endif +} + +void EstimateEntropy16(uint16_t num_tile, + float cost1, + float mul16x16, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_q_org, + hls::stream& stream_mask_org, + hls::stream& stream_dctin, + hls::stream& stream_entropy_16) { +#pragma HLS inline + + hls::stream stream_rectx160("rectx160"); +#pragma HLS stream variable = stream_rectx160 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx160 type = fifo + hls::stream stream_recty160("recty160"); +#pragma HLS stream variable = stream_recty160 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty160 type = fifo + hls::stream stream_rectx161("rectx161"); +#pragma HLS stream variable = stream_rectx161 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx161 type = fifo + hls::stream stream_recty161("recty161"); +#pragma HLS stream variable = stream_recty161 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty161 type = fifo + hls::stream stream_rectx162("rectx162"); +#pragma HLS stream variable = stream_rectx162 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx162 type = fifo + hls::stream stream_recty162("recty162"); +#pragma HLS stream variable = stream_recty162 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty162 type = fifo + +#ifdef FIX + hls::stream > stream_loss("loss_16"); +#pragma HLS stream variable = stream_loss depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss type = fifo + hls::stream > stream_loss2("loss2_16"); +#pragma HLS stream variable = stream_loss2 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2 type = fifo + hls::stream > stream_entropy("entropy_16"); +#pragma HLS stream variable = stream_entropy depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy type = fifo + hls::stream > stream_nzeros("nzeros_16"); +#pragma HLS stream variable = stream_nzeros depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros type = fifo +#else + hls::stream stream_loss("loss_16"); +#pragma HLS stream variable = stream_loss depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss type = fifo + hls::stream stream_loss2("loss2_16"); +#pragma HLS stream variable = stream_loss2 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2 type = fifo + hls::stream stream_entropy("entropy_16"); +#pragma HLS stream variable = stream_entropy depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy type = fifo + hls::stream stream_nzeros("nzeros_16"); +#pragma HLS stream variable = stream_nzeros depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros type = fifo +#endif + hls::stream stream_loss_sum("loss_16_sum"); + ; +#pragma HLS stream variable = stream_loss_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss_sum type = fifo + hls::stream stream_loss2_sum("loss2_16_sum"); +#pragma HLS stream variable = stream_loss2_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2_sum type = fifo + hls::stream stream_entropy_sum("entropy_16_sum"); +#pragma HLS stream variable = stream_entropy_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy_sum type = fifo + hls::stream stream_nzeros_sum("nzeros_16_sum"); +#pragma HLS stream variable = stream_nzeros_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros_sum type = fifo + hls::stream stream_q("q_16"); +#pragma HLS stream variable = stream_q depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_q type = fifo + hls::stream stream_mask("mask_16"); +#pragma HLS stream variable = stream_mask depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_mask type = fifo + + // #pragma HLS dataflow + GetQAndMask_16_32<2>(num_tile, stream_rectx, stream_recty, stream_rectx160, stream_recty160, stream_q_org, + stream_mask_org, stream_q, stream_mask); + +#ifdef FIX + + ComputeEntropy1<2>(num_tile, stream_rectx160, stream_recty160, stream_rectx161, stream_recty161, stream_q, + stream_dctin, stream_loss, stream_loss2, stream_entropy, stream_nzeros); + + ComputeEntropy3<2>(num_tile, cost1, mul16x16, stream_rectx161, stream_recty161, stream_loss, stream_loss2, + stream_entropy, stream_nzeros, stream_mask, stream_entropy_16); + +#else + + ComputeEntropy1<2>(num_tile, stream_rectx160, stream_recty160, stream_rectx161, stream_recty161, stream_q, + stream_dctin, stream_loss, stream_loss2, stream_entropy, stream_nzeros); + + ComputeEntropy2<2>(num_tile, stream_rectx161, stream_recty161, stream_rectx162, stream_recty162, stream_loss, + stream_loss2, stream_entropy, stream_nzeros, stream_loss_sum, stream_loss2_sum, + stream_entropy_sum, stream_nzeros_sum); + + ComputeEntropy3<2>(num_tile, cost1, mul16x16, stream_rectx162, stream_recty162, stream_loss_sum, stream_loss2_sum, + stream_entropy_sum, stream_nzeros_sum, stream_mask, stream_entropy_16); +#endif +} + +void EstimateEntropy32(uint16_t num_tile, + float cost1, + float mul32x32, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_q_org, + hls::stream& stream_mask_org, + hls::stream& stream_dctin, + hls::stream& stream_entropy_32) { +#pragma HLS inline + + hls::stream stream_rectx320("rectx320"); +#pragma HLS stream variable = stream_rectx320 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx320 type = fifo + hls::stream stream_recty320("recty320"); +#pragma HLS stream variable = stream_recty320 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty320 type = fifo + hls::stream stream_rectx321("rectx321"); +#pragma HLS stream variable = stream_rectx321 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx321 type = fifo + hls::stream stream_recty321("recty321"); +#pragma HLS stream variable = stream_recty321 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty321 type = fifo + hls::stream stream_rectx322("rectx322"); +#pragma HLS stream variable = stream_rectx322 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx322 type = fifo + hls::stream stream_recty322("recty322"); +#pragma HLS stream variable = stream_recty322 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty322 type = fifo + +#ifdef FIX + hls::stream > stream_loss("loss_32"); +#pragma HLS stream variable = stream_loss depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss type = fifo + hls::stream > stream_loss2("loss2_32"); +#pragma HLS stream variable = stream_loss2 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2 type = fifo + hls::stream > stream_entropy("entropy_32"); +#pragma HLS stream variable = stream_entropy depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy type = fifo + hls::stream > stream_nzeros("nzeros_32"); +#pragma HLS stream variable = stream_nzeros depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros type = fifo +#else + hls::stream stream_loss("loss_32"); +#pragma HLS stream variable = stream_loss depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss type = fifo + hls::stream stream_loss2("loss2_32"); +#pragma HLS stream variable = stream_loss2 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2 type = fifo + hls::stream stream_entropy("entropy_32"); +#pragma HLS stream variable = stream_entropy depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy type = fifo + hls::stream stream_nzeros("nzeros_32"); +#pragma HLS stream variable = stream_nzeros depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros type = fifo +#endif + hls::stream stream_loss_sum("loss_32_sum"); + ; +#pragma HLS stream variable = stream_loss_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss_sum type = fifo + hls::stream stream_loss2_sum("loss2_32_sum"); +#pragma HLS stream variable = stream_loss2_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_loss2_sum type = fifo + hls::stream stream_entropy_sum("entropy_32_sum"); +#pragma HLS stream variable = stream_entropy_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy_sum type = fifo + hls::stream stream_nzeros_sum("nzeros_32_sum"); +#pragma HLS stream variable = stream_nzeros_sum depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_nzeros_sum type = fifo + hls::stream stream_q("q_32"); +#pragma HLS stream variable = stream_q depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_q type = fifo + hls::stream stream_mask("mask_32"); +#pragma HLS stream variable = stream_mask depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_mask type = fifo + + // #pragma HLS dataflow + GetQAndMask_16_32<4>(num_tile, stream_rectx, stream_recty, stream_rectx320, stream_recty320, stream_q_org, + stream_mask_org, stream_q, stream_mask); + +#ifdef FIX + ComputeEntropy1<4>(num_tile, stream_rectx320, stream_recty320, stream_rectx321, stream_recty321, stream_q, + stream_dctin, stream_loss, stream_loss2, stream_entropy, stream_nzeros); + + ComputeEntropy3<4>(num_tile, cost1, mul32x32, stream_rectx321, stream_recty321, stream_loss, stream_loss2, + stream_entropy, stream_nzeros, stream_mask, stream_entropy_32); + +#else + + ComputeEntropy1<4>(num_tile, stream_rectx320, stream_recty320, stream_rectx321, stream_recty321, stream_q, + stream_dctin, stream_loss, stream_loss2, stream_entropy, stream_nzeros); + + ComputeEntropy2<4>(num_tile, stream_rectx321, stream_recty321, stream_rectx322, stream_recty322, stream_loss, + stream_loss2, stream_entropy, stream_nzeros, stream_loss_sum, stream_loss2_sum, + stream_entropy_sum, stream_nzeros_sum); + + ComputeEntropy3<4>(num_tile, cost1, mul32x32, stream_rectx322, stream_recty322, stream_loss_sum, stream_loss2_sum, + stream_entropy_sum, stream_nzeros_sum, stream_mask, stream_entropy_32); +#endif +} + +void CompareEntropy(uint16_t num_tile, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_rectx_out, + hls::stream& stream_recty_out, + hls::stream& stream_entropy_8, + hls::stream& stream_entropy_16, + hls::stream& stream_entropy_32, + uint8_t* strategy_ping, + uint8_t* strategy_pang, + hls::stream& stream_con, + hls::stream& stream_ok) { + bool flag = true; +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + stream_recty_out.write(rect_ysize); + stream_rectx_out.write(rect_xsize); + float entropy_32; + float entropy_16; + float entropy_8; + float entropy_sum[16] = {0}; + stream_con.write(1); + LOOP_1: + for (uint8_t iy = 0; iy < rect_ysize; iy++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_2: + for (uint8_t ix = 0; ix < rect_xsize; ix++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 +#pragma HLS pipeline + uint8_t idx = iy * 8 + ix; + uint8_t idx_8_sum = (iy / 2) * 4 + ix / 2; + entropy_sum[idx_8_sum] += stream_entropy_8.read(); +#ifdef __SYNTHESIS__ + if (flag) { + strategy_ping[idx] = 0; + } else { + strategy_pang[idx] = 0; + } +#else + int idx1 = 64 * tid + idx; + if (flag) { + strategy_ping[idx1] = 0; + } else { + strategy_pang[idx1] = 0; + } +#endif + if (iy % 2 == 1 && ix % 2 == 1) { + entropy_16 = stream_entropy_16.read(); + entropy_8 = entropy_sum[(iy / 2) * 4 + ix / 2]; + if (entropy_16 < entropy_8) { + LOOP_3: + for (uint8_t y = iy - 1; y < iy + 1; y++) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 +#pragma HLS unroll + LOOP_4: + for (uint8_t x = ix - 1; x < ix + 1; x++) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 +#pragma HLS unroll + uint8_t idx = y * 8 + x; +#ifdef __SYNTHESIS__ + if (flag) { + strategy_ping[idx] = 4; + } else { + strategy_pang[idx] = 4; + } +#else + int idx1 = 64 * tid + idx; + if (flag) { + strategy_ping[idx1] = 4; + } else { + strategy_pang[idx1] = 4; + } +#endif + entropy_sum[(y / 2) * 4 + x / 2] = entropy_16; + } + } + } + if (iy % 4 == 3 && ix % 4 == 3) { + entropy_32 = stream_entropy_32.read(); + entropy_16 = 0; + LOOP_5: + for (uint8_t y = iy - 3; y < iy + 1; y += 2) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 +#pragma HLS unroll + LOOP_6: + for (uint8_t x = ix - 3; x < ix + 1; x += 2) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 +#pragma HLS unroll + uint8_t idx_16 = y * 8 + x; + entropy_16 += entropy_sum[(y / 2) * 4 + x / 2]; + } + } + if (entropy_32 < entropy_16) { + LOOP_7: + for (uint8_t y = iy - 3; y < iy + 1; y++) { +#pragma HLS LOOP_TRIPCOUNT min = 4 max = 4 +#pragma HLS unroll + LOOP_8: + for (uint8_t x = ix - 3; x < ix + 1; x++) { +#pragma HLS LOOP_TRIPCOUNT min = 4 max = 4 +#pragma HLS unroll + uint8_t idx = y * 8 + x; +#ifdef __SYNTHESIS__ + if (flag) { + strategy_ping[idx] = 5; + } else { + strategy_pang[idx] = 5; + } +#else + int idx1 = 64 * tid + idx; + if (flag) { + strategy_ping[idx1] = 5; + } else { + strategy_pang[idx1] = 5; + } +#endif + } + } + } + } + } + } + } + flag = !flag; + stream_ok.write(1); + } +} + +void Reorder(uint16_t num_tile, + float* ping8, + float* pang8, + float* ping16, + float* pang16, + float* ping32, + float* pang32, + uint8_t* strategy_ping, + uint8_t* strategy_pang, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_rectx_out, + hls::stream& stream_recty_out, + hls::stream& stream_con, + hls::stream& stream_ok, + hls::stream& stream_con8, + hls::stream& stream_ok8, + hls::stream& stream_con16, + hls::stream& stream_ok16, + hls::stream& stream_con32, + hls::stream& stream_ok32, + hls::stream& stream_strategy, + hls::stream& stream_strategy1, + hls::stream& stream_select) { + bool flag = true; + uint16_t size = 4096; + uint8_t w = 64; + uint16_t total_size = 4096 * 3; + ap_uint<64> visited; +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + float entropy_32; + float entropy_16; + float entropy_8; + float entropy_sum[16] = {0}; + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + stream_recty_out.write(rect_ysize); + stream_rectx_out.write(rect_xsize); + stream_ok.read(); + stream_ok8.read(); + stream_ok16.read(); + stream_ok32.read(); + visited = 0; + LOOP_1: + for (uint8_t iy = 0; iy < rect_ysize; iy++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_2: + for (uint8_t ix = 0; ix < rect_xsize; ix++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + uint8_t strategy; + uint8_t idx = iy * 8 + ix; + if (visited.range(idx, idx) == 0) { +#ifdef __SYNTHESIS__ + if (flag) { + strategy = strategy_ping[idx]; + } else { + strategy = strategy_pang[idx]; + } +#else + if (flag) { + strategy = strategy_ping[tid * 64 + idx]; + } else { + strategy = strategy_pang[tid * 64 + idx]; + } +#endif + stream_strategy.write(strategy); + stream_strategy1.write(strategy); + if (strategy == 4) { + LOOP_3: + for (uint8_t y = 0; y < 2; y++) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 +#pragma HLS unroll + LOOP_4: + for (uint8_t x = 0; x < 2; x++) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 +#pragma HLS unroll + uint8_t idx = (iy + y) * 8 + (ix + x); + visited.range(idx, idx) = 1; + } + } + LOOP_5: + for (uint8_t y = 0; y < 16; y++) { + LOOP_6: + for (uint8_t x = 0; x < 16; x++) { + LOOP_7: + for (uint8_t c = 0; c < 3; c++) { +#pragma HLS pipeline + uint16_t sy = iy * 8 + y; + uint16_t sx = ix * 8 + x; +#ifdef __SYNTHESIS__ + uint16_t idx = c * size + sy * w + sx; + if (flag) { + float tmp = ping16[idx]; + stream_select.write(tmp); + } else { + float tmp = pang16[idx]; + stream_select.write(tmp); + } +#else + int idx = total_size * tid + c * size + sy * w + sx; + if (flag) { + float tmp = ping16[idx]; + stream_select.write(tmp); + } else { + float tmp = pang16[idx]; + stream_select.write(tmp); + } +#endif + } + } + } + } else if (strategy == 5) { + LOOP_8: + for (uint8_t y = 0; y < 4; y++) { +#pragma HLS LOOP_TRIPCOUNT min = 4 max = 4 +#pragma HLS unroll + LOOP_9: + for (uint8_t x = 0; x < 4; x++) { +#pragma HLS LOOP_TRIPCOUNT min = 4 max = 4 +#pragma HLS unroll + uint8_t idx = (iy + y) * 8 + (ix + x); + visited.range(idx, idx) = 1; + } + } + LOOP_10: + for (uint8_t y = 0; y < 32; y++) { + LOOP_11: + for (uint8_t x = 0; x < 32; x++) { + LOOP_12: + for (uint8_t c = 0; c < 3; c++) { +#pragma HLS pipeline + uint16_t sy = iy * 8 + y; + uint16_t sx = ix * 8 + x; +#ifdef __SYNTHESIS__ + uint16_t idx = c * size + sy * w + sx; + if (flag) { + float tmp = ping32[idx]; + stream_select.write(tmp); + } else { + float tmp = pang32[idx]; + stream_select.write(tmp); + } +#else + int idx = total_size * tid + c * size + sy * w + sx; + if (flag) { + float tmp = ping32[idx]; + stream_select.write(tmp); + } else { + float tmp = pang32[idx]; + stream_select.write(tmp); + } +#endif + } + } + } + } else { + visited.range(idx, idx) = 1; + LOOP_13: + for (uint8_t y = 0; y < 8; y++) { + LOOP_14: + for (uint8_t x = 0; x < 8; x++) { + LOOP_15: + for (uint8_t c = 0; c < 3; c++) { +#pragma HLS pipeline + uint16_t sy = iy * 8 + y; + uint16_t sx = ix * 8 + x; +#ifdef __SYNTHESIS__ + uint16_t idx = c * size + sy * w + sx; + if (flag) { + float tmp = ping8[idx]; + stream_select.write(tmp); + } else { + float tmp = pang8[idx]; + stream_select.write(tmp); + } +#else + int idx = total_size * tid + c * size + sy * w + sx; + float tmp; + if (flag) { + tmp = ping8[idx]; + stream_select.write(tmp); + } else { + tmp = pang8[idx]; + stream_select.write(tmp); + } +#endif + } + } + } + } + } + } + } + flag = !flag; + stream_con.read(); + stream_con8.read(); + stream_con16.read(); + stream_con32.read(); + } +} + +void ConsumeStrategyDCT(int xsize, + int ysize, + hls::stream& stream_strategy, + hls::stream& stream_select, + float* dctx_8x8, + float* dcty_8x8, + float* dctb_8x8, + float* dctx_16x16, + float* dcty_16x16, + float* dctb_16x16, + float* dctx_32x32, + float* dcty_32x32, + float* dctb_32x32) { + int xsize_blocks = xsize / 8; + int ysize_blocks = ysize / 8; + int n_enc_tiles = Div_Ceil(xsize_blocks, 8); + int count = 0; + int count_s = 0; + ap_uint<64> visited; +LOOP_0: + for (int tid = 0; tid < Div_Ceil(xsize_blocks, 8) * Div_Ceil(ysize_blocks, 8); tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + int tx1 = tid % n_enc_tiles; + int ty1 = tid / n_enc_tiles; + int by = ty1 * 8; + int by1 = fmin((int)((ty1 + 1) * 8), ysize_blocks); + int bx = tx1 * 8; + int bx1 = fmin((int)((tx1 + 1) * 8), xsize_blocks); + int rect_ysize = by1 - by; + int rect_xsize = bx1 - bx; + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + visited = 0; + for (int iy = 0; iy < rect_ysize; iy++) { + for (int ix = 0; ix < rect_xsize; ix++) { + char strategy; + int idx = iy * 8 + ix; + if (visited.range(idx, idx) == 0) { + strategy = stream_strategy.read(); + if (strategy == 0) { + visited.range(idx, idx) = 1; + for (int y = 0; y < 8; y++) { + for (int x = 0; x < 8; x++) { + for (int c = 0; c < 3; c++) { + float tmp = stream_select.read(); + int idx = ((ty1 * 64 + iy * 8 + y) * xsize) + (tx1 * 64 + ix * 8 + x); + if (c == 0) { + dcty_8x8[idx] = tmp; + } else if (c == 1) { + dctx_8x8[idx] = tmp; + } else if (c == 2) { + dctb_8x8[idx] = tmp; + } + } + } + } + } else if (strategy == 4) { + for (int y = 0; y < 2; y++) { + for (int x = 0; x < 2; x++) { + int idx = (iy + y) * 8 + (ix + x); + visited.range(idx, idx) = 1; + } + } + for (int y = 0; y < 16; y++) { + for (int x = 0; x < 16; x++) { + for (int c = 0; c < 3; c++) { + float tmp = stream_select.read(); + int idx = ((ty1 * 64 + iy * 8 + y) * xsize) + (tx1 * 64 + ix * 8 + x); + if (c == 0) { + dcty_16x16[idx] = tmp; + } else if (c == 1) { + dctx_16x16[idx] = tmp; + } else if (c == 2) { + dctb_16x16[idx] = tmp; + } + } + } + } + } else if (strategy == 5) { + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) { + int idx = (iy + y) * 8 + (ix + x); + visited.range(idx, idx) = 1; + } + } + for (int y = 0; y < 32; y++) { + for (int x = 0; x < 32; x++) { + for (int c = 0; c < 3; c++) { + float tmp = stream_select.read(); + int idx = ((ty1 * 64 + iy * 8 + y) * xsize) + (tx1 * 64 + ix * 8 + x); + if (c == 0) { + dcty_32x32[idx] = tmp; + } else if (c == 1) { + dctx_32x32[idx] = tmp; + } else if (c == 2) { + dctb_32x32[idx] = tmp; + } + } + } + } + } + } + } + } + } + // std::cout << "use count_s=" << count_s << ", count=" << count << std::endl; +} + +void SetQuantField(uint16_t num_tile, + float inv_global_scale, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_rqf_org, + hls::stream& stream_strategy1, + hls::stream& stream_rqf) { + ap_uint<64> visited; +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + visited = 0; + float rqf_array[64]; + LOOP_1: + for (uint8_t iy = 0; iy < rect_ysize; iy++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_2: + for (uint8_t ix = 0; ix < rect_xsize; ix++) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 +#pragma HLS pipeline + uint16_t index = iy * 8 + ix; + rqf_array[index] = stream_rqf_org.read(); + } + } + LOOP_3: + for (uint8_t y = 0; y < rect_ysize; ++y) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 + LOOP_4: + for (uint8_t x = 0; x < rect_xsize; ++x) { +#pragma HLS LOOP_TRIPCOUNT min = 8 max = 8 +#pragma HLS pipeline + float max = -3.40282e+038; + uint8_t idx = y * 8 + x; + if (visited.range(idx, idx) == 0) { + uint8_t strategy = stream_strategy1.read(); + uint8_t b = strategy_block[strategy]; + LOOP_5: + for (uint8_t iy = 0; iy < b; iy++) { + LOOP_6: + for (uint8_t ix = 0; ix < b; ix++) { +#pragma HLS pipeline + uint16_t idx = (iy + y) * 8 + (ix + x); + visited.range(idx, idx) = 1; + float tmp = rqf_array[idx]; + max = fmax(tmp, max); + } + } + float tmp = max; + tmp = tmp * inv_global_scale + 0.5f; + int16_t tmp_i = (int16_t)tmp; + tmp_i = tmp_i > 256 ? 256 : tmp_i; + int16_t val = tmp_i > 1 ? tmp_i : 1; + stream_rqf.write(val); + } + } + } + } +} + +template +void DupDCT(uint16_t num_tile, + hls::stream& stream_rectx, + hls::stream& stream_recty, + hls::stream& stream_rectx_out0, + hls::stream& stream_recty_out0, + hls::stream& stream_rectx_out1, + hls::stream& stream_recty_out1, + hls::stream& stream_dctin, + hls::stream& stream_dctout0, + hls::stream& stream_dctout1) { + uint8_t block_n = N * N; + const uint16_t size = 64 * block_n; +LOOP_0: + for (uint16_t tid = 0; tid < num_tile; tid++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint8_t rect_ysize = stream_recty.read(); + uint8_t rect_xsize = stream_rectx.read(); + stream_recty_out0.write(rect_ysize); + stream_rectx_out0.write(rect_xsize); + stream_recty_out1.write(rect_ysize); + stream_rectx_out1.write(rect_xsize); + LOOP_1: + for (uint8_t iy = 0; iy + N - 1 < rect_ysize; iy += N) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 + LOOP_2: + for (uint8_t ix = 0; ix + N - 1 < rect_xsize; ix += N) { +#pragma HLS LOOP_TRIPCOUNT min = 2 max = 2 + LOOP_3: + for (uint8_t c = 0; c < 3; c++) { +#pragma HLS LOOP_TRIPCOUNT min = 3 max = 3 + LOOP_4: + for (uint16_t i = 0; i < size; i += 1) { +#pragma HLS LOOP_TRIPCOUNT min = 1024 max = 1024 +#pragma HLS pipeline II = 1 + float tmp = stream_dctin.read(); + stream_dctout0.write(tmp); + stream_dctout1.write(tmp); + } + } + } + } + } +} + +void ComputeTileACSHLS(uint16_t num_tile, + short ysize, + short xsize, + float cost1, + float butteraugli_target, + float inv_global_scale, + float mul8x8, + float mul16x16, + float mul32x32, + hls::stream& stream_rectx_acs, + hls::stream& stream_recty_acs, + hls::stream& stream_rqf_org, + hls::stream& stream_q_org, + hls::stream& stream_mask_org, + hls::stream& stream_dctin8, + hls::stream& stream_dctin16, + hls::stream& stream_dctin32, + hls::stream& stream_strategy, + hls::stream& stream_select, + hls::stream& stream_rqf) { +#pragma HLS INLINE + hls::stream stream_rectx0("rectx0"); +#pragma HLS stream variable = stream_rectx0 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx0 type = fifo + hls::stream stream_recty0("recty0"); +#pragma HLS stream variable = stream_recty0 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty0 type = fifo + + hls::stream stream_rectx1("rectx1"); +#pragma HLS stream variable = stream_rectx1 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx1 type = fifo + hls::stream stream_recty1("recty1"); +#pragma HLS stream variable = stream_recty1 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty1 type = fifo + + hls::stream stream_rectx2("rectx2"); +#pragma HLS stream variable = stream_rectx2 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx2 type = fifo + hls::stream stream_recty2("recty2"); +#pragma HLS stream variable = stream_recty2 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty2 type = fifo + + hls::stream stream_rectx3("rectx3"); +#pragma HLS stream variable = stream_rectx3 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx3 type = fifo + hls::stream stream_recty3("recty3"); +#pragma HLS stream variable = stream_recty3 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty3 type = fifo + + hls::stream stream_rectx4("rectx4"); +#pragma HLS stream variable = stream_rectx4 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx4 type = fifo + hls::stream stream_recty4("recty4"); +#pragma HLS stream variable = stream_recty4 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty4 type = fifo + + hls::stream stream_rectx5("rectx5"); +#pragma HLS stream variable = stream_rectx5 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx5 type = fifo + hls::stream stream_recty5("recty5"); +#pragma HLS stream variable = stream_recty5 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty5 type = fifo + + hls::stream stream_rectx6("rectx6"); +#pragma HLS stream variable = stream_rectx6 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx6 type = fifo + hls::stream stream_recty6("recty6"); +#pragma HLS stream variable = stream_recty6 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty6 type = fifo + + hls::stream stream_rectx7("rectx7"); +#pragma HLS stream variable = stream_rectx7 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx7 type = fifo + hls::stream stream_recty7("recty7"); +#pragma HLS stream variable = stream_recty7 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty7 type = fifo + + hls::stream stream_rectx8("rectx8"); +#pragma HLS stream variable = stream_rectx8 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx8 type = fifo + hls::stream stream_recty8("recty8"); +#pragma HLS stream variable = stream_recty8 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty8 type = fifo + + hls::stream stream_rectx9("rectx9"); +#pragma HLS stream variable = stream_rectx9 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx9 type = fifo + hls::stream stream_recty9("recty9"); +#pragma HLS stream variable = stream_recty9 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty9 type = fifo + + hls::stream stream_rectx10("rectx10"); +#pragma HLS stream variable = stream_rectx10 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx10 type = fifo + hls::stream stream_recty10("recty10"); +#pragma HLS stream variable = stream_recty10 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty10 type = fifo + + hls::stream stream_rectx11("rectx11"); +#pragma HLS stream variable = stream_rectx11 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx11 type = fifo + hls::stream stream_recty11("recty11"); +#pragma HLS stream variable = stream_recty11 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty11 type = fifo + + hls::stream stream_rectx12("rectx12"); +#pragma HLS stream variable = stream_rectx12 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_rectx12 type = fifo + hls::stream stream_recty12("recty12"); +#pragma HLS stream variable = stream_recty12 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_recty12 type = fifo + + hls::stream stream_dctin8_0("dctin8_0"); +#pragma HLS stream variable = stream_dctin8_0 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_dctin8_0 type = fifo + hls::stream stream_dctin16_0("dctin16_0"); +#pragma HLS stream variable = stream_dctin16_0 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_dctin16_0 type = fifo + hls::stream stream_dctin32_0("dctin32_0"); +#pragma HLS stream variable = stream_dctin32_0 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_dctin32_0 type = fifo + hls::stream stream_dctin8_1("dctin8_1"); +#pragma HLS stream variable = stream_dctin8_1 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_dctin8_1 type = fifo + hls::stream stream_dctin16_1("dctin16_1"); +#pragma HLS stream variable = stream_dctin16_1 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_dctin16_1 type = fifo + hls::stream stream_dctin32_1("dctin32_1"); +#pragma HLS stream variable = stream_dctin32_1 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_dctin32_1 type = fifo + + hls::stream stream_entropy_8("entropy_8"); +#pragma HLS stream variable = stream_entropy_8 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy_8 type = fifo + hls::stream stream_entropy_16("entropy_16"); +#pragma HLS stream variable = stream_entropy_16 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy_16 type = fifo + hls::stream stream_entropy_32("entropy_32"); +#pragma HLS stream variable = stream_entropy_32 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_entropy_32 type = fifo + + hls::stream stream_con("con"); +#pragma HLS stream variable = stream_con depth = 2 +#pragma HLS BIND_STORAGE variable = stream_con type = fifo + hls::stream stream_ok("ok"); +#pragma HLS stream variable = stream_ok depth = 2 +#pragma HLS BIND_STORAGE variable = stream_ok type = fifo + hls::stream stream_con8("con8"); +#pragma HLS stream variable = stream_con8 depth = 2 +#pragma HLS BIND_STORAGE variable = stream_con8 type = fifo + hls::stream stream_ok8("ok"); +#pragma HLS stream variable = stream_ok8 depth = 2 +#pragma HLS BIND_STORAGE variable = stream_ok8 type = fifo + hls::stream stream_con16("con16"); +#pragma HLS stream variable = stream_con16 depth = 2 +#pragma HLS BIND_STORAGE variable = stream_con16 type = fifo + hls::stream stream_ok16("ok16"); +#pragma HLS stream variable = stream_ok16 depth = 2 +#pragma HLS BIND_STORAGE variable = stream_ok16 type = fifo + hls::stream stream_con32("con32"); +#pragma HLS stream variable = stream_con32 depth = 2 +#pragma HLS BIND_STORAGE variable = stream_con32 type = fifo + hls::stream stream_ok32("ok32"); +#pragma HLS stream variable = stream_ok32 depth = 2 +#pragma HLS BIND_STORAGE variable = stream_ok32 type = fifo + + hls::stream stream_strategy1("strategy1"); +#pragma HLS stream variable = stream_strategy1 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_strategy1 type = fifo + + hls::stream stream_q_org_8("q_org_8"); +#pragma HLS stream variable = stream_q_org_8 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_q_org_8 type = fifo + hls::stream stream_mask_org_8("mask_org_8"); +#pragma HLS stream variable = stream_mask_org_8 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_mask_org_8 type = fifo + hls::stream stream_q_org_16("q_org_8"); +#pragma HLS stream variable = stream_q_org_16 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_q_org_16 type = fifo + hls::stream stream_mask_org_16("mask_org_8"); +#pragma HLS stream variable = stream_mask_org_16 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_mask_org_16 type = fifo + hls::stream stream_q_org_32("q_org_8"); +#pragma HLS stream variable = stream_q_org_32 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_q_org_32 type = fifo + hls::stream stream_mask_org_32("mask_org_8"); +#pragma HLS stream variable = stream_mask_org_32 depth = 1024 +#pragma HLS BIND_STORAGE variable = stream_mask_org_32 type = fifo + +// #pragma HLS dataflow + +#ifdef __SYNTHESIS__ + uint8_t strategy_ping[64]; + uint8_t strategy_pang[64]; +#pragma HLS bind_storage variable = strategy_ping type = RAM_T2P impl = bram +#pragma HLS shared variable = strategy_ping +#pragma HLS stable variable = strategy_ping +#pragma HLS bind_storage variable = strategy_pang type = RAM_T2P impl = bram +#pragma HLS shared variable = strategy_pang +#pragma HLS stable variable = strategy_pang +#else + uint8_t* strategy_ping = (uint8_t*)malloc(sizeof(uint8_t) * 64 * 32 * 32); + uint8_t* strategy_pang = (uint8_t*)malloc(sizeof(uint8_t) * 64 * 32 * 32); +#endif + +#ifdef __SYNTHESIS__ + float ping8[3 * 64 * 64]; + float pang8[3 * 64 * 64]; + float ping16[3 * 64 * 64]; + float pang16[3 * 64 * 64]; + float ping32[3 * 64 * 64]; + float pang32[3 * 64 * 64]; +#pragma HLS bind_storage variable = ping8 type = RAM_T2P impl = uram +#pragma HLS shared variable = ping8 +#pragma HLS stable variable = ping8 +#pragma HLS bind_storage variable = pang8 type = RAM_T2P impl = uram +#pragma HLS shared variable = pang8 +#pragma HLS stable variable = pang8 +#pragma HLS bind_storage variable = ping16 type = RAM_T2P impl = uram +#pragma HLS shared variable = ping16 +#pragma HLS stable variable = ping16 +#pragma HLS bind_storage variable = pang16 type = RAM_T2P impl = uram +#pragma HLS shared variable = pang16 +#pragma HLS stable variable = pang16 +#pragma HLS bind_storage variable = ping32 type = RAM_T2P impl = uram +#pragma HLS shared variable = ping32 +#pragma HLS stable variable = ping32 +#pragma HLS bind_storage variable = pang32 type = RAM_T2P impl = uram +#pragma HLS shared variable = pang32 +#pragma HLS stable variable = pang32 +#else + float* ping8 = (float*)malloc(sizeof(float) * 2048 * 2048 * 3); + float* pang8 = (float*)malloc(sizeof(float) * 2048 * 2048 * 3); + float* ping16 = (float*)malloc(sizeof(float) * 2048 * 2048 * 3); + float* pang16 = (float*)malloc(sizeof(float) * 2048 * 2048 * 3); + float* ping32 = (float*)malloc(sizeof(float) * 2048 * 2048 * 3); + float* pang32 = (float*)malloc(sizeof(float) * 2048 * 2048 * 3); +#endif + + // #pragma HLS dataflow + GetACSSize(xsize, ysize, stream_rectx_acs, stream_recty_acs, stream_rectx0, stream_recty0, stream_rectx1, + stream_recty1, stream_rectx2, stream_recty2, stream_rectx3, stream_recty3, stream_rectx10, + stream_recty10); + + DupQuantAndMask(num_tile, stream_rectx0, stream_recty0, stream_q_org, stream_mask_org, stream_q_org_8, + stream_mask_org_8, stream_q_org_16, stream_mask_org_16, stream_q_org_32, stream_mask_org_32); + + DupDCT<1>(num_tile, stream_rectx1, stream_recty1, stream_rectx4, stream_recty4, stream_rectx7, stream_recty7, + stream_dctin8, stream_dctin8_0, stream_dctin8_1); + + DupDCT<2>(num_tile, stream_rectx2, stream_recty2, stream_rectx5, stream_recty5, stream_rectx8, stream_recty8, + stream_dctin16, stream_dctin16_0, stream_dctin16_1); + + DupDCT<4>(num_tile, stream_rectx3, stream_recty3, stream_rectx6, stream_recty6, stream_rectx9, stream_recty9, + stream_dctin32, stream_dctin32_0, stream_dctin32_1); + + EstimateEntropy8(num_tile, cost1, mul8x8, stream_rectx4, stream_recty4, stream_q_org_8, stream_mask_org_8, + stream_dctin8_0, stream_entropy_8); + + EstimateEntropy16(num_tile, cost1, mul16x16, stream_rectx5, stream_recty5, stream_q_org_16, stream_mask_org_16, + stream_dctin16_0, stream_entropy_16); + + EstimateEntropy32(num_tile, cost1, mul32x32, stream_rectx6, stream_recty6, stream_q_org_32, stream_mask_org_32, + stream_dctin32_0, stream_entropy_32); + + CompareEntropy(num_tile, stream_rectx10, stream_recty10, stream_rectx11, stream_recty11, stream_entropy_8, + stream_entropy_16, stream_entropy_32, strategy_ping, strategy_pang, stream_con, stream_ok); + + BufferN<1>(num_tile, ping8, pang8, stream_rectx7, stream_recty7, stream_dctin8_1, stream_con8, stream_ok8); + + BufferN<2>(num_tile, ping16, pang16, stream_rectx8, stream_recty8, stream_dctin16_1, stream_con16, stream_ok16); + + BufferN<4>(num_tile, ping32, pang32, stream_rectx9, stream_recty9, stream_dctin32_1, stream_con32, stream_ok32); + + Reorder(num_tile, ping8, pang8, ping16, pang16, ping32, pang32, strategy_ping, strategy_pang, stream_rectx11, + stream_recty11, stream_rectx12, stream_recty12, stream_con, stream_ok, stream_con8, stream_ok8, + stream_con16, stream_ok16, stream_con32, stream_ok32, stream_strategy, stream_strategy1, stream_select); + + SetQuantField(num_tile, inv_global_scale, stream_rectx12, stream_recty12, stream_rqf_org, stream_strategy1, + stream_rqf); +} + +void GetSourceSize(short xsize, + short ysize, + hls::stream& stream_rectx_dct, + hls::stream& stream_recty_dct, + hls::stream& stream_rectx_acs, + hls::stream& stream_recty_acs, + hls::stream& stream_rectx_dc, + hls::stream& stream_recty_dc) { + uint16_t xsize_blocks = xsize / 8; + uint16_t ysize_blocks = ysize / 8; +LOOP_0: + for (uint16_t y = 0; y < Div_Ceil(ysize_blocks, 8); y++) { + LOOP_1: + for (uint16_t x = 0; x < Div_Ceil(xsize_blocks, 8); x++) { +#pragma HLS LOOP_TRIPCOUNT min = 64 max = 64 + uint16_t by = y * 8; + uint16_t by1 = ((y + 1) * 8) < ysize_blocks ? ((y + 1) * 8) : ysize_blocks; + uint16_t bx = x * 8; + uint16_t bx1 = ((x + 1) * 8) < xsize_blocks ? ((x + 1) * 8) : xsize_blocks; + uint8_t rect_ysize = by1 - by; + uint8_t rect_xsize = bx1 - bx; + stream_rectx_dct.write(rect_xsize); + stream_recty_dct.write(rect_ysize); + stream_rectx_acs.write(rect_xsize); + stream_recty_acs.write(rect_ysize); + stream_rectx_dc.write(rect_xsize); + stream_recty_dc.write(rect_ysize); + } + } +} + +//=========================================================// +// data flow region +//=========================================================// +void hls_lossy_enc_compute_dataflow( + // config + uint32_t ysize, + uint32_t xsize, + int masking_field_stride, + int quant_field_stride, + float butteraugli_target, + float cost1, + float inv_global_scale, + float* hls_opsin_1, // mm1, input + float* hls_opsin_2, // mm2, input + float* hls_opsin_3, // mm3, input + float* quant_field_row, // mm4, input + float* masking_field_row, // mm5, input + float* aq_map_f, // mm6, input + int8_t* cmap_axi, // mm7, output + int* ac_coef_axiout, // mm8, output + unsigned char* strategy_all, // mm9, output + int* raw_quant_field_i, // mm10, output + float* hls_dc8x8, // mm11, output + float* hls_dc16x16, // mm12, output + float* hls_dc32x32, // mm13, output + int32_t num_zeros[3][320], + hls::stream, 2>& used_orders_strm) { +#pragma HLS DATAFLOW + + int tile_xsize = (xsize + 63) / 64 * 64; + int tile_ysize = (ysize + 63) / 64 * 64; + int xnum_tile = (xsize + 63) / 64; + int ynum_tile = (ysize + 63) / 64; + int num_tile = xnum_tile * ynum_tile; + hls::stream stream_rectx_dct; + hls::stream stream_recty_dct; + hls::stream stream_rectx_acs; + hls::stream stream_recty_acs; + hls::stream stream_rectx_dc; + hls::stream stream_recty_dc; + GetSourceSize(xsize, ysize, stream_rectx_dct, stream_recty_dct, stream_rectx_acs, stream_recty_acs, stream_rectx_dc, + stream_recty_dc); + + // load data + hls::stream stream_q_org("q_org"); + hls::stream stream_mask_org("mask_org"); + hls::stream stream_rqf_org("rqf_org"); + load_rqf_mask(xsize, ysize, aq_map_f, masking_field_row, quant_field_row, quant_field_stride, stream_q_org, + stream_mask_org, stream_rqf_org); + + // load pixel + hls::stream opsin8x8_stream; + hls::stream opsin16x16_stream; + hls::stream opsin32x32_stream; + loadPixel(ysize, xsize, hls_opsin_1, hls_opsin_2, hls_opsin_3, opsin8x8_stream, opsin16x16_stream, + opsin32x32_stream); + + // 1. dct8x8, dct16x16, dct32x32 + hls::stream ac_coef8x8_stream("ac_coef8"); + hls::stream ac_coef16x16_stream("ac_coef16"); + hls::stream ac_coef32x32_stream("ac_coef32"); + + hls::stream dc_coef8x8_stream("dc_coef8"); + hls::stream dc_coef16x16_stream("dc_coef16"); + hls::stream dc_coef32x32_stream("dc_coef32"); + hls_dct_top(ysize, xsize, stream_rectx_dct, stream_recty_dct, opsin8x8_stream, opsin16x16_stream, opsin32x32_stream, + ac_coef8x8_stream, ac_coef16x16_stream, ac_coef32x32_stream, dc_coef8x8_stream, dc_coef16x16_stream, + dc_coef32x32_stream); + + // 2. ac strategy + float k8x8mul1 = -0.55; + float k8x8mul2 = 1.0735757687292623f; + float k8x8base = 1.4; + float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base); + float k16X16mul1 = -0.35; + float k16X16mul2 = 0.82098067020252011; + float k16X16base = 2.0; + float entropy_mul16X16 = k16X16mul2 + k16X16mul1 / (butteraugli_target + k16X16base); + float entropy_mul32X32 = 0.9188333021616017f; + hls::stream acs_stream; + hls::stream dct_select_stream; + hls::stream acs_out_stream("acs_out_stream"); + hls::stream rqf_out_stream("rqf_out_stream"); + hls::stream rqf_out_stream2("rqf_out_stream2"); + ComputeTileACSHLS((uint16_t)num_tile, (short)ysize, (short)xsize, cost1, butteraugli_target, inv_global_scale, + mul8x8, entropy_mul16X16, entropy_mul32X32, stream_rectx_acs, stream_recty_acs, stream_rqf_org, + stream_q_org, stream_mask_org, ac_coef8x8_stream, ac_coef16x16_stream, ac_coef32x32_stream, + acs_stream, dct_select_stream, rqf_out_stream); + + // 3. cfl heuristic + hls::stream cmapx_stream("cmapx_stream"); + hls::stream cmapb_stream("cmapb_stream"); + hls::stream cmapx_axi_stream("cmpax_axi_stream"); + hls::stream cmapb_axi_stream("cmapb_axi_stream"); + + hls::stream rqf_cfl_stream("rqf_cfl_stream"); +#pragma HLS stream variable = rqf_cfl_stream depth = 4096 * 6 +#pragma HLS BIND_STORAGE variable = rqf_cfl_stream type = fifo impl = uram + hls::stream acs_cfl_stream("acs_cfl_stream"); +#pragma HLS stream variable = acs_cfl_stream depth = 4096 * 6 +#pragma HLS BIND_STORAGE variable = acs_cfl_stream type = fifo impl = uram + hls::stream ac_coef_cfl_stream("ac_coef_cfl_stream"); +#pragma HLS stream variable = ac_coef_cfl_stream depth = 4096 * 6 +#pragma HLS BIND_STORAGE variable = ac_coef_cfl_stream type = fifo impl = uram + + hls_CFLComputeTile(xsize, ysize, dct_select_stream, rqf_out_stream, acs_stream, cmapx_stream, cmapb_stream, + cmapx_axi_stream, cmapb_axi_stream, ac_coef_cfl_stream, rqf_cfl_stream, acs_cfl_stream); + + // 4. ComputeCoefficients + hls::stream acs_coeff_stream1("acs_coeff_stream1"); + hls::stream ac_quant_coeff_stream("ac_quant_coeff_stream"); + hls::stream coeff_axi_stream("coeff_axi_stream"); + hls::stream acs_axi_stream("acs_axi_stream"); + hls::stream qf_axi_stream("qf_axi_stream"); + hls_ComputeCoefficients(xsize, ysize, acs_cfl_stream, ac_coef_cfl_stream, rqf_cfl_stream, cmapx_stream, + cmapb_stream, acs_coeff_stream1, ac_quant_coeff_stream, coeff_axi_stream, acs_axi_stream, + qf_axi_stream); + + // 5. ComputeAllCoeffOrders + count_numzeros(xsize, ysize, acs_coeff_stream1, ac_quant_coeff_stream, used_orders_strm, num_zeros); + + // 6. axi writeout + dc_writeout(ysize, xsize, hls_dc8x8, hls_dc16x16, hls_dc32x32, stream_rectx_dc, stream_recty_dc, dc_coef8x8_stream, + dc_coef16x16_stream, dc_coef32x32_stream); + cfl_writeout(xsize, ysize, cmapx_axi_stream, cmapb_axi_stream, cmap_axi); + ac_coeff_writeout(xsize, ysize, coeff_axi_stream, ac_coef_axiout); + acs_rqf_writeout(xsize, ysize, strategy_all, raw_quant_field_i, acs_axi_stream, qf_axi_stream); +} + +void lossy_acc::lossyEncComp(int config[MAX_NUM_CONFIG], // mm15, input + float config_fl[MAX_NUM_CONFIG], // mm16, input + float* hls_opsin_1, // mm1, input + float* hls_opsin_2, // mm2, input + float* hls_opsin_3, // mm3, input + float* quant_field_row, // mm4, input + float* masking_field_row, // mm5, input + float* aq_map_f, // mm6, input + int8_t* cmap_axi, // mm7, output + int* ac_coef_axiout, // mm8, output + unsigned char* strategy_all, // mm9, output + int* raw_quant_field_i, // mm10, output + uint32_t* hls_order, // mm11, output + float* hls_dc8x8, // mm12, output + float* hls_dc16x16, // mm13, output + float* hls_dc32x32 // mm14, output + ) { +#pragma HLS INTERFACE mode = m_axi bundle = mm1 latency = 32 offset = direct num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_opsin_1 +#pragma HLS INTERFACE mode = m_axi bundle = mm2 latency = 32 offset = direct num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_opsin_2 +#pragma HLS INTERFACE mode = m_axi bundle = mm3 latency = 32 offset = direct num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_opsin_3 +#pragma HLS INTERFACE mode = m_axi bundle = mm4 latency = 32 offset = direct num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + BLOCK8_H* BLOCK8_W port = quant_field_row +#pragma HLS INTERFACE mode = m_axi bundle = mm5 latency = 32 offset = direct num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + BLOCK8_H* BLOCK8_W port = masking_field_row +#pragma HLS INTERFACE mode = m_axi bundle = mm6 latency = 32 offset = direct num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + BLOCK8_H* BLOCK8_W port = aq_map_f +#pragma HLS INTERFACE mode = m_axi bundle = mm7 latency = 32 offset = direct num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + TILE_W* TILE_H* 2 port = cmap_axi +#pragma HLS INTERFACE mode = m_axi bundle = mm8 latency = 32 offset = direct num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + ac_coef_axiout +#pragma HLS INTERFACE mode = m_axi bundle = mm9 latency = 32 offset = direct num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + BLOCK8_W* BLOCK8_H port = strategy_all +#pragma HLS INTERFACE mode = m_axi bundle = mm10 latency = 32 offset = direct num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = \ + BLOCK8_H* BLOCK8_W port = raw_quant_field_i +#pragma HLS INTERFACE mode = m_axi bundle = mm11 latency = 32 offset = direct num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = MAX_ORDER port = \ + hls_order +#pragma HLS INTERFACE mode = m_axi bundle = mm12 latency = 32 offset = direct num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_dc8x8 +#pragma HLS INTERFACE mode = m_axi bundle = mm13 latency = 32 offset = direct num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_dc16x16 +#pragma HLS INTERFACE mode = m_axi bundle = mm14 latency = 32 offset = direct num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + hls_dc32x32 +#pragma HLS INTERFACE mode = m_axi bundle = mm15 latency = 32 offset = direct num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = MAX_NUM_CONFIG port = \ + config +#pragma HLS INTERFACE mode = m_axi bundle = mm16 latency = 32 offset = direct num_write_outstanding = \ + 1 num_read_outstanding = 64 max_write_burst_length = 64 max_read_burst_length = 64 depth = MAX_NUM_CONFIG port = \ + config_fl + + // global config + uint32_t ysize = config[0]; + uint32_t xsize = config[1]; + int masking_field_stride = config[2]; + int quant_field_stride = config[3]; + float butteraugli_target = config_fl[0]; + float cost1 = config_fl[1]; + float inv_global_scale = config_fl[2]; + int32_t num_zeros[3][320]; +#pragma HLS BIND_STORAGE type = ram_2p variable = num_zeros impl = BRAM + + // Non-Dataflow region: initialization zeros + init_numzeros(num_zeros); + + // Dataflow region: enc_compute + hls::stream, 2> used_orders_strm; + hls_lossy_enc_compute_dataflow(ysize, xsize, masking_field_stride, quant_field_stride, butteraugli_target, cost1, + inv_global_scale, hls_opsin_1, hls_opsin_2, hls_opsin_3, quant_field_row, + masking_field_row, aq_map_f, cmap_axi, ac_coef_axiout, strategy_all, + raw_quant_field_i, hls_dc8x8, hls_dc16x16, hls_dc32x32, num_zeros, used_orders_strm); + + // Non-Dataflow region: compute orders + order_finalize_dataflow(used_orders_strm, num_zeros, hls_order); +} + +// ------------------------------------------------------------ +/** + * @brief Level 2 : kernel implement for JXL lossy frame encode computing + * + * @param config the int config signal, such as image size, field stride and etc. + * @param config_fl the floating config signal, such as cost, inv_global_scale and etc. + * @param hls_opsin_1 the input RGB image data for channnel-1. + * @param hls_opsin_2 the input RGB image data for channnel-2. + * @param hls_opsin_3 the input RGB image data for channnel-3. + * @param quant_field_row the initial quant_filed data. + * @param masking_filed_row the initial masking_filed data. + * @param aq_map_f the initial adjust quant map data. + * @param cmap_axi the output of color correlation map. + * @param ac_coef_axiout the output of quanted AC coefficients. + * @param strategy_all the output of strategy for each block in image + * @param raw_quant_field_i the output of computed raw_quant_field + * @param hls_order the output of orders for each block in image + * @param hls_dc8x8 the DC coefficients output for 8x8 blocks + * @param hls_dc16x16 the DC coefficients output for 16x16 blocks + * @param hls_dc32x32 the DC coefficients output for 32x32 blocks + */ +// ------------------------------------------------------------ + +void lossy_acc::compute(int* config, + float* config_fl, + float* hls_opsin_1, + float* hls_opsin_2, + float* hls_opsin_3, + float* quant_field_row, + float* masking_field_row, + float* aq_map_f, + int8_t* cmap_axi, + int* ac_coef_axiout, + unsigned char* strategy_all, + int* raw_quant_field_i, + uint32_t* hls_order, + float* hls_dc8x8, + float* hls_dc16x16, + float* hls_dc32x32) { + lossyEncComp(config, config_fl, hls_opsin_1, hls_opsin_2, hls_opsin_3, quant_field_row, masking_field_row, aq_map_f, + cmap_axi, ac_coef_axiout, strategy_all, raw_quant_field_i, hls_order, hls_dc8x8, hls_dc16x16, + hls_dc32x32); +} + +#endif diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/kernel/hls_lossy_enc_compute.hpp b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/kernel/hls_lossy_enc_compute.hpp new file mode 100644 index 0000000000..3263f1841d --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/kernel/hls_lossy_enc_compute.hpp @@ -0,0 +1,4011 @@ +/* + * Copyright 2022 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HLS_LOSSY_ENC_COMPUTE_HPP +#define HLS_LOSSY_ENC_COMPUTE_HPP + +#include "vpp_acc.hpp" + +#include +#include +#include +#include "stddef.h" +#include +#include + +typedef ap_fixed<38, 24> ca_x_t; +typedef ap_fixed<38, 24> cb_x_t; +typedef ap_fixed<38, 18> ca_b_t; +typedef ap_fixed<38, 21> cb_b_t; + +enum Type { + // Regular block size DCT + DCT = 0, + // Encode pixels without transforming + IDENTITY = 1, + // Use 2-by-2 DCT + DCT2X2 = 2, + // Use 4-by-4 DCT + DCT4X4 = 3, + // Use 16-by-16 DCT + DCT16X16 = 4, + // Use 32-by-32 DCT + DCT32X32 = 5 +}; + +template +T DivCeil(T a, size_t b) { + return (a + b - 1) / b; // 8 +} + +template +inline F bitsToF(I in) { + union { + I __I; + F __F; + } __T; + __T.__I = in; + return __T.__F; +} + +template +inline I fToBits(F in) { + union { + I __I; + F __F; + } __T; + __T.__F = in; + return __T.__I; +} + +template +union cast; + +template +union cast { + DT f; + int8_t i; +}; + +template +union cast { + DT f; + int32_t i; +}; + +template +union cast { + DT f; + uint32_t i; +}; + +template +union cast { + DT f; + int64_t i; +}; + +const int PIXEL_W = 2048; +const int PIXEL_H = 2048; +const int FRAME_DIM = 3; +const int ALL_PIXEL = PIXEL_W * PIXEL_H * FRAME_DIM; +const int BLOCK8_W = PIXEL_W / 8; +const int BLOCK8_H = PIXEL_H / 8; +const int BLOCK8_NUM = BLOCK8_W * BLOCK8_H * FRAME_DIM; +const int TILE_W = PIXEL_W / 64; +const int TILE_H = PIXEL_H / 64; +const int MAX_ORDER = 320 * 3 + 1; +const int MAX_NUM_CONFIG = 32; + +const size_t kBlockDim = 8; +const size_t kColorTileDim = 64; +const size_t kDCTBlockSize = 64; +const size_t kEncTileDimInBlocks = 8; +const int kGlobalScaleDenom = 1 << 16; +const size_t kColorTileDimInBlocks = 8; // kColorTileDim / kBlockDim +const int global_scale = 4587; // global_scale_(global_scale) +const float global_scale_float = global_scale * (1.0 / kGlobalScaleDenom); +const float inv_global_scale = 1.0 * kGlobalScaleDenom / global_scale; + +static const uint8_t kDefaultColorFactor = 84; +static float color_scale = 1.0f / (uint32_t)kDefaultColorFactor; +static const float kYToBRatio = 1.0f; +static float base_correlation_x = 0.0f; +static float base_correlation_b = kYToBRatio; + +static const float kDefaultQuantBias[4] = { + 1.0f - 0.05465007330715401f, 1.0f - 0.07005449891748593f, 1.0f - 0.049935103337343655f, 0.145f, +}; + +const float qmx8x8[64] = {0, + 3150, + 3139.258544921875, + 2648.63037109375, + 2234.68115234375, + 1885.427490234375, + 1590.758056640625, + 1342.1417236328125, + 3150, + 3150, + 3015.8095703125, + 2576.583984375, + 2188.4150390625, + 1853.965576171875, + 1568.5406494140625, + 1326.029296875, + 3139.258544921875, + 3015.8095703125, + 2726.995361328125, + 2389.616455078125, + 2062.382568359375, + 1765.966552734375, + 1505.3934326171875, + 1279.74853515625, + 2648.63037109375, + 2576.583984375, + 2389.616455078125, + 2144.407470703125, + 1885.427490234375, + 1637.12109375, + 1410.3748779296875, + 1208.7896728515625, + 2234.68115234375, + 2188.4150390625, + 2062.382568359375, + 1885.427490234375, + 1686.2821044921875, + 1485.4266357421875, + 1294.8450927734375, + 1060.5933837890625, + 1885.427490234375, + 1853.965576171875, + 1765.966552734375, + 1637.12109375, + 1485.4266357421875, + 1326.029296875, + 1169.4920654296875, + 785.9630126953125, + 1590.758056640625, + 1568.5406494140625, + 1505.3934326171875, + 1410.3748779296875, + 1294.8450927734375, + 1169.4920654296875, + 838.70172119140625, + 558.03729248046875, + 1342.1417236328125, + 1326.029296875, + 1279.74853515625, + 1208.7896728515625, + 1060.5933837890625, + 785.9630126953125, + 558.03729248046875, + 382.654693603515625}; +const float qmb8x8[64] = {0, + 293.959503173828125, + 169.4699554443359375, + 119.41248321533203125, + 85.33333587646484375, + 85.33333587646484375, + 83.5508270263671875, + 58.871856689453125, + 293.959503173828125, + 233.598114013671875, + 156.02716064453125, + 112.8175048828125, + 85.33333587646484375, + 85.33333587646484375, + 81.16471099853515625, + 57.425174713134765625, + 169.4699554443359375, + 156.02716064453125, + 126.80493927001953125, + 96.60062408447265625, + 85.33333587646484375, + 85.33333587646484375, + 74.5768890380859375, + 53.37267303466796875, + 119.41248321533203125, + 112.8175048828125, + 96.60062408447265625, + 85.33333587646484375, + 85.33333587646484375, + 85.33333587646484375, + 65.20384979248046875, + 47.455181121826171875, + 85.33333587646484375, + 85.33333587646484375, + 85.33333587646484375, + 85.33333587646484375, + 85.33333587646484375, + 72.55352020263671875, + 54.6778106689453125, + 39.419506072998046875, + 85.33333587646484375, + 85.33333587646484375, + 85.33333587646484375, + 85.33333587646484375, + 72.55352020263671875, + 57.425174713134765625, + 44.331756591796875, + 29.2122058868408203125, + 83.5508270263671875, + 81.16471099853515625, + 74.5768890380859375, + 65.20384979248046875, + 54.6778106689453125, + 44.331756591796875, + 31.1723690032958984375, + 20.7407989501953125, + 58.871856689453125, + 57.425174713134765625, + 53.37267303466796875, + 47.455181121826171875, + 39.419506072998046875, + 29.2122058868408203125, + 20.7407989501953125, + 14.22228240966796875}; +const float qmx16x16[256] = {0, + 0, + 5616.41552734375, + 4437.5478515625, + 3710.523681640625, + 3312.083740234375, + 2956.42822265625, + 2638.9638671875, + 2378.979736328125, + 2146.23095703125, + 1936.2532958984375, + 1722.1861572265625, + 1498.605712890625, + 1304.0516357421875, + 1134.7548828125, + 951.88201904296875, + 0, + 0, + 5312.58251953125, + 4271.09716796875, + 3658.995849609375, + 3275.037109375, + 2928.763916015625, + 2617.745361328125, + 2363.779541015625, + 2134.027099609375, + 1926.335693359375, + 1711.357177734375, + 1489.962646484375, + 1297.1055908203125, + 1129.140380859375, + 946.136962890625, + 5616.41552734375, + 5312.58251953125, + 4620.5927734375, + 3880.564697265625, + 3516.761474609375, + 3170.294189453125, + 2849.415283203125, + 2562.00634765625, + 2319.431640625, + 2098.26171875, + 1897.1728515625, + 1679.534423828125, + 1464.5052490234375, + 1276.60888671875, + 1112.54638671875, + 929.18414306640625, + 4437.5478515625, + 4271.09716796875, + 3880.564697265625, + 3609.647705078125, + 3312.083740234375, + 3013.74951171875, + 2727.90283203125, + 2474.977294921875, + 2249.396484375, + 2041.3057861328125, + 1850.436279296875, + 1628.6099853515625, + 1423.5849609375, + 1243.5428466796875, + 1077.57275390625, + 901.83697509765625, + 3710.523681640625, + 3658.995849609375, + 3516.761474609375, + 3312.083740234375, + 3073.944580078125, + 2824.097412109375, + 2580.273681640625, + 2363.779541015625, + 2158.580810546875, + 1966.6195068359375, + 1778.0765380859375, + 1561.4259033203125, + 1369.259765625, + 1199.417236328125, + 1031.115478515625, + 865.35723876953125, + 3312.083740234375, + 3275.037109375, + 3170.294189453125, + 3013.74951171875, + 2824.097412109375, + 2617.7451171875, + 2425.913330078125, + 2235.929931640625, + 2052.44384765625, + 1878.2061767578125, + 1679.534423828125, + 1481.3988037109375, + 1304.0516357421875, + 1146.111572265625, + 975.34478759765625, + 821.329833984375, + 2956.42822265625, + 2928.763916015625, + 2849.415283203125, + 2727.90283203125, + 2580.273681640625, + 2425.913330078125, + 2263.03759765625, + 2098.26171875, + 1936.2532958984375, + 1766.65966796875, + 1570.745849609375, + 1392.13525390625, + 1230.6845703125, + 1077.57275390625, + 912.64251708984375, + 771.521240234375, + 2638.9638671875, + 2617.745361328125, + 2562.00634765625, + 2474.977294921875, + 2363.779541015625, + 2235.929931640625, + 2098.26171875, + 1956.3931884765625, + 1813.078369140625, + 1628.6099853515625, + 1456.1728515625, + 1297.1055908203125, + 1151.8544921875, + 993.46435546875, + 845.40533447265625, + 717.73773193359375, + 2378.979736328125, + 2363.779541015625, + 2319.431640625, + 2249.396484375, + 2158.580810546875, + 2052.44384765625, + 1936.2532958984375, + 1813.078369140625, + 1648.672119140625, + 1489.962646484375, + 1339.6640625, + 1199.41748046875, + 1057.3155517578125, + 907.21795654296875, + 775.87847900390625, + 661.70928955078125, + 2146.23095703125, + 2134.027099609375, + 2098.26171875, + 2041.3057861328125, + 1966.6195068359375, + 1878.2061767578125, + 1766.65966796875, + 1628.6099853515625, + 1489.962646484375, + 1354.3355712890625, + 1224.331787109375, + 1098.37109375, + 951.88201904296875, + 821.329833984375, + 706.04150390625, + 604.9959716796875, + 1936.2532958984375, + 1926.335693359375, + 1897.1728515625, + 1850.436279296875, + 1778.0765380859375, + 1679.534423828125, + 1570.745849609375, + 1456.1728515625, + 1339.6640625, + 1224.331787109375, + 1112.546142578125, + 975.344482421875, + 850.33416748046875, + 737.81219482421875, + 637.54150390625, + 531.86663818359375, + 1722.1861572265625, + 1711.357177734375, + 1679.534423828125, + 1628.6099853515625, + 1561.4259033203125, + 1481.3988037109375, + 1392.13525390625, + 1297.1055908203125, + 1199.41748046875, + 1098.37109375, + 975.344482421875, + 860.30999755859375, + 754.41485595703125, + 658.18359375, + 565.16876220703125, + 455.065155029296875, + 1498.605712890625, + 1489.962646484375, + 1464.5052490234375, + 1423.5849609375, + 1369.259765625, + 1304.0516357421875, + 1230.6845703125, + 1151.8544921875, + 1057.3155517578125, + 951.88201904296875, + 850.33416748046875, + 754.41485595703125, + 665.2603759765625, + 582.76104736328125, + 475.56475830078125, + 385.666412353515625, + 1304.0516357421875, + 1297.1055908203125, + 1276.60888671875, + 1243.5428466796875, + 1199.417236328125, + 1146.111572265625, + 1077.57275390625, + 993.46435546875, + 907.21795654296875, + 821.329833984375, + 737.81219482421875, + 658.18359375, + 582.76104736328125, + 482.643035888671875, + 396.77593994140625, + 324.0394287109375, + 1134.7548828125, + 1129.140380859375, + 1112.54638671875, + 1077.57275390625, + 1031.115478515625, + 975.34478759765625, + 912.64251708984375, + 845.40533447265625, + 775.87847900390625, + 706.04150390625, + 637.54150390625, + 565.16876220703125, + 475.56475830078125, + 396.77593994140625, + 328.516326904296875, + 270.136077880859375, + 951.88201904296875, + 946.136962890625, + 929.18414306640625, + 901.83697509765625, + 865.35723876953125, + 821.329833984375, + 771.521240234375, + 717.73773193359375, + 661.70928955078125, + 604.9959716796875, + 531.86663818359375, + 455.065155029296875, + 385.666412353515625, + 324.0394287109375, + 270.136077880859375, + 223.608489990234375}; +const float qmb16x16[256] = {0, + 0, + 615.61383056640625, + 448.953399658203125, + 337.930267333984375, + 263.80755615234375, + 205.943115234375, + 160.7708892822265625, + 141.832733154296875, + 126.30164337158203125, + 112.47124481201171875, + 100.76338958740234375, + 91.12081146240234375, + 82.40099334716796875, + 74.5156097412109375, + 58.896236419677734375, + 0, + 0, + 571.40203857421875, + 426.5322265625, + 327.784393310546875, + 257.417816162109375, + 201.76556396484375, + 157.9664306640625, + 140.8123321533203125, + 125.4929656982421875, + 111.822540283203125, + 100.30467987060546875, + 90.7403564453125, + 82.08327484130859375, + 74.2487335205078125, + 58.3933258056640625, + 615.61383056640625, + 571.40203857421875, + 473.94189453125, + 372.602783203125, + 300.644775390625, + 239.809600830078125, + 190.039825439453125, + 154.1826629638671875, + 137.8400421142578125, + 123.12636566162109375, + 109.9174652099609375, + 98.95200347900390625, + 89.61621856689453125, + 81.14296722412109375, + 73.45781707763671875, + 56.916744232177734375, + 448.953399658203125, + 426.5322265625, + 372.602783203125, + 318.224456787109375, + 263.80755615234375, + 214.746795654296875, + 172.8172607421875, + 148.2958526611328125, + 133.160797119140625, + 119.3681488037109375, + 106.87210845947265625, + 96.77252197265625, + 87.79785919189453125, + 79.6171722412109375, + 70.20831298828125, + 54.558437347412109375, + 337.930267333984375, + 327.784393310546875, + 300.644775390625, + 263.80755615234375, + 224.2069549560546875, + 186.3783111572265625, + 155.42156982421875, + 140.8123321533203125, + 127.12058258056640625, + 114.4600982666015625, + 103.11833953857421875, + 93.86804962158203125, + 85.36130523681640625, + 77.5634307861328125, + 65.95937347412109375, + 51.458751678466796875, + 263.80755615234375, + 257.417816162109375, + 239.809600830078125, + 214.746795654296875, + 186.3783111572265625, + 157.966400146484375, + 144.9885406494140625, + 132.263153076171875, + 120.10205078125, + 108.6804351806640625, + 98.95200347900390625, + 90.36280059814453125, + 82.40099334716796875, + 75.0543060302734375, + 60.963199615478515625, + 47.78974151611328125, + 205.943115234375, + 201.76556396484375, + 190.039825439453125, + 172.8172607421875, + 155.42156982421875, + 144.9885406494140625, + 134.070770263671875, + 123.12636566162109375, + 112.47124481201171875, + 102.63896942138671875, + 94.27301025390625, + 86.3905029296875, + 79.02082061767578125, + 70.20831298828125, + 55.486751556396484375, + 43.7368011474609375, + 160.7708892822265625, + 157.9664306640625, + 154.1826629638671875, + 148.2958526611328125, + 140.8123321533203125, + 132.263153076171875, + 123.12636566162109375, + 113.789886474609375, + 104.58271026611328125, + 96.77252197265625, + 89.247100830078125, + 82.08327484130859375, + 75.3261566162109375, + 62.573711395263671875, + 49.786182403564453125, + 39.48137664794921875, + 141.832733154296875, + 140.8123321533203125, + 137.8400421142578125, + 133.160797119140625, + 127.12058258056640625, + 120.10205078125, + 112.47124481201171875, + 104.58271026611328125, + 97.63336944580078125, + 90.7403564453125, + 84.02266693115234375, + 77.563446044921875, + 68.3460235595703125, + 55.02014923095703125, + 44.087116241455078125, + 35.18759918212890625, + 126.30164337158203125, + 125.4929656982421875, + 123.12636566162109375, + 119.3681488037109375, + 114.4600982666015625, + 108.6804351806640625, + 102.63896942138671875, + 96.77252197265625, + 90.7403564453125, + 84.68727874755859375, + 78.725555419921875, + 72.135589599609375, + 58.896236419677734375, + 47.78974151611328125, + 38.57308197021484375, + 30.9930629730224609375, + 112.47124481201171875, + 111.822540283203125, + 109.9174652099609375, + 106.87210845947265625, + 103.11833953857421875, + 98.95200347900390625, + 94.27301025390625, + 89.247100830078125, + 84.02266693115234375, + 78.725555419921875, + 73.4578094482421875, + 60.96317291259765625, + 50.197849273681640625, + 41.054691314697265625, + 33.38103485107421875, + 24.7806758880615234375, + 100.76338958740234375, + 100.30467987060546875, + 98.95200347900390625, + 96.77252197265625, + 93.86804962158203125, + 90.36280059814453125, + 86.3905029296875, + 82.08327484130859375, + 77.563446044921875, + 72.135589599609375, + 60.96317291259765625, + 51.034107208251953125, + 42.369472503662109375, + 34.922313690185546875, + 27.726070404052734375, + 18.572216033935546875, + 91.12081146240234375, + 90.7403564453125, + 89.61621856689453125, + 87.79785919189453125, + 85.36130523681640625, + 82.40099334716796875, + 79.02082061767578125, + 75.3261566162109375, + 68.3460235595703125, + 58.896236419677734375, + 50.197849273681640625, + 42.369472503662109375, + 35.455394744873046875, + 29.34313201904296875, + 20.1489048004150390625, + 13.67640781402587890625, + 82.40099334716796875, + 82.08327484130859375, + 81.14296722412109375, + 79.6171722412109375, + 77.5634307861328125, + 75.0543060302734375, + 70.20831298828125, + 62.573711395263671875, + 55.02014923095703125, + 47.78974151611328125, + 41.054691314697265625, + 34.922313690185546875, + 29.34313201904296875, + 20.706996917724609375, + 14.41384983062744140625, + 9.911548614501953125, + 74.5156097412109375, + 74.2487335205078125, + 73.45781707763671875, + 70.20831298828125, + 65.95937347412109375, + 60.963199615478515625, + 55.486751556396484375, + 49.786182403564453125, + 44.087116241455078125, + 38.57308197021484375, + 33.38103485107421875, + 27.726070404052734375, + 20.1489048004150390625, + 14.41384983062744140625, + 10.16626739501953125, + 7.0798015594482421875, + 58.896236419677734375, + 58.3933258056640625, + 56.916744232177734375, + 54.558437347412109375, + 51.458751678466796875, + 47.78974151611328125, + 43.7368011474609375, + 39.48137664794921875, + 35.18759918212890625, + 30.9930629730224609375, + 24.7806758880615234375, + 18.572216033935546875, + 13.67640781402587890625, + 9.911548614501953125, + 7.0798015594482421875, + 4.99121952056884765625}; +const float qmx32x32[1024] = {0, + 0, + 0, + 0, + 10016.1787109375, + 8949.0185546875, + 7995.55859375, + 7162.60107421875, + 6422.4755859375, + 5758.8291015625, + 5163.75830078125, + 4630.1767578125, + 4151.732421875, + 3734.188232421875, + 3370.10986328125, + 3041.52880859375, + 2744.98388671875, + 2477.35107421875, + 2235.813232421875, + 2038.7496337890625, + 1932.1097412109375, + 1831.0474853515625, + 1735.2716064453125, + 1644.505615234375, + 1558.4873046875, + 1476.968017578125, + 1386.82666015625, + 1301.5286865234375, + 1221.4771728515625, + 1146.34912109375, + 1075.8421630859375, + 1009.6715087890625, + 0, + 0, + 0, + 0, + 9878.224609375, + 8849.744140625, + 7921.35595703125, + 7107.29541015625, + 6379.01171875, + 5724.1455078125, + 5135.74365234375, + 4607.32568359375, + 4132.939453125, + 3719.505126953125, + 3357.800537109375, + 3031.1572265625, + 2736.20654296875, + 2469.894287109375, + 2229.455810546875, + 2035.871337890625, + 1929.51806640625, + 1828.7081298828125, + 1733.1553955078125, + 1642.5870361328125, + 1556.7445068359375, + 1475.38232421875, + 1385.1351318359375, + 1300, + 1220.09375, + 1145.095703125, + 1074.7049560546875, + 1008.638671875, + 0, + 0, + 0, + 0, + 9497.3408203125, + 8569.009765625, + 7710.1953125, + 6947.08251953125, + 6252.30078125, + 5622.568359375, + 5053.4169921875, + 4539.9931640625, + 4077.45068359375, + 3676.055419921875, + 3321.326416015625, + 3000.390625, + 2710.1435546875, + 2447.7333984375, + 2210.550537109375, + 2027.2841796875, + 1921.7830810546875, + 1821.7237548828125, + 1726.834716796875, + 1636.8553466796875, + 1551.537353515625, + 1470.240966796875, + 1380.0811767578125, + 1295.431396484375, + 1215.958251953125, + 1141.3475341796875, + 1071.3038330078125, + 1005.5491943359375, + 0, + 0, + 0, + 0, + 8949.0185546875, + 8149.28955078125, + 7394.22412109375, + 6697.34423828125, + 6052.48828125, + 5461.01953125, + 4921.63427734375, + 4431.66796875, + 3987.818603515625, + 3605.5732421875, + 3262.003173828125, + 2950.239990234375, + 2667.58154296875, + 2411.486328125, + 2179.5849609375, + 2013.1302490234375, + 1909.0233154296875, + 1810.194091796875, + 1716.393798828125, + 1627.38232421875, + 1542.9271240234375, + 1460.985595703125, + 1371.7230224609375, + 1287.8731689453125, + 1209.1142578125, + 1135.1424560546875, + 1065.6719970703125, + 1000.4320068359375, + 10016.1787109375, + 9878.224609375, + 9497.3408203125, + 8949.0185546875, + 8310.703125, + 7644.4052734375, + 6999.5673828125, + 6379.01171875, + 5793.93896484375, + 5249.58837890625, + 4747.62841796875, + 4287.62841796875, + 3871.052001953125, + 3510.74609375, + 3181.8896484375, + 2882.297607421875, + 2609.764404296875, + 2362.1328125, + 2137.337890625, + 1993.63818359375, + 1891.4306640625, + 1794.2806396484375, + 1701.970703125, + 1614.285400390625, + 1531.013916015625, + 1448.1861572265625, + 1360.1573486328125, + 1277.407958984375, + 1199.633056640625, + 1126.5428466796875, + 1057.8629150390625, + 993.33349609375, + 8949.0185546875, + 8849.744140625, + 8569.009765625, + 8149.28955078125, + 7644.4052734375, + 7107.29541015625, + 6556.77978515625, + 6014.109375, + 5492.59033203125, + 4999.91259765625, + 4539.9931640625, + 4114.296875, + 3734.188232421875, + 3394.959228515625, + 3083.605224609375, + 2798.61328125, + 2538.306396484375, + 2300.954345703125, + 2084.83349609375, + 1969.1136474609375, + 1869.26220703125, + 1774.201171875, + 1683.7496337890625, + 1597.7220458984375, + 1515.93359375, + 1431.994873046875, + 1345.5145263671875, + 1264.1485595703125, + 1187.612060546875, + 1115.6322021484375, + 1047.949462890625, + 984.94073486328125, + 7995.55859375, + 7921.35595703125, + 7710.1953125, + 7394.22412109375, + 6999.5673828125, + 6556.77978515625, + 6091.37744140625, + 5622.568359375, + 5163.75830078125, + 4723.701171875, + 4307.68896484375, + 3918.65771484375, + 3578.03271484375, + 3262.003173828125, + 2970.1337890625, + 2701.544189453125, + 2455.08642578125, + 2229.455810546875, + 2041.63623046875, + 1939.925048828125, + 1842.8297119140625, + 1750.22119140625, + 1661.9576416015625, + 1577.8870849609375, + 1497.8533935546875, + 1412.5999755859375, + 1327.9566650390625, + 1248.234619140625, + 1173.171875, + 1102.515625, + 1036.0238037109375, + 975.33502197265625, + 7162.60107421875, + 7107.29541015625, + 6947.08251953125, + 6697.34423828125, + 6379.01171875, + 6014.109375, + 5622.568359375, + 5220.6748046875, + 4820.7763671875, + 4431.66796875, + 4059.243408203125, + 3719.505126953125, + 3407.500244140625, + 3115.7998046875, + 2844.60986328125, + 2593.611328125, + 2362.133056640625, + 2149.279541015625, + 2004.73095703125, + 1906.490966796875, + 1812.4891357421875, + 1722.6444091796875, + 1636.8553466796875, + 1555.00537109375, + 1476.9681396484375, + 1390.2191162109375, + 1307.6717529296875, + 1229.8292236328125, + 1156.454345703125, + 1087.3167724609375, + 1022.19244384765625, + 964.1702880859375, + 6422.4755859375, + 6379.01171875, + 6252.30078125, + 6052.48828125, + 5793.93896484375, + 5492.59033203125, + 5163.75830078125, + 4820.7763671875, + 4474.4306640625, + 4132.939453125, + 3809.151123046875, + 3510.74560546875, + 3227.259765625, + 2960.15966796875, + 2710.1435546875, + 2477.3515625, + 2261.5234375, + 2062.12646484375, + 1963.7435302734375, + 1869.26220703125, + 1778.6273193359375, + 1691.8035888671875, + 1608.7301025390625, + 1529.3251953125, + 1450.00341796875, + 1365.0950927734375, + 1284.86962890625, + 1209.1142578125, + 1137.61865234375, + 1070.173828125, + 1006.57757568359375, + 951.533935546875, + 5758.8291015625, + 5724.1455078125, + 5622.568359375, + 5461.01953125, + 5249.58837890625, + 4999.91259765625, + 4723.701171875, + 4431.66796875, + 4132.939453125, + 3839.8818359375, + 3564.3984375, + 3297.380859375, + 3041.52880859375, + 2798.61328125, + 2569.677490234375, + 2355.211669921875, + 2155.288330078125, + 2013.1302490234375, + 1919.218017578125, + 1828.7081298828125, + 1741.64990234375, + 1658.050537109375, + 1577.8870849609375, + 1501.1123046875, + 1419.6033935546875, + 1337.488037109375, + 1259.777099609375, + 1186.287841796875, + 1116.836181640625, + 1051.2381591796875, + 989.35601806640625, + 937.5218505859375, + 5163.75830078125, + 5135.74365234375, + 5053.4169921875, + 4921.63427734375, + 4747.62841796875, + 4539.9931640625, + 4307.68896484375, + 4059.243408203125, + 3809.151123046875, + 3564.3984375, + 3321.326416015625, + 3083.605224609375, + 2853.95654296875, + 2634.296142578125, + 2425.8837890625, + 2229.455810546875, + 2053.26318359375, + 1961.06884765625, + 1871.700927734375, + 1785.30419921875, + 1701.970703125, + 1621.7459716796875, + 1544.6424560546875, + 1470.240966796875, + 1386.82666015625, + 1307.6715087890625, + 1232.6331787109375, + 1161.558837890625, + 1094.29150390625, + 1030.6707763671875, + 972.74078369140625, + 922.236572265625, + 4630.1767578125, + 4607.32568359375, + 4539.9931640625, + 4431.66796875, + 4287.62841796875, + 4114.296875, + 3918.65771484375, + 3719.505126953125, + 3510.74560546875, + 3297.380859375, + 3083.605224609375, + 2872.7998046875, + 2667.58154296875, + 2469.89453125, + 2281.107177734375, + 2102.114501953125, + 1993.638427734375, + 1906.490966796875, + 1821.723876953125, + 1739.5189208984375, + 1660.0018310546875, + 1583.2529296875, + 1509.314697265625, + 1431.994873046875, + 1351.9913330078125, + 1275.923828125, + 1203.682373046875, + 1135.1424560546875, + 1070.173828125, + 1008.638671875, + 954.87835693359375, + 905.78668212890625, + 4151.732421875, + 4132.939453125, + 4077.45068359375, + 3987.818603515625, + 3871.052001953125, + 3734.188232421875, + 3578.03271484375, + 3407.500244140625, + 3227.259765625, + 3041.52880859375, + 2853.95654296875, + 2667.58154296875, + 2484.843994140625, + 2307.630126953125, + 2137.337890625, + 2015.9456787109375, + 1932.1097412109375, + 1849.968017578125, + 1769.796142578125, + 1691.8035888671875, + 1616.1446533203125, + 1542.9271240234375, + 1472.10400390625, + 1391.9202880859375, + 1315.4149169921875, + 1242.5257568359375, + 1173.171875, + 1107.2579345703125, + 1044.676513671875, + 985.8211669921875, + 935.8944091796875, + 888.28326416015625, + 3734.188232421875, + 3719.505126953125, + 3676.055419921875, + 3605.5732421875, + 3510.74609375, + 3394.959228515625, + 3262.003173828125, + 3115.7998046875, + 2960.15966796875, + 2798.61328125, + 2634.296142578125, + 2469.89453125, + 2307.630126953125, + 2149.279541015625, + 2027.2845458984375, + 1947.802001953125, + 1869.26220703125, + 1792.02880859375, + 1716.393798828125, + 1642.5870361328125, + 1570.7830810546875, + 1501.1123046875, + 1426.662841796875, + 1350.36767578125, + 1277.407958984375, + 1207.7525634765625, + 1141.3475341796875, + 1078.122314453125, + 1017.99298095703125, + 964.1702880859375, + 915.9161376953125, + 869.84039306640625, + 3370.10986328125, + 3357.800537109375, + 3321.326416015625, + 3262.003173828125, + 3181.8896484375, + 3083.605224609375, + 2970.1337890625, + 2844.60986328125, + 2710.1435546875, + 2569.677490234375, + 2425.8837890625, + 2281.107177734375, + 2137.337890625, + 2027.2845458984375, + 1953.087890625, + 1879.05322265625, + 1805.62060546875, + 1733.1553955078125, + 1661.9576416015625, + 1592.2684326171875, + 1524.27880859375, + 1455.4775390625, + 1380.0811767578125, + 1307.6715087890625, + 1238.270751953125, + 1171.8729248046875, + 1108.4483642578125, + 1047.949462890625, + 990.31439208984375, + 941.60955810546875, + 895.07000732421875, + 850.57257080078125, + 3041.52880859375, + 3031.1572265625, + 3000.390625, + 2950.239990234375, + 2882.297607421875, + 2798.61328125, + 2701.544189453125, + 2593.611328125, + 2477.3515625, + 2355.211669921875, + 2229.455810546875, + 2102.114501953125, + 2015.9456787109375, + 1947.802001953125, + 1879.05322265625, + 1810.194091796875, + 1741.64990234375, + 1673.7799072265625, + 1606.8861083984375, + 1541.215087890625, + 1476.9681396484375, + 1403.9237060546875, + 1332.708740234375, + 1264.1485595703125, + 1198.2880859375, + 1135.1424560546875, + 1074.7049560546875, + 1016.9471435546875, + 965.0220947265625, + 918.27838134765625, + 873.48193359375, + 830.5924072265625, + 2744.98388671875, + 2736.20654296875, + 2710.1435546875, + 2667.58154296875, + 2609.764404296875, + 2538.306396484375, + 2455.08642578125, + 2362.133056640625, + 2261.5234375, + 2155.288330078125, + 2053.26318359375, + 1993.638427734375, + 1932.1097412109375, + 1869.26220703125, + 1805.62060546875, + 1741.64990234375, + 1677.7550048828125, + 1614.285400390625, + 1551.537353515625, + 1489.759765625, + 1421.3629150390625, + 1351.9913330078125, + 1284.869384765625, + 1220.09375, + 1157.7274169921875, + 1097.8046875, + 1040.3365478515625, + 985.8211669921875, + 939.15362548828125, + 894.31201171875, + 851.27447509765625, + 810.0118408203125, + 2477.35107421875, + 2469.894287109375, + 2447.7333984375, + 2411.486328125, + 2362.1328125, + 2300.954345703125, + 2229.455810546875, + 2149.279541015625, + 2062.12646484375, + 2013.1302490234375, + 1961.06884765625, + 1906.490966796875, + 1849.968017578125, + 1792.02880859375, + 1733.1553955078125, + 1673.7799072265625, + 1614.285400390625, + 1555.00537109375, + 1496.228515625, + 1431.994873046875, + 1365.0950927734375, + 1300, + 1236.8575439453125, + 1175.777099609375, + 1116.836181640625, + 1060.0849609375, + 1005.5491943359375, + 957.398681640625, + 912.78082275390625, + 869.84039306640625, + 828.56634521484375, + 788.9383544921875, + 2235.813232421875, + 2229.455810546875, + 2210.550537109375, + 2179.5849609375, + 2137.337890625, + 2084.83349609375, + 2041.63623046875, + 2004.73095703125, + 1963.7435302734375, + 1919.218017578125, + 1871.700927734375, + 1821.723876953125, + 1769.796142578125, + 1716.393798828125, + 1661.9576416015625, + 1606.8861083984375, + 1551.537353515625, + 1496.228515625, + 1435.5677490234375, + 1371.7230224609375, + 1309.21435546875, + 1248.234619140625, + 1188.9384765625, + 1131.4447021484375, + 1075.8421630859375, + 1022.19244384765625, + 972.74078369140625, + 928.625244140625, + 886.03875732421875, + 844.98834228515625, + 805.47149658203125, + 767.47625732421875, + 2038.7496337890625, + 2035.871337890625, + 2027.2841796875, + 2013.1302490234375, + 1993.63818359375, + 1969.1136474609375, + 1939.925048828125, + 1906.490966796875, + 1869.26220703125, + 1828.7081298828125, + 1785.30419921875, + 1739.5189208984375, + 1691.8035888671875, + 1642.5870361328125, + 1592.2684326171875, + 1541.215087890625, + 1489.759765625, + 1431.994873046875, + 1371.7230224609375, + 1312.3087158203125, + 1253.9849853515625, + 1196.9451904296875, + 1141.3475341796875, + 1087.3167724609375, + 1034.9495849609375, + 984.94073486328125, + 941.60955810546875, + 899.6387939453125, + 859.05474853515625, + 819.87261962890625, + 782.09710693359375, + 745.7244873046875, + 1932.1097412109375, + 1929.51806640625, + 1921.7830810546875, + 1909.0233154296875, + 1891.4306640625, + 1869.26220703125, + 1842.8297119140625, + 1812.4891357421875, + 1778.6273193359375, + 1741.64990234375, + 1701.970703125, + 1660.0018310546875, + 1616.1446533203125, + 1570.7830810546875, + 1524.27880859375, + 1476.9681396484375, + 1421.3629150390625, + 1365.0950927734375, + 1309.21435546875, + 1253.9849853515625, + 1199.633056640625, + 1146.34912109375, + 1094.29150390625, + 1043.5887451171875, + 994.34295654296875, + 951.5340576171875, + 910.440185546875, + 870.5670166015625, + 831.9473876953125, + 794.60296630859375, + 758.54522705078125, + 723.7767333984375, + 1831.0474853515625, + 1828.7081298828125, + 1821.7237548828125, + 1810.194091796875, + 1794.2806396484375, + 1774.201171875, + 1750.22119140625, + 1722.6444091796875, + 1691.8035888671875, + 1658.050537109375, + 1621.7459716796875, + 1583.2529296875, + 1542.9271240234375, + 1501.1123046875, + 1455.4775390625, + 1403.9237060546875, + 1351.9913330078125, + 1300, + 1248.234619140625, + 1196.9451904296875, + 1146.34912109375, + 1096.6314697265625, + 1047.949462890625, + 1000.4320068359375, + 958.2410888671875, + 918.27825927734375, + 879.35675048828125, + 841.5264892578125, + 804.8258056640625, + 769.28125, + 734.91033935546875, + 701.72100830078125, + 1735.2716064453125, + 1733.1553955078125, + 1726.834716796875, + 1716.393798828125, + 1701.970703125, + 1683.7496337890625, + 1661.9576416015625, + 1636.8553466796875, + 1608.7301025390625, + 1577.8870849609375, + 1544.6424560546875, + 1509.314697265625, + 1472.10400390625, + 1426.662841796875, + 1380.0811767578125, + 1332.708740234375, + 1284.869384765625, + 1236.8575439453125, + 1188.9384765625, + 1141.3475341796875, + 1094.29150390625, + 1047.949462890625, + 1002.47418212890625, + 961.62213134765625, + 923.03155517578125, + 885.292236328125, + 848.471923828125, + 812.6236572265625, + 777.789794921875, + 744.00146484375, + 711.28009033203125, + 684.97052001953125, + 1644.505615234375, + 1642.5870361328125, + 1636.8553466796875, + 1627.38232421875, + 1614.285400390625, + 1597.7220458984375, + 1577.8870849609375, + 1555.00537109375, + 1529.3251953125, + 1501.1123046875, + 1470.240966796875, + 1431.994873046875, + 1391.9202880859375, + 1350.36767578125, + 1307.6715087890625, + 1264.1485595703125, + 1220.09375, + 1175.777099609375, + 1131.4447021484375, + 1087.3167724609375, + 1043.5887451171875, + 1000.4320068359375, + 961.62213134765625, + 924.62445068359375, + 888.28326416015625, + 852.68096923828125, + 817.8857421875, + 783.95404052734375, + 750.92999267578125, + 718.84820556640625, + 690.50970458984375, + 669.78717041015625, + 1558.4873046875, + 1556.7445068359375, + 1551.537353515625, + 1542.9271240234375, + 1531.013916015625, + 1515.93359375, + 1497.8533935546875, + 1476.9681396484375, + 1450.00341796875, + 1419.6033935546875, + 1386.82666015625, + 1351.9913330078125, + 1315.4149169921875, + 1277.407958984375, + 1238.270751953125, + 1198.2880859375, + 1157.7274169921875, + 1116.836181640625, + 1075.8421630859375, + 1034.9495849609375, + 994.34295654296875, + 958.2410888671875, + 923.03155517578125, + 888.28326416015625, + 854.0911865234375, + 820.53631591796875, + 787.68792724609375, + 755.60260009765625, + 724.32769775390625, + 694.7138671875, + 674.44976806640625, + 654.522705078125, + 1476.968017578125, + 1475.38232421875, + 1470.240966796875, + 1460.985595703125, + 1448.1861572265625, + 1431.994873046875, + 1412.5999755859375, + 1390.2191162109375, + 1365.0950927734375, + 1337.488037109375, + 1307.6715087890625, + 1275.923828125, + 1242.5257568359375, + 1207.7525634765625, + 1171.8729248046875, + 1135.1424560546875, + 1097.8046875, + 1060.0849609375, + 1022.19244384765625, + 984.94073486328125, + 951.5340576171875, + 918.27825927734375, + 885.292236328125, + 852.68096923828125, + 820.53631591796875, + 788.9383544921875, + 757.955322265625, + 727.6448974609375, + 698.0548095703125, + 677.8135986328125, + 658.3643798828125, + 639.217041015625, + 1386.82666015625, + 1385.1351318359375, + 1380.0811767578125, + 1371.7230224609375, + 1360.1573486328125, + 1345.5145263671875, + 1327.9566650390625, + 1307.6717529296875, + 1284.86962890625, + 1259.777099609375, + 1232.6331787109375, + 1203.682373046875, + 1173.171875, + 1141.3475341796875, + 1108.4483642578125, + 1074.7049560546875, + 1040.3365478515625, + 1005.5491943359375, + 972.74078369140625, + 941.60955810546875, + 910.440185546875, + 879.35675048828125, + 848.471923828125, + 817.8857421875, + 787.68792724609375, + 757.955322265625, + 728.75579833984375, + 700.14642333984375, + 679.84564208984375, + 660.9462890625, + 642.2919921875, + 623.90673828125, + 1301.5286865234375, + 1300, + 1295.431396484375, + 1287.8731689453125, + 1277.407958984375, + 1264.1485595703125, + 1248.234619140625, + 1229.8292236328125, + 1209.1142578125, + 1186.287841796875, + 1161.558837890625, + 1135.1424560546875, + 1107.2579345703125, + 1078.122314453125, + 1047.949462890625, + 1016.9471435546875, + 985.8211669921875, + 957.398681640625, + 928.625244140625, + 899.6387939453125, + 870.5670166015625, + 841.5264892578125, + 812.6236572265625, + 783.95404052734375, + 755.60260009765625, + 727.6448974609375, + 700.14642333984375, + 680.52508544921875, + 662.2437744140625, + 644.1488037109375, + 626.26806640625, + 608.6259765625, + 1221.4771728515625, + 1220.09375, + 1215.958251953125, + 1209.1142578125, + 1199.633056640625, + 1187.612060546875, + 1173.171875, + 1156.454345703125, + 1137.61865234375, + 1116.836181640625, + 1094.29150390625, + 1070.173828125, + 1044.676513671875, + 1017.99298095703125, + 990.31439208984375, + 965.0220947265625, + 939.15362548828125, + 912.78082275390625, + 886.03875732421875, + 859.05474853515625, + 831.9473876953125, + 804.8258056640625, + 777.789794921875, + 750.92999267578125, + 724.32769775390625, + 698.0548095703125, + 679.84564208984375, + 662.2437744140625, + 644.769775390625, + 627.45428466796875, + 610.32489013671875, + 593.4061279296875, + 1146.34912109375, + 1145.095703125, + 1141.3475341796875, + 1135.1424560546875, + 1126.5428466796875, + 1115.6322021484375, + 1102.515625, + 1087.3167724609375, + 1070.173828125, + 1051.2381591796875, + 1030.6707763671875, + 1008.638671875, + 985.8211669921875, + 964.1702880859375, + 941.60955810546875, + 918.27838134765625, + 894.31201171875, + 869.84039306640625, + 844.98834228515625, + 819.87261962890625, + 794.60296630859375, + 769.28125, + 744.00146484375, + 718.84820556640625, + 694.7138671875, + 677.8135986328125, + 660.9462890625, + 644.1488037109375, + 627.45428466796875, + 610.89300537109375, + 594.491943359375, + 578.27581787109375, + 1075.8421630859375, + 1074.7049560546875, + 1071.3038330078125, + 1065.6719970703125, + 1057.8629150390625, + 1047.949462890625, + 1036.0238037109375, + 1022.19244384765625, + 1006.57757568359375, + 989.35601806640625, + 972.74078369140625, + 954.87835693359375, + 935.8944091796875, + 915.9161376953125, + 895.07000732421875, + 873.48193359375, + 851.27447509765625, + 828.56634521484375, + 805.47149658203125, + 782.09710693359375, + 758.54522705078125, + 734.91033935546875, + 711.28009033203125, + 690.50970458984375, + 674.44976806640625, + 658.3643798828125, + 642.2919921875, + 626.26806640625, + 610.32489013671875, + 594.491943359375, + 578.7960205078125, + 563.26104736328125, + 1009.6715087890625, + 1008.638671875, + 1005.5491943359375, + 1000.4320068359375, + 993.33349609375, + 984.94073486328125, + 975.33502197265625, + 964.1702880859375, + 951.533935546875, + 937.5218505859375, + 922.236572265625, + 905.78668212890625, + 888.28326416015625, + 869.84039306640625, + 850.57257080078125, + 830.5924072265625, + 810.0118408203125, + 788.9383544921875, + 767.47625732421875, + 745.7244873046875, + 723.7767333984375, + 701.72100830078125, + 684.97052001953125, + 669.78717041015625, + 654.522705078125, + 639.217041015625, + 623.90673828125, + 608.6259765625, + 593.4061279296875, + 578.27581787109375, + 563.26104736328125, + 548.38555908203125}; + +const float qmb32x32[1024] = {0, + 0, + 0, + 0, + 1554.123779296875, + 1242.53955078125, + 993.424560546875, + 821.7386474609375, + 688.02374267578125, + 576.06719970703125, + 482.32843017578125, + 403.842987060546875, + 338.12896728515625, + 283.2335205078125, + 237.367095947265625, + 198.92822265625, + 166.714080810546875, + 139.71661376953125, + 117.09114837646484375, + 100.3662261962890625, + 93.5875701904296875, + 87.26671600341796875, + 81.37277984619140625, + 75.876922607421875, + 70.75225830078125, + 65.97368621826171875, + 62.470378875732421875, + 59.2027587890625, + 56.10607147216796875, + 53.17134857177734375, + 50.390140533447265625, + 47.75440216064453125, + 0, + 0, + 0, + 0, + 1511.89892578125, + 1215.3125, + 975.19708251953125, + 811.43212890625, + 680.45819091796875, + 570.4285888671875, + 478.07489013671875, + 400.602691650390625, + 335.640289306640625, + 281.318328857421875, + 235.8760223388671875, + 197.7614898681640625, + 165.797119140625, + 138.9931640625, + 116.51837158203125, + 100.18183135986328125, + 93.42420196533203125, + 87.1216278076171875, + 81.24362945556640625, + 75.76171875, + 70.6492919921875, + 65.8815155029296875, + 62.4058837890625, + 59.143909454345703125, + 56.05228424072265625, + 53.12213897705078125, + 50.345058441162109375, + 47.71305084228515625, + 0, + 0, + 0, + 0, + 1398.31689453125, + 1139.9393310546875, + 926.46905517578125, + 781.85968017578125, + 658.5869140625, + 554.03814697265625, + 465.6590576171875, + 391.11358642578125, + 328.3336181640625, + 275.68292236328125, + 231.480926513671875, + 194.317352294921875, + 163.086822509765625, + 136.85247802734375, + 114.822052001953125, + 99.632171630859375, + 92.93701934814453125, + 86.6887664794921875, + 80.85819244384765625, + 75.4178009033203125, + 70.34185791015625, + 65.63607025146484375, + 62.2131195068359375, + 58.967945098876953125, + 55.891448974609375, + 52.97493743896484375, + 50.21018218994140625, + 47.589336395263671875, + 0, + 0, + 0, + 0, + 1242.53955078125, + 1031.720703125, + 865.44610595703125, + 736.6123046875, + 624.660888671875, + 528.352294921875, + 446.04833984375, + 376.0323486328125, + 316.66259765625, + 266.643341064453125, + 224.4067230224609375, + 188.7577972412109375, + 158.70111083984375, + 133.38116455078125, + 112.06615447998046875, + 98.72772216796875, + 92.1346588134765625, + 85.97531890869140625, + 80.2224273681640625, + 74.85018157958984375, + 69.8341827392578125, + 65.2862091064453125, + 61.8940887451171875, + 58.6766357421875, + 55.625087738037109375, + 52.731090545654296875, + 49.9866943359375, + 47.384288787841796875, + 1554.123779296875, + 1511.89892578125, + 1398.31689453125, + 1242.53955078125, + 1072.7047119140625, + 913.631103515625, + 791.500732421875, + 680.45819091796875, + 581.7969970703125, + 495.450836181640625, + 420.656097412109375, + 356.335296630859375, + 301.346527099609375, + 254.681488037109375, + 214.9998931884765625, + 181.3341522216796875, + 152.824005126953125, + 128.7149505615234375, + 108.35161590576171875, + 97.48529052734375, + 91.031036376953125, + 84.99285125732421875, + 79.34609222412109375, + 74.0670623779296875, + 69.13317108154296875, + 64.80181121826171875, + 61.4521331787109375, + 58.2728424072265625, + 55.255706787109375, + 52.392795562744140625, + 49.676509857177734375, + 47.09958648681640625, + 1242.53955078125, + 1215.3125, + 1139.9393310546875, + 1031.720703125, + 913.631103515625, + 811.43212890625, + 711.60491943359375, + 618.22406005859375, + 533.33489990234375, + 457.657806396484375, + 391.11358642578125, + 333.17852783203125, + 283.2335205078125, + 240.389190673828125, + 203.69110107421875, + 172.3627471923828125, + 145.6892547607421875, + 123.02779388427734375, + 103.80861663818359375, + 95.92728424072265625, + 89.64476776123046875, + 83.756927490234375, + 78.2422027587890625, + 73.07941436767578125, + 68.24814605712890625, + 64.18810272216796875, + 60.89176177978515625, + 57.760517120361328125, + 54.786739349365234375, + 51.96302032470703125, + 49.28223419189453125, + 46.73752593994140625, + 993.424560546875, + 975.19708251953125, + 926.46905517578125, + 865.44610595703125, + 791.500732421875, + 711.60491943359375, + 631.20953369140625, + 554.03814697265625, + 482.32843017578125, + 417.209503173828125, + 359.053955078125, + 307.756866455078125, + 263.14556884765625, + 224.4067230224609375, + 190.9550628662109375, + 162.196685791015625, + 137.5612030029296875, + 116.51837158203125, + 100.55123138427734375, + 94.08060455322265625, + 87.99832916259765625, + 82.28644561767578125, + 76.92669677734375, + 71.90074920654296875, + 67.1905670166015625, + 63.451557159423828125, + 60.218593597412109375, + 57.14452362060546875, + 54.22241973876953125, + 51.44549560546875, + 48.807163238525390625, + 46.300994873046875, + 821.7386474609375, + 811.43212890625, + 781.85968017578125, + 736.6123046875, + 680.45819091796875, + 618.22406005859375, + 554.03814697265625, + 491.01513671875, + 431.260406494140625, + 376.0323486328125, + 325.949676513671875, + 281.318328857421875, + 241.92047119140625, + 207.3672332763671875, + 177.27020263671875, + 151.198699951171875, + 128.7149810791015625, + 109.3962554931640625, + 98.19190216064453125, + 91.9756011962890625, + 86.117218017578125, + 80.60289764404296875, + 75.4178009033203125, + 70.54657745361328125, + 65.97369384765625, + 62.5996856689453125, + 59.439167022705078125, + 56.430583953857421875, + 53.567768096923828125, + 50.844631195068359375, + 48.255126953125, + 45.7933807373046875, + 688.02374267578125, + 680.45819091796875, + 658.5869140625, + 624.660888671875, + 581.7969970703125, + 533.33489990234375, + 482.32843017578125, + 431.260406494140625, + 381.958526611328125, + 335.640289306640625, + 293.0960693359375, + 254.681427001953125, + 220.30645751953125, + 189.852081298828125, + 163.086822509765625, + 139.7166595458984375, + 119.4195404052734375, + 101.8691864013671875, + 95.58690643310546875, + 89.64476776123046875, + 84.02899932861328125, + 78.7296905517578125, + 73.73546600341796875, + 69.03392791748046875, + 64.8706207275390625, + 61.6408843994140625, + 58.560794830322265625, + 55.625087738037109375, + 52.82842254638671875, + 50.165355682373046875, + 47.630523681640625, + 45.21855926513671875, + 576.06719970703125, + 570.4285888671875, + 554.03814697265625, + 528.352294921875, + 495.450836181640625, + 457.657806396484375, + 417.209503173828125, + 376.0323486328125, + 335.640289306640625, + 297.17999267578125, + 261.421112060546875, + 228.6143646240234375, + 198.92822265625, + 172.3627471923828125, + 148.803924560546875, + 128.066162109375, + 109.923492431640625, + 98.72772216796875, + 92.77559661865234375, + 87.1216278076171875, + 81.76230621337890625, + 76.69138336181640625, + 71.90074920654296875, + 67.38091278076171875, + 63.7176971435546875, + 60.584194183349609375, + 57.59142303466796875, + 54.7350311279296875, + 52.010478973388671875, + 49.41309356689453125, + 46.93811798095703125, + 44.580780029296875, + 482.32843017578125, + 478.07489013671875, + 465.6590576171875, + 446.04833984375, + 420.656097412109375, + 391.11358642578125, + 359.053955078125, + 325.949676513671875, + 293.0960693359375, + 261.421112060546875, + 231.480926513671875, + 203.69110107421875, + 178.2744598388671875, + 155.306304931640625, + 134.7555084228515625, + 116.51837158203125, + 101.29721832275390625, + 95.41748046875, + 89.79701995849609375, + 84.4398193359375, + 79.34609222412109375, + 74.51293182373046875, + 69.9352569580078125, + 65.63607025146484375, + 62.470378875732421875, + 59.439159393310546875, + 56.53945159912109375, + 53.767810821533203125, + 51.120525360107421875, + 48.59365081787109375, + 46.18306732177734375, + 43.884586334228515625, + 403.842987060546875, + 400.602691650390625, + 391.11358642578125, + 376.0323486328125, + 356.335296630859375, + 333.17852783203125, + 307.756866455078125, + 281.318328857421875, + 254.681427001953125, + 228.6143646240234375, + 203.69110107421875, + 180.3063201904296875, + 158.70111083984375, + 138.993194580078125, + 121.20597076416015625, + 105.29486083984375, + 97.48529815673828125, + 91.9756011962890625, + 86.68878173828125, + 81.632110595703125, + 76.80889129638671875, + 72.21916961669921875, + 67.86054229736328125, + 64.18810272216796875, + 61.139739990234375, + 58.215541839599609375, + 55.413524627685546875, + 52.731090545654296875, + 50.165355682373046875, + 47.71305084228515625, + 45.3707275390625, + 43.134796142578125, + 338.12896728515625, + 335.640289306640625, + 328.3336181640625, + 316.66259765625, + 301.346527099609375, + 283.2335205078125, + 263.14556884765625, + 241.92047119140625, + 220.30645751953125, + 198.92822265625, + 178.2744598388671875, + 158.70111083984375, + 140.44512939453125, + 123.64312744140625, + 108.35161590576171875, + 98.907470703125, + 93.5875701904296875, + 88.4422607421875, + 83.48635101318359375, + 78.7296905517578125, + 74.17812347412109375, + 69.8341827392578125, + 65.70645904541015625, + 62.66451263427734375, + 59.736907958984375, + 56.923252105712890625, + 54.22241973876953125, + 51.632717132568359375, + 49.151935577392578125, + 46.77752685546875, + 44.50667572021484375, + 42.336353302001953125, + 283.2335205078125, + 281.318328857421875, + 275.68292236328125, + 266.643341064453125, + 254.681488037109375, + 240.389190673828125, + 224.4067230224609375, + 207.3672332763671875, + 189.852081298828125, + 172.3627471923828125, + 155.306304931640625, + 138.993194580078125, + 123.64312744140625, + 109.3962554931640625, + 99.63219451904296875, + 94.5781402587890625, + 89.64476776123046875, + 84.85404205322265625, + 80.2224273681640625, + 75.76171875, + 71.4796905517578125, + 67.38091278076171875, + 63.985767364501953125, + 61.0775909423828125, + 58.2728424072265625, + 55.5720672607421875, + 52.97493743896484375, + 50.480510711669921875, + 48.087291717529296875, + 45.7933807373046875, + 43.59656524658203125, + 41.49433135986328125, + 237.367095947265625, + 235.8760223388671875, + 231.480926513671875, + 224.4067230224609375, + 214.9998931884765625, + 203.69110107421875, + 190.9550628662109375, + 177.27020263671875, + 163.086822509765625, + 148.803924560546875, + 134.7555084228515625, + 121.20597076416015625, + 108.35161590576171875, + 99.63219451904296875, + 94.912353515625, + 90.2564239501953125, + 85.69269561767578125, + 81.24362945556640625, + 76.92669677734375, + 72.75490570068359375, + 68.737579345703125, + 65.0778350830078125, + 62.2131195068359375, + 59.439159393310546875, + 56.75823211669921875, + 54.171604156494140625, + 51.679691314697265625, + 49.28223419189453125, + 46.97840118408203125, + 44.766880035400390625, + 42.64601898193359375, + 40.613834381103515625, + 198.92822265625, + 197.7614898681640625, + 194.317352294921875, + 188.7577972412109375, + 181.3341522216796875, + 172.3627471923828125, + 162.196685791015625, + 151.198699951171875, + 139.7166595458984375, + 128.066162109375, + 116.51837158203125, + 105.29486083984375, + 98.907470703125, + 94.5781402587890625, + 90.2564239501953125, + 85.97531890869140625, + 81.76230621337890625, + 77.6397247314453125, + 73.62548065185546875, + 69.73333740234375, + 65.97369384765625, + 63.1215667724609375, + 60.4009246826171875, + 57.760517120361328125, + 55.203277587890625, + 52.731090545654296875, + 50.345058441162109375, + 48.045475006103515625, + 45.832118988037109375, + 43.7042236328125, + 41.660648345947265625, + 39.699886322021484375, + 166.714080810546875, + 165.797119140625, + 163.086822509765625, + 158.70111083984375, + 152.824005126953125, + 145.6892547607421875, + 137.5612030029296875, + 128.7149810791015625, + 119.4195404052734375, + 109.923492431640625, + 101.29721832275390625, + 97.48529815673828125, + 93.5875701904296875, + 89.64476776123046875, + 85.69269561767578125, + 81.76230621337890625, + 77.87981414794921875, + 74.0670623779296875, + 70.34185791015625, + 66.7183837890625, + 63.7845306396484375, + 61.139739990234375, + 58.560787200927734375, + 56.05228424072265625, + 53.617671966552734375, + 51.25939178466796875, + 48.979061126708984375, + 46.77752685546875, + 44.6550750732421875, + 42.611438751220703125, + 40.645923614501953125, + 38.757503509521484375, + 139.71661376953125, + 138.9931640625, + 136.85247802734375, + 133.38116455078125, + 128.7149505615234375, + 123.02779388427734375, + 116.51837158203125, + 109.3962554931640625, + 101.8691864013671875, + 98.72772216796875, + 95.41748046875, + 91.9756011962890625, + 88.4422607421875, + 84.85404205322265625, + 81.24362945556640625, + 77.6397247314453125, + 74.0670623779296875, + 70.54657745361328125, + 67.09571075439453125, + 64.18810272216796875, + 61.6408843994140625, + 59.143909454345703125, + 56.703399658203125, + 54.324310302734375, + 52.010478973388671875, + 49.764812469482421875, + 47.589336395263671875, + 45.485385894775390625, + 43.453662872314453125, + 41.49433135986328125, + 39.607158660888671875, + 37.7914886474609375, + 117.09114837646484375, + 116.51837158203125, + 114.822052001953125, + 112.06615447998046875, + 108.35161590576171875, + 103.80861663818359375, + 100.55123138427734375, + 98.19190216064453125, + 95.58690643310546875, + 92.77559661865234375, + 89.79701995849609375, + 86.68878173828125, + 83.48635101318359375, + 80.2224273681640625, + 76.92669677734375, + 73.62548065185546875, + 70.34185791015625, + 67.09571075439453125, + 64.32361602783203125, + 61.8940887451171875, + 59.498504638671875, + 57.14452362060546875, + 54.838520050048828125, + 52.585674285888671875, + 50.390140533447265625, + 48.255126953125, + 46.18306732177734375, + 44.175624847412109375, + 42.233917236328125, + 40.358489990234375, + 38.54946136474609375, + 36.8065338134765625, + 100.3662261962890625, + 100.18183135986328125, + 99.632171630859375, + 98.72772216796875, + 97.48529052734375, + 95.92728424072265625, + 94.08060455322265625, + 91.9756011962890625, + 89.64476776123046875, + 87.1216278076171875, + 84.4398193359375, + 81.632110595703125, + 78.7296905517578125, + 75.76171875, + 72.75490570068359375, + 69.73333740234375, + 66.7183837890625, + 64.18810272216796875, + 61.8940887451171875, + 59.61750030517578125, + 57.367244720458984375, + 55.150913238525390625, + 52.97493743896484375, + 50.844631195068359375, + 48.76433563232421875, + 46.73752593994140625, + 44.766880035400390625, + 42.854427337646484375, + 41.001552581787109375, + 39.2091522216796875, + 37.477649688720703125, + 35.807086944580078125, + 93.5875701904296875, + 93.42420196533203125, + 92.93701934814453125, + 92.1346588134765625, + 91.031036376953125, + 89.64476776123046875, + 87.99832916259765625, + 86.117218017578125, + 84.02899932861328125, + 81.76230621337890625, + 79.34609222412109375, + 76.80889129638671875, + 74.17812347412109375, + 71.4796905517578125, + 68.737579345703125, + 65.97369384765625, + 63.7845306396484375, + 61.6408843994140625, + 59.498504638671875, + 57.367244720458984375, + 55.255706787109375, + 53.17134857177734375, + 51.120525360107421875, + 49.10861968994140625, + 47.1400909423828125, + 45.21856689453125, + 43.34696197509765625, + 41.52751922607421875, + 39.761898040771484375, + 38.05126190185546875, + 36.396320343017578125, + 34.797382354736328125, + 87.26671600341796875, + 87.1216278076171875, + 86.6887664794921875, + 85.97531890869140625, + 84.99285125732421875, + 83.756927490234375, + 82.28644561767578125, + 80.60289764404296875, + 78.7296905517578125, + 76.69138336181640625, + 74.51293182373046875, + 72.21916961669921875, + 69.8341827392578125, + 67.38091278076171875, + 65.0778350830078125, + 63.1215667724609375, + 61.139739990234375, + 59.143909454345703125, + 57.14452362060546875, + 55.150913238525390625, + 53.17134857177734375, + 51.2130279541015625, + 49.28223419189453125, + 47.384288787841796875, + 45.52370452880859375, + 43.70421600341796875, + 41.92890167236328125, + 40.20015716552734375, + 38.519870758056640625, + 36.88941192626953125, + 35.309741973876953125, + 33.78139495849609375, + 81.37277984619140625, + 81.24362945556640625, + 80.85819244384765625, + 80.2224273681640625, + 79.34609222412109375, + 78.2422027587890625, + 76.92669677734375, + 75.4178009033203125, + 73.73546600341796875, + 71.90074920654296875, + 69.9352569580078125, + 67.86054229736328125, + 65.70645904541015625, + 63.985767364501953125, + 62.2131195068359375, + 60.4009246826171875, + 58.560787200927734375, + 56.703399658203125, + 54.838520050048828125, + 52.97493743896484375, + 51.120525360107421875, + 49.28223419189453125, + 47.4661407470703125, + 45.677494049072265625, + 43.920803070068359375, + 42.199848175048828125, + 40.517787933349609375, + 38.877155303955078125, + 37.279994964599609375, + 35.72786712646484375, + 34.221893310546875, + 32.9320831298828125, + 75.876922607421875, + 75.76171875, + 75.4178009033203125, + 74.85018157958984375, + 74.0670623779296875, + 73.07941436767578125, + 71.90074920654296875, + 70.54657745361328125, + 69.03392791748046875, + 67.38091278076171875, + 65.63607025146484375, + 64.18810272216796875, + 62.66451263427734375, + 61.0775909423828125, + 59.439159393310546875, + 57.760517120361328125, + 56.05228424072265625, + 54.324310302734375, + 52.585674285888671875, + 50.844631195068359375, + 49.10861968994140625, + 47.384288787841796875, + 45.677494049072265625, + 43.993377685546875, + 42.336353302001953125, + 40.710224151611328125, + 39.118167877197265625, + 37.562847137451171875, + 36.04637908935546875, + 34.570468902587890625, + 33.22452545166015625, + 32.131664276123046875, + 70.75225830078125, + 70.6492919921875, + 70.34185791015625, + 69.8341827392578125, + 69.13317108154296875, + 68.24814605712890625, + 67.1905670166015625, + 65.97369384765625, + 64.8706207275390625, + 63.7176971435546875, + 62.470378875732421875, + 61.139739990234375, + 59.736907958984375, + 58.2728424072265625, + 56.75823211669921875, + 55.203277587890625, + 53.617671966552734375, + 52.010478973388671875, + 50.390140533447265625, + 48.76433563232421875, + 47.1400909423828125, + 45.52370452880859375, + 43.920803070068359375, + 42.336353302001953125, + 40.774688720703125, + 39.239543914794921875, + 37.734134674072265625, + 36.26111602783203125, + 34.822742462158203125, + 33.446636199951171875, + 32.3772735595703125, + 31.328754425048828125, + 65.97368621826171875, + 65.8815155029296875, + 65.63607025146484375, + 65.2862091064453125, + 64.80181121826171875, + 64.18810272216796875, + 63.451557159423828125, + 62.5996856689453125, + 61.6408843994140625, + 60.584194183349609375, + 59.439159393310546875, + 58.215541839599609375, + 56.923252105712890625, + 55.5720672607421875, + 54.171604156494140625, + 52.731090545654296875, + 51.25939178466796875, + 49.764812469482421875, + 48.255126953125, + 46.73752593994140625, + 45.21856689453125, + 43.70421600341796875, + 42.199848175048828125, + 40.710224151611328125, + 39.239543914794921875, + 37.7914886474609375, + 36.3692169189453125, + 34.975429534912109375, + 33.612384796142578125, + 32.55457305908203125, + 31.5306549072265625, + 30.5255126953125, + 62.470378875732421875, + 62.4058837890625, + 62.2131195068359375, + 61.8940887451171875, + 61.4521331787109375, + 60.89176177978515625, + 60.218593597412109375, + 59.439167022705078125, + 58.560794830322265625, + 57.59142303466796875, + 56.53945159912109375, + 55.413524627685546875, + 54.22241973876953125, + 52.97493743896484375, + 51.679691314697265625, + 50.345058441162109375, + 48.979061126708984375, + 47.589336395263671875, + 46.18306732177734375, + 44.766880035400390625, + 43.34696197509765625, + 41.92890167236328125, + 40.517787933349609375, + 39.118167877197265625, + 37.734134674072265625, + 36.3692169189453125, + 35.026554107666015625, + 33.708812713623046875, + 32.66172027587890625, + 31.6664142608642578125, + 30.6867351531982421875, + 29.723903656005859375, + 59.2027587890625, + 59.143909454345703125, + 58.967945098876953125, + 58.6766357421875, + 58.2728424072265625, + 57.760517120361328125, + 57.14452362060546875, + 56.430583953857421875, + 55.625087738037109375, + 54.7350311279296875, + 53.767810821533203125, + 52.731090545654296875, + 51.632717132568359375, + 50.480510711669921875, + 49.28223419189453125, + 48.045475006103515625, + 46.77752685546875, + 45.485385894775390625, + 44.175624847412109375, + 42.854427337646484375, + 41.52751922607421875, + 40.20015716552734375, + 38.877155303955078125, + 37.562847137451171875, + 36.26111602783203125, + 34.975429534912109375, + 33.708812713623046875, + 32.697551727294921875, + 31.7346553802490234375, + 30.7841281890869140625, + 29.8474140167236328125, + 28.925754547119140625, + 56.10607147216796875, + 56.05228424072265625, + 55.891448974609375, + 55.625087738037109375, + 55.255706787109375, + 54.786739349365234375, + 54.22241973876953125, + 53.567768096923828125, + 52.82842254638671875, + 52.010478973388671875, + 51.120525360107421875, + 50.165355682373046875, + 49.151935577392578125, + 48.087291717529296875, + 46.97840118408203125, + 45.832118988037109375, + 44.6550750732421875, + 43.453662872314453125, + 42.233917236328125, + 41.001552581787109375, + 39.761898040771484375, + 38.519870758056640625, + 37.279994964599609375, + 36.04637908935546875, + 34.822742462158203125, + 33.612384796142578125, + 32.66172027587890625, + 31.7346553802490234375, + 30.81670379638671875, + 29.9094753265380859375, + 29.0143985748291015625, + 28.132732391357421875, + 53.17134857177734375, + 53.12213897705078125, + 52.97493743896484375, + 52.731090545654296875, + 52.392795562744140625, + 51.96302032470703125, + 51.44549560546875, + 50.844631195068359375, + 50.165355682373046875, + 49.41309356689453125, + 48.59365081787109375, + 47.71305084228515625, + 46.77752685546875, + 45.7933807373046875, + 44.766880035400390625, + 43.7042236328125, + 42.611438751220703125, + 41.49433135986328125, + 40.358489990234375, + 39.2091522216796875, + 38.05126190185546875, + 36.88941192626953125, + 35.72786712646484375, + 34.570468902587890625, + 33.446636199951171875, + 32.55457305908203125, + 31.6664142608642578125, + 30.7841281890869140625, + 29.9094753265380859375, + 29.0440425872802734375, + 28.189243316650390625, + 27.346340179443359375, + 50.390140533447265625, + 50.345058441162109375, + 50.21018218994140625, + 49.9866943359375, + 49.676509857177734375, + 49.28223419189453125, + 48.807163238525390625, + 48.255126953125, + 47.630523681640625, + 46.93811798095703125, + 46.18306732177734375, + 45.3707275390625, + 44.50667572021484375, + 43.59656524658203125, + 42.64601898193359375, + 41.660648345947265625, + 40.645923614501953125, + 39.607158660888671875, + 38.54946136474609375, + 37.477649688720703125, + 36.396320343017578125, + 35.309741973876953125, + 34.221893310546875, + 33.22452545166015625, + 32.3772735595703125, + 31.5306549072265625, + 30.6867351531982421875, + 29.8474140167236328125, + 29.0143985748291015625, + 28.189243316650390625, + 27.3733463287353515625, + 26.56793975830078125, + 47.75440216064453125, + 47.71305084228515625, + 47.589336395263671875, + 47.384288787841796875, + 47.09958648681640625, + 46.73752593994140625, + 46.300994873046875, + 45.7933807373046875, + 45.21855926513671875, + 44.580780029296875, + 43.884586334228515625, + 43.134796142578125, + 42.336353302001953125, + 41.49433135986328125, + 40.613834381103515625, + 39.699886322021484375, + 38.757503509521484375, + 37.7914886474609375, + 36.8065338134765625, + 35.807086944580078125, + 34.797382354736328125, + 33.78139495849609375, + 32.9320831298828125, + 32.131664276123046875, + 31.328754425048828125, + 30.5255126953125, + 29.723903656005859375, + 28.925754547119140625, + 28.132732391357421875, + 27.346340179443359375, + 26.56793975830078125, + 25.7987575531005859375}; + +static float dequant_table[] = { + 0.00031746, 0.00031746, 0.000318547, 0.000377554, 0.000447491, 0.000530384, 0.000628631, 0.000745078, 0.00031746, + 0.00031746, 0.000331586, 0.000388111, 0.000456952, 0.000539384, 0.000637535, 0.000754131, 0.000318547, 0.000331586, + 0.000366704, 0.000418477, 0.000484876, 0.000566262, 0.000664278, 0.000781404, 0.000377554, 0.000388111, 0.000418477, + 0.000466329, 0.000530384, 0.000610828, 0.000709031, 0.000827274, 0.000447491, 0.000456952, 0.000484876, 0.000530384, + 0.000593021, 0.000673207, 0.000772293, 0.000942868, 0.000530384, 0.000539384, 0.000566262, 0.000610828, 0.000673207, + 0.000754131, 0.000855072, 0.00127232, 0.000628631, 0.000637535, 0.000664278, 0.000709031, 0.000772293, 0.000855072, + 0.00119232, 0.00179199, 0.000745078, 0.000754131, 0.000781404, 0.000827274, 0.000942868, 0.00127232, 0.00179199, + 0.00261332, 0.00178571, 0.00178571, 0.00179048, 0.00204418, 0.00233383, 0.00266452, 0.00304207, 0.00347311, + 0.00178571, 0.00178571, 0.00184737, 0.00208861, 0.00237221, 0.00269971, 0.00307561, 0.00350598, 0.00179048, + 0.00184737, 0.00199823, 0.00221497, 0.00248451, 0.00280405, 0.00317576, 0.00360445, 0.00204418, 0.00208861, + 0.00221497, 0.00241009, 0.00266452, 0.00297468, 0.00334138, 0.0037684, 0.00233383, 0.00237221, 0.00248451, + 0.00266452, 0.00290684, 0.00320899, 0.00357164, 0.00399808, 0.00266452, 0.00269971, 0.00280405, 0.00297468, + 0.00320899, 0.00350598, 0.00386677, 0.0042947, 0.00304207, 0.00307561, 0.00317576, 0.00334138, 0.00357164, + 0.00386677, 0.0042286, 0.00466073, 0.00347311, 0.00350598, 0.00360445, 0.0037684, 0.00399808, 0.0042947, + 0.00466073, 0.00510017, 0.00195312, 0.00340183, 0.00590075, 0.00837433, 0.0117188, 0.0117188, 0.0119688, + 0.016986, 0.00340183, 0.00428086, 0.00640914, 0.00886387, 0.0117188, 0.0117188, 0.0123206, 0.017414, + 0.00590075, 0.00640914, 0.00788613, 0.0103519, 0.0117188, 0.0117188, 0.013409, 0.0187362, 0.00837433, + 0.00886387, 0.0103519, 0.0117188, 0.0117188, 0.0117188, 0.0153365, 0.0210725, 0.0117188, 0.0117188, + 0.0117188, 0.0117188, 0.0117188, 0.0137829, 0.018289, 0.0253682, 0.0117188, 0.0117188, 0.0117188, + 0.0117188, 0.0137829, 0.017414, 0.0225572, 0.0342323, 0.0119688, 0.0123206, 0.013409, 0.0153365, + 0.018289, 0.0225572, 0.0320797, 0.0482141, 0.016986, 0.017414, 0.0187362, 0.0210725, 0.0253682, + 0.0342323, 0.0482141, 0.0703122, 0.00357143, 0.000316456, 0.00357143, 0.00357143, 0.00357143, 0.00357143, + 0.00357143, 0.00357143, 0.000316456, 0.000316456, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, + 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, + 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, + 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, + 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, + 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.00357143, + 0.00357143, 0.00357143, 0.00357143, 0.00357143, 0.0166667, 0.00115741, 0.0166667, 0.0166667, 0.0166667, + 0.0166667, 0.0166667, 0.0166667, 0.00115741, 0.00115741, 0.0166667, 0.0166667, 0.0166667, 0.0166667, + 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, + 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, + 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, + 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, + 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, + 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0166667, 0.0555556, 0.005, 0.0555556, 0.0555556, + 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.005, 0.005, 0.0555556, 0.0555556, 0.0555556, + 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, + 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, + 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, + 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, + 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, + 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.0555556, 0.00033456, 0.000260417, 0.00078125, + 0.00078125, 0.00208333, 0.00208333, 0.00208333, 0.00208333, 0.000260417, 0.000390625, 0.00078125, 0.00078125, + 0.00208333, 0.00208333, 0.00208333, 0.00208333, 0.00078125, 0.00078125, 0.0015625, 0.0015625, 0.00208333, + 0.00208333, 0.00208333, 0.00208333, 0.00078125, 0.00078125, 0.0015625, 0.0015625, 0.00208333, 0.00208333, + 0.00208333, 0.00208333, 0.00208333, 0.00208333, 0.00208333, 0.00208333, 0.00333333, 0.00333333, 0.00333333, + 0.00333333, 0.00208333, 0.00208333, 0.00208333, 0.00208333, 0.00333333, 0.00333333, 0.00333333, 0.00333333, + 0.00208333, 0.00208333, 0.00208333, 0.00208333, 0.00333333, 0.00333333, 0.00333333, 0.00333333, 0.00208333, + 0.00208333, 0.00208333, 0.00208333, 0.00333333, 0.00333333, 0.00333333, 0.00333333, 0.00033456, 0.00104167, + 0.003125, 0.003125, 0.00714286, 0.00714286, 0.00714286, 0.00714286, 0.00104167, 0.0015625, 0.003125, + 0.003125, 0.00714286, 0.00714286, 0.00714286, 0.00714286, 0.003125, 0.003125, 0.00555556, 0.00555556, + 0.00714286, 0.00714286, 0.00714286, 0.00714286, 0.003125, 0.003125, 0.00555556, 0.00555556, 0.00714286, + 0.00714286, 0.00714286, 0.00714286, 0.00714286, 0.00714286, 0.00714286, 0.00714286, 0.00833333, 0.00833333, + 0.00833333, 0.00833333, 0.00714286, 0.00714286, 0.00714286, 0.00714286, 0.00833333, 0.00833333, 0.00833333, + 0.00833333, 0.00714286, 0.00714286, 0.00714286, 0.00714286, 0.00833333, 0.00833333, 0.00833333, 0.00833333, + 0.00714286, 0.00714286, 0.00714286, 0.00714286, 0.00833333, 0.00833333, 0.00833333, 0.00833333, 0.00033456, + 0.0015625, 0.0078125, 0.0078125, 0.03125, 0.03125, 0.03125, 0.03125, 0.0015625, 0.003125, + 0.0078125, 0.0078125, 0.03125, 0.03125, 0.03125, 0.03125, 0.0078125, 0.0078125, 0.015625, + 0.015625, 0.03125, 0.03125, 0.03125, 0.03125, 0.0078125, 0.0078125, 0.015625, 0.015625, + 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.0625, + 0.0625, 0.0625, 0.0625, 0.03125, 0.03125, 0.03125, 0.03125, 0.0625, 0.0625, + 0.0625, 0.0625, 0.03125, 0.03125, 0.03125, 0.03125, 0.0625, 0.0625, 0.0625, + 0.0625, 0.03125, 0.03125, 0.03125, 0.03125, 0.0625, 0.0625, 0.0625, 0.0625, + 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, + 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, + 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, + 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, + 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, + 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, + 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, 0.000454545, + 0.000454545, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, + 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, + 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, + 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, + 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, + 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, + 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, 0.00255102, + 0.00255102, 0.00255102, 0.00892857, 0.00892857, 0.0104546, 0.0104546, 0.0122415, 0.0122415, 0.0146543, + 0.0146543, 0.00892857, 0.00892857, 0.0104546, 0.0104546, 0.0122415, 0.0122415, 0.0146543, 0.0146543, + 0.0104546, 0.0104546, 0.0111607, 0.0111607, 0.012706, 0.012706, 0.0153522, 0.0153522, 0.0104546, + 0.0104546, 0.0111607, 0.0111607, 0.012706, 0.012706, 0.0153522, 0.0153522, 0.0122415, 0.0122415, + 0.012706, 0.012706, 0.0139509, 0.0139509, 0.0174327, 0.0174327, 0.0122415, 0.0122415, 0.012706, + 0.012706, 0.0139509, 0.0139509, 0.0174327, 0.0174327, 0.0146543, 0.0146543, 0.0153522, 0.0153522, + 0.0174327, 0.0174327, 0.0209263, 0.0209263, 0.0146543, 0.0146543, 0.0153522, 0.0153522, 0.0174327, + 0.0174327, 0.0209263, 0.0209263, 0.00011115, 0.000140677, 0.000178049, 0.00022535, 0.000269504, 0.000301925, + 0.000338246, 0.000378937, 0.000420348, 0.000465933, 0.000516461, 0.000580657, 0.000667287, 0.000766841, 0.000881248, + 0.00105055, 0.000140677, 0.000155098, 0.000188232, 0.000234132, 0.000273299, 0.00030534, 0.000341441, 0.000382008, + 0.000423051, 0.000468598, 0.00051912, 0.000584332, 0.000671158, 0.000770947, 0.000885629, 0.00105693, 0.000178049, + 0.000188232, 0.000216422, 0.000257694, 0.000284353, 0.000315428, 0.000350949, 0.000390319, 0.00043114, 0.000476585, + 0.0005271, 0.000595403, 0.000682824, 0.000783325, 0.000898839, 0.00107621, 0.00022535, 0.000234132, 0.000257694, + 0.000277035, 0.000301925, 0.000331813, 0.000366582, 0.000404044, 0.000444564, 0.000489882, 0.000540413, 0.000614021, + 0.000702452, 0.000804154, 0.000928012, 0.00110885, 0.000269504, 0.000273299, 0.000284353, 0.000301925, 0.000325315, + 0.000354095, 0.000387556, 0.000423051, 0.000463267, 0.000508487, 0.000562406, 0.00064044, 0.000730322, 0.000833738, + 0.000969823, 0.00115559, 0.000301925, 0.00030534, 0.000315428, 0.000331813, 0.000354095, 0.000382008, 0.000412216, + 0.000447241, 0.000487224, 0.000532423, 0.000595403, 0.000675038, 0.000766841, 0.000872515, 0.00102528, 0.00121754, + 0.000338246, 0.000341441, 0.000350949, 0.000366582, 0.000387556, 0.000412216, 0.000441884, 0.000476585, 0.000516461, + 0.00056604, 0.00063664, 0.000718321, 0.000812556, 0.000928012, 0.00109572, 0.00129614, 0.000378937, 0.000382008, + 0.000390319, 0.000404044, 0.000423051, 0.000447241, 0.000476585, 0.000511145, 0.000551548, 0.000614021, 0.000686732, + 0.000770947, 0.000868165, 0.00100658, 0.00118286, 0.00139327, 0.000420348, 0.000423051, 0.00043114, 0.000444564, + 0.000463267, 0.000487224, 0.000516461, 0.000551548, 0.000606549, 0.000671158, 0.000746456, 0.000833738, 0.000945791, + 0.00110227, 0.00128886, 0.00151124, 0.000465933, 0.000468598, 0.000476585, 0.000489882, 0.000508487, 0.000532423, + 0.00056604, 0.000614021, 0.000671158, 0.000738369, 0.000816772, 0.000910439, 0.00105055, 0.00121754, 0.00141635, + 0.0016529, 0.000516461, 0.00051912, 0.0005271, 0.000540413, 0.000562406, 0.000595403, 0.00063664, 0.000686732, + 0.000746456, 0.000816772, 0.000898839, 0.00102528, 0.00117601, 0.00135536, 0.00156853, 0.00188017, 0.000580657, + 0.000584332, 0.000595403, 0.000614021, 0.00064044, 0.000675038, 0.000718321, 0.000770947, 0.000833738, 0.000910439, + 0.00102528, 0.00116237, 0.00132553, 0.00151933, 0.00176938, 0.00219749, 0.000667287, 0.000671158, 0.000682824, + 0.000702452, 0.000730322, 0.000766841, 0.000812556, 0.000868165, 0.000945791, 0.00105055, 0.00117601, 0.00132553, + 0.00150317, 0.00171597, 0.00210276, 0.00259291, 0.000766841, 0.000770947, 0.000783325, 0.000804154, 0.000833738, + 0.000872515, 0.000928012, 0.00100658, 0.00110227, 0.00121754, 0.00135536, 0.00151933, 0.00171597, 0.00207192, + 0.00252031, 0.00308604, 0.000881248, 0.000885629, 0.000898839, 0.000928012, 0.000969823, 0.00102528, 0.00109572, + 0.00118286, 0.00128886, 0.00141635, 0.00156853, 0.00176938, 0.00210276, 0.00252031, 0.00304399, 0.00370184, + 0.00105055, 0.00105693, 0.00107621, 0.00110885, 0.00115559, 0.00121754, 0.00129614, 0.00139327, 0.00151124, + 0.0016529, 0.00188017, 0.00219749, 0.00259291, 0.00308604, 0.00370184, 0.0044721, 0.000313334, 0.000362504, + 0.000419391, 0.000485204, 0.000567019, 0.000670359, 0.000792533, 0.000936973, 0.00104528, 0.00116095, 0.00128941, + 0.00142184, 0.0015506, 0.00169102, 0.00184416, 0.00199463, 0.000362504, 0.000385066, 0.000434073, 0.000496818, + 0.000578827, 0.000681565, 0.000803591, 0.000948188, 0.00105214, 0.00116772, 0.00129618, 0.00142744, 0.0015562, + 0.00169666, 0.00184987, 0.00199976, 0.000419391, 0.000434073, 0.00047322, 0.000530783, 0.000613658, 0.000715011, + 0.000836789, 0.00096922, 0.00107265, 0.00118801, 0.00131649, 0.00144425, 0.00157301, 0.00171359, 0.00186702, + 0.00201515, 0.000485204, 0.000496818, 0.000530783, 0.000590527, 0.000670359, 0.000770417, 0.0008923, 0.00100397, + 0.0011067, 0.00122181, 0.00135039, 0.00147223, 0.00160105, 0.00174186, 0.00189247, 0.00204083, 0.000567019, + 0.000578827, 0.000613658, 0.000670359, 0.000748285, 0.000847868, 0.000962226, 0.00105214, 0.00115418, 0.00126912, + 0.00139381, 0.00151141, 0.00164036, 0.00178153, 0.00192816, 0.00207686, 0.000670359, 0.000681565, 0.000715011, + 0.000770417, 0.000847868, 0.000948188, 0.00102467, 0.00111349, 0.00121505, 0.00133005, 0.00144425, 0.00156181, + 0.00169102, 0.00183274, 0.00197415, 0.00212335, 0.000792533, 0.000803591, 0.000836789, 0.0008923, 0.000962226, + 0.00102467, 0.0010999, 0.00118801, 0.00128941, 0.00139942, 0.00150581, 0.0016235, 0.00175318, 0.00189247, + 0.00203055, 0.00218041, 0.000936973, 0.000948188, 0.00096922, 0.00100397, 0.00105214, 0.00111349, 0.00118801, + 0.00127588, 0.00137698, 0.00147223, 0.00157862, 0.00169666, 0.00182704, 0.00195881, 0.0020975, 0.00224824, + 0.00104528, 0.00105214, 0.00107265, 0.0011067, 0.00115418, 0.00121505, 0.00128941, 0.00137698, 0.00146104, + 0.0015562, 0.00166286, 0.00178153, 0.00190776, 0.00203569, 0.00217521, 0.00232705, 0.00116095, 0.00116772, + 0.00118801, 0.00122181, 0.00126912, 0.00133005, 0.00139942, 0.00147223, 0.0015562, 0.0016516, 0.00175885, + 0.0018772, 0.00199463, 0.00212335, 0.00226395, 0.00241714, 0.00128941, 0.00129618, 0.00131649, 0.00135039, + 0.00139381, 0.00144425, 0.00150581, 0.00157862, 0.00166286, 0.00175885, 0.00186702, 0.00197415, 0.00209234, + 0.0022221, 0.00236404, 0.00253057, 0.00142184, 0.00142744, 0.00144425, 0.00147223, 0.00151141, 0.00156181, + 0.0016235, 0.00169666, 0.00178153, 0.0018772, 0.00197415, 0.00208202, 0.00220124, 0.00233233, 0.00248006, + 0.00266504, 0.0015506, 0.0015562, 0.00157301, 0.00160105, 0.00164036, 0.00169102, 0.00175318, 0.00182704, + 0.00190776, 0.00199463, 0.00209234, 0.00220124, 0.00232178, 0.00245495, 0.00262634, 0.00281554, 0.00169102, + 0.00169666, 0.00171359, 0.00174186, 0.00178153, 0.00183274, 0.00189247, 0.00195881, 0.00203569, 0.00212335, + 0.0022221, 0.00233233, 0.00245495, 0.00261349, 0.00278912, 0.00298309, 0.00184416, 0.00184987, 0.00186702, + 0.00189247, 0.00192816, 0.00197415, 0.00203055, 0.0020975, 0.00217521, 0.00226395, 0.00236404, 0.00248006, + 0.00262634, 0.00278912, 0.00296953, 0.00316883, 0.00199463, 0.00199976, 0.00201515, 0.00204083, 0.00207686, + 0.00212335, 0.00218041, 0.00224824, 0.00232705, 0.00241714, 0.00253057, 0.00266504, 0.00281554, 0.00298309, + 0.00316883, 0.00337407, 0.000863928, 0.00118463, 0.00162439, 0.0022274, 0.00295919, 0.00379064, 0.00485571, + 0.00622003, 0.00705056, 0.00791755, 0.00889116, 0.00992424, 0.0109744, 0.0121358, 0.01342, 0.016979, + 0.00118463, 0.00135013, 0.00175008, 0.00234449, 0.00305079, 0.00388474, 0.00495625, 0.00633046, 0.00710165, + 0.00796857, 0.00894274, 0.00996962, 0.0110205, 0.0121827, 0.0134682, 0.0171252, 0.00162439, 0.00175008, + 0.00210996, 0.00268382, 0.00332618, 0.00416997, 0.00526206, 0.00648581, 0.00725479, 0.00812174, 0.00909774, + 0.0101059, 0.0111587, 0.0123239, 0.0136133, 0.0175695, 0.0022274, 0.00234449, 0.00268382, 0.00314244, + 0.00379064, 0.00465665, 0.00578646, 0.00674328, 0.00750972, 0.00837744, 0.00935698, 0.0103335, 0.0113898, + 0.0125601, 0.0142433, 0.018329, 0.00295919, 0.00305079, 0.00332618, 0.00379064, 0.00446017, 0.00536543, + 0.00643411, 0.00710165, 0.00786655, 0.00873667, 0.0096976, 0.0106533, 0.0117149, 0.0128927, 0.0151608, + 0.019433, 0.00379064, 0.00388474, 0.00416997, 0.00465665, 0.00536543, 0.00633046, 0.0068971, 0.00756068, + 0.00832625, 0.00920129, 0.0101059, 0.0110665, 0.0121358, 0.0133237, 0.0164033, 0.020925, 0.00485571, + 0.00495625, 0.00526206, 0.00578646, 0.00643411, 0.0068971, 0.00745875, 0.00812174, 0.00889116, 0.00974289, + 0.0106075, 0.0115753, 0.0126549, 0.0142433, 0.0180223, 0.022864, 0.00622003, 0.00633046, 0.00648581, + 0.00674328, 0.00710165, 0.00756068, 0.00812174, 0.00878813, 0.00956181, 0.0103335, 0.0112048, 0.0121827, + 0.0132756, 0.0159812, 0.0200859, 0.0253284, 0.00705056, 0.00710165, 0.00725479, 0.00750972, 0.00786655, + 0.00832625, 0.00889116, 0.00956181, 0.0102424, 0.0110205, 0.0119015, 0.0128927, 0.0146314, 0.0181752, + 0.0226824, 0.0284191, 0.00791755, 0.00796857, 0.00812174, 0.00837744, 0.00873667, 0.00920129, 0.00974289, + 0.0103335, 0.0110205, 0.0118081, 0.0127024, 0.0138628, 0.016979, 0.020925, 0.0259248, 0.0322653, + 0.00889116, 0.00894274, 0.00909774, 0.00935698, 0.0096976, 0.0101059, 0.0106075, 0.0112048, 0.0119015, + 0.0127024, 0.0136133, 0.0164033, 0.0199212, 0.0243578, 0.0299571, 0.040354, 0.00992424, 0.00996962, + 0.0101059, 0.0103335, 0.0106533, 0.0110665, 0.0115753, 0.0121827, 0.0128927, 0.0138628, 0.0164033, + 0.0195947, 0.0236019, 0.028635, 0.0360671, 0.0538439, 0.0109744, 0.0110205, 0.0111587, 0.0113898, + 0.0117149, 0.0121358, 0.0126549, 0.0132756, 0.0146314, 0.016979, 0.0199212, 0.0236019, 0.0282045, + 0.0340795, 0.0496305, 0.0731186, 0.0121358, 0.0121827, 0.0123239, 0.0125601, 0.0128927, 0.0133237, + 0.0142433, 0.0159812, 0.0181752, 0.020925, 0.0243578, 0.028635, 0.0340795, 0.0482929, 0.0693777, + 0.100892, 0.01342, 0.0134682, 0.0136133, 0.0142433, 0.0151608, 0.0164033, 0.0180223, 0.0200859, + 0.0226824, 0.0259248, 0.0299571, 0.0360671, 0.0496305, 0.0693777, 0.0983645, 0.141247, 0.016979, + 0.0171252, 0.0175695, 0.018329, 0.019433, 0.020925, 0.022864, 0.0253284, 0.0284191, 0.0322653, + 0.040354, 0.0538439, 0.0731186, 0.100892, 0.141247, 0.200352, 6.36197e-05, 7.12062e-05, 7.96975e-05, + 8.92013e-05, 9.98385e-05, 0.000111744, 0.000125069, 0.000139614, 0.000155703, 0.000173646, 0.000193657, 0.000215974, + 0.000240863, 0.000267796, 0.000296726, 0.000328782, 0.000364301, 0.000403657, 0.000447265, 0.000490497, 0.000517569, + 0.000546135, 0.000576279, 0.000608085, 0.000641648, 0.000677063, 0.000721071, 0.000768327, 0.000818681, 0.000872335, + 0.000929504, 0.000990421, 7.12062e-05, 7.46078e-05, 8.18455e-05, 9.08471e-05, 0.000101233, 0.000112998, 0.000126241, + 0.000140701, 0.000156764, 0.000174699, 0.000194714, 0.000217046, 0.000241959, 0.000268853, 0.000297814, 0.000329907, + 0.000365469, 0.000404876, 0.00044854, 0.00049119, 0.000518264, 0.000546834, 0.000576982, 0.000608796, 0.000642366, + 0.00067779, 0.000721951, 0.000769231, 0.000819609, 0.000873289, 0.000930488, 0.000991435, 7.96975e-05, 8.18455e-05, + 8.74937e-05, 9.5499e-05, 0.000105293, 0.0001167, 0.000129698, 0.000143945, 0.000159941, 0.000177855, 0.000197886, + 0.000220265, 0.000245251, 0.000272031, 0.000301085, 0.00033329, 0.000368984, 0.000408541, 0.000452376, 0.000493271, + 0.00052035, 0.000548931, 0.000579094, 0.000610927, 0.000644522, 0.000680161, 0.000724595, 0.000771944, 0.000822397, + 0.000876157, 0.000933442, 0.000994481, 8.92013e-05, 9.08471e-05, 9.5499e-05, 0.000102605, 0.000111744, 0.00012271, + 0.000135241, 0.000149313, 0.000165221, 0.000183116, 0.000203185, 0.000225649, 0.000250764, 0.000277348, 0.00030656, + 0.000338955, 0.000374871, 0.000414682, 0.000458803, 0.000496739, 0.000523828, 0.000552427, 0.000582617, 0.000614484, + 0.000648119, 0.000684469, 0.00072901, 0.000776474, 0.000827052, 0.000880947, 0.000938375, 0.000999568, 9.98385e-05, + 0.000101233, 0.000105293, 0.000111744, 0.000120327, 0.000130815, 0.000142866, 0.000156764, 0.000172594, 0.000190491, + 0.000210631, 0.000233229, 0.000258328, 0.00028484, 0.000314279, 0.000346945, 0.000383176, 0.000423346, 0.000467872, + 0.000501596, 0.0005287, 0.000557326, 0.000587554, 0.000619469, 0.000653162, 0.000690519, 0.000735209, 0.000782835, + 0.000833588, 0.000887672, 0.000945302, 0.00100671, 0.000111744, 0.000112998, 0.0001167, 0.00012271, 0.000130815, + 0.000140701, 0.000152514, 0.000166276, 0.000182063, 0.000200004, 0.000220265, 0.000243055, 0.000267796, 0.000294554, + 0.000324296, 0.00035732, 0.000393964, 0.000434602, 0.000479655, 0.000507843, 0.00053497, 0.000563634, 0.000593912, + 0.000625891, 0.00065966, 0.000698327, 0.00074321, 0.000791046, 0.000842026, 0.000896353, 0.000954244, 0.00101529, + 0.000125069, 0.000126241, 0.000129698, 0.000135241, 0.000142866, 0.000152514, 0.000164166, 0.000177855, 0.000193657, + 0.000211698, 0.000232143, 0.000255189, 0.000279483, 0.00030656, 0.000336685, 0.000370159, 0.000407318, 0.00044854, + 0.000489803, 0.000515484, 0.000542644, 0.000571356, 0.0006017, 0.000633759, 0.000667622, 0.000707914, 0.000753037, + 0.000801131, 0.00085239, 0.000907017, 0.000965229, 0.00102529, 0.000139614, 0.000140701, 0.000143945, 0.000149313, + 0.000156764, 0.000166276, 0.000177855, 0.000191546, 0.000207435, 0.000225649, 0.000246351, 0.000268853, 0.00029347, + 0.000320945, 0.000351542, 0.000385563, 0.000423346, 0.000465272, 0.00049882, 0.000524524, 0.000551727, 0.000580503, + 0.000610927, 0.000643085, 0.000677063, 0.000719311, 0.000764718, 0.000813121, 0.000864712, 0.000919695, 0.000978289, + 0.00103716, 0.000155703, 0.000156764, 0.000159941, 0.000165221, 0.000172594, 0.000182063, 0.000193657, 0.000207435, + 0.000223492, 0.000241959, 0.000262526, 0.00028484, 0.00030986, 0.00033782, 0.000368984, 0.000403657, 0.00044218, + 0.000484936, 0.000509231, 0.00053497, 0.000562231, 0.000591085, 0.000621608, 0.000653883, 0.000689654, 0.00073255, + 0.000778289, 0.000827052, 0.000879029, 0.000934428, 0.000993465, 0.00105093, 0.000173646, 0.000174699, 0.000177855, + 0.000183116, 0.000190491, 0.000200004, 0.000211698, 0.000225649, 0.000241959, 0.000260425, 0.000280552, 0.000303271, + 0.000328782, 0.00035732, 0.000389154, 0.00042459, 0.000463975, 0.000496739, 0.000521046, 0.000546834, 0.000574168, + 0.000603118, 0.000633759, 0.000666173, 0.000704422, 0.00074767, 0.000793791, 0.000842966, 0.000895386, 0.000951259, + 0.00101076, 0.00106664, 0.000193657, 0.000194714, 0.000197886, 0.000203185, 0.000210631, 0.000220265, 0.000232143, + 0.000246351, 0.000262526, 0.000280552, 0.000301085, 0.000324296, 0.000350391, 0.000379608, 0.000412221, 0.00044854, + 0.00048703, 0.000509926, 0.000534273, 0.000560129, 0.000587554, 0.000616619, 0.000647399, 0.000680161, 0.000721071, + 0.000764718, 0.000811271, 0.000860912, 0.000913833, 0.000970242, 0.00102802, 0.00108432, 0.000215974, 0.000217046, + 0.000220265, 0.000225649, 0.000233229, 0.000243055, 0.000255189, 0.000268853, 0.00028484, 0.000303271, 0.000324296, + 0.000348092, 0.000374871, 0.000404876, 0.000438384, 0.000475711, 0.000501595, 0.000524524, 0.000548931, 0.000574872, + 0.000602409, 0.000631611, 0.000662552, 0.000698327, 0.00073965, 0.000783746, 0.000830784, 0.000880947, 0.000934428, + 0.000991435, 0.00104725, 0.00110401, 0.000240863, 0.000241959, 0.000245251, 0.000250764, 0.000258328, 0.000267796, + 0.000279483, 0.00029347, 0.00030986, 0.000328782, 0.000350391, 0.000374871, 0.00040244, 0.000433345, 0.000467872, + 0.000496045, 0.000517569, 0.00054055, 0.000565037, 0.000591085, 0.000618757, 0.000648119, 0.0006793, 0.000718432, + 0.000760216, 0.000804812, 0.00085239, 0.000903132, 0.000957234, 0.00101438, 0.0010685, 0.00112577, 0.000267796, + 0.000268853, 0.000272031, 0.000277348, 0.00028484, 0.000294554, 0.00030656, 0.000320945, 0.00033782, 0.00035732, + 0.000379608, 0.000404876, 0.000433345, 0.000465272, 0.000493271, 0.000513399, 0.00053497, 0.000558027, 0.000582617, + 0.000608796, 0.000636625, 0.000666173, 0.000700936, 0.000740539, 0.000782835, 0.000827984, 0.000876157, 0.000927539, + 0.000982325, 0.00103716, 0.0010918, 0.00114964, 0.000296726, 0.000297814, 0.000301085, 0.00030656, 0.000314279, + 0.000324296, 0.000336685, 0.000351542, 0.000368984, 0.000389154, 0.000412221, 0.000438384, 0.000467872, 0.000493271, + 0.00051201, 0.000532183, 0.000553826, 0.000576982, 0.0006017, 0.000628035, 0.000656048, 0.00068706, 0.000724595, + 0.000764718, 0.000807578, 0.000853335, 0.000902162, 0.000954244, 0.00100978, 0.00106201, 0.00111723, 0.00117568, + 0.000328782, 0.000329907, 0.00033329, 0.000338955, 0.000346945, 0.00035732, 0.000370159, 0.000385563, 0.000403657, + 0.00042459, 0.00044854, 0.000475711, 0.000496045, 0.000513399, 0.000532183, 0.000552427, 0.000574168, 0.00059745, + 0.000622322, 0.000648839, 0.000677063, 0.000712289, 0.000750352, 0.000791046, 0.000834524, 0.000880947, 0.000930488, + 0.000983335, 0.00103625, 0.00108899, 0.00114484, 0.00120396, 0.000364301, 0.000365469, 0.000368984, 0.000374871, + 0.000383176, 0.000393964, 0.000407318, 0.000423346, 0.00044218, 0.000463975, 0.00048703, 0.000501595, 0.000517569, + 0.00053497, 0.000553826, 0.000574168, 0.000596035, 0.000619469, 0.000644522, 0.000671249, 0.00070355, 0.00073965, + 0.000778289, 0.000819609, 0.000863761, 0.000910909, 0.000961228, 0.00101438, 0.00106479, 0.00111818, 0.00117471, + 0.00123455, 0.000403657, 0.000404876, 0.000408541, 0.000414682, 0.000423346, 0.000434602, 0.00044854, 0.000465272, + 0.000484936, 0.000496739, 0.000509926, 0.000524524, 0.00054055, 0.000558027, 0.000576982, 0.00059745, 0.000619469, + 0.000643085, 0.000668347, 0.000698327, 0.00073255, 0.000769231, 0.000808501, 0.000850501, 0.000895386, 0.000943321, + 0.000994481, 0.0010445, 0.00109555, 0.00114964, 0.0012069, 0.00126753, 0.000447265, 0.00044854, 0.000452376, + 0.000458803, 0.000467872, 0.000479655, 0.000489803, 0.00049882, 0.000509231, 0.000521046, 0.000534273, 0.000548931, + 0.000565037, 0.000582617, 0.0006017, 0.000622322, 0.000644522, 0.000668347, 0.000696588, 0.00072901, 0.000763817, + 0.000801131, 0.000841086, 0.000883826, 0.000929504, 0.000978289, 0.00102802, 0.00107686, 0.00112862, 0.00118345, + 0.00124151, 0.00130297, 0.000490497, 0.00049119, 0.000493271, 0.000496739, 0.000501596, 0.000507843, 0.000515484, + 0.000524524, 0.00053497, 0.000546834, 0.000560129, 0.000574872, 0.000591085, 0.000608796, 0.000628035, 0.000648839, + 0.000671249, 0.000698327, 0.00072901, 0.000762016, 0.000797458, 0.00083546, 0.000876157, 0.000919695, 0.000966231, + 0.00101529, 0.00106201, 0.00111156, 0.00116407, 0.0012197, 0.00127861, 0.00134098, 0.000517569, 0.000518264, + 0.00052035, 0.000523828, 0.0005287, 0.00053497, 0.000542644, 0.000551727, 0.000562231, 0.000574168, 0.000587554, + 0.000602409, 0.000618757, 0.000636625, 0.000656048, 0.000677063, 0.00070355, 0.00073255, 0.000763817, 0.000797458, + 0.000833588, 0.000872335, 0.000913833, 0.000958232, 0.00100569, 0.00105093, 0.00109837, 0.00114868, 0.001202, + 0.00125849, 0.00131831, 0.00138164, 0.000546135, 0.000546834, 0.000548931, 0.000552427, 0.000557326, 0.000563634, + 0.000571356, 0.000580503, 0.000591085, 0.000603118, 0.000616619, 0.000631611, 0.000648119, 0.000666173, 0.00068706, + 0.000712289, 0.00073965, 0.000769231, 0.000801131, 0.00083546, 0.000872335, 0.000911883, 0.000954244, 0.000999568, + 0.00104358, 0.00108899, 0.00113719, 0.00118832, 0.0012425, 0.00129991, 0.00136071, 0.00142507, 0.000576279, + 0.000576982, 0.000579094, 0.000582617, 0.000587554, 0.000593912, 0.0006017, 0.000610927, 0.000621608, 0.000633759, + 0.000647399, 0.000662552, 0.0006793, 0.000700936, 0.000724595, 0.000750352, 0.000778289, 0.000808501, 0.000841086, + 0.000876157, 0.000913833, 0.000954244, 0.000997532, 0.00103991, 0.00108339, 0.00112957, 0.00117859, 0.00123058, + 0.00128569, 0.00134408, 0.00140592, 0.00145992, 0.000608085, 0.000608796, 0.000610927, 0.000614484, 0.000619469, + 0.000625891, 0.000633759, 0.000643085, 0.000653883, 0.000666173, 0.000680161, 0.000698327, 0.000718432, 0.000740539, + 0.000764718, 0.000791046, 0.000819609, 0.000850501, 0.000883826, 0.000919695, 0.000958232, 0.000999568, 0.00103991, + 0.00108152, 0.00112577, 0.00117277, 0.00122266, 0.00127559, 0.00133168, 0.00139111, 0.00144821, 0.00149301, + 0.000641648, 0.000642366, 0.000644522, 0.000648119, 0.000653162, 0.00065966, 0.000667622, 0.000677063, 0.000689654, + 0.000704422, 0.000721071, 0.00073965, 0.000760216, 0.000782835, 0.000807578, 0.000834524, 0.000863761, 0.000895386, + 0.000929504, 0.000966231, 0.00100569, 0.00104358, 0.00108339, 0.00112577, 0.00117084, 0.00121872, 0.00126954, + 0.00132345, 0.00138059, 0.00143944, 0.00148269, 0.00152783, 0.000677063, 0.00067779, 0.000680161, 0.000684469, + 0.000690519, 0.000698327, 0.000707914, 0.000719311, 0.00073255, 0.00074767, 0.000764718, 0.000783746, 0.000804812, + 0.000827984, 0.000853335, 0.000880947, 0.000910909, 0.000943321, 0.000978289, 0.00101529, 0.00105093, 0.00108899, + 0.00112957, 0.00117277, 0.00121872, 0.00126753, 0.00131934, 0.0013743, 0.00143255, 0.00147533, 0.00151892, + 0.00156441, 0.000721071, 0.000721951, 0.000724595, 0.00072901, 0.000735209, 0.00074321, 0.000753037, 0.000764718, + 0.000778289, 0.000793791, 0.000811271, 0.000830784, 0.00085239, 0.000876157, 0.000902162, 0.000930488, 0.000961228, + 0.000994481, 0.00102802, 0.00106201, 0.00109837, 0.00113719, 0.00117859, 0.00122266, 0.00126954, 0.00131934, + 0.0013722, 0.00142827, 0.00147092, 0.00151298, 0.00155692, 0.0016028, 0.000768327, 0.000769231, 0.000771944, + 0.000776474, 0.000782835, 0.000791046, 0.000801131, 0.000813121, 0.000827052, 0.000842966, 0.000860912, 0.000880947, + 0.000903132, 0.000927539, 0.000954244, 0.000983335, 0.00101438, 0.0010445, 0.00107686, 0.00111156, 0.00114868, + 0.00118832, 0.00123058, 0.00127559, 0.00132345, 0.0013743, 0.00142827, 0.00146945, 0.00151002, 0.00155244, + 0.00159676, 0.00164305, 0.000818681, 0.000819609, 0.000822397, 0.000827052, 0.000833588, 0.000842026, 0.00085239, + 0.000864712, 0.000879029, 0.000895386, 0.000913833, 0.000934428, 0.000957234, 0.000982325, 0.00100978, 0.00103625, + 0.00106479, 0.00109555, 0.00112862, 0.00116407, 0.001202, 0.0012425, 0.00128569, 0.00133168, 0.00138059, + 0.00143255, 0.00147092, 0.00151002, 0.00155094, 0.00159374, 0.00163847, 0.00168519, 0.000872335, 0.000873289, + 0.000876157, 0.000880947, 0.000887672, 0.000896353, 0.000907017, 0.000919695, 0.000934428, 0.000951259, 0.000970242, + 0.000991435, 0.00101438, 0.00103716, 0.00106201, 0.00108899, 0.00111818, 0.00114964, 0.00118345, 0.0012197, + 0.00125849, 0.00129991, 0.00134408, 0.00139111, 0.00143944, 0.00147533, 0.00151298, 0.00155244, 0.00159374, + 0.00163695, 0.00168211, 0.00172928, 0.000929504, 0.000930488, 0.000933442, 0.000938375, 0.000945302, 0.000954244, + 0.000965229, 0.000978289, 0.000993465, 0.00101076, 0.00102802, 0.00104725, 0.0010685, 0.0010918, 0.00111723, + 0.00114484, 0.00117471, 0.0012069, 0.00124151, 0.00127861, 0.00131831, 0.00136071, 0.00140592, 0.00144821, + 0.00148269, 0.00151892, 0.00155692, 0.00159676, 0.00163847, 0.00168211, 0.00172772, 0.00177538, 0.000990421, + 0.000991435, 0.000994481, 0.000999568, 0.00100671, 0.00101529, 0.00102529, 0.00103716, 0.00105093, 0.00106664, + 0.00108432, 0.00110401, 0.00112577, 0.00114964, 0.00117568, 0.00120396, 0.00123455, 0.00126753, 0.00130297, + 0.00134098, 0.00138164, 0.00142507, 0.00145992, 0.00149301, 0.00152783, 0.00156441, 0.0016028, 0.00164305, + 0.00168519, 0.00172928, 0.00177538, 0.00182353, 0.000136878, 0.000150402, 0.000165263, 0.000181592, 0.000199534, + 0.000219249, 0.000240912, 0.000264002, 0.000289027, 0.000316423, 0.000346417, 0.000379254, 0.000415204, 0.000450291, + 0.000483257, 0.000518636, 0.000556606, 0.000597355, 0.000641088, 0.00068713, 0.000732919, 0.000781759, 0.000833854, + 0.00088942, 0.000948689, 0.00101191, 0.00107258, 0.00113651, 0.00120424, 0.00127601, 0.00135205, 0.00143263, + 0.000150402, 0.000156389, 0.00016898, 0.00018439, 0.000201862, 0.000221304, 0.000242798, 0.000265706, 0.000290661, + 0.000318014, 0.000347985, 0.000380815, 0.000416771, 0.000451514, 0.000484476, 0.000519858, 0.000557835, 0.000598597, + 0.000642346, 0.000688297, 0.000734101, 0.00078296, 0.000835077, 0.000890668, 0.000949965, 0.00101321, 0.00107378, + 0.00113772, 0.00120548, 0.00127728, 0.00135336, 0.00143397, 0.000165263, 0.00016898, 0.00017868, 0.000192254, + 0.000208611, 0.000227352, 0.000248337, 0.000270784, 0.000295543, 0.000322778, 0.000352686, 0.000385499, 0.000421474, + 0.000455183, 0.000488134, 0.000523523, 0.000561524, 0.000602324, 0.000646125, 0.000691799, 0.000737651, 0.000786566, + 0.000838749, 0.000894414, 0.000953794, 0.00101695, 0.00107736, 0.00114138, 0.00120922, 0.00128111, 0.00135728, + 0.00143799, 0.000181592, 0.00018439, 0.000192254, 0.000204148, 0.000219249, 0.000237105, 0.000257117, 0.000279142, + 0.000303622, 0.000330686, 0.00036051, 0.000393307, 0.000429325, 0.000461293, 0.000494231, 0.000529636, 0.000567679, + 0.000608545, 0.000652433, 0.000697644, 0.000743575, 0.000792586, 0.000844879, 0.00090067, 0.000960189, 0.00102282, + 0.00108335, 0.00114749, 0.00121546, 0.00128749, 0.00136382, 0.00144469, 0.000199534, 0.000201862, 0.000208611, + 0.000219249, 0.000233247, 0.00025011, 0.000269098, 0.000290661, 0.00031483, 0.000341708, 0.000371447, 0.000404247, + 0.000439264, 0.000469839, 0.000502768, 0.000538204, 0.000576312, 0.000617274, 0.000661288, 0.000705842, 0.000751888, + 0.000801034, 0.000853484, 0.000909452, 0.000969168, 0.00103106, 0.00109175, 0.00115606, 0.00122422, 0.00129645, + 0.00137299, 0.00145411, 0.000219249, 0.000221304, 0.000227352, 0.000237105, 0.00025011, 0.000265706, 0.000284102, + 0.00030523, 0.000329107, 0.000355817, 0.000385499, 0.000418338, 0.000450291, 0.000480818, 0.000513751, 0.000549237, + 0.000587438, 0.000628532, 0.000672714, 0.000716412, 0.000762608, 0.000811933, 0.000864586, 0.000920784, 0.000980757, + 0.00104169, 0.00110257, 0.00116711, 0.00123551, 0.00130801, 0.00138483, 0.00146575, 0.000240912, 0.000242798, + 0.000248337, 0.000257117, 0.000269098, 0.000284102, 0.000302012, 0.000322778, 0.000346417, 0.000373009, 0.000402683, + 0.000435582, 0.000463735, 0.000494231, 0.000527191, 0.000562755, 0.000601081, 0.000642346, 0.000685964, 0.000729375, + 0.000775761, 0.000825309, 0.000878217, 0.000934701, 0.000994989, 0.00105472, 0.00111586, 0.00118067, 0.00124938, + 0.00132219, 0.00139936, 0.00147962, 0.000264002, 0.000265706, 0.000270784, 0.000279142, 0.000290661, 0.00030523, + 0.000322778, 0.000343278, 0.000366761, 0.000393307, 0.000423044, 0.000451514, 0.000479598, 0.000510089, 0.000543105, + 0.000578782, 0.000617274, 0.000658755, 0.000701155, 0.000744761, 0.000791381, 0.000841199, 0.000894414, 0.000951241, + 0.00101191, 0.0010702, 0.00113164, 0.00119678, 0.00126584, 0.00133904, 0.00141662, 0.00149609, 0.000289027, + 0.000290661, 0.000295543, 0.000303622, 0.00031483, 0.000329107, 0.000346417, 0.000366761, 0.000390183, 0.000416771, + 0.000444168, 0.000469839, 0.00049789, 0.000528413, 0.000561524, 0.000597355, 0.000636059, 0.000677807, 0.000718765, + 0.000762608, 0.000809507, 0.000859646, 0.000913224, 0.000970453, 0.00102988, 0.00108814, 0.00114994, 0.00121546, + 0.00128493, 0.00135858, 0.00143665, 0.00151518, 0.000316423, 0.000318014, 0.000322778, 0.000330686, 0.000341708, + 0.000355817, 0.000373009, 0.000393307, 0.000416771, 0.000441717, 0.000464956, 0.000490573, 0.000518636, 0.000549237, + 0.000582489, 0.000618523, 0.00065749, 0.000697644, 0.000738835, 0.00078296, 0.000830189, 0.000880703, 0.000934701, + 0.000992396, 0.00104998, 0.00110861, 0.00117081, 0.00123677, 0.00130672, 0.00138088, 0.00145946, 0.00153694, + 0.000346417, 0.000347985, 0.000352686, 0.00036051, 0.000371447, 0.000385499, 0.000402683, 0.000423044, 0.000444168, + 0.000464956, 0.000488134, 0.000513751, 0.000541879, 0.00057261, 0.000606055, 0.000642346, 0.000681302, 0.000719942, + 0.000761415, 0.000805873, 0.000853484, 0.00090443, 0.000958909, 0.00101695, 0.00107258, 0.00113164, 0.0011943, + 0.00126076, 0.00133125, 0.00140599, 0.00148341, 0.00156141, 0.000379254, 0.000380815, 0.000385499, 0.000393307, + 0.000404247, 0.000418338, 0.000435582, 0.000451514, 0.000469839, 0.000490573, 0.000513751, 0.000539429, 0.000567679, + 0.000598597, 0.000632293, 0.0006689, 0.000705842, 0.000744761, 0.000786566, 0.00083141, 0.00087946, 0.000930898, + 0.000985923, 0.00104169, 0.00109776, 0.00115729, 0.00122046, 0.00128749, 0.00135858, 0.00143397, 0.00151008, + 0.00158866, 0.000415204, 0.000416771, 0.000421474, 0.000429325, 0.000439264, 0.000450291, 0.000463735, 0.000479598, + 0.00049789, 0.000518636, 0.000541879, 0.000567679, 0.000596114, 0.000627279, 0.000661288, 0.000696474, 0.000732919, + 0.000772168, 0.00081436, 0.000859646, 0.000908195, 0.000960189, 0.00101577, 0.001069, 0.00112556, 0.00118562, + 0.00124938, 0.00131703, 0.00138879, 0.00146449, 0.00153951, 0.00161874, 0.000450291, 0.000451514, 0.000455183, + 0.000461293, 0.000469839, 0.000480818, 0.000494231, 0.000510089, 0.000528413, 0.000549237, 0.00057261, 0.000598597, + 0.000627279, 0.000658755, 0.000691799, 0.000725835, 0.000762608, 0.000802243, 0.000844879, 0.000890668, 0.000939779, + 0.000992396, 0.00104524, 0.00109896, 0.00115606, 0.00121671, 0.00128111, 0.00134945, 0.00142195, 0.00149609, + 0.00157177, 0.00165171, 0.000483257, 0.000484476, 0.000488134, 0.000494231, 0.000502768, 0.000513751, 0.000527191, + 0.000543105, 0.000561524, 0.000582489, 0.000606055, 0.000632293, 0.000661288, 0.000691799, 0.000723476, 0.000757839, + 0.000794997, 0.000835077, 0.000878217, 0.000924573, 0.000974312, 0.00102635, 0.00107736, 0.00113164, 0.00118934, + 0.00125064, 0.00131574, 0.00138483, 0.00145815, 0.00153052, 0.00160694, 0.00168766, 0.000518636, 0.000519858, + 0.000523523, 0.000529636, 0.000538204, 0.000549237, 0.000562755, 0.000578782, 0.000597355, 0.000618523, 0.000642346, + 0.0006689, 0.000696474, 0.000725835, 0.000757839, 0.000792586, 0.000830189, 0.000870774, 0.000914482, 0.00096147, + 0.00101191, 0.00106066, 0.00111223, 0.00116711, 0.00122547, 0.00128749, 0.00135336, 0.00142328, 0.00149482, + 0.00156788, 0.00164509, 0.00172666, 0.000556606, 0.000557835, 0.000561524, 0.000567679, 0.000576312, 0.000587438, + 0.000601081, 0.000617274, 0.000636059, 0.00065749, 0.000681302, 0.000705842, 0.000732919, 0.000762608, 0.000794997, + 0.000830189, 0.000868297, 0.000909452, 0.000953794, 0.00100148, 0.00104879, 0.00109776, 0.00114994, 0.00120548, + 0.00126457, 0.00132737, 0.00139407, 0.00146449, 0.00153437, 0.00160825, 0.00168632, 0.00176881, 0.000597355, + 0.000598597, 0.000602324, 0.000608545, 0.000617274, 0.000628532, 0.000642346, 0.000658755, 0.000677807, 0.000697644, + 0.000719942, 0.000744761, 0.000772168, 0.000802243, 0.000835077, 0.000870774, 0.000909452, 0.000951241, 0.000996287, + 0.00104169, 0.00108814, 0.00113772, 0.00119057, 0.00124685, 0.00130672, 0.00137037, 0.00143799, 0.00150626, + 0.00157696, 0.00165171, 0.00173072, 0.0018142, 0.000641088, 0.000642346, 0.000646125, 0.000652433, 0.000661288, + 0.000672714, 0.000685964, 0.000701155, 0.000718765, 0.000738835, 0.000761415, 0.000786566, 0.00081436, 0.000844879, + 0.000878217, 0.000914482, 0.000953794, 0.000996287, 0.00103932, 0.00108335, 0.00113042, 0.00118067, 0.00123426, + 0.00129133, 0.00135205, 0.00141662, 0.00148341, 0.00155109, 0.00162268, 0.00169838, 0.00177839, 0.00186294, + 0.00068713, 0.000688297, 0.000691799, 0.000697644, 0.000705842, 0.000716412, 0.000729375, 0.000744761, 0.000762608, + 0.00078296, 0.000805873, 0.00083141, 0.000859646, 0.000890668, 0.000924573, 0.00096147, 0.00100148, 0.00104169, + 0.00108335, 0.00112799, 0.00117574, 0.00122673, 0.00128111, 0.00133904, 0.00140069, 0.00146575, 0.00153052, + 0.00159909, 0.00167164, 0.00174836, 0.00182945, 0.00191514, 0.000732919, 0.000734101, 0.000737651, 0.000743575, + 0.000751888, 0.000762608, 0.000775761, 0.000791381, 0.000809507, 0.000830189, 0.000853484, 0.00087946, 0.000908195, + 0.000939779, 0.000974312, 0.00101191, 0.00104879, 0.00108814, 0.00113042, 0.00117574, 0.00122422, 0.00127601, + 0.00133125, 0.00139011, 0.00145276, 0.00151518, 0.00158086, 0.00165039, 0.00172396, 0.00180177, 0.00188402, + 0.00197093, 0.000781759, 0.00078296, 0.000786566, 0.000792586, 0.000801034, 0.000811933, 0.000825309, 0.000841199, + 0.000859646, 0.000880703, 0.00090443, 0.000930898, 0.000960189, 0.000992396, 0.00102635, 0.00106066, 0.00109776, + 0.00113772, 0.00118067, 0.00122673, 0.00127601, 0.00132866, 0.00138483, 0.00144469, 0.00150498, 0.00156788, + 0.00163453, 0.0017051, 0.00177977, 0.00185874, 0.00194222, 0.00203043, 0.000833854, 0.000835077, 0.000838749, + 0.000844879, 0.000853484, 0.000864586, 0.000878217, 0.000894414, 0.000913224, 0.000934701, 0.000958909, 0.000985923, + 0.00101577, 0.00104524, 0.00107736, 0.00111223, 0.00114994, 0.00119057, 0.00123426, 0.00128111, 0.00133125, + 0.00138483, 0.00144201, 0.0014999, 0.00156012, 0.00162399, 0.00169168, 0.00176335, 0.00183919, 0.0019194, + 0.0020042, 0.00207984, 0.00088942, 0.000890668, 0.000894414, 0.00090067, 0.000909452, 0.000920784, 0.000934701, + 0.000951241, 0.000970453, 0.000992396, 0.00101695, 0.00104169, 0.001069, 0.00109896, 0.00113164, 0.00116711, + 0.00120548, 0.00124685, 0.00129133, 0.00133904, 0.00139011, 0.00144469, 0.0014999, 0.00155754, 0.00161874, + 0.00168365, 0.00175244, 0.00182529, 0.00190238, 0.00198391, 0.00206298, 0.0021275, 0.000948689, 0.000949965, + 0.000953794, 0.000960189, 0.000969168, 0.000980757, 0.000994989, 0.00101191, 0.00102988, 0.00104998, 0.00107258, + 0.00109776, 0.00112556, 0.00115606, 0.00118934, 0.00122547, 0.00126457, 0.00130672, 0.00135205, 0.00140069, + 0.00145276, 0.00150498, 0.00156012, 0.00161874, 0.00168098, 0.001747, 0.00181697, 0.00189107, 0.00196949, + 0.00205037, 0.00211263, 0.00217764, 0.00101191, 0.00101321, 0.00101695, 0.00102282, 0.00103106, 0.00104169, + 0.00105472, 0.0010702, 0.00108814, 0.00110861, 0.00113164, 0.00115729, 0.00118562, 0.00121671, 0.00125064, + 0.00128749, 0.00132737, 0.00137037, 0.00141662, 0.00146575, 0.00151518, 0.00156788, 0.00162399, 0.00168365, + 0.001747, 0.0018142, 0.00188543, 0.00196085, 0.00204068, 0.00210204, 0.0021648, 0.00223034, 0.00107258, + 0.00107378, 0.00107736, 0.00108335, 0.00109175, 0.00110257, 0.00111586, 0.00113164, 0.00114994, 0.00117081, + 0.0011943, 0.00122046, 0.00124938, 0.00128111, 0.00131574, 0.00135336, 0.00139407, 0.00143799, 0.00148341, + 0.00153052, 0.00158086, 0.00163453, 0.00169168, 0.00175244, 0.00181697, 0.00188543, 0.00195798, 0.00203482, + 0.00209569, 0.00215626, 0.00221955, 0.00228566, 0.00113651, 0.00113772, 0.00114138, 0.00114749, 0.00115606, + 0.00116711, 0.00118067, 0.00119678, 0.00121546, 0.00123677, 0.00126076, 0.00128749, 0.00131703, 0.00134945, + 0.00138483, 0.00142328, 0.00146449, 0.00150626, 0.00155109, 0.00159909, 0.00165039, 0.0017051, 0.00176335, + 0.00182529, 0.00189107, 0.00196085, 0.00203482, 0.00209357, 0.00215199, 0.00221309, 0.00227695, 0.00234366, + 0.00120424, 0.00120548, 0.00120922, 0.00121546, 0.00122422, 0.00123551, 0.00124938, 0.00126584, 0.00128493, + 0.00130672, 0.00133125, 0.00135858, 0.00138879, 0.00142195, 0.00145815, 0.00149482, 0.00153437, 0.00157696, + 0.00162268, 0.00167164, 0.00172396, 0.00177977, 0.00183919, 0.00190238, 0.00196949, 0.00204068, 0.00209569, + 0.00215199, 0.00221093, 0.0022726, 0.00233707, 0.00240442, 0.00127601, 0.00127728, 0.00128111, 0.00128749, + 0.00129645, 0.00130801, 0.00132219, 0.00133904, 0.00135858, 0.00138088, 0.00140599, 0.00143397, 0.00146449, + 0.00149609, 0.00153052, 0.00156788, 0.00160825, 0.00165171, 0.00169838, 0.00174836, 0.00180177, 0.00185874, + 0.0019194, 0.00198391, 0.00205037, 0.00210204, 0.00215626, 0.00221309, 0.0022726, 0.00233487, 0.00239998, + 0.002468, 0.00135205, 0.00135336, 0.00135728, 0.00136382, 0.00137299, 0.00138483, 0.00139936, 0.00141662, + 0.00143665, 0.00145946, 0.00148341, 0.00151008, 0.00153951, 0.00157177, 0.00160694, 0.00164509, 0.00168632, + 0.00173072, 0.00177839, 0.00182945, 0.00188402, 0.00194222, 0.0020042, 0.00206298, 0.00211263, 0.0021648, + 0.00221955, 0.00227695, 0.00233707, 0.00239998, 0.00246576, 0.0025345, 0.00143263, 0.00143397, 0.00143799, + 0.00144469, 0.00145411, 0.00146575, 0.00147962, 0.00149609, 0.00151518, 0.00153694, 0.00156141, 0.00158866, + 0.00161874, 0.00165171, 0.00168766, 0.00172666, 0.00176881, 0.0018142, 0.00186294, 0.00191514, 0.00197093, + 0.00203043, 0.00207984, 0.0021275, 0.00217764, 0.00223034, 0.00228566, 0.00234366, 0.00240442, 0.002468, + 0.0025345, 0.00260399, 0.000262914, 0.000328843, 0.000411305, 0.000514445, 0.000643449, 0.000804803, 0.00100662, + 0.00121693, 0.00145344, 0.00173591, 0.00207328, 0.00247621, 0.00295745, 0.00353066, 0.00421288, 0.00502694, + 0.00599829, 0.00715734, 0.00854036, 0.00996351, 0.0106852, 0.0114591, 0.0122891, 0.0131792, 0.0141338, + 0.0151576, 0.0160076, 0.0168911, 0.0178234, 0.0188071, 0.0198452, 0.0209405, 0.000328843, 0.000360778, + 0.000433614, 0.000533468, 0.00066142, 0.000822834, 0.00102543, 0.00123239, 0.0014696, 0.00175307, 0.00209172, + 0.00249624, 0.00297938, 0.00355469, 0.00423952, 0.0050566, 0.00603147, 0.0071946, 0.00858234, 0.00998185, + 0.0107039, 0.0114782, 0.0123087, 0.0131993, 0.0141544, 0.0151788, 0.0160241, 0.0169079, 0.0178405, + 0.0188245, 0.0198629, 0.0209586, 0.000411305, 0.000433614, 0.00049507, 0.000589093, 0.000715145, 0.00087724, + 0.00107937, 0.001279, 0.0015184, 0.00180493, 0.00214749, 0.0025568, 0.00304568, 0.00362736, 0.00432001, + 0.00514622, 0.0061317, 0.00730714, 0.00870913, 0.0100369, 0.01076, 0.0115355, 0.0123673, 0.0132595, + 0.0142163, 0.0152355, 0.0160738, 0.0169584, 0.0178918, 0.0188769, 0.0199163, 0.0210131, 0.000514445, + 0.000533468, 0.000589093, 0.000679349, 0.000804803, 0.000969255, 0.00115547, 0.00135757, 0.00160087, 0.00189268, + 0.00224191, 0.00265935, 0.00315794, 0.00375033, 0.00445619, 0.00529779, 0.00630115, 0.00749731, 0.0089233, + 0.0101289, 0.0108537, 0.0116312, 0.0124653, 0.01336, 0.0143196, 0.0153172, 0.0161566, 0.0170426, + 0.0179775, 0.0189641, 0.0200053, 0.021104, 0.000643449, 0.00066142, 0.000715145, 0.000804803, 0.000932223, + 0.00109453, 0.00126342, 0.0014696, 0.00171881, 0.00201836, 0.00237724, 0.00280635, 0.00331844, 0.00392647, + 0.00465117, 0.00551468, 0.00654347, 0.00776911, 0.00922921, 0.010258, 0.0109853, 0.0117657, 0.012603, + 0.0135013, 0.0144648, 0.0154317, 0.0162728, 0.0171607, 0.0180977, 0.0190866, 0.0201302, 0.0212316, + 0.000804803, 0.000822834, 0.00087724, 0.000969255, 0.00109453, 0.00123239, 0.00140527, 0.00161754, 0.00187499, + 0.00218504, 0.0025568, 0.00300139, 0.00353066, 0.00415992, 0.00490939, 0.00580172, 0.00686392, 0.00812824, + 0.00963311, 0.0104246, 0.0111551, 0.0119393, 0.0127808, 0.0136837, 0.0146524, 0.0155792, 0.0164226, + 0.0173129, 0.0182526, 0.0192445, 0.0202913, 0.0213961, 0.00100662, 0.00102543, 0.00107937, 0.00115547, + 0.00126342, 0.00140527, 0.00158426, 0.00180493, 0.00207328, 0.00239688, 0.0027851, 0.00324932, 0.00380018, + 0.00445619, 0.00523683, 0.00616535, 0.00726949, 0.00858234, 0.00994518, 0.0106292, 0.0113639, 0.0121527, + 0.0129994, 0.0139081, 0.014883, 0.0157601, 0.0166062, 0.0174995, 0.0184426, 0.019438, 0.0204888, + 0.0215978, 0.00121693, 0.00123239, 0.001279, 0.00135757, 0.0014696, 0.00161754, 0.00180493, 0.0020366, + 0.00231878, 0.00265935, 0.00306796, 0.00355469, 0.00413359, 0.00482236, 0.00564111, 0.00661381, 0.0077691, + 0.00914108, 0.0101841, 0.0108724, 0.0116121, 0.0124065, 0.0132595, 0.014175, 0.0151576, 0.0159745, + 0.0168239, 0.0177209, 0.0186679, 0.0196678, 0.0207232, 0.0218372, 0.00145344, 0.0014696, 0.0015184, + 0.00160087, 0.00171881, 0.00187499, 0.00207328, 0.00231878, 0.00261809, 0.00297938, 0.00341185, 0.00392647, + 0.00453913, 0.00526726, 0.0061317, 0.00715734, 0.00837384, 0.00981651, 0.0104617, 0.0111551, 0.0119007, + 0.0127017, 0.013562, 0.0144856, 0.0154153, 0.016223, 0.0170763, 0.0179775, 0.0189292, 0.0199341, + 0.0209949, 0.0221148, 0.00173591, 0.00175307, 0.00180493, 0.00189268, 0.00201836, 0.00218504, 0.00239688, + 0.00265935, 0.00297938, 0.00336496, 0.00382525, 0.00437418, 0.00502694, 0.00580172, 0.00672025, 0.00780846, + 0.00909724, 0.0101289, 0.0107787, 0.0114782, 0.0122306, 0.0130393, 0.0139081, 0.014841, 0.0156942, + 0.016506, 0.0173637, 0.0182698, 0.0192269, 0.0202376, 0.0213046, 0.0224312, 0.00207328, 0.00209172, + 0.00214749, 0.00224191, 0.00237724, 0.0025568, 0.0027851, 0.00306796, 0.00341185, 0.00382525, 0.00432001, + 0.00490939, 0.00560933, 0.00643889, 0.00742085, 0.00858234, 0.00987194, 0.0104803, 0.0111362, 0.0118428, + 0.012603, 0.0134205, 0.0142989, 0.0152355, 0.0160076, 0.0168239, 0.0176868, 0.0185985, 0.0195616, + 0.0205788, 0.021653, 0.022787, 0.00247621, 0.00249624, 0.0025568, 0.00265935, 0.00280635, 0.00300139, + 0.00324932, 0.00355469, 0.00392647, 0.00437418, 0.00490939, 0.00554612, 0.00630115, 0.0071946, 0.00825042, + 0.00949714, 0.010258, 0.0108724, 0.0115355, 0.0122501, 0.0130193, 0.0138467, 0.0147361, 0.0155792, + 0.016356, 0.0171775, 0.0180461, 0.0189641, 0.0199341, 0.0209586, 0.0220406, 0.0231831, 0.00295745, + 0.00297938, 0.00304568, 0.00315794, 0.00331844, 0.00353066, 0.00380018, 0.00413359, 0.00453913, 0.00502694, + 0.00560933, 0.00630115, 0.00712022, 0.00808779, 0.00922921, 0.0101105, 0.0106852, 0.0113068, 0.011978, + 0.0127017, 0.0134811, 0.0143196, 0.0152192, 0.015958, 0.0167401, 0.0175675, 0.0184426, 0.0193676, + 0.0203451, 0.0213778, 0.0224685, 0.0236204, 0.00353066, 0.00355469, 0.00362736, 0.00375033, 0.00392647, + 0.00415992, 0.00445619, 0.00482236, 0.00526726, 0.00580172, 0.00643889, 0.0071946, 0.00808779, 0.00914108, + 0.0100369, 0.0105733, 0.0111551, 0.0117849, 0.0124653, 0.0131993, 0.01399, 0.014841, 0.0156285, + 0.0163726, 0.0171607, 0.0179947, 0.0188769, 0.0198096, 0.0207955, 0.0218372, 0.0229376, 0.0240997, + 0.00421288, 0.00423952, 0.00432001, 0.00445619, 0.00465117, 0.00490939, 0.00523683, 0.00564111, 0.0061317, + 0.00672025, 0.00742085, 0.00825042, 0.00922921, 0.0100369, 0.010536, 0.0110795, 0.0116696, 0.0123087, + 0.0129994, 0.0137448, 0.0145481, 0.0153662, 0.0160738, 0.0168239, 0.0176186, 0.0184599, 0.01935, + 0.0202913, 0.0212864, 0.0223379, 0.0234488, 0.0246222, 0.00502694, 0.0050566, 0.00514622, 0.00529779, + 0.00551468, 0.00580172, 0.00616535, 0.00661381, 0.00715734, 0.00780846, 0.00858234, 0.00949714, 0.0101105, + 0.0105733, 0.0110795, 0.0116312, 0.0122306, 0.01288, 0.0135823, 0.0143403, 0.0151576, 0.0158424, + 0.016556, 0.0173129, 0.0181149, 0.0189641, 0.0198629, 0.0208136, 0.0218188, 0.0228811, 0.0240035, + 0.025189, 0.00599829, 0.00603147, 0.0061317, 0.00630115, 0.00654347, 0.00686392, 0.00726949, 0.0077691, + 0.00837384, 0.00909724, 0.00987194, 0.010258, 0.0106852, 0.0111551, 0.0116696, 0.0122306, 0.0128403, + 0.0135013, 0.0142163, 0.0149884, 0.0156778, 0.016356, 0.0170763, 0.0178405, 0.0186506, 0.0195086, + 0.0204169, 0.0213778, 0.0223939, 0.0234679, 0.0246027, 0.0258015, 0.00715734, 0.0071946, 0.00730714, + 0.00749731, 0.00776911, 0.00812824, 0.00858234, 0.00914108, 0.00981651, 0.0101289, 0.0104803, 0.0108724, + 0.0113068, 0.0117849, 0.0123087, 0.01288, 0.0135013, 0.014175, 0.0149041, 0.0155792, 0.016223, + 0.0169079, 0.0176356, 0.018408, 0.0192269, 0.0200945, 0.0210131, 0.0219851, 0.023013, 0.0240997, + 0.025248, 0.026461, 0.00854036, 0.00858234, 0.00870913, 0.0089233, 0.00922921, 0.00963311, 0.00994518, + 0.0101841, 0.0104617, 0.0107787, 0.0111362, 0.0115355, 0.011978, 0.0124653, 0.0129994, 0.0135823, + 0.0142163, 0.0149041, 0.0155464, 0.0161566, 0.0168071, 0.0174995, 0.0182354, 0.0190166, 0.0198452, + 0.0207232, 0.021653, 0.0226369, 0.0236777, 0.0247779, 0.0259407, 0.0271691, 0.00996351, 0.00998185, + 0.0100369, 0.0101289, 0.010258, 0.0104246, 0.0106292, 0.0108724, 0.0111551, 0.0114782, 0.0118428, + 0.0122501, 0.0127017, 0.0131993, 0.0137448, 0.0143403, 0.0149884, 0.0155792, 0.0161566, 0.0167736, + 0.0174315, 0.0181321, 0.0188769, 0.0196678, 0.0205068, 0.0213961, 0.0223379, 0.0233348, 0.0243893, + 0.0255042, 0.0266826, 0.0279274, 0.0106852, 0.0107039, 0.01076, 0.0108537, 0.0109853, 0.0111551, + 0.0113639, 0.0116121, 0.0119007, 0.0122306, 0.012603, 0.0130193, 0.0134811, 0.01399, 0.0145481, + 0.0151576, 0.0156778, 0.016223, 0.0168071, 0.0174315, 0.0180977, 0.0188071, 0.0195616, 0.020363, + 0.0212134, 0.0221148, 0.0230697, 0.0240804, 0.0251497, 0.0262803, 0.0274753, 0.0287378, 0.0114591, + 0.0114782, 0.0115355, 0.0116312, 0.0117657, 0.0119393, 0.0121527, 0.0124065, 0.0127017, 0.0130393, + 0.0134205, 0.0138467, 0.0143196, 0.014841, 0.0153662, 0.0158424, 0.016356, 0.0169079, 0.0174995, + 0.0181321, 0.0188071, 0.0195263, 0.0202913, 0.021104, 0.0219666, 0.0228811, 0.0238499, 0.0248755, + 0.0259606, 0.027108, 0.0283208, 0.0296021, 0.0122891, 0.0123087, 0.0123673, 0.0124653, 0.012603, + 0.0127808, 0.0129994, 0.0132595, 0.013562, 0.0139081, 0.0142989, 0.0147361, 0.0152192, 0.0156285, + 0.0160738, 0.016556, 0.0170763, 0.0176356, 0.0182354, 0.0188769, 0.0195616, 0.0202913, 0.0210677, + 0.0218926, 0.0227683, 0.0236968, 0.0246805, 0.025722, 0.026824, 0.0279894, 0.0292211, 0.0303655, + 0.0131792, 0.0131993, 0.0132595, 0.01336, 0.0135013, 0.0136837, 0.0139081, 0.014175, 0.0144856, + 0.014841, 0.0152355, 0.0155792, 0.015958, 0.0163726, 0.0168239, 0.0173129, 0.0178405, 0.018408, + 0.0190166, 0.0196678, 0.020363, 0.021104, 0.0218926, 0.0227307, 0.0236204, 0.0245639, 0.0255636, + 0.026622, 0.027742, 0.0289264, 0.0300982, 0.0311219, 0.0141338, 0.0141544, 0.0142163, 0.0143196, + 0.0144648, 0.0146524, 0.014883, 0.0151576, 0.0154153, 0.0156942, 0.0160076, 0.016356, 0.0167401, + 0.0171607, 0.0176186, 0.0181149, 0.0186506, 0.0192269, 0.0198452, 0.0205068, 0.0212134, 0.0219666, + 0.0227683, 0.0236204, 0.024525, 0.0254845, 0.0265012, 0.0275778, 0.0287169, 0.0298984, 0.0308859, + 0.0319196, 0.0151576, 0.0151788, 0.0152355, 0.0153172, 0.0154317, 0.0155792, 0.0157601, 0.0159745, + 0.016223, 0.016506, 0.0168239, 0.0171775, 0.0175675, 0.0179947, 0.0184599, 0.0189641, 0.0195086, + 0.0200945, 0.0207232, 0.0213961, 0.0221148, 0.0228811, 0.0236968, 0.0245639, 0.0254845, 0.026461, + 0.0274958, 0.0285915, 0.0297509, 0.0307177, 0.0317152, 0.0327595, 0.0160076, 0.0160241, 0.0160738, + 0.0161566, 0.0162728, 0.0164226, 0.0166062, 0.0168239, 0.0170763, 0.0173637, 0.0176868, 0.0180461, + 0.0184426, 0.0188769, 0.01935, 0.0198629, 0.0204169, 0.0210131, 0.021653, 0.0223379, 0.0230697, + 0.0238499, 0.0246805, 0.0255636, 0.0265012, 0.0274958, 0.0285498, 0.0296658, 0.0306169, 0.0315792, + 0.0325874, 0.033643, 0.0168911, 0.0169079, 0.0169584, 0.0170426, 0.0171607, 0.0173129, 0.0174995, + 0.0177209, 0.0179775, 0.0182698, 0.0185985, 0.0189641, 0.0193676, 0.0198096, 0.0202913, 0.0208136, + 0.0213778, 0.0219851, 0.0226369, 0.0233348, 0.0240804, 0.0248755, 0.025722, 0.026622, 0.0275778, + 0.0285915, 0.0296658, 0.0305833, 0.0315113, 0.0324843, 0.0335037, 0.0345713, 0.0178234, 0.0178405, + 0.0178918, 0.0179775, 0.0180977, 0.0182526, 0.0184426, 0.0186679, 0.0189292, 0.0192269, 0.0195616, + 0.0199341, 0.0203451, 0.0207955, 0.0212864, 0.0218188, 0.0223939, 0.023013, 0.0236777, 0.0243893, + 0.0251497, 0.0259606, 0.026824, 0.027742, 0.0287169, 0.0297509, 0.0306169, 0.0315113, 0.0324499, + 0.0334342, 0.0344656, 0.0355458, 0.0188071, 0.0188245, 0.0188769, 0.0189641, 0.0190866, 0.0192445, + 0.019438, 0.0196678, 0.0199341, 0.0202376, 0.0205788, 0.0209586, 0.0213778, 0.0218372, 0.0223379, + 0.0228811, 0.0234679, 0.0240997, 0.0247779, 0.0255042, 0.0262803, 0.027108, 0.0279894, 0.0289264, + 0.0298984, 0.0307177, 0.0315792, 0.0324843, 0.0334342, 0.0344305, 0.0354745, 0.036568, 0.0198452, + 0.0198629, 0.0199163, 0.0200053, 0.0201302, 0.0202913, 0.0204888, 0.0207232, 0.0209949, 0.0213046, + 0.021653, 0.0220406, 0.0224685, 0.0229376, 0.0234488, 0.0240035, 0.0246027, 0.025248, 0.0259407, + 0.0266826, 0.0274753, 0.0283208, 0.0292211, 0.0300982, 0.0308859, 0.0317152, 0.0325874, 0.0335037, + 0.0344656, 0.0354745, 0.0365319, 0.0376393, 0.0209405, 0.0209586, 0.0210131, 0.021104, 0.0212316, + 0.0213961, 0.0215978, 0.0218372, 0.0221148, 0.0224312, 0.022787, 0.0231831, 0.0236204, 0.0240997, + 0.0246222, 0.025189, 0.0258015, 0.026461, 0.0271691, 0.0279274, 0.0287378, 0.0296021, 0.0303655, + 0.0311219, 0.0319196, 0.0327595, 0.033643, 0.0345713, 0.0355458, 0.036568, 0.0376393, 0.0387616, + 0.000138107, 0.000160471, 0.000186456, 0.000216649, 0.000251732, 0.000292495, 0.00033986}; + +static float inv_dequant_stable[] = { + 0.000000, 3150.000000, 3139.258545, 2648.630371, 2234.681152, 1885.427490, 1590.758057, 1342.141724, + 3150.000000, 3150.000000, 3015.809570, 2576.583984, 2188.415039, 1853.965576, 1568.540649, 1326.029297, + 3139.258545, 3015.809570, 2726.995361, 2389.616455, 2062.382568, 1765.966431, 1505.393555, 1279.748535, + 2648.630371, 2576.583984, 2389.616455, 2144.407471, 1885.427490, 1637.121094, 1410.374878, 1208.789673, + 2234.681152, 2188.415039, 2062.382568, 1885.427490, 1686.281982, 1485.426636, 1294.845093, 1060.593384, + 1885.427490, 1853.965576, 1765.966431, 1637.121094, 1485.426636, 1326.029297, 1169.492065, 785.962952, + 1590.758057, 1568.540649, 1505.393555, 1410.374878, 1294.845093, 1169.492065, 838.701721, 558.037292, + 1342.141724, 1326.029297, 1279.748535, 1208.789673, 1060.593384, 785.962952, 558.037292, 382.654694, + 0.000000, 560.000000, 558.510437, 489.194183, 428.480621, 375.302246, 328.723816, 287.926147, + 560.000000, 560.000000, 541.309387, 478.786804, 421.547455, 370.409943, 325.138336, 285.227325, + 558.510437, 541.309387, 500.443756, 451.472992, 402.494324, 356.627594, 314.885712, 277.434692, + 489.194183, 478.786804, 451.472992, 414.922729, 375.302246, 336.170715, 299.277435, 265.364777, + 428.480621, 421.547455, 402.494324, 375.302246, 344.016449, 311.624298, 279.983337, 250.119843, + 375.302246, 370.409943, 356.627594, 336.170715, 311.624298, 285.227325, 258.613495, 232.845169, + 328.723816, 325.138336, 314.885712, 299.277435, 279.983337, 258.613495, 236.484726, 214.558777, + 287.926147, 285.227325, 277.434692, 265.364777, 250.119843, 232.845169, 214.558777, 196.071777, + 0.000000, 293.959503, 169.469955, 119.412476, 85.333336, 85.333336, 83.550827, 58.871857, + 293.959503, 233.598114, 156.027161, 112.817505, 85.333336, 85.333336, 81.164711, 57.425171, + 169.469955, 156.027161, 126.804932, 96.600616, 85.333336, 85.333336, 74.576889, 53.372673, + 119.412476, 112.817505, 96.600616, 85.333336, 85.333336, 85.333336, 65.203850, 47.455181, + 85.333336, 85.333336, 85.333336, 85.333336, 85.333336, 72.553520, 54.677811, 39.419506, + 85.333336, 85.333336, 85.333336, 85.333336, 72.553520, 57.425171, 44.331757, 29.212204, + 83.550827, 81.164711, 74.576889, 65.203850, 54.677811, 44.331757, 31.172369, 20.740799, + 58.871857, 57.425171, 53.372673, 47.455181, 39.419506, 29.212204, 20.740799, 14.222282, + 0.000000, 3160.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, + 3160.000000, 3160.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, + 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, + 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, + 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, + 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, + 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, + 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, 280.000000, + 0.000000, 864.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, + 864.000000, 864.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, + 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, + 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, + 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, + 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, + 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, + 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, 60.000000, + 0.000000, 200.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, + 200.000000, 200.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, + 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, + 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, + 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, + 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, + 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, + 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, 18.000000, + 0.000000, 3840.000000, 1280.000000, 1280.000000, 480.000000, 480.000000, 480.000000, 480.000000, + 3840.000000, 2560.000000, 1280.000000, 1280.000000, 480.000000, 480.000000, 480.000000, 480.000000, + 1280.000000, 1280.000000, 640.000000, 640.000000, 480.000000, 480.000000, 480.000000, 480.000000, + 1280.000000, 1280.000000, 640.000000, 640.000000, 480.000000, 480.000000, 480.000000, 480.000000, + 480.000000, 480.000000, 480.000000, 480.000000, 300.000000, 300.000000, 300.000000, 300.000000, + 480.000000, 480.000000, 480.000000, 480.000000, 300.000000, 300.000000, 300.000000, 300.000000, + 480.000000, 480.000000, 480.000000, 480.000000, 300.000000, 300.000000, 300.000000, 300.000000, + 480.000000, 480.000000, 480.000000, 480.000000, 300.000000, 300.000000, 300.000000, 300.000000, + 0.000000, 960.000000, 320.000000, 320.000000, 140.000000, 140.000000, 140.000000, 140.000000, + 960.000000, 640.000000, 320.000000, 320.000000, 140.000000, 140.000000, 140.000000, 140.000000, + 320.000000, 320.000000, 180.000000, 180.000000, 140.000000, 140.000000, 140.000000, 140.000000, + 320.000000, 320.000000, 180.000000, 180.000000, 140.000000, 140.000000, 140.000000, 140.000000, + 140.000000, 140.000000, 140.000000, 140.000000, 120.000000, 120.000000, 120.000000, 120.000000, + 140.000000, 140.000000, 140.000000, 140.000000, 120.000000, 120.000000, 120.000000, 120.000000, + 140.000000, 140.000000, 140.000000, 140.000000, 120.000000, 120.000000, 120.000000, 120.000000, + 140.000000, 140.000000, 140.000000, 140.000000, 120.000000, 120.000000, 120.000000, 120.000000, + 0.000000, 640.000000, 128.000000, 128.000000, 32.000000, 32.000000, 32.000000, 32.000000, + 640.000000, 320.000000, 128.000000, 128.000000, 32.000000, 32.000000, 32.000000, 32.000000, + 128.000000, 128.000000, 64.000000, 64.000000, 32.000000, 32.000000, 32.000000, 32.000000, + 128.000000, 128.000000, 64.000000, 64.000000, 32.000000, 32.000000, 32.000000, 32.000000, + 32.000000, 32.000000, 32.000000, 32.000000, 16.000000, 16.000000, 16.000000, 16.000000, + 32.000000, 32.000000, 32.000000, 32.000000, 16.000000, 16.000000, 16.000000, 16.000000, + 32.000000, 32.000000, 32.000000, 32.000000, 16.000000, 16.000000, 16.000000, 16.000000, + 32.000000, 32.000000, 32.000000, 32.000000, 16.000000, 16.000000, 16.000000, 16.000000, + 0.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, + 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, + 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, + 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, + 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, + 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, + 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, + 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, 2200.000000, + 0.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, + 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, + 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, + 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, + 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, + 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, + 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, + 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, 392.000000, + 0.000000, 112.000000, 95.651627, 95.651627, 81.689583, 81.689583, 68.239342, 68.239342, + 112.000000, 112.000000, 95.651627, 95.651627, 81.689583, 81.689583, 68.239342, 68.239342, + 95.651627, 95.651627, 89.600014, 89.600014, 78.702759, 78.702759, 65.137154, 65.137154, + 95.651627, 95.651627, 89.600014, 89.600014, 78.702759, 78.702759, 65.137154, 65.137154, + 81.689583, 81.689583, 78.702759, 78.702759, 71.680023, 71.680023, 57.363346, 57.363346, + 81.689583, 81.689583, 78.702759, 78.702759, 71.680023, 71.680023, 57.363346, 57.363346, + 68.239342, 68.239342, 65.137154, 65.137154, 57.363346, 57.363346, 47.786716, 47.786716, + 68.239342, 68.239342, 65.137154, 65.137154, 57.363346, 57.363346, 47.786716, 47.786716, + 0.000000, 0.000000, 5616.416016, 4437.547852, 3710.523682, 3312.083740, 2956.428467, 2638.963867, + 2378.979736, 2146.230957, 1936.253296, 1722.186157, 1498.605713, 1304.051636, 1134.754883, 951.882080, + 0.000000, 0.000000, 5312.582520, 4271.097168, 3658.995850, 3275.037109, 2928.764160, 2617.745361, + 2363.779541, 2134.027100, 1926.335571, 1711.357300, 1489.962646, 1297.105591, 1129.140381, 946.136963, + 5616.416016, 5312.582520, 4620.592773, 3880.564697, 3516.761230, 3170.294189, 2849.415527, 2562.006348, + 2319.431641, 2098.261719, 1897.172852, 1679.534424, 1464.505249, 1276.608887, 1112.546509, 929.184143, + 4437.547852, 4271.097168, 3880.564697, 3609.647705, 3312.083740, 3013.749512, 2727.902588, 2474.977295, + 2249.396484, 2041.305786, 1850.436279, 1628.609985, 1423.584961, 1243.542969, 1077.572754, 901.837036, + 3710.523682, 3658.995850, 3516.761230, 3312.083740, 3073.944824, 2824.097412, 2580.273682, 2363.779541, + 2158.580811, 1966.619507, 1778.076416, 1561.425903, 1369.259766, 1199.417236, 1031.115479, 865.357239, + 3312.083740, 3275.037109, 3170.294189, 3013.749512, 2824.097412, 2617.745117, 2425.913330, 2235.929932, + 2052.443848, 1878.206177, 1679.534424, 1481.398804, 1304.051636, 1146.111572, 975.344788, 821.329834, + 2956.428467, 2928.764160, 2849.415527, 2727.902588, 2580.273682, 2425.913330, 2263.037598, 2098.261719, + 1936.253296, 1766.659668, 1570.745850, 1392.135254, 1230.684570, 1077.572754, 912.642517, 771.521240, + 2638.963867, 2617.745361, 2562.006348, 2474.977295, 2363.779541, 2235.929932, 2098.261719, 1956.393188, + 1813.078247, 1628.609985, 1456.172852, 1297.105591, 1151.854614, 993.464294, 845.405334, 717.737732, + 2378.979736, 2363.779541, 2319.431641, 2249.396484, 2158.580811, 2052.443848, 1936.253296, 1813.078247, + 1648.672119, 1489.962646, 1339.664185, 1199.417480, 1057.315552, 907.217957, 775.878479, 661.709290, + 2146.230957, 2134.027100, 2098.261719, 2041.305786, 1966.619507, 1878.206177, 1766.659668, 1628.609985, + 1489.962646, 1354.335571, 1224.331787, 1098.371216, 951.882080, 821.329834, 706.041565, 604.996033, + 1936.253296, 1926.335571, 1897.172852, 1850.436279, 1778.076416, 1679.534424, 1570.745850, 1456.172852, + 1339.664185, 1224.331787, 1112.546143, 975.344482, 850.334167, 737.812134, 637.541504, 531.866638, + 1722.186157, 1711.357300, 1679.534424, 1628.609985, 1561.425903, 1481.398804, 1392.135254, 1297.105591, + 1199.417480, 1098.371216, 975.344482, 860.309998, 754.414917, 658.183533, 565.168762, 455.065186, + 1498.605713, 1489.962646, 1464.505249, 1423.584961, 1369.259766, 1304.051636, 1230.684570, 1151.854614, + 1057.315552, 951.882080, 850.334167, 754.414917, 665.260315, 582.761047, 475.564758, 385.666412, + 1304.051636, 1297.105591, 1276.608887, 1243.542969, 1199.417236, 1146.111572, 1077.572754, 993.464294, + 907.217957, 821.329834, 737.812134, 658.183533, 582.761047, 482.643036, 396.775940, 324.039429, + 1134.754883, 1129.140381, 1112.546509, 1077.572754, 1031.115479, 975.344788, 912.642517, 845.405334, + 775.878479, 706.041565, 637.541504, 565.168762, 475.564758, 396.775940, 328.516357, 270.136078, + 951.882080, 946.136963, 929.184143, 901.837036, 865.357239, 821.329834, 771.521240, 717.737732, + 661.709290, 604.996033, 531.866638, 455.065186, 385.666412, 324.039429, 270.136078, 223.608490, + 0.000000, 0.000000, 2384.412598, 2060.989746, 1763.609009, 1491.737793, 1261.776978, 1067.266357, + 956.677612, 861.364075, 775.546631, 703.312927, 644.910889, 591.358521, 542.252991, 501.345215, + 0.000000, 0.000000, 2303.758789, 2012.809937, 1727.632080, 1467.211548, 1244.414307, 1054.643066, + 950.447205, 856.371826, 771.497620, 700.552734, 642.589600, 589.392944, 540.578857, 500.060272, + 2384.412598, 2303.758789, 2113.184082, 1884.007446, 1629.571411, 1398.579590, 1195.044922, 1031.757080, + 932.273987, 841.744202, 759.593811, 692.403137, 635.722961, 583.569458, 535.612549, 496.242188, + 2060.989746, 2012.809937, 1884.007446, 1693.401611, 1491.737793, 1297.998291, 1120.699707, 996.043396, + 903.588257, 818.460022, 740.524109, 679.239563, 624.590454, 574.100037, 528.409058, 489.997620, + 1763.609009, 1727.632080, 1629.571411, 1491.737793, 1336.388306, 1179.428345, 1039.256348, 950.447205, + 866.416626, 787.946472, 717.456177, 661.633423, 609.623047, 561.314270, 518.629089, 481.495361, + 1491.737793, 1467.211548, 1398.579590, 1297.998291, 1179.428345, 1054.642944, 975.919922, 898.074402, + 823.012451, 751.853821, 692.403137, 640.284668, 591.358521, 545.629761, 506.546997, 470.954254, + 1261.776978, 1244.414307, 1195.044922, 1120.699707, 1039.256348, 975.919922, 909.174133, 841.744202, + 775.546631, 714.580872, 664.092590, 615.952393, 570.392151, 528.409058, 492.477875, 458.628601, + 1067.266357, 1054.643066, 1031.757080, 996.043396, 950.447205, 898.074402, 841.744202, 783.770203, + 726.228333, 679.239563, 633.465698, 589.392944, 547.332581, 510.515045, 476.757660, 444.792908, + 956.677612, 950.447205, 932.273987, 903.588257, 866.416626, 823.012451, 775.546631, 726.228333, + 684.443726, 642.589600, 601.375000, 561.314392, 524.175049, 491.234863, 459.724792, 429.728699, + 861.364075, 856.371826, 841.744202, 818.460022, 787.946472, 751.853821, 714.580872, 679.239563, + 642.589600, 605.472290, 568.554810, 532.708740, 501.345215, 470.954254, 441.705719, 413.711823, + 775.546631, 771.497620, 759.593811, 740.524109, 717.456177, 692.403137, 664.092590, 633.465698, + 601.375000, 568.554810, 535.612427, 506.546936, 477.933990, 450.024689, 423.003998, 395.167694, + 703.312927, 700.552734, 692.403137, 679.239563, 661.633423, 640.284668, 615.952393, 589.392944, + 561.314392, 532.708740, 506.546936, 480.302856, 454.290039, 428.756592, 403.216187, 375.228302, + 644.910889, 642.589600, 635.722961, 624.590454, 609.623047, 591.358521, 570.392151, 547.332581, + 524.175049, 501.345215, 477.933990, 454.290039, 430.704803, 407.340515, 380.757690, 355.171173, + 591.358521, 589.392944, 583.569458, 574.100037, 561.314270, 545.629761, 528.409058, 510.515045, + 491.234863, 470.954254, 450.024689, 428.756592, 407.340515, 382.629913, 358.535706, 335.223267, + 542.252991, 540.578857, 535.612549, 528.409058, 518.629089, 506.546997, 492.477875, 476.757660, + 459.724792, 441.705719, 423.003998, 403.216187, 380.757690, 358.535706, 336.753815, 315.574097, + 501.345215, 500.060272, 496.242188, 489.997620, 481.495361, 470.954254, 458.628601, 444.792908, + 429.728699, 413.711823, 395.167694, 375.228302, 355.171173, 335.223267, 315.574097, 296.378265, + 0.000000, 0.000000, 615.613831, 448.953400, 337.930267, 263.807556, 205.943130, 160.770889, + 141.832733, 126.301643, 112.471252, 100.763390, 91.120811, 82.400993, 74.515610, 58.896236, + 0.000000, 0.000000, 571.402039, 426.532227, 327.784393, 257.417816, 201.765564, 157.966431, + 140.812332, 125.492966, 111.822540, 100.304680, 90.740356, 82.083275, 74.248734, 58.393326, + 615.613831, 571.402039, 473.941895, 372.602753, 300.644775, 239.809601, 190.039810, 154.182663, + 137.840027, 123.126366, 109.917458, 98.951996, 89.616219, 81.142967, 73.457825, 56.916744, + 448.953400, 426.532227, 372.602753, 318.224457, 263.807556, 214.746811, 172.817261, 148.295853, + 133.160797, 119.368141, 106.872108, 96.772522, 87.797852, 79.617172, 70.208313, 54.558437, + 337.930267, 327.784393, 300.644775, 263.807556, 224.206940, 186.378311, 155.421555, 140.812332, + 127.120590, 114.460091, 103.118340, 93.868050, 85.361305, 77.563431, 65.959373, 51.458752, + 263.807556, 257.417816, 239.809601, 214.746811, 186.378311, 157.966400, 144.988541, 132.263153, + 120.102051, 108.680435, 98.951996, 90.362801, 82.400993, 75.054314, 60.963200, 47.789742, + 205.943130, 201.765564, 190.039810, 172.817261, 155.421555, 144.988541, 134.070770, 123.126366, + 112.471252, 102.638969, 94.273003, 86.390495, 79.020828, 70.208313, 55.486752, 43.736801, + 160.770889, 157.966431, 154.182663, 148.295853, 140.812332, 132.263153, 123.126366, 113.789886, + 104.582710, 96.772522, 89.247108, 82.083275, 75.326157, 62.573708, 49.786186, 39.481380, + 141.832733, 140.812332, 137.840027, 133.160797, 127.120590, 120.102051, 112.471252, 104.582710, + 97.633369, 90.740356, 84.022667, 77.563446, 68.346024, 55.020145, 44.087116, 35.187599, + 126.301643, 125.492966, 123.126366, 119.368141, 114.460091, 108.680435, 102.638969, 96.772522, + 90.740356, 84.687279, 78.725555, 72.135590, 58.896236, 47.789742, 38.573082, 30.993063, + 112.471252, 111.822540, 109.917458, 106.872108, 103.118340, 98.951996, 94.273003, 89.247108, + 84.022667, 78.725555, 73.457809, 60.963173, 50.197853, 41.054691, 33.381031, 24.780676, + 100.763390, 100.304680, 98.951996, 96.772522, 93.868050, 90.362801, 86.390495, 82.083275, + 77.563446, 72.135590, 60.963173, 51.034103, 42.369473, 34.922314, 27.726070, 18.572216, + 91.120811, 90.740356, 89.616219, 87.797852, 85.361305, 82.400993, 79.020828, 75.326157, + 68.346024, 58.896236, 50.197853, 42.369473, 35.455399, 29.343132, 20.148905, 13.676408, + 82.400993, 82.083275, 81.142967, 79.617172, 77.563431, 75.054314, 70.208313, 62.573708, + 55.020145, 47.789742, 41.054691, 34.922314, 29.343132, 20.706997, 14.413850, 9.911549, + 74.515610, 74.248734, 73.457825, 70.208313, 65.959373, 60.963200, 55.486752, 49.786186, + 44.087116, 38.573082, 33.381031, 27.726070, 20.148905, 14.413850, 10.166267, 7.079802, + 58.896236, 58.393326, 56.916744, 54.558437, 51.458752, 47.789742, 43.736801, 39.481380, + 35.187599, 30.993063, 24.780676, 18.572216, 13.676408, 9.911549, 7.079802, 4.991220, + 0.000000, 0.000000, 0.000000, 0.000000, 10016.177734, 8949.019531, 7995.559082, 7162.601074, + 6422.475586, 5758.828613, 5163.758301, 4630.176758, 4151.732422, 3734.188232, 3370.109863, 3041.528564, + 2744.983643, 2477.351074, 2235.813232, 2038.749634, 1932.109741, 1831.047485, 1735.271729, 1644.505737, + 1558.487183, 1476.968018, 1386.826660, 1301.528687, 1221.477173, 1146.349243, 1075.842163, 1009.671509, + 0.000000, 0.000000, 0.000000, 0.000000, 9878.224609, 8849.744141, 7921.355469, 7107.295410, + 6379.011230, 5724.145508, 5135.744141, 4607.326172, 4132.939453, 3719.505127, 3357.800781, 3031.157227, + 2736.206543, 2469.894287, 2229.455811, 2035.871338, 1929.518066, 1828.708130, 1733.155273, 1642.587036, + 1556.744507, 1475.382324, 1385.135010, 1300.000122, 1220.093750, 1145.095825, 1074.704956, 1008.638672, + 0.000000, 0.000000, 0.000000, 0.000000, 9497.340820, 8569.009766, 7710.195312, 6947.082520, + 6252.300781, 5622.568359, 5053.416504, 4539.993652, 4077.450684, 3676.055664, 3321.326660, 3000.390625, + 2710.143555, 2447.733643, 2210.550537, 2027.284180, 1921.783081, 1821.723755, 1726.834595, 1636.855469, + 1551.537354, 1470.240967, 1380.081177, 1295.431274, 1215.958252, 1141.347534, 1071.303833, 1005.549255, + 0.000000, 0.000000, 0.000000, 0.000000, 8949.019531, 8149.289551, 7394.224121, 6697.344727, + 6052.488281, 5461.019531, 4921.634277, 4431.667969, 3987.818604, 3605.573242, 3262.003174, 2950.239990, + 2667.581299, 2411.486328, 2179.584961, 2013.130249, 1909.023438, 1810.194092, 1716.393799, 1627.382446, + 1542.927124, 1460.985596, 1371.723022, 1287.873291, 1209.114380, 1135.142578, 1065.671997, 1000.432007, + 10016.177734, 9878.224609, 9497.340820, 8949.019531, 8310.704102, 7644.405273, 6999.567383, 6379.011230, + 5793.938965, 5249.588867, 4747.628418, 4287.628418, 3871.052002, 3510.746094, 3181.889648, 2882.297852, + 2609.764404, 2362.132812, 2137.337891, 1993.638184, 1891.430786, 1794.280640, 1701.970703, 1614.285400, + 1531.013916, 1448.186157, 1360.157349, 1277.407837, 1199.633057, 1126.542847, 1057.862915, 993.333557, + 8949.019531, 8849.744141, 8569.009766, 8149.289551, 7644.405273, 7107.295410, 6556.779785, 6014.109863, + 5492.590332, 4999.912598, 4539.993652, 4114.296875, 3734.188232, 3394.959229, 3083.605469, 2798.613281, + 2538.306152, 2300.954346, 2084.833496, 1969.113525, 1869.262207, 1774.201294, 1683.749756, 1597.722046, + 1515.933594, 1431.994873, 1345.514648, 1264.148682, 1187.612061, 1115.632202, 1047.949463, 984.940796, + 7995.559082, 7921.355469, 7710.195312, 7394.224121, 6999.567383, 6556.779785, 6091.377441, 5622.568359, + 5163.758301, 4723.701172, 4307.688965, 3918.657715, 3578.032715, 3262.003174, 2970.133789, 2701.544189, + 2455.086426, 2229.455811, 2041.636230, 1939.925049, 1842.829834, 1750.221191, 1661.957764, 1577.887085, + 1497.853394, 1412.599976, 1327.956665, 1248.234619, 1173.171875, 1102.515625, 1036.023804, 975.335022, + 7162.601074, 7107.295410, 6947.082520, 6697.344727, 6379.011230, 6014.109863, 5622.568359, 5220.674805, + 4820.776367, 4431.667969, 4059.243408, 3719.505127, 3407.500000, 3115.799805, 2844.609619, 2593.611328, + 2362.133057, 2149.279541, 2004.730957, 1906.490967, 1812.489014, 1722.644409, 1636.855469, 1555.005493, + 1476.968018, 1390.219116, 1307.671753, 1229.829224, 1156.454346, 1087.316772, 1022.192383, 964.170288, + 6422.475586, 6379.011230, 6252.300781, 6052.488281, 5793.938965, 5492.590332, 5163.758301, 4820.776367, + 4474.431152, 4132.939453, 3809.151123, 3510.745605, 3227.259766, 2960.159424, 2710.143555, 2477.351562, + 2261.523438, 2062.126221, 1963.743530, 1869.262207, 1778.627319, 1691.803589, 1608.729980, 1529.325195, + 1450.003418, 1365.094971, 1284.869629, 1209.114380, 1137.618530, 1070.173828, 1006.577637, 951.533936, + 5758.828613, 5724.145508, 5622.568359, 5461.019531, 5249.588867, 4999.912598, 4723.701172, 4431.667969, + 4132.939453, 3839.881836, 3564.398193, 3297.380859, 3041.528564, 2798.613281, 2569.677490, 2355.211426, + 2155.288330, 2013.130249, 1919.218018, 1828.708130, 1741.649902, 1658.050537, 1577.887085, 1501.112305, + 1419.603271, 1337.488037, 1259.777100, 1186.287842, 1116.836182, 1051.238037, 989.356018, 937.521851, + 5163.758301, 5135.744141, 5053.416504, 4921.634277, 4747.628418, 4539.993652, 4307.688965, 4059.243408, + 3809.151123, 3564.398193, 3321.326660, 3083.605469, 2853.956543, 2634.296387, 2425.883789, 2229.455811, + 2053.263184, 1961.068726, 1871.700928, 1785.304199, 1701.970703, 1621.745972, 1544.642456, 1470.240967, + 1386.826660, 1307.671509, 1232.633179, 1161.558716, 1094.291504, 1030.670776, 972.740723, 922.236572, + 4630.176758, 4607.326172, 4539.993652, 4431.667969, 4287.628418, 4114.296875, 3918.657715, 3719.505127, + 3510.745605, 3297.380859, 3083.605469, 2872.799805, 2667.581299, 2469.894531, 2281.107178, 2102.114502, + 1993.638428, 1906.490967, 1821.723877, 1739.518799, 1660.001831, 1583.252930, 1509.314819, 1431.994873, + 1351.991333, 1275.923828, 1203.682495, 1135.142578, 1070.173828, 1008.638672, 954.878296, 905.786682, + 4151.732422, 4132.939453, 4077.450684, 3987.818604, 3871.052002, 3734.188232, 3578.032715, 3407.500000, + 3227.259766, 3041.528564, 2853.956543, 2667.581299, 2484.843994, 2307.630371, 2137.337891, 2015.945557, + 1932.109741, 1849.967896, 1769.796143, 1691.803589, 1616.144531, 1542.927124, 1472.104004, 1391.920288, + 1315.414917, 1242.525757, 1173.171875, 1107.257935, 1044.676514, 985.821167, 935.894348, 888.283264, + 3734.188232, 3719.505127, 3676.055664, 3605.573242, 3510.746094, 3394.959229, 3262.003174, 3115.799805, + 2960.159424, 2798.613281, 2634.296387, 2469.894531, 2307.630371, 2149.279541, 2027.284424, 1947.802002, + 1869.262207, 1792.028931, 1716.393799, 1642.587036, 1570.783203, 1501.112305, 1426.662964, 1350.367676, + 1277.407837, 1207.752563, 1141.347534, 1078.122192, 1017.992920, 964.170288, 915.916138, 869.840393, + 3370.109863, 3357.800781, 3321.326660, 3262.003174, 3181.889648, 3083.605469, 2970.133789, 2844.609619, + 2710.143555, 2569.677490, 2425.883789, 2281.107178, 2137.337891, 2027.284424, 1953.087891, 1879.053223, + 1805.620605, 1733.155273, 1661.957764, 1592.268433, 1524.278809, 1455.477539, 1380.081177, 1307.671509, + 1238.270752, 1171.872925, 1108.448364, 1047.949463, 990.314453, 941.609558, 895.070007, 850.572571, + 3041.528564, 3031.157227, 3000.390625, 2950.239990, 2882.297852, 2798.613281, 2701.544189, 2593.611328, + 2477.351562, 2355.211426, 2229.455811, 2102.114502, 2015.945557, 1947.802002, 1879.053223, 1810.194092, + 1741.649902, 1673.779907, 1606.886108, 1541.215088, 1476.968018, 1403.923828, 1332.708740, 1264.148682, + 1198.288086, 1135.142578, 1074.704956, 1016.947144, 965.022034, 918.278442, 873.481934, 830.592407, + 2744.983643, 2736.206543, 2710.143555, 2667.581299, 2609.764404, 2538.306152, 2455.086426, 2362.133057, + 2261.523438, 2155.288330, 2053.263184, 1993.638428, 1932.109741, 1869.262207, 1805.620605, 1741.649902, + 1677.755005, 1614.285400, 1551.537354, 1489.759766, 1421.362915, 1351.991333, 1284.869263, 1220.093750, + 1157.727417, 1097.804565, 1040.336426, 985.821167, 939.153625, 894.312012, 851.274536, 810.011841, + 2477.351074, 2469.894287, 2447.733643, 2411.486328, 2362.132812, 2300.954346, 2229.455811, 2149.279541, + 2062.126221, 2013.130249, 1961.068726, 1906.490967, 1849.967896, 1792.028931, 1733.155273, 1673.779907, + 1614.285400, 1555.005493, 1496.228516, 1431.994873, 1365.094971, 1300.000122, 1236.857544, 1175.777100, + 1116.836182, 1060.084961, 1005.549255, 957.398621, 912.780823, 869.840393, 828.566345, 788.938354, + 2235.813232, 2229.455811, 2210.550537, 2179.584961, 2137.337891, 2084.833496, 2041.636230, 2004.730957, + 1963.743530, 1919.218018, 1871.700928, 1821.723877, 1769.796143, 1716.393799, 1661.957764, 1606.886108, + 1551.537354, 1496.228516, 1435.567749, 1371.723022, 1309.214355, 1248.234619, 1188.938477, 1131.444702, + 1075.842163, 1022.192383, 972.740723, 928.625183, 886.038757, 844.988281, 805.471558, 767.476257, + 2038.749634, 2035.871338, 2027.284180, 2013.130249, 1993.638184, 1969.113525, 1939.925049, 1906.490967, + 1869.262207, 1828.708130, 1785.304199, 1739.518799, 1691.803589, 1642.587036, 1592.268433, 1541.215088, + 1489.759766, 1431.994873, 1371.723022, 1312.308716, 1253.984985, 1196.945190, 1141.347534, 1087.316772, + 1034.949585, 984.940796, 941.609558, 899.638794, 859.054749, 819.872620, 782.097107, 745.724487, + 1932.109741, 1929.518066, 1921.783081, 1909.023438, 1891.430786, 1869.262207, 1842.829834, 1812.489014, + 1778.627319, 1741.649902, 1701.970703, 1660.001831, 1616.144531, 1570.783203, 1524.278809, 1476.968018, + 1421.362915, 1365.094971, 1309.214355, 1253.984985, 1199.633057, 1146.349243, 1094.291504, 1043.588867, + 994.342957, 951.534058, 910.440247, 870.566956, 831.947388, 794.602966, 758.545227, 723.776733, + 1831.047485, 1828.708130, 1821.723755, 1810.194092, 1794.280640, 1774.201294, 1750.221191, 1722.644409, + 1691.803589, 1658.050537, 1621.745972, 1583.252930, 1542.927124, 1501.112305, 1455.477539, 1403.923828, + 1351.991333, 1300.000122, 1248.234619, 1196.945190, 1146.349243, 1096.631470, 1047.949463, 1000.432007, + 958.241089, 918.278259, 879.356750, 841.526489, 804.825806, 769.281250, 734.910400, 701.720947, + 1735.271729, 1733.155273, 1726.834595, 1716.393799, 1701.970703, 1683.749756, 1661.957764, 1636.855469, + 1608.729980, 1577.887085, 1544.642456, 1509.314819, 1472.104004, 1426.662964, 1380.081177, 1332.708740, + 1284.869263, 1236.857544, 1188.938477, 1141.347534, 1094.291504, 1047.949463, 1002.474182, 961.622131, + 923.031555, 885.292297, 848.471924, 812.623657, 777.789856, 744.001404, 711.280090, 684.970581, + 1644.505737, 1642.587036, 1636.855469, 1627.382446, 1614.285400, 1597.722046, 1577.887085, 1555.005493, + 1529.325195, 1501.112305, 1470.240967, 1431.994873, 1391.920288, 1350.367676, 1307.671509, 1264.148682, + 1220.093750, 1175.777100, 1131.444702, 1087.316772, 1043.588867, 1000.432007, 961.622131, 924.624451, + 888.283264, 852.680969, 817.885742, 783.954041, 750.929993, 718.848206, 690.509705, 669.787170, + 1558.487183, 1556.744507, 1551.537354, 1542.927124, 1531.013916, 1515.933594, 1497.853394, 1476.968018, + 1450.003418, 1419.603271, 1386.826660, 1351.991333, 1315.414917, 1277.407837, 1238.270752, 1198.288086, + 1157.727417, 1116.836182, 1075.842163, 1034.949585, 994.342957, 958.241089, 923.031555, 888.283264, + 854.091187, 820.536316, 787.687988, 755.602661, 724.327698, 694.713867, 674.449768, 654.522705, + 1476.968018, 1475.382324, 1470.240967, 1460.985596, 1448.186157, 1431.994873, 1412.599976, 1390.219116, + 1365.094971, 1337.488037, 1307.671509, 1275.923828, 1242.525757, 1207.752563, 1171.872925, 1135.142578, + 1097.804565, 1060.084961, 1022.192383, 984.940796, 951.534058, 918.278259, 885.292297, 852.680969, + 820.536316, 788.938354, 757.955322, 727.644897, 698.054810, 677.813538, 658.364380, 639.217041, + 1386.826660, 1385.135010, 1380.081177, 1371.723022, 1360.157349, 1345.514648, 1327.956665, 1307.671753, + 1284.869629, 1259.777100, 1232.633179, 1203.682495, 1173.171875, 1141.347534, 1108.448364, 1074.704956, + 1040.336426, 1005.549255, 972.740723, 941.609558, 910.440247, 879.356750, 848.471924, 817.885742, + 787.687988, 757.955322, 728.755737, 700.146423, 679.845642, 660.946289, 642.291992, 623.906799, + 1301.528687, 1300.000122, 1295.431274, 1287.873291, 1277.407837, 1264.148682, 1248.234619, 1229.829224, + 1209.114380, 1186.287842, 1161.558716, 1135.142578, 1107.257935, 1078.122192, 1047.949463, 1016.947144, + 985.821167, 957.398621, 928.625183, 899.638794, 870.566956, 841.526489, 812.623657, 783.954041, + 755.602661, 727.644897, 700.146423, 680.525146, 662.243774, 644.148804, 626.268066, 608.625916, + 1221.477173, 1220.093750, 1215.958252, 1209.114380, 1199.633057, 1187.612061, 1173.171875, 1156.454346, + 1137.618530, 1116.836182, 1094.291504, 1070.173828, 1044.676514, 1017.992920, 990.314453, 965.022034, + 939.153625, 912.780823, 886.038757, 859.054749, 831.947388, 804.825806, 777.789856, 750.929993, + 724.327698, 698.054810, 679.845642, 662.243774, 644.769775, 627.454285, 610.324890, 593.406128, + 1146.349243, 1145.095825, 1141.347534, 1135.142578, 1126.542847, 1115.632202, 1102.515625, 1087.316772, + 1070.173828, 1051.238037, 1030.670776, 1008.638672, 985.821167, 964.170288, 941.609558, 918.278442, + 894.312012, 869.840393, 844.988281, 819.872620, 794.602966, 769.281250, 744.001404, 718.848206, + 694.713867, 677.813538, 660.946289, 644.148804, 627.454285, 610.892944, 594.491943, 578.275757, + 1075.842163, 1074.704956, 1071.303833, 1065.671997, 1057.862915, 1047.949463, 1036.023804, 1022.192383, + 1006.577637, 989.356018, 972.740723, 954.878296, 935.894348, 915.916138, 895.070007, 873.481934, + 851.274536, 828.566345, 805.471558, 782.097107, 758.545227, 734.910400, 711.280090, 690.509705, + 674.449768, 658.364380, 642.291992, 626.268066, 610.324890, 594.491943, 578.796021, 563.260986, + 1009.671509, 1008.638672, 1005.549255, 1000.432007, 993.333557, 984.940796, 975.335022, 964.170288, + 951.533936, 937.521851, 922.236572, 905.786682, 888.283264, 869.840393, 850.572571, 830.592407, + 810.011841, 788.938354, 767.476257, 745.724487, 723.776733, 701.720947, 684.970581, 669.787170, + 654.522705, 639.217041, 623.906799, 608.625916, 593.406128, 578.275757, 563.260986, 548.385559, + 0.000000, 0.000000, 0.000000, 0.000000, 5011.678711, 4561.026367, 4150.897949, 3787.853271, + 3459.890381, 3160.322998, 2886.693115, 2636.754883, 2408.457275, 2220.788330, 2069.294189, 1928.134521, + 1796.604248, 1674.046265, 1559.848999, 1455.328247, 1364.407104, 1279.166016, 1199.250488, 1124.327759, + 1054.085815, 988.231934, 932.328857, 879.889832, 830.400330, 783.694336, 739.615356, 698.015564, + 0.000000, 0.000000, 0.000000, 0.000000, 4953.881836, 4518.670410, 4118.654297, 3763.552734, + 3440.437256, 3144.510986, 2873.683594, 2625.945068, 2399.401855, 2214.770264, 2064.085693, 1923.603760, + 1792.645508, 1670.574097, 1556.793091, 1452.861328, 1362.209839, 1277.203857, 1197.494507, 1122.752808, + 1052.670532, 986.958130, 931.291748, 878.947388, 829.542664, 782.912842, 738.902405, 697.364319, + 0.000000, 0.000000, 0.000000, 0.000000, 4793.614746, 4398.468262, 4026.789795, 3692.973877, + 3383.596924, 3098.108398, 2835.382080, 2594.041016, 2372.622803, 2196.918701, 2048.617188, 1910.134766, + 1780.867554, 1660.236206, 1547.688721, 1445.505981, 1355.655151, 1271.348633, 1192.252319, 1118.050049, + 1048.443970, 983.335632, 928.192505, 876.130005, 826.978088, 780.575439, 736.769653, 695.416138, + 0.000000, 0.000000, 0.000000, 0.000000, 4561.026367, 4217.543457, 3889.284912, 3582.401611, + 3293.564941, 3024.014893, 2773.850098, 2542.543945, 2329.235352, 2167.820068, 2023.344727, 1888.087524, + 1761.557373, 1643.263794, 1532.723755, 1433.395996, 1344.854126, 1261.692749, 1183.601440, 1110.284790, + 1041.461182, 977.688965, 923.064758, 871.467163, 822.732300, 776.704590, 733.236694, 692.188110, + 5011.678711, 4953.881836, 4793.614746, 4561.026367, 4287.298828, 3998.239258, 3716.125000, 3440.437256, + 3176.312988, 2926.477539, 2692.172852, 2473.735840, 2276.533936, 2128.389404, 1988.987183, 1858.031494, + 1735.171753, 1620.026123, 1512.199829, 1416.746704, 1329.985962, 1248.385864, 1171.668457, 1099.563843, + 1031.812866, 969.874817, 915.964600, 865.006897, 816.846924, 771.336731, 728.335144, 687.707825, + 4561.026367, 4518.670410, 4398.468262, 4217.543457, 3998.239258, 3763.552734, 3519.859619, 3276.214600, + 3038.523682, 2810.433105, 2594.041016, 2390.411621, 2220.788330, 2079.791016, 1946.466553, 1820.706665, + 1702.307983, 1591.010010, 1486.515869, 1395.845215, 1311.290039, 1231.629150, 1156.622437, 1086.030518, + 1019.620911, 959.981201, 906.967834, 856.815063, 809.379150, 764.521179, 722.108276, 682.245117, + 4150.897949, 4118.654297, 4026.789795, 3889.284912, 3716.125000, 3519.859619, 3311.126465, 3098.108398, + 2886.693115, 2680.902100, 2483.341797, 2295.779541, 2156.401855, 2023.344727, 1896.847168, 1776.973755, + 1663.669312, 1556.793091, 1457.802856, 1371.036987, 1289.056152, 1211.667358, 1138.670532, 1069.861084, + 1005.035828, 948.117004, 896.168518, 846.973267, 800.399719, 756.319885, 714.610474, 675.848572, + 3787.853271, 3763.552734, 3692.973877, 3582.401611, 3440.437256, 3276.214600, 3098.108398, 2913.088135, + 2726.568848, 2542.543945, 2363.822754, 2214.770264, 2085.079346, 1960.440552, 1841.264404, 1727.767578, + 1620.026245, 1518.013916, 1426.217773, 1342.712280, 1263.613770, 1188.779175, 1118.050049, 1051.258545, + 988.231995, 934.408508, 883.676331, 835.576843, 789.991882, 746.805908, 705.905090, 668.410767, + 3459.890381, 3440.437256, 3383.596924, 3293.564941, 3176.312988, 3038.523682, 2886.693115, 2726.568848, + 2562.898193, 2399.401855, 2251.398438, 2128.389404, 2008.477295, 1892.457642, 1780.867554, 1674.046387, + 1572.181152, 1475.345703, 1391.275391, 1311.290039, 1235.319580, 1163.268921, 1095.021729, 1030.446289, + 970.984558, 918.996521, 869.613525, 822.732300, 778.249512, 736.060913, 696.064697, 659.988525, + 3160.322998, 3144.510986, 3098.108398, 3024.014893, 2926.477539, 2810.433105, 2680.902100, 2542.543945, + 2399.401855, 2263.892822, 2150.739014, 2038.433105, 1928.134521, 1820.706665, 1716.770630, 1616.755371, + 1520.935669, 1433.395996, 1353.482666, 1277.203857, 1204.545532, 1135.456787, 1069.861084, 1007.662292, + 952.402710, 902.032532, 854.112671, 808.556152, 765.273621, 724.174500, 685.184509, 650.644348, + 2886.693115, 2873.683594, 2835.382080, 2773.850098, 2692.172852, 2594.041016, 2483.341797, 2363.822754, + 2251.398438, 2150.739014, 2048.617188, 1946.466553, 1845.429077, 1746.389160, 1650.014648, 1556.793091, + 1467.777588, 1389.000244, 1313.344360, 1240.890259, 1171.668457, 1105.668823, 1042.851685, 983.335632, + 932.328857, 883.676147, 837.313965, 793.171143, 751.173340, 711.242554, 674.120605, 640.444885, + 2636.754883, 2625.945068, 2594.041016, 2542.543945, 2473.735840, 2390.411621, 2295.779541, 2214.770264, + 2128.389404, 2038.433105, 1946.466553, 1853.812378, 1761.557373, 1670.574341, 1581.545288, 1494.991333, + 1416.746948, 1342.712280, 1271.348755, 1202.776001, 1137.061523, 1074.231323, 1014.277527, 959.981201, + 910.948364, 864.090393, 819.361023, 776.704590, 736.060913, 697.364319, 662.218079, 629.460938, + 2408.457275, 2399.401855, 2372.622803, 2329.235352, 2276.533936, 2220.788330, 2156.401855, 2085.079346, + 2008.477295, 1928.134521, 1845.429077, 1761.557373, 1677.531860, 1594.187866, 1512.199829, 1435.803467, + 1364.407104, 1295.054199, 1227.958252, 1163.268921, 1101.084717, 1041.461182, 984.471924, 935.451172, + 888.446838, 843.439941, 800.399719, 759.286133, 720.051270, 682.831299, 649.558655, 617.765076, + 2220.788330, 2214.770264, 2196.918701, 2167.820068, 2128.389404, 2079.791016, 2023.344727, 1960.440552, + 1892.457642, 1820.706665, 1746.389160, 1670.574341, 1594.187866, 1518.013916, 1445.506348, 1377.724487, + 1311.290039, 1246.504883, 1183.601440, 1122.752808, 1064.079834, 1007.662292, 956.720947, 909.950623, + 865.006897, 821.887329, 780.575439, 741.044800, 703.259888, 668.410767, 636.225525, 605.431763, + 2069.294189, 2064.085693, 2048.617188, 2023.344727, 1988.987183, 1946.466553, 1896.847168, 1841.264404, + 1780.867554, 1716.770630, 1650.014648, 1581.545288, 1512.199829, 1445.506348, 1382.215332, 1319.541870, + 1257.865967, 1197.494507, 1138.670532, 1081.580688, 1026.364746, 974.326904, 928.192505, 883.676147, + 840.805603, 799.591431, 760.030457, 722.108276, 685.801392, 653.370850, 622.301086, 592.535889, + 1928.134521, 1923.603760, 1910.134766, 1888.087524, 1858.031494, 1820.706665, 1776.973755, 1727.767578, + 1674.046387, 1616.755371, 1556.793091, 1494.991333, 1435.803467, 1377.724487, 1319.541870, 1261.692749, + 1204.545532, 1148.403564, 1093.514893, 1040.073730, 988.231995, 942.804993, 899.092651, 856.815063, + 816.011780, 776.704590, 738.902405, 702.601074, 668.978333, 637.802612, 607.867737, 579.151306, + 1796.604248, 1792.645508, 1780.867554, 1761.557373, 1735.171753, 1702.307983, 1663.669312, 1620.026245, + 1572.181152, 1520.935669, 1467.777588, 1416.746948, 1364.407104, 1311.290039, 1257.865967, 1204.545532, + 1151.679443, 1099.563843, 1048.443970, 998.518372, 953.479187, 910.948364, 869.613342, 829.542664, + 790.784973, 753.372253, 717.322815, 682.831299, 651.732788, 621.794556, 593.005920, 565.351440, + 1674.046265, 1670.574097, 1660.236206, 1643.263794, 1620.026123, 1591.010010, 1556.793091, 1518.013916, + 1475.345703, 1433.395996, 1389.000244, 1342.712280, 1295.054199, 1246.504883, 1197.494507, 1148.403564, + 1099.563843, 1051.258545, 1003.726868, 959.981201, 918.996521, 878.947388, 839.930481, 802.020447, + 765.273621, 729.730225, 695.416138, 663.897949, 634.132019, 605.431763, 577.793396, 551.206848, + 1559.848999, 1556.793091, 1547.688721, 1532.723755, 1512.199829, 1486.515869, 1457.802856, 1426.217773, + 1391.275391, 1353.482666, 1313.344360, 1271.348755, 1227.958252, 1183.601440, 1138.670532, 1093.514893, + 1048.443970, 1003.726868, 962.165222, 923.064758, 884.626892, 846.973267, 810.203491, 774.396851, + 739.615356, 705.905090, 674.120605, 644.708618, 616.264648, 588.796265, 562.305176, 536.786438, + 1455.328247, 1452.861328, 1445.505981, 1433.395996, 1416.746704, 1395.845215, 1371.036987, 1342.712280, + 1311.290039, 1277.203857, 1240.890259, 1202.776001, 1163.268921, 1122.752808, 1081.580688, 1040.073730, + 998.518372, 959.981201, 923.064758, 886.533447, 850.530762, 815.177795, 780.575439, 746.805908, + 713.934814, 682.245117, 653.370850, 625.353882, 598.214355, 571.965027, 546.611938, 522.155396, + 1364.407104, 1362.209839, 1355.655151, 1344.854126, 1329.985962, 1311.290039, 1289.056152, 1263.613770, + 1235.319580, 1204.545532, 1171.668457, 1137.061523, 1101.084717, 1064.079834, 1026.364746, 988.231995, + 953.479187, 918.996521, 884.626892, 850.530762, 816.846924, 783.694336, 751.173340, 719.367554, + 688.345093, 659.988647, 632.568970, 605.917847, 580.059387, 555.010437, 530.781067, 507.375702, + 1279.166016, 1277.203857, 1271.348633, 1261.692749, 1248.385864, 1231.629150, 1211.667358, 1188.779175, + 1163.268921, 1135.456787, 1105.668823, 1074.231323, 1041.461182, 1007.662292, 974.326904, 942.804993, + 910.948364, 878.947388, 846.973267, 815.177795, 783.694336, 752.638062, 722.108276, 692.188110, + 664.459473, 637.802490, 611.796875, 586.477539, 561.871887, 537.999817, 514.875244, 492.505737, + 1199.250488, 1197.494507, 1192.252319, 1183.601440, 1171.668457, 1156.622437, 1138.670532, 1118.050049, + 1095.021729, 1069.861084, 1042.851685, 1014.277527, 984.471924, 956.720947, 928.192505, 899.092651, + 869.613342, 839.930481, 810.203491, 780.575439, 751.173340, 722.108276, 693.476562, 666.712769, + 640.975464, 615.765564, 591.129272, 567.103455, 543.718079, 520.995728, 498.952637, 480.805573, + 1124.327759, 1122.752808, 1118.050049, 1110.284790, 1099.563843, 1086.030518, 1069.861084, 1051.258545, + 1030.446289, 1007.662292, 983.335632, 959.981201, 935.451172, 909.950623, 883.676147, 856.815063, + 829.542664, 802.020447, 774.396851, 746.805908, 719.367554, 692.188110, 666.712769, 642.038696, + 617.765076, 593.947571, 570.632690, 547.859314, 525.658264, 504.054474, 484.734985, 470.036285, + 1054.085815, 1052.670532, 1048.443970, 1041.461182, 1031.812866, 1019.620911, 1005.035828, 988.231995, + 970.984558, 952.402710, 932.328857, 910.948364, 888.446838, 865.006897, 840.805603, 816.011780, + 790.784973, 765.273621, 739.615356, 713.934814, 688.345093, 664.459473, 640.975464, 617.765076, + 594.891724, 572.410034, 550.367126, 528.801819, 507.746918, 487.717651, 473.343079, 459.212067, + 988.231934, 986.958130, 983.335632, 977.688965, 969.874817, 959.981201, 948.117004, 934.408508, + 918.996521, 902.032532, 883.676147, 864.090393, 843.439941, 821.887329, 799.591431, 776.704590, + 753.372253, 729.730225, 705.905090, 682.245117, 659.988647, 637.802490, 615.765564, 593.947571, + 572.410034, 551.206848, 530.384338, 509.981781, 490.032288, 475.728912, 461.936005, 448.361359, + 932.328857, 931.291748, 928.192505, 923.064758, 915.964600, 906.967834, 896.168518, 883.676331, + 869.613525, 854.112671, 837.313965, 819.361023, 800.399719, 780.575439, 760.030457, 738.902405, + 717.322815, 695.416138, 674.120605, 653.370850, 632.568970, 611.796875, 591.129272, 570.632690, + 550.367126, 530.384338, 510.730133, 491.443481, 477.170258, 463.766785, 450.541077, 437.510101, + 879.889832, 878.947388, 876.130005, 871.467163, 865.006897, 856.815063, 846.973267, 835.576843, + 822.732300, 808.556152, 793.171143, 776.704590, 759.286133, 741.044800, 722.108276, 702.601074, + 682.831299, 663.897949, 644.708618, 625.353882, 605.917847, 586.477539, 567.103455, 547.859314, + 528.801819, 509.981781, 491.443481, 477.652222, 464.686829, 451.857361, 439.183533, 426.682556, + 830.400330, 829.542664, 826.978088, 822.732300, 816.846924, 809.379150, 800.399719, 789.991882, + 778.249512, 765.273621, 751.173340, 736.060913, 720.051270, 703.259888, 685.801392, 668.978333, + 651.732788, 634.132019, 616.264648, 598.214355, 580.059387, 561.871887, 543.718079, 525.658264, + 507.746918, 490.032288, 477.170258, 464.686829, 452.297577, 440.024200, 427.886261, 415.901093, + 783.694336, 782.912842, 780.575439, 776.704590, 771.336731, 764.521179, 756.319885, 746.805908, + 736.060913, 724.174500, 711.242554, 697.364319, 682.831299, 668.410767, 653.370850, 637.802612, + 621.794556, 605.431763, 588.796265, 571.965027, 555.010437, 537.999817, 520.995728, 504.054474, + 487.717651, 475.728912, 463.766785, 451.857361, 440.024200, 428.288727, 416.670166, 405.185883, + 739.615356, 738.902405, 736.769653, 733.236694, 728.335144, 722.108276, 714.610474, 705.905090, + 696.064697, 685.184509, 674.120605, 662.218079, 649.558655, 636.225525, 622.301086, 607.867737, + 593.005920, 577.793396, 562.305176, 546.611938, 530.781067, 514.875244, 498.952637, 484.734985, + 473.343079, 461.936005, 450.541077, 439.183533, 427.886261, 416.670166, 405.554260, 394.555481, + 698.015564, 697.364319, 695.416138, 692.188110, 687.707825, 682.245117, 675.848572, 668.410767, + 659.988525, 650.644348, 640.444885, 629.460938, 617.765076, 605.431763, 592.535889, 579.151306, + 565.351440, 551.206848, 536.786438, 522.155396, 507.375702, 492.505737, 480.805573, 470.036285, + 459.212067, 448.361359, 437.510101, 426.682556, 415.901093, 405.185883, 394.555481, 384.026642, + 0.000000, 0.000000, 0.000000, 0.000000, 1554.123779, 1242.539551, 993.424500, 821.738708, + 688.023743, 576.067200, 482.328461, 403.842957, 338.128937, 283.233490, 237.367096, 198.928207, + 166.714081, 139.716614, 117.091141, 100.366226, 93.587563, 87.266724, 81.372780, 75.876930, + 70.752258, 65.973686, 62.470375, 59.202759, 56.106068, 53.171352, 50.390141, 47.754402, + 0.000000, 0.000000, 0.000000, 0.000000, 1511.898926, 1215.312500, 975.197021, 811.432129, + 680.458130, 570.428589, 478.074890, 400.602692, 335.640289, 281.318329, 235.876022, 197.761490, + 165.797119, 138.993164, 116.518372, 100.181839, 93.424210, 87.121628, 81.243637, 75.761719, + 70.649292, 65.881516, 62.405884, 59.143909, 56.052280, 53.122139, 50.345058, 47.713055, + 0.000000, 0.000000, 0.000000, 0.000000, 1398.316895, 1139.939331, 926.469055, 781.859680, + 658.586914, 554.038086, 465.659058, 391.113556, 328.333618, 275.682922, 231.480927, 194.317368, + 163.086807, 136.852478, 114.822044, 99.632172, 92.937019, 86.688774, 80.858185, 75.417809, + 70.341858, 65.636070, 62.213120, 58.967945, 55.891445, 52.974934, 50.210182, 47.589340, + 0.000000, 0.000000, 0.000000, 0.000000, 1242.539551, 1031.720703, 865.446106, 736.612244, + 624.660889, 528.352295, 446.048340, 376.032318, 316.662598, 266.643341, 224.406723, 188.757797, + 158.701111, 133.381165, 112.066162, 98.727715, 92.134651, 85.975319, 80.222427, 74.850182, + 69.834183, 65.286209, 61.894089, 58.676640, 55.625088, 52.731091, 49.986694, 47.384289, + 1554.123779, 1511.898926, 1398.316895, 1242.539551, 1072.704712, 913.631165, 791.500732, 680.458130, + 581.796936, 495.450836, 420.656097, 356.335327, 301.346527, 254.681503, 214.999893, 181.334152, + 152.824005, 128.714935, 108.351624, 97.485291, 91.031036, 84.992851, 79.346092, 74.067062, + 69.133171, 64.801811, 61.452133, 58.272842, 55.255707, 52.392796, 49.676514, 47.099586, + 1242.539551, 1215.312500, 1139.939331, 1031.720703, 913.631165, 811.432129, 711.604919, 618.224060, + 533.334961, 457.657806, 391.113556, 333.178528, 283.233490, 240.389191, 203.691101, 172.362762, + 145.689255, 123.027794, 103.808617, 95.927284, 89.644768, 83.756927, 78.242203, 73.079414, + 68.248146, 64.188103, 60.891762, 57.760517, 54.786736, 51.963017, 49.282234, 46.737526, + 993.424500, 975.197021, 926.469055, 865.446106, 791.500732, 711.604919, 631.209534, 554.038086, + 482.328461, 417.209503, 359.053955, 307.756866, 263.145569, 224.406723, 190.955078, 162.196686, + 137.561203, 116.518372, 100.551231, 94.080605, 87.998329, 82.286446, 76.926704, 71.900749, + 67.190567, 63.451557, 60.218594, 57.144524, 54.222420, 51.445496, 48.807163, 46.300991, + 821.738708, 811.432129, 781.859680, 736.612244, 680.458130, 618.224060, 554.038086, 491.015137, + 431.260406, 376.032318, 325.949646, 281.318329, 241.920471, 207.367233, 177.270203, 151.198700, + 128.714981, 109.396255, 98.191902, 91.975601, 86.117218, 80.602898, 75.417809, 70.546577, + 65.973694, 62.599686, 59.439167, 56.430584, 53.567768, 50.844631, 48.255127, 45.793381, + 688.023743, 680.458130, 658.586914, 624.660889, 581.796936, 533.334961, 482.328461, 431.260406, + 381.958527, 335.640289, 293.096069, 254.681427, 220.306442, 189.852081, 163.086807, 139.716660, + 119.419540, 101.869186, 95.586914, 89.644768, 84.028999, 78.729691, 73.735466, 69.033928, + 64.870621, 61.640888, 58.560795, 55.625088, 52.828423, 50.165352, 47.630527, 45.218559, + 576.067200, 570.428589, 554.038086, 528.352295, 495.450836, 457.657806, 417.209503, 376.032318, + 335.640289, 297.179993, 261.421112, 228.614380, 198.928207, 172.362762, 148.803925, 128.066162, + 109.923485, 98.727715, 92.775597, 87.121628, 81.762306, 76.691391, 71.900749, 67.380913, + 63.717697, 60.584194, 57.591423, 54.735031, 52.010479, 49.413094, 46.938114, 44.580780, + 482.328461, 478.074890, 465.659058, 446.048340, 420.656097, 391.113556, 359.053955, 325.949646, + 293.096069, 261.421112, 231.480927, 203.691101, 178.274475, 155.306305, 134.755493, 116.518372, + 101.297218, 95.417488, 89.797020, 84.439827, 79.346092, 74.512932, 69.935249, 65.636070, + 62.470375, 59.439159, 56.539452, 53.767811, 51.120529, 48.593651, 46.183067, 43.884586, + 403.842957, 400.602692, 391.113556, 376.032318, 356.335327, 333.178528, 307.756866, 281.318329, + 254.681427, 228.614380, 203.691101, 180.306320, 158.701111, 138.993195, 121.205971, 105.294853, + 97.485306, 91.975601, 86.688774, 81.632111, 76.808891, 72.219170, 67.860542, 64.188103, + 61.139740, 58.215542, 55.413525, 52.731091, 50.165352, 47.713055, 45.370728, 43.134796, + 338.128937, 335.640289, 328.333618, 316.662598, 301.346527, 283.233490, 263.145569, 241.920471, + 220.306442, 198.928207, 178.274475, 158.701111, 140.445145, 123.643127, 108.351624, 98.907478, + 93.587563, 88.442261, 83.486351, 78.729691, 74.178123, 69.834183, 65.706459, 62.664513, + 59.736908, 56.923252, 54.222420, 51.632717, 49.151936, 46.777527, 44.506676, 42.336353, + 283.233490, 281.318329, 275.682922, 266.643341, 254.681503, 240.389191, 224.406723, 207.367233, + 189.852081, 172.362762, 155.306305, 138.993195, 123.643127, 109.396255, 99.632195, 94.578140, + 89.644768, 84.854034, 80.222427, 75.761719, 71.479691, 67.380913, 63.985767, 61.077591, + 58.272842, 55.572067, 52.974934, 50.480511, 48.087288, 45.793381, 43.596565, 41.494331, + 237.367096, 235.876022, 231.480927, 224.406723, 214.999893, 203.691101, 190.955078, 177.270203, + 163.086807, 148.803925, 134.755493, 121.205971, 108.351624, 99.632195, 94.912354, 90.256424, + 85.692696, 81.243637, 76.926704, 72.754906, 68.737579, 65.077835, 62.213120, 59.439159, + 56.758232, 54.171604, 51.679691, 49.282234, 46.978401, 44.766880, 42.646019, 40.613834, + 198.928207, 197.761490, 194.317368, 188.757797, 181.334152, 172.362762, 162.196686, 151.198700, + 139.716660, 128.066162, 116.518372, 105.294853, 98.907478, 94.578140, 90.256424, 85.975319, + 81.762306, 77.639725, 73.625481, 69.733337, 65.973694, 63.121567, 60.400925, 57.760517, + 55.203278, 52.731091, 50.345058, 48.045479, 45.832119, 43.704224, 41.660648, 39.699886, + 166.714081, 165.797119, 163.086807, 158.701111, 152.824005, 145.689255, 137.561203, 128.714981, + 119.419540, 109.923485, 101.297218, 97.485306, 93.587563, 89.644768, 85.692696, 81.762306, + 77.879822, 74.067062, 70.341858, 66.718384, 63.784531, 61.139740, 58.560787, 56.052280, + 53.617672, 51.259396, 48.979061, 46.777527, 44.655071, 42.611439, 40.645927, 38.757504, + 139.716614, 138.993164, 136.852478, 133.381165, 128.714935, 123.027794, 116.518372, 109.396255, + 101.869186, 98.727715, 95.417488, 91.975601, 88.442261, 84.854034, 81.243637, 77.639725, + 74.067062, 70.546577, 67.095711, 64.188103, 61.640888, 59.143909, 56.703403, 54.324310, + 52.010479, 49.764812, 47.589340, 45.485382, 43.453663, 41.494331, 39.607159, 37.791485, + 117.091141, 116.518372, 114.822044, 112.066162, 108.351624, 103.808617, 100.551231, 98.191902, + 95.586914, 92.775597, 89.797020, 86.688774, 83.486351, 80.222427, 76.926704, 73.625481, + 70.341858, 67.095711, 64.323616, 61.894089, 59.498505, 57.144524, 54.838520, 52.585674, + 50.390141, 48.255127, 46.183067, 44.175625, 42.233917, 40.358486, 38.549461, 36.806534, + 100.366226, 100.181839, 99.632172, 98.727715, 97.485291, 95.927284, 94.080605, 91.975601, + 89.644768, 87.121628, 84.439827, 81.632111, 78.729691, 75.761719, 72.754906, 69.733337, + 66.718384, 64.188103, 61.894089, 59.617504, 57.367245, 55.150913, 52.974934, 50.844631, + 48.764336, 46.737526, 44.766880, 42.854427, 41.001553, 39.209156, 37.477650, 35.807087, + 93.587563, 93.424210, 92.937019, 92.134651, 91.031036, 89.644768, 87.998329, 86.117218, + 84.028999, 81.762306, 79.346092, 76.808891, 74.178123, 71.479691, 68.737579, 65.973694, + 63.784531, 61.640888, 59.498505, 57.367245, 55.255707, 53.171352, 51.120529, 49.108620, + 47.140091, 45.218567, 43.346962, 41.527519, 39.761902, 38.051262, 36.396320, 34.797382, + 87.266724, 87.121628, 86.688774, 85.975319, 84.992851, 83.756927, 82.286446, 80.602898, + 78.729691, 76.691391, 74.512932, 72.219170, 69.834183, 67.380913, 65.077835, 63.121567, + 61.139740, 59.143909, 57.144524, 55.150913, 53.171352, 51.213028, 49.282234, 47.384289, + 45.523705, 43.704216, 41.928902, 40.200157, 38.519875, 36.889412, 35.309746, 33.781395, + 81.372780, 81.243637, 80.858185, 80.222427, 79.346092, 78.242203, 76.926704, 75.417809, + 73.735466, 71.900749, 69.935249, 67.860542, 65.706459, 63.985767, 62.213120, 60.400925, + 58.560787, 56.703403, 54.838520, 52.974934, 51.120529, 49.282234, 47.466137, 45.677494, + 43.920807, 42.199848, 40.517788, 38.877151, 37.279995, 35.727867, 34.221893, 32.932083, + 75.876930, 75.761719, 75.417809, 74.850182, 74.067062, 73.079414, 71.900749, 70.546577, + 69.033928, 67.380913, 65.636070, 64.188103, 62.664513, 61.077591, 59.439159, 57.760517, + 56.052280, 54.324310, 52.585674, 50.844631, 49.108620, 47.384289, 45.677494, 43.993374, + 42.336353, 40.710224, 39.118168, 37.562847, 36.046383, 34.570469, 33.224525, 32.131664, + 70.752258, 70.649292, 70.341858, 69.834183, 69.133171, 68.248146, 67.190567, 65.973694, + 64.870621, 63.717697, 62.470375, 61.139740, 59.736908, 58.272842, 56.758232, 55.203278, + 53.617672, 52.010479, 50.390141, 48.764336, 47.140091, 45.523705, 43.920807, 42.336353, + 40.774689, 39.239544, 37.734135, 36.261116, 34.822742, 33.446636, 32.377274, 31.328754, + 65.973686, 65.881516, 65.636070, 65.286209, 64.801811, 64.188103, 63.451557, 62.599686, + 61.640888, 60.584194, 59.439159, 58.215542, 56.923252, 55.572067, 54.171604, 52.731091, + 51.259396, 49.764812, 48.255127, 46.737526, 45.218567, 43.704216, 42.199848, 40.710224, + 39.239544, 37.791485, 36.369217, 34.975430, 33.612385, 32.554573, 31.530655, 30.525511, + 62.470375, 62.405884, 62.213120, 61.894089, 61.452133, 60.891762, 60.218594, 59.439167, + 58.560795, 57.591423, 56.539452, 55.413525, 54.222420, 52.974934, 51.679691, 50.345058, + 48.979061, 47.589340, 46.183067, 44.766880, 43.346962, 41.928902, 40.517788, 39.118168, + 37.734135, 36.369217, 35.026558, 33.708813, 32.661720, 31.666414, 30.686733, 29.723904, + 59.202759, 59.143909, 58.967945, 58.676640, 58.272842, 57.760517, 57.144524, 56.430584, + 55.625088, 54.735031, 53.767811, 52.731091, 51.632717, 50.480511, 49.282234, 48.045479, + 46.777527, 45.485382, 44.175625, 42.854427, 41.527519, 40.200157, 38.877151, 37.562847, + 36.261116, 34.975430, 33.708813, 32.697552, 31.734655, 30.784130, 29.847412, 28.925755, + 56.106068, 56.052280, 55.891445, 55.625088, 55.255707, 54.786736, 54.222420, 53.567768, + 52.828423, 52.010479, 51.120529, 50.165352, 49.151936, 48.087288, 46.978401, 45.832119, + 44.655071, 43.453663, 42.233917, 41.001553, 39.761902, 38.519875, 37.279995, 36.046383, + 34.822742, 33.612385, 32.661720, 31.734655, 30.816704, 29.909475, 29.014397, 28.132734, + 53.171352, 53.122139, 52.974934, 52.731091, 52.392796, 51.963017, 51.445496, 50.844631, + 50.165352, 49.413094, 48.593651, 47.713055, 46.777527, 45.793381, 44.766880, 43.704224, + 42.611439, 41.494331, 40.358486, 39.209156, 38.051262, 36.889412, 35.727867, 34.570469, + 33.446636, 32.554573, 31.666414, 30.784130, 29.909475, 29.044043, 28.189245, 27.346340, + 50.390141, 50.345058, 50.210182, 49.986694, 49.676514, 49.282234, 48.807163, 48.255127, + 47.630527, 46.938114, 46.183067, 45.370728, 44.506676, 43.596565, 42.646019, 41.660648, + 40.645927, 39.607159, 38.549461, 37.477650, 36.396320, 35.309746, 34.221893, 33.224525, + 32.377274, 31.530655, 30.686733, 29.847412, 29.014397, 28.189245, 27.373348, 26.567940, + 47.754402, 47.713055, 47.589340, 47.384289, 47.099586, 46.737526, 46.300991, 45.793381, + 45.218559, 44.580780, 43.884586, 43.134796, 42.336353, 41.494331, 40.613834, 39.699886, + 38.757504, 37.791485, 36.806534, 35.807087, 34.797382, 33.781395, 32.932083, 32.131664, + 31.328754, 30.525511, 29.723904, 28.925755, 28.132734, 27.346340, 26.567940, 25.798756}; + +class lossy_acc : public VPP_ACC { + // port bindings + ZERO_COPY(config); // mm14, input + ZERO_COPY(config_fl); // mm15, input + ZERO_COPY(hls_opsin_1); // mm0, input + ZERO_COPY(hls_opsin_2); // mm1, input + ZERO_COPY(hls_opsin_3); // mm2, input + ZERO_COPY(quant_field_row); // mm3, input + ZERO_COPY(masking_field_row); // mm4, input + ZERO_COPY(aq_map_f); // mm5, input + ZERO_COPY(cmap_axi); // mm6, output + ZERO_COPY(ac_coef_axiout); // mm7, output + ZERO_COPY(strategy_all); // mm8, output + ZERO_COPY(raw_quant_field_i); // mm9, output + ZERO_COPY(hls_order); // mm10, output + ZERO_COPY(hls_dc8x8); // mm11, output + ZERO_COPY(hls_dc16x16); // mm12, output + ZERO_COPY(hls_dc32x32); // mm13, output + + SYS_PORT(config, HBM[14]); // HBM-14 + SYS_PORT(config_fl, HBM[15]); // HBM-15 + SYS_PORT(hls_opsin_1, HBM[0]); // HBM-0 + SYS_PORT(hls_opsin_2, HBM[1]); // HBM-1 + SYS_PORT(hls_opsin_3, HBM[2]); // HBM-2 + SYS_PORT(quant_field_row, HBM[3]); // HBM-3 + SYS_PORT(masking_field_row, HBM[4]); // HBM-4 + SYS_PORT(aq_map_f, HBM[5]); // HBM-5 + SYS_PORT(cmap_axi, HBM[6]); // HBM-6 + SYS_PORT(ac_coef_axiout, HBM[7]); // HBM-7 + SYS_PORT(strategy_all, HBM[8]); // HBM-8 + SYS_PORT(raw_quant_field_i, HBM[9]); // HBM-9 + SYS_PORT(hls_order, HBM[10]); // HBM-10 + SYS_PORT(hls_dc8x8, HBM[11]); // HBM-11 + SYS_PORT(hls_dc16x16, HBM[12]); // HBM-12 + SYS_PORT(hls_dc32x32, HBM[13]); // HBM-13 + + public: + static void compute(int* config, + float* config_fl, + float* hls_opsin_1, + float* hls_opsin_2, + float* hls_opsin_3, + float* quant_field_row, + float* masking_field_row, + float* aq_map_f, + int8_t* cmap_axi, + int* ac_coef_axiout, + unsigned char* strategy_all, + int* raw_quant_field_i, + uint32_t* hls_order, + float* hls_dc8x8, + float* hls_dc16x16, + float* hls_dc32x32); + + // ------------------------------------------------------------ + /** + * @brief Level 2 : kernel implement for JXL lossy frame encode computing + * + * @param config the int config signal, such as image size, field stride and etc. + * @param config_fl the floating config signal, such as cost, inv_global_scale and etc. + * @param hls_opsin_1 the input RGB image data for channnel-1. + * @param hls_opsin_2 the input RGB image data for channnel-2. + * @param hls_opsin_3 the input RGB image data for channnel-3. + * @param quant_field_row the initial quant_filed data. + * @param masking_filed_row the initial masking_filed data. + * @param aq_map_f the initial adjust quant map data. + * @param cmap_axi the output of color correlation map. + * @param ac_coef_axiout the output of quanted AC coefficients. + * @param strategy_all the output of strategy for each block in image + * @param raw_quant_field_i the output of computed raw_quant_field + * @param hls_order the output of orders for each block in image + * @param hls_dc8x8 the DC coefficients output for 8x8 blocks + * @param hls_dc16x16 the DC coefficients output for 16x16 blocks + * @param hls_dc32x32 the DC coefficients output for 32x32 blocks + */ + // ------------------------------------------------------------ + + static void lossyEncComp(int config[MAX_NUM_CONFIG], + float config_fl[MAX_NUM_CONFIG], + float* hls_opsin_1, + float* hls_opsin_2, + float* hls_opsin_3, + float* quant_field_row, + float* masking_field_row, + float* aq_map_f, + int8_t* cmap_axi, + int* ac_coef_axiout, + unsigned char* strategy_all, + int* raw_quant_field_i, + uint32_t* hls_order, + float* hls_dc8x8, + float* hls_dc16x16, + float* hls_dc32x32); +}; + +#endif diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/postSysLink.tcl b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/postSysLink.tcl new file mode 100644 index 0000000000..2dc2f67034 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/postSysLink.tcl @@ -0,0 +1 @@ +set_property -dict [list CONFIG.ECC_EN {false} CONFIG.ECC_SCRUB_EN {false}] [get_bd_cells hmss_0] diff --git a/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/utils.mk b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/utils.mk new file mode 100644 index 0000000000..1937b53d2b --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_lossy_enc_compute_sc/utils.mk @@ -0,0 +1,239 @@ +# +# Copyright 2019-2021 Xilinx, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# sc makefile-generator v1.0.0 +# +#+------------------------------------------------------------------------------- +# The following parameters are assigned with default values. These parameters can +# be overridden through the make command line +#+------------------------------------------------------------------------------- + +REPORT := no +PROFILE := no +DEBUG := no + +#'estimate' for estimate report generation +#'system' for system report generation +ifneq ($(REPORT), no) +VPP_LDFLAGS += --report estimate +VPP_LDFLAGS += --report system +endif + +#Generates profile summary report +ifeq ($(PROFILE), yes) +VPP_LDFLAGS += --profile_kernel data:all:all:all +endif + +#Generates debug summary report +ifeq ($(DEBUG), yes) +VPP_LDFLAGS += --dk protocol:all:all:all +endif + +#Check environment setup +ifndef XILINX_VITIS + XILINX_VITIS = /opt/xilinx/Vitis/$(TOOL_VERSION) + export XILINX_VITIS +endif +ifndef XILINX_XRT + XILINX_XRT = /opt/xilinx/xrt + export XILINX_XRT +endif + +check_device: + @set -eu; \ + inallowlist=False; \ + inblocklist=False; \ + for dev in $(PLATFORM_ALLOWLIST); \ + do if [[ $$(echo $(PLATFORM_NAME) | grep $$dev) != "" ]]; \ + then inallowlist=True; fi; \ + done ;\ + for dev in $(PLATFORM_BLOCKLIST); \ + do if [[ $$(echo $(PLATFORM_NAME) | grep $$dev) != "" ]]; \ + then inblocklist=True; fi; \ + done ;\ + if [[ $$inallowlist == False ]]; \ + then echo "[Warning]: The device $(PLATFORM_NAME) not in allowlist."; \ + fi; \ + if [[ $$inblocklist == True ]]; \ + then echo "[ERROR]: The device $(PLATFORM_NAME) in blocklist."; exit 1;\ + fi; + +#get HOST_ARCH by PLATFORM +ifneq (,$(PLATFORM)) +HOST_ARCH_temp = $(shell platforminfo -p $(PLATFORM) | grep 'CPU Type' | sed 's/.*://' | sed '/ai_engine/d' | sed 's/^[[:space:]]*//') +ifeq ($(HOST_ARCH_temp), x86) +HOST_ARCH := x86 +else ifeq ($(HOST_ARCH_temp), cortex-a9) +HOST_ARCH := aarch32 +else ifneq (,$(findstring cortex-a, $(HOST_ARCH_temp))) +HOST_ARCH := aarch64 +endif +endif + +#Checks for Device Family +ifeq ($(HOST_ARCH), aarch32) + DEV_FAM = 7Series +else ifeq ($(HOST_ARCH), aarch64) + DEV_FAM = Ultrascale +endif + +#Checks for Correct architecture +ifneq ($(HOST_ARCH), $(filter $(HOST_ARCH),aarch64 aarch32 x86)) +$(error HOST_ARCH variable not set, please set correctly and rerun) +endif + +check_version: +ifneq (, $(shell which git)) +ifneq (,$(wildcard $(XFLIB_DIR)/.git)) + @cd $(XFLIB_DIR) && git log --graph --pretty=format:'%Cred%h%Creset -%C(yellow)%d%Creset %s %Cgreen(%cr) %C(bold blue)<%an>%Creset' --abbrev-commit -n 1 && cd - +endif +endif + +#Checks for SYSROOT +check_sysroot: +ifneq ($(HOST_ARCH), x86) +ifndef SYSROOT + $(error SYSROOT ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif + +#Checks for g++ +CXX := g++ +CXX_REQ := $(shell echo $(GCC_INTOOL) | cut -f 1 -d ".") +ifeq ($(HOST_ARCH), x86) +ifneq ($(shell expr $(shell echo "__GNUG__" | g++ -E -x c++ - | tail -1) \>= $(CXX_REQ)), 1) +ifndef XILINX_VIVADO +$(error [ERROR]: g++ version too old. Please use $(CXX_REQ) or above) +else +CXX := $(XILINX_VIVADO)/tps/lnx64/gcc-$(GCC_INTOOL)/bin/g++ +ifeq ($(LD_LIBRARY_PATH),) +export LD_LIBRARY_PATH := $(XILINX_VIVADO)/tps/lnx64/gcc-$(GCC_INTOOL)/lib64 +else +export LD_LIBRARY_PATH := $(XILINX_VIVADO)/tps/lnx64/gcc-$(GCC_INTOOL)/lib64:$(LD_LIBRARY_PATH) +endif +$(warning [WARNING]: g++ version too old. Using g++ provided by the tool: $(CXX)) +endif +endif +else ifeq ($(HOST_ARCH), aarch64) +CXX := $(XILINX_VITIS)/gnu/aarch64/lin/aarch64-linux/bin/aarch64-linux-gnu-g++ +else ifeq ($(HOST_ARCH), aarch32) +CXX := $(XILINX_VITIS)/gnu/aarch32/lin/gcc-arm-linux-gnueabi/bin/arm-linux-gnueabihf-g++ +endif + +#check binutils +BINUTILS := $(shell ld -v | cut -f 4 -d " " | cut -f 1 -d "-") +BINUTILS_REQ := $(BINUTILS_INTOOL) +ifneq ($(shell expr $(BINUTILS) \>= $(BINUTILS_REQ)), 1) +export PATH := $(XILINX_VIVADO)/tps/lnx64/binutils-$(BINUTILS_INTOOL)/bin:$(PATH) +endif + +#Setting VPP +VPP := v++ + +#Cheks for aiecompiler +AIECXX := aiecompiler +AIESIMULATOR := aiesimulator +X86SIMULATOR := x86simulator + +.PHONY: check_vivado +check_vivado: +ifeq (,$(wildcard $(XILINX_VIVADO)/bin/vivado)) + @echo "Cannot locate Vivado installation. Please set XILINX_VIVADO variable." && false +endif + +.PHONY: check_vpp +check_vpp: +ifeq (,$(wildcard $(XILINX_VITIS)/bin/v++)) + @echo "Cannot locate Vitis installation. Please set XILINX_VITIS variable." && false +endif + +.PHONY: check_xrt +check_xrt: +ifeq (,$(wildcard $(XILINX_XRT)/lib/libxilinxopencl.so)) + @echo "Cannot locate XRT installation. Please set XILINX_XRT variable." && false +endif + +export PATH := $(XILINX_VITIS)/bin:$(XILINX_XRT)/bin:$(PATH) +ifeq ($(HOST_ARCH), x86) +ifeq (,$(LD_LIBRARY_PATH)) +LD_LIBRARY_PATH := $(XILINX_XRT)/lib +else +LD_LIBRARY_PATH := $(XILINX_XRT)/lib:$(LD_LIBRARY_PATH) +endif +endif + +ifneq (,$(wildcard $(PLATFORM))) +# Use PLATFORM as a file path +XPLATFORM := $(PLATFORM) +else +# Use PLATFORM as a file name pattern +# 1. search paths specified by variable +ifneq (,$(PLATFORM_REPO_PATHS)) +# 1.1 as exact name +XPLATFORM := $(strip $(foreach p, $(subst :, ,$(PLATFORM_REPO_PATHS)), $(wildcard $(p)/$(PLATFORM)/$(PLATFORM).xpfm))) +# 1.2 as a pattern +ifeq (,$(XPLATFORM)) +XPLATFORMS := $(foreach p, $(subst :, ,$(PLATFORM_REPO_PATHS)), $(wildcard $(p)/*/*.xpfm)) +XPLATFORM := $(strip $(foreach p, $(XPLATFORMS), $(shell echo $(p) | awk '$$1 ~ /$(PLATFORM)/'))) +endif # 1.2 +endif # 1 +# 2. search Vitis installation +ifeq (,$(XPLATFORM)) +# 2.1 as exact name +XPLATFORM := $(strip $(wildcard $(XILINX_VITIS)/platforms/$(PLATFORM)/$(PLATFORM).xpfm)) +# 2.2 as a pattern +ifeq (,$(XPLATFORM)) +XPLATFORMS := $(wildcard $(XILINX_VITIS)/platforms/*/*.xpfm) +XPLATFORM := $(strip $(foreach p, $(XPLATFORMS), $(shell echo $(p) | awk '$$1 ~ /$(PLATFORM)/'))) +endif # 2.2 +endif # 2 +# 3. search default locations +ifeq (,$(XPLATFORM)) +# 3.1 as exact name +XPLATFORM := $(strip $(wildcard /opt/xilinx/platforms/$(PLATFORM)/$(PLATFORM).xpfm)) +# 3.2 as a pattern +ifeq (,$(XPLATFORM)) +XPLATFORMS := $(wildcard /opt/xilinx/platforms/*/*.xpfm) +XPLATFORM := $(strip $(foreach p, $(XPLATFORMS), $(shell echo $(p) | awk '$$1 ~ /$(PLATFORM)/'))) +endif # 3.2 +endif # 3 +endif + +define MSG_PLATFORM +No platform matched pattern '$(PLATFORM)'. +Available platforms are: $(XPLATFORMS) +To add more platform directories, set the PLATFORM_REPO_PATHS variable or point PLATFORM variable to the full path of platform .xpfm file. +endef +export MSG_PLATFORM + + +.PHONY: check_platform +check_platform: +ifeq (,$(XPLATFORM)) + @echo "$${MSG_PLATFORM}" && false +endif +#Check ends + +# device2xsa - create a filesystem friendly name from device name +# $(1) - full name of device +PLATFORM_NAME = $(strip $(patsubst %.xpfm, % , $(shell basename $(PLATFORM)))) + + +# Cleaning stuff +RM = rm -f +RMDIR = rm -rf + +MV = mv -f +CP = cp -rf +ECHO:= @echo diff --git a/codec/L2/demos/jxlEnc/acc_tokInit_histogram/Makefile b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/Makefile new file mode 100644 index 0000000000..3dfa176276 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/Makefile @@ -0,0 +1,331 @@ +# Copyright 2019-2022 Xilinx, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# vitis makefile-generator v2.0.6 + +############################## Help Section ############################## +.PHONY: help + +help:: + $(ECHO) "Makefile Usage:" + $(ECHO) " make all TARGET= PLATFORM= HOST_ARCH=" + $(ECHO) " Command to generate the design for specified Target and Shell." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" + $(ECHO) "" + $(ECHO) " make run TARGET= PLATFORM= HOST_ARCH=" + $(ECHO) " Command to run application in emulation." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH required for SoC shells" + $(ECHO) "" + $(ECHO) " make xclbin TARGET= PLATFORM= HOST_ARCH=" + $(ECHO) " Command to build xclbin application." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" + $(ECHO) "" + $(ECHO) " make host TARGET=" + $(ECHO) " Command to build host application." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH is required for SoC shells" + $(ECHO) "" + $(ECHO) " NOTE: For embedded devices, e.g. zcu102/zcu104/vck190, env variable SYSROOT and EDGE_COMMON_SW need to be set first, and HOST_ARCH is either aarch32 or aarch64. For example," + $(ECHO) " export SYSROOT=< path-to-platform-sysroot >" + $(ECHO) " export EDGE_COMMON_SW=< path-to-rootfs-and-Image-files >" + $(ECHO) "" + $(ECHO) " make clean " + $(ECHO) " Command to remove the generated non-hardware files." + $(ECHO) "" + $(ECHO) " make cleanall" + $(ECHO) " Command to remove all the generated files." + $(ECHO) "" + +############################## Setting up Project Variables ############################## + +MK_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +XF_PROJ_ROOT ?= $(shell bash -c 'export MK_PATH=$(MK_PATH); echo $${MK_PATH%/L2/*}') +CUR_DIR := $(patsubst %/,%,$(dir $(MK_PATH))) +XFLIB_DIR = $(XF_PROJ_ROOT) + +# setting devault value +TARGET ?= sw_emu +HOST_ARCH ?= x86 + +#setting PLATFORM +ifeq ($(PLATFORM),) +PLATFORM := $(DEVICE) +endif +ifeq ($(PLATFORM),) +PLATFORM := xilinx_u50_gen3x16_xdma_5_202210_1 +endif + +# #################### Checking if PLATFORM in whitelist ############################ +PLATFORM_ALLOWLIST += u50 +PLATFORM_BLOCKLIST += zc + +include ./utils.mk +TEMP_DIR := _x_temp.$(TARGET).$(PLATFORM_NAME) +TEMP_REPORT_DIR := $(CUR_DIR)/reports/_x.$(TARGET).$(PLATFORM_NAME) +BUILD_DIR := build_dir.$(TARGET).$(PLATFORM_NAME) +ifneq ($(RESULT_DIR),) +BUILD_DIR = $(RESULT_DIR) +endif +BUILD_REPORT_DIR := $(CUR_DIR)/reports/_build.$(TARGET).$(PLATFORM_NAME) +EMCONFIG := $(BUILD_DIR)/emconfig.json +XCLBIN_DIR := $(CUR_DIR)/$(BUILD_DIR) +export XCL_BINDIR = $(XCLBIN_DIR) + +EXE_FILE_DEPS := +BINARY_CONTAINERS_DEPS := +RUN_DEPS := + +# get global setting +ifeq ($(HOST_ARCH), x86) +CXXFLAGS += -fmessage-length=0 -I$(CUR_DIR)/src/ -I$(XILINX_XRT)/include -I$(XILINX_HLS)/include -std=c++14 -O3 -Wall -Wno-unknown-pragmas -Wno-unused-label +LDFLAGS += -pthread -L$(XILINX_XRT)/lib -L$(XILINX_HLS)/lnx64/tools/fpo_v7_0 -Wl,--as-needed -lOpenCL -lxrt_coreutil -lgmp -lmpfr -lIp_floating_point_v7_0_bitacc_cmodel +VPP_FLAGS += -t $(TARGET) --platform $(XPLATFORM) --save-temps +VPP_LDFLAGS += --optimize 2 -R 2 +else ifeq ($(HOST_ARCH), aarch64) +CXXFLAGS += -I$(CUR_DIR)/src/ -fmessage-length=0 --sysroot=$(SYSROOT) -I$(SYSROOT)/usr/include/xrt -I$(XILINX_HLS)/include -std=c++14 -O3 -Wall -Wno-unknown-pragmas -Wno-unused-label +LDFLAGS += -pthread -L$(SYSROOT)/usr/lib -L$(XILINX_VITIS_AIETOOLS)/lib/aarch64.o -Wl,--as-needed -lxilinxopencl -lxrt_coreutil +VPP_FLAGS += -t $(TARGET) --platform $(XPLATFORM) --save-temps +VPP_LDFLAGS += --optimize 2 -R 2 +endif +CXXFLAGS += $(EXTRA_CXXFLAGS) +VPP_FLAGS += $(EXTRA_VPP_FLAGS) + +########################## Setting up Host Variables ########################## +ifeq ($(TARGET),sw_emu) +CXXFLAGS += -D SW_EMU_TEST +endif +ifeq ($(TARGET),hw_emu) +CXXFLAGS += -D HW_EMU_TEST +endif + +ifeq (,$(findstring opencv,$(CXXFLAGS))) +CXXFLAGS += $(XRT_CXXFLAGS) +endif + +#Inclue Required Host Source Files +HOST_SRCS += $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cjxl.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cjxl_main.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cmdline.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/codec_config.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/speed_stats.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cpu/cpu.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/cpu/os_specific.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/tools/box/box.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/time.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_png.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_pgx.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_pnm.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_jpg.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/extras/codec_psd.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_internal.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/decode_to_jpeg.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_huffman.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_context_map.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/progressive_split.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_detect_dots.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_params.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/blending.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_comparator.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_table.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_tree.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/linalg.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_file.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/headers.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/alpha.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/image_metadata.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/frame_header.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_fast_heuristics.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_encode.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/fields.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/luminance.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_color_management.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_bit_writer.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/image.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/loop_filter.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/color_management.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_modular.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_quant_weights.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_noise.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_splines.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_patch_dictionary.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/splines.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_xyb.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ar_control_field.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/memory_manager_internal.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_file.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_image_bundle.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_toc.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/passes_state.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/chroma_from_luma.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_context_map.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_coeff_order.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_ans.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_entropy_coder.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_common.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_dc.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/epf.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_dot_dictionary.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_frame.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_patch_dictionary.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_comparator.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_reconstruct.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group_border.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/filters.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/convolve.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_cache.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_noise.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dec_huffman.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/dct_scales.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_decode.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/enc_icc_codec.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli/butteraugli.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data_writer.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/jpeg_data.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data_reader.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/base/padded_bytes.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/base/data_parallel.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/base/cache_aligned.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/base/status.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/dec_ma.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/modular_image.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/encoding.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_rct.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_squeeze.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_palette.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/squeeze.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_transform.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/jxl_transform.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_ma.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_encoding.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/encode.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/memory.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/backward_references_hq.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/brotli_bit_stream.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/block_splitter.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/metablock.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/compress_fragment.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/compress_fragment_two_pass.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/backward_references.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/encoder_dict.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/utf8_util.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/decode.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/static_dict.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/literal_cost.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/entropy_encode.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/bit_cost.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/cluster.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/dictionary_hash.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/histogram.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/bit_reader.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/huffman.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/state.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/dictionary.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/transform.c $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmslut.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsnamed.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmspack.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscnvrt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsio1.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsgmt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsopt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsalpha.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmstypes.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsintrp.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsgamma.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscam02.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscgats.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmshalf.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsmtrx.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsps2.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmssamp.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmssm.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsxform.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsio0.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsplugin.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmserr.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmspcs.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmswtpnt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsvirt.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lodepng/lodepng.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/highway/hwy/aligned_allocator.cc $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/highway/hwy/targets.cc $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_cluster_histogram.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_enc_ac_strategy.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_enc_adaptive_quantization.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_enc_cache.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_enc_chroma_from_luma.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_enc_cluster.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_enc_frame.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/acc_enc_group.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_host.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_phase1.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_phase2.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_phase3.cpp $(XFLIB_DIR)/ext/xcl2/xcl2.cpp $(XFLIB_DIR)/L2/demos/jxlEnc/acc_tokInit_histogram/host/host_tokinit_histogram.cpp +CXXFLAGS += -I $(XFLIB_DIR)/../utils/L1/include/ -I $(XFLIB_DIR)/ext/xcl2 -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/ -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/lib/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/build/lib/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lcms/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/highway -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/brotli/c/include -I $(XFLIB_DIR)/L2/demos/jxlEnc/third_partys/third_party/lodepng -I $(XFLIB_DIR)/L2/demos/jxlEnc/acc_tokInit_histogram/kernel -I $(XFLIB_DIR)/L2/demos/jxlEnc/acc_tokInit_histogram/host -I $(XFLIB_DIR)/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram -I $(XFLIB_DIR)/L2/demos/jxlEnc/others/include +CXXFLAGS += -O3 + +EXE_NAME := host.exe +EXE_FILE := $(BUILD_DIR)/$(EXE_NAME) +EXE_FILE_DEPS := $(HOST_SRCS) $(EXE_FILE_DEPS) + +HOST_ARGS := --xclbin $(BUILD_DIR)/jxlEnc.xclbin $(XFLIB_DIR)/L2/demos/jxlEnc/images/t0.png t0.jxl +ifneq ($(HOST_ARCH), x86) +PKG_HOST_ARGS = $(foreach args,$(HOST_ARGS),$(subst $(dir $(patsubst %/,%,$(args))),,$(args))) +endif + +########################## Kernel compiler global settings ########################## +ifneq (,$(shell echo $(XPLATFORM) | awk '/u50/')) +VPP_FLAGS += --config $(CUR_DIR)/conn_u50.cfg +VPP_FLAGS += -I $(XFLIB_DIR)/../utils/L1/include/ -I $(XFLIB_DIR)/L2/include/hw/jxlEnc + +else +VPP_FLAGS += -I $(XFLIB_DIR)/../utils/L1/include/ -I $(XFLIB_DIR)/L2/include/hw/jxlEnc + +endif + +######################### binary container global settings ########################## +VPP_FLAGS_JxlEnc_ans_initHistogram += -D KERNEL_NAME=JxlEnc_ans_initHistogram +VPP_FLAGS_JxlEnc_ans_initHistogram += --hls.clock 300000000:JxlEnc_ans_initHistogram +ifneq ($(HOST_ARCH), x86) +VPP_LDFLAGS_jxlEnc += --clock.defaultFreqHz 300000000 +else +VPP_LDFLAGS_jxlEnc += --kernel_frequency 300 +endif + +ifeq ($(HOST_ARCH), x86) +BINARY_CONTAINERS += $(BUILD_DIR)/jxlEnc.xclbin +else +BINARY_CONTAINERS += $(BUILD_DIR)/jxlEnc_pkg.$(LINK_TARGET_FMT) +BINARY_CONTAINERS_PKG += $(BUILD_DIR)/jxlEnc.xclbin +endif + +# ################ Setting Rules for Binary Containers (Building Kernels) ################ +$(TEMP_DIR)/JxlEnc_ans_initHistogram.xo: $(XFLIB_DIR)/L2/demos/jxlEnc/acc_tokInit_histogram/kernel/hls_init_histogram.cpp + $(ECHO) "Compiling Kernel: JxlEnc_ans_initHistogram" + mkdir -p $(TEMP_DIR) + $(VPP) -c $(VPP_FLAGS_JxlEnc_ans_initHistogram) $(VPP_FLAGS) -k JxlEnc_ans_initHistogram -I'$(> $(RUN_SCRIPT) +ifneq ($(filter sw_emu hw_emu, $(TARGET)),) + @echo 'export XCL_EMULATION_MODE=$(TARGET)' >> $(RUN_SCRIPT) +endif + @echo 'export XILINX_VITIS=/mnt' >> $(RUN_SCRIPT) + @echo 'export XILINX_XRT=/usr' >> $(RUN_SCRIPT) + @echo 'if [ -f platform_desc.txt ]; then' >> $(RUN_SCRIPT) + @echo ' cp platform_desc.txt /etc/xocl.txt' >> $(RUN_SCRIPT) + @echo 'fi' >> $(RUN_SCRIPT) + @echo './$(EXE_NAME) $(PKG_HOST_ARGS)' >> $(RUN_SCRIPT) + @echo 'return_code=$$?' >> $(RUN_SCRIPT) + @echo 'if [ $$return_code -ne 0 ]; then' >> $(RUN_SCRIPT) + @echo ' echo "ERROR: Embedded host run failed, RC=$$return_code"' >> $(RUN_SCRIPT) + @echo 'else' >> $(RUN_SCRIPT) + @echo ' echo "INFO: TEST PASSED, RC=0"' >> $(RUN_SCRIPT) + @echo 'fi' >> $(RUN_SCRIPT) + @echo 'echo "INFO: Embedded host run completed."' >> $(RUN_SCRIPT) + @echo 'exit $$return_code' >> $(RUN_SCRIPT) +DATA_FILE := +DATA_DIR := +SD_FILES += $(RUN_SCRIPT) +SD_FILES += $(EXE_FILE) +SD_FILES += $(EMCONFIG) +SD_FILES += xrt.ini +SD_FILES += $(DATA_FILE)# where define DATAFILE in json +SD_FILES_WITH_PREFIX = $(foreach sd_file,$(SD_FILES), $(if $(filter $(sd_file),$(wildcard $(sd_file))), --package.sd_file $(sd_file))) +SD_DIRS_WITH_PREFIX = $(foreach sd_dir,$(DATA_DIR),--package.sd_dir $(sd_dir)) +PACKAGE_FILES := $(BINARY_CONTAINERS) +PACKAGE_FILES += $(AIE_CONTAINER) +SD_CARD := $(CUR_DIR)/package_$(TARGET) +vck190_dfx_hw := false +$(SD_CARD): $(EXE_FILE) $(BINARY_CONTAINERS) $(RUN_SCRIPT) $(EMCONFIG) + @echo "Generating sd_card folder...." + mkdir -p $(SD_CARD) + chmod a+rx $(BUILD_DIR)/run_script.sh +ifneq (,$(findstring vck190_base_dfx, $(PLATFORM_NAME))) +ifeq ($(TARGET),hw) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p $(PACKAGE_FILES) -o $(BINARY_CONTAINERS_PKG) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -p --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) --package.sd_file $(BINARY_CONTAINERS_PKG) + @echo "### ***** sd_card generation done! ***** ###" +vck190_dfx_hw := true +endif +endif +ifeq ($(vck190_dfx_hw), false) + $(VPP) -t $(TARGET) --platform $(XPLATFORM) -o $(BINARY_CONTAINERS_PKG) -p $(PACKAGE_FILES) $(VPP_PACKAGE) --package.out_dir $(SD_CARD) --package.rootfs $(SYSROOT)/../../rootfs.ext4 --package.kernel_image $(K_IMAGE) $(SD_FILES_WITH_PREFIX) $(SD_DIRS_WITH_PREFIX) + @echo "### ***** sd_card generation done! ***** ###" +endif + +.PHONY: sd_card +sd_card: $(SD_CARD) +endif +############################## Setting Essential Checks and Building Rules ############################## +RUN_DEPS += $(EXE_FILE) $(BINARY_CONTAINERS) $(EMCONFIG) +RUN_DEPS += $(SD_CARD) + +.PHONY: mkflag all run +mkflag: + mkdir -p $(BUILD_DIR) + rm -rf $(BUILD_DIR)/makefile_args.txt + @for var in $(MAKEFLAGS); do echo $$var >> $(BUILD_DIR)/makefile_args.txt; done +all: check_device check_vpp check_platform mkflag $(RUN_DEPS) +run: all +#hw_emu +ifneq (,$(filter hw_emu, $(TARGET))) +ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + XCL_EMULATION_MODE=$(TARGET) $(EXE_FILE) $(HOST_ARGS) + ./check.sh +else + @echo $(RUN_DEPS) + $(SD_CARD)/launch_$(TARGET).sh -no-reboot -run-app $(notdir $(RUN_SCRIPT)) + grep "TEST PASSED, RC=0" $(SD_CARD)/qemu_output.log || exit 1 + ./check.sh +endif +endif +#sw_emu +ifneq (,$(filter sw_emu, $(TARGET))) +ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + XCL_EMULATION_MODE=$(TARGET) $(EXE_FILE) $(HOST_ARGS) + ./check.sh +else + @echo $(RUN_DEPS) + $(SD_CARD)/launch_$(TARGET).sh -no-reboot -run-app $(notdir $(RUN_SCRIPT)) + grep "TEST PASSED, RC=0" $(SD_CARD)/qemu_output.log || exit 1 + ./check.sh +endif +endif +#hw +ifeq ($(TARGET), hw) +ifneq (,$(findstring aws-vu9p-f1, $(PLATFORM_NAME))) +ifneq ($(JENKINS_INTERNAL_BUILD), 1) + $(ECHO) "This makefile does not directly support converting .xclbin to .awsxclbin, please refer https://github.com/aws/aws-fpga/blob/master/Vitis/README.md for next operations" +else + $(ECHO) "Running inside Xilinx regression without converting to .awsxclbin" + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(EXE_FILE) $(HOST_ARGS) + ./check.sh +endif +else ifeq ($(HOST_ARCH), x86) + LD_LIBRARY_PATH=$(LIBRARY_PATH):$$LD_LIBRARY_PATH \ + $(EXE_FILE) $(HOST_ARGS) + ./check.sh +else + $(ECHO) "Please copy the content of sd_card folder and data to an SD Card and run on the board" +endif +endif + +############################## Setting Targets ############################## + +.PHONY: clean cleanall emconfig +emconfig: $(EMCONFIG) + +.PHONY: host +ifeq ($(HOST_ARCH), x86) +host: check_xrt $(EXE_FILE) +else +host: check_sysroot $(EXE_FILE) +endif + +.PHONY: xclbin +ifeq ($(HOST_ARCH), x86) +xclbin: check_vpp check_xrt $(BINARY_CONTAINERS) +else +xclbin: check_vpp check_sysroot $(BINARY_CONTAINERS) +endif + +############################## Cleaning Rules ############################## +cleanh: + -$(RMDIR) $(EXE_FILE) vitis_* TempConfig system_estimate.xtxt *.rpt .run/ + -$(RMDIR) src/*.ll _xocc_* .Xil dltmp* xmltmp* *.log *.jou *.wcfg *.wdb sample_link.ini sample_compile.ini obj* bin* *.csv *.jpg *.jpeg *.png + +cleank: + -$(RMDIR) $(BUILD_DIR)/*.xclbin _vimage *xclbin.run_summary qemu-memory-_* emulation/ _vimage/ pl*start_simulation. sh *.xclbin + -$(RMDIR) _x_temp.* + +cleanall: cleanh cleank + -$(RMDIR) $(BUILD_DIR) emconfig.json *.html $(TEMP_DIR) $(CUR_DIR)/reports *.csv *.run_summary $(CUR_DIR)/*.raw package_* $(BUILD_DIR)/run_script.sh .ipcache *.str + -$(RMDIR) $(CUR_DIR)/Work $(CUR_DIR)/*.xpe $(CUR_DIR)/hw.o $(CUR_DIR)/*.xsa $(CUR_DIR)/xnwOut + -$(RMDIR) + +clean: cleanh \ No newline at end of file diff --git a/codec/L2/demos/jxlEnc/acc_tokInit_histogram/check.sh b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/check.sh new file mode 100755 index 0000000000..780685e603 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/check.sh @@ -0,0 +1 @@ +echo "7d5224e22f239d3b9322d507b8ca3fb9 t0.jxl" | md5sum -c - diff --git a/codec/L2/demos/jxlEnc/acc_tokInit_histogram/conn_u50.cfg b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/conn_u50.cfg new file mode 100644 index 0000000000..c1d1936089 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/conn_u50.cfg @@ -0,0 +1,41 @@ +[hls] +#pre_tcl=JxlEnc_ans_pre.tcl + +[connectivity] +sp=JxlEnc_ans_initHistogram_1.ac_coeff_ordered_ddr:HBM[2] +sp=JxlEnc_ans_initHistogram_1.strategy_ddr:HBM[3] +sp=JxlEnc_ans_initHistogram_1.qf_ddr:HBM[4] +sp=JxlEnc_ans_initHistogram_1.qdc_ddr:HBM[5] +sp=JxlEnc_ans_initHistogram_1.ctx_map:HBM[6] +sp=JxlEnc_ans_initHistogram_1.qf_thresholds:HBM[6] +sp=JxlEnc_ans_initHistogram_1.config:HBM[7] +sp=JxlEnc_ans_initHistogram_1.ac_tokens_ddr:HBM[8] +sp=JxlEnc_ans_initHistogram_1.tokens0_ptr:HBM[9] +sp=JxlEnc_ans_initHistogram_1.tokens1_ptr:HBM[10] +sp=JxlEnc_ans_initHistogram_1.tokens2_ptr:HBM[11] +sp=JxlEnc_ans_initHistogram_1.tokens3_ptr:HBM[12] +# nonempty +sp=JxlEnc_ans_initHistogram_1.nonempty0_ptr:HBM[9] +sp=JxlEnc_ans_initHistogram_1.nonempty1_ptr:HBM[9] +sp=JxlEnc_ans_initHistogram_1.nonempty2_ptr:HBM[9] +sp=JxlEnc_ans_initHistogram_1.nonempty3_ptr:HBM[9] +sp=JxlEnc_ans_initHistogram_1.nonempty4_ptr:HBM[9] +# histograms_ptr +sp=JxlEnc_ans_initHistogram_1.histograms0_ptr:HBM[10] +sp=JxlEnc_ans_initHistogram_1.histograms1_ptr:HBM[10] +sp=JxlEnc_ans_initHistogram_1.histograms2_ptr:HBM[10] +sp=JxlEnc_ans_initHistogram_1.histograms3_ptr:HBM[10] +sp=JxlEnc_ans_initHistogram_1.histograms4_ptr:HBM[10] +# histograms_size +sp=JxlEnc_ans_initHistogram_1.histograms_size0_ptr:HBM[11] +sp=JxlEnc_ans_initHistogram_1.histograms_size1_ptr:HBM[11] +sp=JxlEnc_ans_initHistogram_1.histograms_size2_ptr:HBM[11] +sp=JxlEnc_ans_initHistogram_1.histograms_size3_ptr:HBM[11] +sp=JxlEnc_ans_initHistogram_1.histograms_size4_ptr:HBM[11] +# total_count +sp=JxlEnc_ans_initHistogram_1.total_count0_ptr:HBM[12] +sp=JxlEnc_ans_initHistogram_1.total_count1_ptr:HBM[12] +sp=JxlEnc_ans_initHistogram_1.total_count2_ptr:HBM[12] +sp=JxlEnc_ans_initHistogram_1.total_count3_ptr:HBM[12] +sp=JxlEnc_ans_initHistogram_1.total_count4_ptr:HBM[12] + diff --git a/codec/L2/demos/jxlEnc/acc_tokInit_histogram/description.json b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/description.json new file mode 100644 index 0000000000..34d11bd3a8 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/description.json @@ -0,0 +1,329 @@ +{ + "gui": false, + "name": "JXL ACC_TOKINIT Demo", + "description": "This example is based on Google's PIK, which was chosen as the base framework for JPEG XL. The pikEnc is based on the 'fast mode' of PIK which can provide better encoding efficnty than most of other still image encoding methods. The pikEnc is based on Xilinx HLS design methodology and optimized for FPGA arthitecture. It can proved higher throughput and lower latency compared to software-based solutions", + "flow": "vitis", + "platform_allowlist": [ + "u50" + ], + "platform_blocklist": [ + "zc" + ], + "platform_properties": { + "u50": { + "v++": { + "compiler": { + "clflags": [ + "--config PROJECT/conn_u50.cfg" + ] + } + } + } + }, + "data": [ + "./data" + ], + "launch": [ + { + "cmd_args": " --xclbin BUILD/jxlEnc.xclbin LIB_DIR/L2/demos/jxlEnc/images/t0.png t0.jxl", + "name": "generic launch for all flows" + } + ], + "post_launch": [ + { + "launch_cmd": [ + "./check.sh" + ] + } + ], + "host": { + "host_exe": "host.exe", + "compiler": { + "sources": [ + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cjxl.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cjxl_main.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cmdline.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/codec_config.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/speed_stats.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cpu/cpu.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/cpu/os_specific.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/tools/box/box.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/time.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_png.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_pgx.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_pnm.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_jpg.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/extras/codec_psd.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_internal.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/decode_to_jpeg.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_huffman.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_context_map.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/progressive_split.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_detect_dots.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_params.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/blending.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_comparator.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_table.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_tree.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/linalg.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_file.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/headers.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/alpha.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/image_metadata.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/frame_header.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_fast_heuristics.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_encode.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/fields.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/luminance.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_color_management.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_bit_writer.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/image.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/loop_filter.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/color_management.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_modular.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_quant_weights.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_noise.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_splines.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_patch_dictionary.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/splines.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_xyb.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ar_control_field.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/memory_manager_internal.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_file.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_image_bundle.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_toc.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/passes_state.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/chroma_from_luma.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_context_map.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_coeff_order.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_ans.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_entropy_coder.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_common.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_dc.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/epf.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_dot_dictionary.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_frame.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_patch_dictionary.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_comparator.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_reconstruct.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group_border.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/filters.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/convolve.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_cache.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_noise.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dec_huffman.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/dct_scales.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_decode.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/enc_icc_codec.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli/butteraugli.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data_writer.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/jpeg_data.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data_reader.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/base/padded_bytes.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/base/data_parallel.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/base/cache_aligned.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/base/status.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/dec_ma.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/modular_image.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/encoding.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_rct.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_squeeze.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_palette.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/squeeze.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_transform.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/jxl_transform.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_ma.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_encoding.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/encode.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/memory.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/backward_references_hq.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/brotli_bit_stream.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/block_splitter.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/metablock.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/compress_fragment.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/compress_fragment_two_pass.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/backward_references.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/encoder_dict.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/utf8_util.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/decode.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/static_dict.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/literal_cost.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/entropy_encode.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/bit_cost.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/cluster.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/dictionary_hash.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/enc/histogram.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/bit_reader.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/huffman.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/dec/state.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/dictionary.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/transform.c", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmslut.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsnamed.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmspack.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscnvrt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsio1.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsgmt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsopt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsalpha.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmstypes.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsintrp.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsgamma.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscam02.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmscgats.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmshalf.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsmtrx.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsps2.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmssamp.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmssm.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsxform.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsio0.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsplugin.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmserr.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmspcs.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmswtpnt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/src/cmsvirt.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lodepng/lodepng.cpp", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/highway/hwy/aligned_allocator.cc", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/highway/hwy/targets.cc", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_cluster_histogram.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_enc_ac_strategy.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_enc_adaptive_quantization.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_enc_cache.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_enc_chroma_from_luma.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_enc_cluster.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_enc_frame.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/acc_enc_group.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_host.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_phase1.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_phase2.cpp", + "LIB_DIR/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_phase3.cpp", + "LIB_DIR/ext/xcl2/xcl2.cpp", + "LIB_DIR/L2/demos/jxlEnc/acc_tokInit_histogram/host/host_tokinit_histogram.cpp" + ], + "includepaths": [ + "LIB_DIR/../utils/L1/include/", + "LIB_DIR/ext/xcl2", + "LIB_DIR/L2/demos/jxlEnc/third_partys/", + "LIB_DIR/L2/demos/jxlEnc/third_partys/lib/include", + "LIB_DIR/L2/demos/jxlEnc/third_partys/build/lib/include", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lcms/include", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/highway", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/brotli/c/include", + "LIB_DIR/L2/demos/jxlEnc/third_partys/third_party/lodepng", + "LIB_DIR/L2/demos/jxlEnc/acc_tokInit_histogram/kernel", + "LIB_DIR/L2/demos/jxlEnc/acc_tokInit_histogram/host", + "LIB_DIR/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram", + "LIB_DIR/L2/demos/jxlEnc/others/include" + ], + "options": "-O3 " + } + }, + "v++": { + "compiler": { + "includepaths": [ + "LIB_DIR/../utils/L1/include/", + "LIB_DIR/L2/include/hw/jxlEnc" + ] + } + }, + "containers": [ + { + "name": "jxlEnc", + "accelerators": [ + { + "location": "LIB_DIR/L2/demos/jxlEnc/acc_tokInit_histogram/kernel/hls_init_histogram.cpp", + "frequency": 300.0, + "clflags": " -D KERNEL_NAME=JxlEnc_ans_initHistogram", + "name": "JxlEnc_ans_initHistogram", + "num_compute_units": 1, + "compute_units": [ + { + "name": "JxlEnc_ans_initHistogram", + "arguments": [ + { + "name": "gmem0_0", + "memory": "DDR[0]" + }, + { + "name": "gmem0_1", + "memory": "DDR[0]" + }, + { + "name": "gmem1_0", + "memory": "DDR[1]" + }, + { + "name": "gmem1_1", + "memory": "DDR[1]" + }, + { + "name": "gmem1_2", + "memory": "DDR[1]" + } + ] + } + ] + } + ], + "frequency": 300 + } + ], + "testinfo": { + "disable": false, + "jobs": [ + { + "index": 0, + "dependency": [], + "env": "", + "cmd": "", + "max_memory_MB": { + "vitis_hw_build": 81920, + "vitis_hw_emu": 40960, + "vitis_sw_emu": 10240, + "vitis_hw_run": 10240 + }, + "max_time_min": { + "vitis_hw_build": 3200, + "vitis_hw_emu": 1600, + "vitis_sw_emu": 120, + "vitis_hw_run": 10 + } + } + ], + "targets": [ + "vitis_sw_emu", + "vitis_hw_emu", + "vitis_hw" + ], + "category": "canary" + } +} diff --git a/codec/L2/demos/jxlEnc/acc_tokInit_histogram/host/host_tokinit_histogram.cpp b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/host/host_tokinit_histogram.cpp new file mode 100644 index 0000000000..f9072acc8f --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/host/host_tokinit_histogram.cpp @@ -0,0 +1,638 @@ +/* + * Copyright 2022 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HOST_TOKINIT_HISTOGRAM_CPP +#define HOST_TOKINIT_HISTOGRAM_CPP + +#include +#include +#include "ap_int.h" + +#ifndef HLS_TEST +#include "xf_utils_sw/logger.hpp" +#include "xcl2.hpp" +const int PIXEL_W = 2048; +const int PIXEL_H = 2048; +const int FRAME_DIM = 3; +const int ALL_PIXEL = PIXEL_W * PIXEL_H * FRAME_DIM; +const int MAX_NUM_BLK88_W = PIXEL_W / 8; +const int MAX_NUM_BLK88_H = PIXEL_H / 8; +const int MAX_NUM_BLK88 = MAX_NUM_BLK88_W * MAX_NUM_BLK88_H; +const int MAX_ORDERS_SIZE = (3 * 64 + 3 * 64 + 3 * 256 + 3 * 1024); +const int MAX_QF_THRESH_SIZE = 256; +const int MAX_CTX_MAP_SIZE = 256; +const int MAX_AC_TOKEN_SIZE = ALL_PIXEL; +#else +#include "hls_init_histogram.hpp" +#endif + +#define MAX_NUM_CONFIG 32 + +unsigned long diff(const struct timeval* newTime, const struct timeval* oldTime) { + return (newTime->tv_sec - oldTime->tv_sec) * 1000000 + (newTime->tv_usec - oldTime->tv_usec); +} + +template +T* aligned_alloc(std::size_t num) { + void* ptr = NULL; + if (posix_memalign(&ptr, 4096, num * sizeof(T))) throw std::bad_alloc(); + return reinterpret_cast(ptr); +} + +void hls_ANSinitHistogram_wrapper(std::string xclbinPath, + int config[32], + //==================== + int32_t* ac_coeff_ordered_ddr, + int32_t* strategy_ddr, + int32_t* qf_ddr, + uint8_t* qdc_ddr, + uint8_t* ctx_map, + uint32_t* qf_thresholds, + uint64_t* ac_tokens_ddr, + //==================== + uint64_t* tokens0_ptr, + uint64_t* tokens1_ptr, + uint64_t* tokens2_ptr, + uint64_t* tokens3_ptr, + //==================== + int32_t* histograms0_ptr, + uint32_t* histograms_size0_ptr, + uint32_t* total_count0_ptr, + uint32_t* nonempty0_ptr, + //====================== + int32_t* histograms1_ptr, + uint32_t* histograms_size1_ptr, + uint32_t* total_count1_ptr, + uint32_t* nonempty1_ptr, + //====================== + int32_t* histograms2_ptr, + uint32_t* histograms_size2_ptr, + uint32_t* total_count2_ptr, + uint32_t* nonempty2_ptr, + //====================== + int32_t* histograms3_ptr, + uint32_t* histograms_size3_ptr, + uint32_t* total_count3_ptr, + uint32_t* nonempty3_ptr, + //====================== + int32_t* histograms4_ptr, + uint32_t* histograms_size4_ptr, + uint32_t* total_count4_ptr, + uint32_t* nonempty4_ptr) { +#ifndef HLS_TEST + + xf::common::utils_sw::Logger logger(std::cout, std::cerr); + cl_int fail; + + struct timeval start_time; // End to end time clock start + gettimeofday(&start_time, 0); + + // platform related operations + std::vector devices = xcl::get_xil_devices(); + cl::Device device = devices[0]; + + // Creating Context and Command Queue for selected Device + cl::Context context(device, NULL, NULL, NULL, &fail); + logger.logCreateContext(fail); + cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &fail); + logger.logCreateCommandQueue(fail); + std::string devName = device.getInfo(); + printf("INFO: Found Device=%s\n", devName.c_str()); + cl::Program::Binaries xclBins = xcl::import_binary_file(xclbinPath); + + devices.resize(1); + cl::Program program(context, devices, xclBins, NULL, &fail); + logger.logCreateProgram(fail); + + int repInt = 1; + // create kernels + // std::vector cluster_kernel(repInt); + std::vector initHist_kernel(repInt); + for (int i = 0; i < repInt; i++) { + initHist_kernel[i] = cl::Kernel(program, "JxlEnc_ans_initHistogram", &fail); + logger.logCreateKernel(fail); + } + std::cout << "INFO: kernel has been created" << std::endl; + + // declare map of host Buffers + std::cout << "kernel config size:" << 26 << std::endl; + std::cout << "group_dim: " << config[4] << std::endl; + std::cout << "do_once: " << config[12] << "," << config[13] << "," << config[14] << "," << config[15] << "," + << config[16] << std::endl; + + // add code for hls_ANSinitTop + // 1. create all I/O Buffer + uint32_t* hb_config = aligned_alloc(MAX_NUM_CONFIG); + + int32_t* hb_ac_coeff_ordered_ddr = aligned_alloc(ALL_PIXEL); + int32_t* hb_strategy_ddr = aligned_alloc(MAX_NUM_BLK88); + int32_t* hb_qf_ddr = aligned_alloc(MAX_NUM_BLK88); + uint8_t* hb_qdc_ddr = aligned_alloc(MAX_NUM_BLK88); + uint8_t* hb_ctx_map = aligned_alloc(MAX_CTX_MAP_SIZE); + uint32_t* hb_qf_thresholds = aligned_alloc(MAX_QF_THRESH_SIZE); + uint64_t* hb_ac_tokens_ddr = aligned_alloc(MAX_AC_TOKEN_SIZE); + + ap_uint<64>* hb_token0_ptr = aligned_alloc >(MAX_AC_TOKEN_SIZE); + ap_uint<64>* hb_token1_ptr = aligned_alloc >(MAX_AC_TOKEN_SIZE); + ap_uint<64>* hb_token2_ptr = aligned_alloc >(MAX_AC_TOKEN_SIZE); + ap_uint<64>* hb_token3_ptr = aligned_alloc >(MAX_AC_TOKEN_SIZE); + + int32_t* hb_histograms0_ptr = aligned_alloc(163840); + int32_t* hb_histograms1_ptr = aligned_alloc(163840); + int32_t* hb_histograms2_ptr = aligned_alloc(163840); + int32_t* hb_histograms3_ptr = aligned_alloc(163840); + int32_t* hb_histograms4_ptr = aligned_alloc(163840); + + uint32_t* hb_histograms_size0_ptr = aligned_alloc(4096); + uint32_t* hb_histograms_size1_ptr = aligned_alloc(4096); + uint32_t* hb_histograms_size2_ptr = aligned_alloc(4096); + uint32_t* hb_histograms_size3_ptr = aligned_alloc(4096); + uint32_t* hb_histograms_size4_ptr = aligned_alloc(4096); + + uint32_t* hb_total_count0_ptr = aligned_alloc(4096); + uint32_t* hb_total_count1_ptr = aligned_alloc(4096); + uint32_t* hb_total_count2_ptr = aligned_alloc(4096); + uint32_t* hb_total_count3_ptr = aligned_alloc(4096); + uint32_t* hb_total_count4_ptr = aligned_alloc(4096); + + uint32_t* hb_nonempty0_ptr = aligned_alloc(4096); + uint32_t* hb_nonempty1_ptr = aligned_alloc(4096); + uint32_t* hb_nonempty2_ptr = aligned_alloc(4096); + uint32_t* hb_nonempty3_ptr = aligned_alloc(4096); + uint32_t* hb_nonempty4_ptr = aligned_alloc(4096); + + //================================================== + // 2. init all the host Buffers + //================================================== + for (int j = 0; j < MAX_NUM_CONFIG; j++) { + hb_config[j] = config[j]; + } + + for (int j = 0; j < ALL_PIXEL; j++) { + hb_ac_coeff_ordered_ddr[j] = ac_coeff_ordered_ddr[j]; + } + + for (int j = 0; j < MAX_NUM_BLK88; j++) { + hb_strategy_ddr[j] = strategy_ddr[j]; + } + + for (int j = 0; j < MAX_NUM_BLK88; j++) { + hb_qdc_ddr[j] = qdc_ddr[j]; + } + + for (int j = 0; j < MAX_NUM_BLK88; j++) { + hb_qf_ddr[j] = qf_ddr[j]; + } + + for (int j = 0; j < MAX_CTX_MAP_SIZE; j++) { + hb_ctx_map[j] = ctx_map[j]; + } + + for (int j = 0; j < MAX_QF_THRESH_SIZE; j++) { + hb_qf_thresholds[j] = qf_thresholds[j]; + } + + for (int j = 0; j < MAX_AC_TOKEN_SIZE; j++) { + hb_ac_tokens_ddr[j] = ac_tokens_ddr[j]; + } + + for (int j = 0; j < MAX_AC_TOKEN_SIZE; j++) { + hb_token0_ptr[j] = (ap_uint<64>)tokens0_ptr[j]; + hb_token1_ptr[j] = (ap_uint<64>)tokens1_ptr[j]; + hb_token2_ptr[j] = (ap_uint<64>)tokens2_ptr[j]; + hb_token3_ptr[j] = (ap_uint<64>)tokens3_ptr[j]; + } + + for (int j = 0; j < 163840; j++) { + hb_histograms0_ptr[j] = 0; + hb_histograms1_ptr[j] = 0; + hb_histograms2_ptr[j] = 0; + hb_histograms3_ptr[j] = 0; + hb_histograms4_ptr[j] = 0; + } + + for (int j = 0; j < 4096; j++) { + hb_histograms_size0_ptr[j] = 0; + hb_histograms_size1_ptr[j] = 0; + hb_histograms_size2_ptr[j] = 0; + hb_histograms_size3_ptr[j] = 0; + hb_histograms_size4_ptr[j] = 0; + } + + for (int j = 0; j < 4096; j++) { + hb_total_count0_ptr[j] = 0; + hb_total_count1_ptr[j] = 0; + hb_total_count2_ptr[j] = 0; + hb_total_count3_ptr[j] = 0; + hb_total_count4_ptr[j] = 0; + } + + for (int j = 0; j < 4096; j++) { + hb_nonempty0_ptr[j] = 0; + hb_nonempty1_ptr[j] = 0; + hb_nonempty2_ptr[j] = 0; + hb_nonempty3_ptr[j] = 0; + hb_nonempty4_ptr[j] = 0; + } + + // mapping to HBM banks + std::vector mext_o(33); + mext_o[0] = {(((unsigned int)(7)) | XCL_MEM_TOPOLOGY), hb_config, 0}; + + mext_o[1] = {(((unsigned int)(2)) | XCL_MEM_TOPOLOGY), hb_ac_coeff_ordered_ddr, 0}; + mext_o[2] = {(((unsigned int)(3)) | XCL_MEM_TOPOLOGY), hb_strategy_ddr, 0}; + mext_o[3] = {(((unsigned int)(4)) | XCL_MEM_TOPOLOGY), hb_qf_ddr, 0}; + mext_o[4] = {(((unsigned int)(5)) | XCL_MEM_TOPOLOGY), hb_qdc_ddr, 0}; + mext_o[5] = {(((unsigned int)(6)) | XCL_MEM_TOPOLOGY), hb_ctx_map, 0}; + mext_o[6] = {(((unsigned int)(6)) | XCL_MEM_TOPOLOGY), hb_qf_thresholds, 0}; + mext_o[7] = {(((unsigned int)(8)) | XCL_MEM_TOPOLOGY), hb_ac_tokens_ddr, 0}; + + mext_o[8] = {(((unsigned int)(9)) | XCL_MEM_TOPOLOGY), hb_token0_ptr, 0}; + mext_o[9] = {(((unsigned int)(10)) | XCL_MEM_TOPOLOGY), hb_token1_ptr, 0}; + mext_o[10] = {(((unsigned int)(11)) | XCL_MEM_TOPOLOGY), hb_token2_ptr, 0}; + mext_o[11] = {(((unsigned int)(12)) | XCL_MEM_TOPOLOGY), hb_token3_ptr, 0}; + + mext_o[12] = {(((unsigned int)(9)) | XCL_MEM_TOPOLOGY), hb_nonempty0_ptr, 0}; + mext_o[13] = {(((unsigned int)(9)) | XCL_MEM_TOPOLOGY), hb_nonempty1_ptr, 0}; + mext_o[14] = {(((unsigned int)(9)) | XCL_MEM_TOPOLOGY), hb_nonempty2_ptr, 0}; + mext_o[15] = {(((unsigned int)(9)) | XCL_MEM_TOPOLOGY), hb_nonempty3_ptr, 0}; + mext_o[16] = {(((unsigned int)(9)) | XCL_MEM_TOPOLOGY), hb_nonempty4_ptr, 0}; + + mext_o[17] = {(((unsigned int)(10)) | XCL_MEM_TOPOLOGY), hb_histograms0_ptr, 0}; + mext_o[18] = {(((unsigned int)(10)) | XCL_MEM_TOPOLOGY), hb_histograms1_ptr, 0}; + mext_o[19] = {(((unsigned int)(10)) | XCL_MEM_TOPOLOGY), hb_histograms2_ptr, 0}; + mext_o[20] = {(((unsigned int)(10)) | XCL_MEM_TOPOLOGY), hb_histograms3_ptr, 0}; + mext_o[21] = {(((unsigned int)(10)) | XCL_MEM_TOPOLOGY), hb_histograms4_ptr, 0}; + + mext_o[22] = {(((unsigned int)(11)) | XCL_MEM_TOPOLOGY), hb_histograms_size0_ptr, 0}; + mext_o[23] = {(((unsigned int)(11)) | XCL_MEM_TOPOLOGY), hb_histograms_size1_ptr, 0}; + mext_o[24] = {(((unsigned int)(11)) | XCL_MEM_TOPOLOGY), hb_histograms_size2_ptr, 0}; + mext_o[25] = {(((unsigned int)(11)) | XCL_MEM_TOPOLOGY), hb_histograms_size3_ptr, 0}; + mext_o[26] = {(((unsigned int)(11)) | XCL_MEM_TOPOLOGY), hb_histograms_size4_ptr, 0}; + + mext_o[27] = {(((unsigned int)(12)) | XCL_MEM_TOPOLOGY), hb_total_count0_ptr, 0}; + mext_o[28] = {(((unsigned int)(12)) | XCL_MEM_TOPOLOGY), hb_total_count1_ptr, 0}; + mext_o[29] = {(((unsigned int)(12)) | XCL_MEM_TOPOLOGY), hb_total_count2_ptr, 0}; + mext_o[30] = {(((unsigned int)(12)) | XCL_MEM_TOPOLOGY), hb_total_count3_ptr, 0}; + mext_o[31] = {(((unsigned int)(12)) | XCL_MEM_TOPOLOGY), hb_total_count4_ptr, 0}; + + //=================================================== + // 3. create device Buffer and map dev buf to host buf, + //=================================================== + cl::Buffer db_config; + + cl::Buffer db_ac_coef_ordered_ddr; + cl::Buffer db_strategy_ddr; + cl::Buffer db_qf_ddr; + cl::Buffer db_qdc_ddr; + cl::Buffer db_ctx_map; + cl::Buffer db_qf_thresholds; + cl::Buffer db_ac_tokens_ddr; + + cl::Buffer db_token0_ptr; + cl::Buffer db_token1_ptr; + cl::Buffer db_token2_ptr; + cl::Buffer db_token3_ptr; + + cl::Buffer db_histograms0_ptr; + cl::Buffer db_histograms1_ptr; + cl::Buffer db_histograms2_ptr; + cl::Buffer db_histograms3_ptr; + cl::Buffer db_histograms4_ptr; + + cl::Buffer db_histograms_size0_ptr; + cl::Buffer db_histograms_size1_ptr; + cl::Buffer db_histograms_size2_ptr; + cl::Buffer db_histograms_size3_ptr; + cl::Buffer db_histograms_size4_ptr; + + cl::Buffer db_total_count0_ptr; + cl::Buffer db_total_count1_ptr; + cl::Buffer db_total_count2_ptr; + cl::Buffer db_total_count3_ptr; + cl::Buffer db_total_count4_ptr; + + cl::Buffer db_nonempty0_ptr; + cl::Buffer db_nonempty1_ptr; + cl::Buffer db_nonempty2_ptr; + cl::Buffer db_nonempty3_ptr; + cl::Buffer db_nonempty4_ptr; + + // init cl Buffer + db_config = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * MAX_NUM_CONFIG, &mext_o[0]); + + db_ac_coef_ordered_ddr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * ALL_PIXEL, &mext_o[1]); + + db_strategy_ddr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * MAX_NUM_BLK88, &mext_o[2]); + + db_qf_ddr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * MAX_NUM_BLK88, &mext_o[3]); + + db_qdc_ddr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * MAX_NUM_BLK88, &mext_o[4]); + + db_ctx_map = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * MAX_CTX_MAP_SIZE, &mext_o[5]); + + db_qf_thresholds = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * MAX_QF_THRESH_SIZE, &mext_o[6]); + db_ac_tokens_ddr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * MAX_AC_TOKEN_SIZE, &mext_o[7]); + //================================= + db_token0_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(ap_uint<64>) * MAX_AC_TOKEN_SIZE, &mext_o[8]); + + db_token1_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(ap_uint<64>) * MAX_AC_TOKEN_SIZE, &mext_o[9]); + + db_token2_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(ap_uint<64>) * MAX_AC_TOKEN_SIZE, &mext_o[10]); + + db_token3_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(ap_uint<64>) * MAX_AC_TOKEN_SIZE, &mext_o[11]); + //=================================== + db_nonempty0_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[12]); + db_nonempty1_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[13]); + db_nonempty2_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[14]); + db_nonempty3_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[15]); + db_nonempty4_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[16]); + //================================= + db_histograms0_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 163840, &mext_o[17]); + db_histograms1_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 163840, &mext_o[18]); + db_histograms2_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 163840, &mext_o[19]); + db_histograms3_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 163840, &mext_o[20]); + db_histograms4_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(int32_t) * 163840, &mext_o[21]); + //================================= + db_histograms_size0_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[22]); + + db_histograms_size1_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[23]); + + db_histograms_size2_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[24]); + + db_histograms_size3_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[25]); + + db_histograms_size4_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[26]); + + //================================== + db_total_count0_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[27]); + + db_total_count1_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[28]); + + db_total_count2_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[29]); + + db_total_count3_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[30]); + + db_total_count4_ptr = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(uint32_t) * 4096, &mext_o[31]); + + //================================== + // add Buffers to migrate + std::vector ob_in; + std::vector ob_out; + + ob_in.push_back(db_config); + ob_in.push_back(db_ac_coef_ordered_ddr); + ob_in.push_back(db_strategy_ddr); + ob_in.push_back(db_qf_ddr); + ob_in.push_back(db_qdc_ddr); + ob_in.push_back(db_ctx_map); + ob_in.push_back(db_qf_thresholds); + ob_in.push_back(db_token0_ptr); + ob_in.push_back(db_token1_ptr); + ob_in.push_back(db_token2_ptr); + ob_in.push_back(db_token3_ptr); + + ob_out.push_back(db_config); + ob_out.push_back(db_ac_tokens_ddr); + ob_out.push_back(db_histograms0_ptr); + ob_out.push_back(db_histograms1_ptr); + ob_out.push_back(db_histograms2_ptr); + ob_out.push_back(db_histograms3_ptr); + ob_out.push_back(db_histograms4_ptr); + ob_out.push_back(db_histograms_size0_ptr); + ob_out.push_back(db_histograms_size1_ptr); + ob_out.push_back(db_histograms_size2_ptr); + ob_out.push_back(db_histograms_size3_ptr); + ob_out.push_back(db_histograms_size4_ptr); + ob_out.push_back(db_total_count0_ptr); + ob_out.push_back(db_total_count1_ptr); + ob_out.push_back(db_total_count2_ptr); + ob_out.push_back(db_total_count3_ptr); + ob_out.push_back(db_total_count4_ptr); + ob_out.push_back(db_nonempty0_ptr); + ob_out.push_back(db_nonempty1_ptr); + ob_out.push_back(db_nonempty2_ptr); + ob_out.push_back(db_nonempty3_ptr); + ob_out.push_back(db_nonempty4_ptr); + + // set kernel args + for (int i = 0; i < repInt; i++) { + initHist_kernel[i].setArg(0, db_config); + initHist_kernel[i].setArg(1, db_ac_coef_ordered_ddr); + initHist_kernel[i].setArg(2, db_strategy_ddr); + initHist_kernel[i].setArg(3, db_qf_ddr); + initHist_kernel[i].setArg(4, db_qdc_ddr); + initHist_kernel[i].setArg(5, db_ctx_map); + initHist_kernel[i].setArg(6, db_qf_thresholds); + initHist_kernel[i].setArg(7, db_ac_tokens_ddr); + initHist_kernel[i].setArg(8, db_token0_ptr); + initHist_kernel[i].setArg(9, db_token1_ptr); + initHist_kernel[i].setArg(10, db_token2_ptr); + initHist_kernel[i].setArg(11, db_token3_ptr); + initHist_kernel[i].setArg(12, db_histograms0_ptr); + initHist_kernel[i].setArg(13, db_histograms_size0_ptr); + initHist_kernel[i].setArg(14, db_total_count0_ptr); + initHist_kernel[i].setArg(15, db_nonempty0_ptr); + initHist_kernel[i].setArg(16, db_histograms1_ptr); + initHist_kernel[i].setArg(17, db_histograms_size1_ptr); + initHist_kernel[i].setArg(18, db_total_count1_ptr); + initHist_kernel[i].setArg(19, db_nonempty1_ptr); + initHist_kernel[i].setArg(20, db_histograms2_ptr); + initHist_kernel[i].setArg(21, db_histograms_size2_ptr); + initHist_kernel[i].setArg(22, db_total_count2_ptr); + initHist_kernel[i].setArg(23, db_nonempty2_ptr); + initHist_kernel[i].setArg(24, db_histograms3_ptr); + initHist_kernel[i].setArg(25, db_histograms_size3_ptr); + initHist_kernel[i].setArg(26, db_total_count3_ptr); + initHist_kernel[i].setArg(27, db_nonempty3_ptr); + initHist_kernel[i].setArg(28, db_histograms4_ptr); + initHist_kernel[i].setArg(29, db_histograms_size4_ptr); + initHist_kernel[i].setArg(30, db_total_count4_ptr); + initHist_kernel[i].setArg(31, db_nonempty4_ptr); + } + + // launch kernel and calculate kernel execution time + std::cout << "INFO: Kernel Start" << std::endl; + // declare events + std::vector events_write(1); + std::vector events_kernel(1); + std::vector events_read(1); + + // migrate, + q.enqueueMigrateMemObjects(ob_in, 0, nullptr, &events_write[0]); + q.enqueueTask(initHist_kernel[0], &events_write, &events_kernel[0]); + q.enqueueMigrateMemObjects(ob_out, 1, &events_kernel, &events_read[0]); + q.finish(); + + struct timeval end_time; + gettimeofday(&end_time, 0); + std::cout << "INFO: Finish kernel execution" << std::endl; + std::cout << "INFO: Finish E2E execution" << std::endl; + + // print related times + unsigned long timeStart, timeEnd, exec_time0; + std::cout << "-------------------------------------------------------" << std::endl; + events_write[0].getProfilingInfo(CL_PROFILING_COMMAND_START, &timeStart); + events_write[0].getProfilingInfo(CL_PROFILING_COMMAND_END, &timeEnd); + exec_time0 = (timeEnd - timeStart) / 1000.0; + std::cout << "INFO: Data transfer from host to device: " << exec_time0 << " us\n"; + std::cout << "-------------------------------------------------------" << std::endl; + events_read[0].getProfilingInfo(CL_PROFILING_COMMAND_START, &timeStart); + events_read[0].getProfilingInfo(CL_PROFILING_COMMAND_END, &timeEnd); + exec_time0 = (timeEnd - timeStart) / 1000.0; + std::cout << "INFO: Kernel1 Data transfer from device to host: " << exec_time0 << " us\n"; + std::cout << "-------------------------------------------------------" << std::endl; + exec_time0 = 0; + for (int i = 0; i < 1; ++i) { + events_kernel[0].getProfilingInfo(CL_PROFILING_COMMAND_START, &timeStart); + events_kernel[0].getProfilingInfo(CL_PROFILING_COMMAND_END, &timeEnd); + exec_time0 += (timeEnd - timeStart) / 1000.0; + + std::cout << "INFO: Kernel" << i + 1 << " execution: " << (timeEnd - timeStart) / 1000.0 << " us\n"; + std::cout << "-------------------------------------------------------" << std::endl; + } + std::cout << "INFO: kernel total execution: " << exec_time0 << " us\n"; + std::cout << "-------------------------------------------------------" << std::endl; + unsigned long exec_timeE2E = diff(&end_time, &start_time); + std::cout << "INFO: FPGA execution time:" << exec_timeE2E << " us\n"; + std::cout << "-------------------------------------------------------" << std::endl; + + for (int j = 0; j < MAX_NUM_CONFIG; j++) { + config[j] = hb_config[j]; + } + + for (int j = 0; j < ALL_PIXEL; j++) { + ac_tokens_ddr[j] = hb_ac_tokens_ddr[j]; + } + + // output + std::cout << "histograms_ptr:" << std::endl; + for (int j = 0; j < 163840; j++) { + histograms0_ptr[j] = hb_histograms0_ptr[j]; + histograms1_ptr[j] = hb_histograms1_ptr[j]; + histograms2_ptr[j] = hb_histograms2_ptr[j]; + histograms3_ptr[j] = hb_histograms3_ptr[j]; + histograms4_ptr[j] = hb_histograms4_ptr[j]; + } + + std::cout << "histograms_size:" << std::endl; + for (int j = 0; j < 4096; j++) { + histograms_size0_ptr[j] = hb_histograms_size0_ptr[j]; + histograms_size1_ptr[j] = hb_histograms_size1_ptr[j]; + histograms_size2_ptr[j] = hb_histograms_size2_ptr[j]; + histograms_size3_ptr[j] = hb_histograms_size3_ptr[j]; + histograms_size4_ptr[j] = hb_histograms_size4_ptr[j]; + } + + std::cout << "total_count_ptr:" << std::endl; + for (int j = 0; j < 4096; j++) { + total_count0_ptr[j] = hb_total_count0_ptr[j]; + total_count1_ptr[j] = hb_total_count1_ptr[j]; + total_count2_ptr[j] = hb_total_count2_ptr[j]; + total_count3_ptr[j] = hb_total_count3_ptr[j]; + total_count4_ptr[j] = hb_total_count4_ptr[j]; + } + + for (int j = 0; j < 4096; j++) { + nonempty0_ptr[j] = hb_nonempty0_ptr[j]; + nonempty1_ptr[j] = hb_nonempty1_ptr[j]; + nonempty2_ptr[j] = hb_nonempty2_ptr[j]; + nonempty3_ptr[j] = hb_nonempty3_ptr[j]; + nonempty4_ptr[j] = hb_nonempty4_ptr[j]; + } + + free(hb_config); + free(hb_ac_coeff_ordered_ddr); + free(hb_strategy_ddr); + free(hb_qf_ddr); + free(hb_qdc_ddr); + free(hb_ctx_map); + free(hb_qf_thresholds); + free(hb_ac_tokens_ddr); + free(hb_token0_ptr); + free(hb_token1_ptr); + free(hb_token2_ptr); + free(hb_token3_ptr); + free(hb_histograms0_ptr); + free(hb_histograms1_ptr); + free(hb_histograms2_ptr); + free(hb_histograms3_ptr); + free(hb_histograms4_ptr); + free(hb_histograms_size0_ptr); + free(hb_histograms_size1_ptr); + free(hb_histograms_size2_ptr); + free(hb_histograms_size3_ptr); + free(hb_histograms_size4_ptr); + free(hb_total_count0_ptr); + free(hb_total_count1_ptr); + free(hb_total_count2_ptr); + free(hb_total_count3_ptr); + free(hb_total_count4_ptr); + free(hb_nonempty0_ptr); + free(hb_nonempty1_ptr); + free(hb_nonempty2_ptr); + free(hb_nonempty3_ptr); + free(hb_nonempty4_ptr); + + std::cout << "finished opencl host" << std::endl; +#else + ap_uint<64>* hls_tokens0_ptr = reinterpret_cast*>(tokens0_ptr); + ap_uint<64>* hls_tokens1_ptr = reinterpret_cast*>(tokens1_ptr); + ap_uint<64>* hls_tokens2_ptr = reinterpret_cast*>(tokens2_ptr); + ap_uint<64>* hls_tokens3_ptr = reinterpret_cast*>(tokens3_ptr); + + hls_ANSinitHistogram(config, ac_coeff_ordered_ddr, strategy_ddr, qf_ddr, qdc_ddr, ctx_map, qf_thresholds, + ac_tokens_ddr, hls_tokens0_ptr, hls_tokens1_ptr, hls_tokens2_ptr, hls_tokens3_ptr, + histograms0_ptr, histograms_size0_ptr, total_count0_ptr, nonempty0_ptr, histograms1_ptr, + histograms_size1_ptr, total_count1_ptr, nonempty1_ptr, histograms2_ptr, histograms_size2_ptr, + total_count2_ptr, nonempty2_ptr, histograms3_ptr, histograms_size3_ptr, total_count3_ptr, + nonempty3_ptr, histograms4_ptr, histograms_size4_ptr, total_count4_ptr, nonempty4_ptr); +#endif +} + +#endif diff --git a/codec/L2/demos/jxlEnc/acc_tokInit_histogram/host/host_tokinit_histogram.hpp b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/host/host_tokinit_histogram.hpp new file mode 100644 index 0000000000..75b3cf0f8a --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/host/host_tokinit_histogram.hpp @@ -0,0 +1,81 @@ +/* + * Copyright 2022 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HOST_TOKINIT_HISTOGRAM_HPP +#define HOST_TOKINIT_HISTOGRAM_HPP + +#include +#include +#ifndef HLS_TEST +#include "xcl2.hpp" +#include "xf_utils_sw/logger.hpp" +const int PIXEL_W = 2048; +const int PIXEL_H = 2048; +const int FRAME_DIM = 3; +const int ALL_PIXEL = PIXEL_W * PIXEL_H * FRAME_DIM; +const int MAX_NUM_BLK88_W = PIXEL_W / 8; +const int MAX_NUM_BLK88_H = PIXEL_H / 8; +const int MAX_NUM_BLK88 = MAX_NUM_BLK88_W * MAX_NUM_BLK88_H; +const int MAX_ORDERS_SIZE = (3 * 64 + 3 * 64 + 3 * 256 + 3 * 1024); +const int MAX_QF_THRESH_SIZE = 256; +const int MAX_CTX_MAP_SIZE = 256; +const int MAX_AC_TOKEN_SIZE = ALL_PIXEL; +#else +#include "hls_init_histogram.hpp" +#endif + +void hls_ANSinitHistogram_wrapper(std::string xclbinPath, + int config[32], + //==================== + int32_t* ac_coeff_ordered_ddr, + int32_t* strategy_ddr, + int32_t* qf_ddr, + uint8_t* qdc_ddr, + uint8_t* ctx_map, + uint32_t* qf_thresholds, + uint64_t* ac_tokens_ddr, + //==================== + uint64_t* tokens0_ptr, + uint64_t* tokens1_ptr, + uint64_t* tokens2_ptr, + uint64_t* tokens3_ptr, + //==================== + int32_t* histograms0_ptr, + uint32_t* histograms_size0_ptr, + uint32_t* total_count0_ptr, + uint32_t* nonempty0_ptr, + //====================== + int32_t* histograms1_ptr, + uint32_t* histograms_size1_ptr, + uint32_t* total_count1_ptr, + uint32_t* nonempty1_ptr, + //====================== + int32_t* histograms2_ptr, + uint32_t* histograms_size2_ptr, + uint32_t* total_count2_ptr, + uint32_t* nonempty2_ptr, + //====================== + int32_t* histograms3_ptr, + uint32_t* histograms_size3_ptr, + uint32_t* total_count3_ptr, + uint32_t* nonempty3_ptr, + //====================== + int32_t* histograms4_ptr, + uint32_t* histograms_size4_ptr, + uint32_t* total_count4_ptr, + uint32_t* nonempty4_ptr); + +#endif diff --git a/codec/L2/demos/jxlEnc/acc_tokInit_histogram/kernel/hls_init_histogram.cpp b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/kernel/hls_init_histogram.cpp new file mode 100644 index 0000000000..bc647c9759 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/kernel/hls_init_histogram.cpp @@ -0,0 +1,1725 @@ +/* + * Copyright 2022 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HLS_INIT_HISTOGRAM_CPP +#define HLS_INIT_HISTOGRAM_CPP + +#include "hls_init_histogram.hpp" +#include "hls_math.h" + +#define kBlockDim 8 +#define kDCTBlockSize kBlockDim* kBlockDim +#define kNonZeroBuckets 37 +#define kZeroDensityContextCount 458 +//====================================================================================// +// hls_initHistogram_qc.cpp +//====================================================================================// +void hls_InitHistogram(hls::stream >& token_stream, +#ifndef __SYNTHESIS__ + std::vector >& histograms_uram, +#else + int32_t histograms_uram[4096][40], +#endif + uint32_t histograms_size[4096], + uint32_t total_count[4096], + uint32_t& nempty_cnt, + uint32_t nonempty_[4096], + uint32_t& large_idx) { +#pragma HLS INLINE off + + int32_t histo_reg[4] = {0, 0, 0, 0}; +#pragma HLS array_partition variable = histo_reg complete dim = 1 + int32_t histo_ctx[4] = {-1, -1, -1, -1}; +#pragma HLS array_partition variable = histo_ctx complete dim = 1 + int32_t histo_tok[4] = {-1, -1, -1, -1}; +#pragma HLS array_partition variable = histo_tok complete dim = 1 + + uint32_t totalcnt_reg[4] = {0, 0, 0, 0}; +#pragma HLS array_partition variable = totalcnt_reg complete dim = 1 + + uint32_t histo_size_reg[4] = {0, 0, 0, 0}; +#pragma HLS array_partition variable = histo_size_reg complete dim = 1 + + ap_uint<64> token_reg = 0; + token_reg = token_stream.read(); + + nempty_cnt = 0; + uint32_t max_totalcnt = 0; + + int tmp_test = 0; // csim-only + +INIT_HISTOGRAM_LOOP: + while (token_reg[63] != 1) { +#pragma HLS PIPELINE II = 1 +#pragma HLS DEPENDENCE variable = total_count inter false +#pragma HLS DEPENDENCE variable = histograms_uram inter false +#pragma HLS DEPENDENCE variable = histograms_size inter false + + // csim-only + tmp_test++; + + ap_uint<32> value = token_reg.range(31, 0); + ap_uint<31> context = token_reg.range(62, 32); + uint32_t tok; + + if (value < 16) { + tok = value; + } else { + uint32_t n = 32 - value.countLeadingZeros() - 1; + uint32_t m = value - (1 << n); + tok = 16 + ((n - 4) << 2) + (m >> (n - 2)); + } + int32_t histo_read; + int32_t histo_write; + if (context == histo_ctx[0] && tok == histo_tok[0]) { + histo_read = histo_reg[0]; + } else if (context == histo_ctx[1] && tok == histo_tok[1]) { + histo_read = histo_reg[1]; + } else if (context == histo_ctx[2] && tok == histo_tok[2]) { + histo_read = histo_reg[2]; + } else if (context == histo_ctx[3] && tok == histo_tok[3]) { + histo_read = histo_reg[3]; + } else { + histo_read = histograms_uram[context][tok]; + } + histo_write = histo_read + 1; + + uint32_t tot_cnt_read; + uint32_t tot_cnt_write; + uint32_t siz_read; + uint32_t siz_write; + if (context == histo_ctx[0]) { + tot_cnt_read = totalcnt_reg[0]; + siz_read = histo_size_reg[0]; + } else if (context == histo_ctx[1]) { + tot_cnt_read = totalcnt_reg[1]; + siz_read = histo_size_reg[1]; + } else if (context == histo_ctx[2]) { + tot_cnt_read = totalcnt_reg[2]; + siz_read = histo_size_reg[2]; + } else if (context == histo_ctx[3]) { + tot_cnt_read = totalcnt_reg[3]; + siz_read = histo_size_reg[3]; + } else { + tot_cnt_read = total_count[context]; + siz_read = histograms_size[context]; + } + + tot_cnt_write = tot_cnt_read + 1; + + if (tot_cnt_read == 0) { + nonempty_[nempty_cnt] = context; + nempty_cnt++; + } + if (tot_cnt_write > max_totalcnt) { + large_idx = context; + max_totalcnt = tot_cnt_write; + } + + if (siz_read <= tok) { + siz_write = (tok + 8) / 8 * 8; + } else { + siz_write = siz_read; + } + + token_reg = token_stream.read(); + histograms_uram[context][tok] = histo_write; // II=1 + histo_reg[3] = histo_reg[2]; + histo_reg[2] = histo_reg[1]; + histo_reg[1] = histo_reg[0]; + histo_reg[0] = histo_write; + histo_ctx[3] = histo_ctx[2]; + histo_ctx[2] = histo_ctx[1]; + histo_ctx[1] = histo_ctx[0]; + histo_ctx[0] = context; + histo_tok[3] = histo_tok[2]; + histo_tok[2] = histo_tok[1]; + histo_tok[1] = histo_tok[0]; + histo_tok[0] = tok; + total_count[context] = tot_cnt_write; // shoulde be II=1 + totalcnt_reg[3] = totalcnt_reg[2]; + totalcnt_reg[2] = totalcnt_reg[1]; + totalcnt_reg[1] = totalcnt_reg[0]; + totalcnt_reg[0] = tot_cnt_write; + histograms_size[context] = siz_write; + histo_size_reg[3] = histo_size_reg[2]; + histo_size_reg[2] = histo_size_reg[1]; + histo_size_reg[1] = histo_size_reg[0]; + histo_size_reg[0] = siz_write; + } +} + +void init_histogram_core(hls::stream >& token_stream, + int32_t* histograms_ptr, + uint32_t* histograms_size_ptr, + uint32_t* total_count_ptr, + uint32_t* nonempty_ptr, + hls::stream& strm_nempty_cnt, + hls::stream& strm_largest_idx) { +#pragma HLS INLINE off + +#ifndef __SYNTHESIS__ + std::vector > histograms_uram(4096, std::vector(40)); +#else + int32_t histograms_uram[4096][40]; // pragma +#pragma HLS BIND_STORAGE impl = URAM variable = histograms_uram type = ram_s2p +#pragma HLS ARRAY_PARTITION variable = histograms_uram complete dim = 2 + // uram pargma +#endif + + uint32_t histograms_size[4096]; + uint32_t total_count[4096]; + uint32_t nonempty_[4096]; + uint32_t nempty_cnt; + +HISTOGRAM_URAM_INIT_LOOP: + for (int j = 0; j < 4096; j++) { +#pragma HLS PIPELINE II = 1 + histograms_size[j] = 0; + total_count[j] = 0; + for (int k = 0; k < 40; k++) { +#pragma HLS UNROLL + histograms_uram[j][k] = 0; + } + } + + uint32_t largest_idx_tmp = 0; + + hls_InitHistogram(token_stream, histograms_uram, histograms_size, total_count, nempty_cnt, nonempty_, + largest_idx_tmp); + + // nempty_cnt_ptr = nempty_cnt; + strm_nempty_cnt.write(nempty_cnt); + strm_largest_idx.write(largest_idx_tmp); + + uint32_t nempty_context; +HISTOGRAM_WRITEOUT_LOOP: + for (int i = 0; i < nempty_cnt; i++) { + for (int j = 0; j < 40; j++) { +#pragma HLS PIPELINE II = 1 + if (j == 0) nempty_context = nonempty_[i]; + histograms_ptr[nempty_context * 40 + j] = histograms_uram[nempty_context][j]; + } + } + +HISTOGRAM_SIZE_WRITEOUT_LOOP: + for (int j = 0; j < 4096; j++) { +#pragma HLS PIPELINE II = 1 + histograms_size_ptr[j] = histograms_size[j]; + } + +HISTOGRAM_CNT_WRITEOUT_LOOP: + for (int j = 0; j < 4096; j++) { +#pragma HLS PIPELINE II = 1 + total_count_ptr[j] = total_count[j]; + } + +HISTOGRAM_NEMPTY_WRITEOUT_LOOP: + for (int j = 0; j < 4096; j++) { +#pragma HLS PIPELINE II = 1 + nonempty_ptr[j] = nonempty_[j]; + } +} + +void init_histogram_top( + // bool do_once[5], + hls::stream& strm_do_once, + hls::stream >& token_stream0, + hls::stream >& token_stream1, + hls::stream >& token_stream2, + hls::stream >& token_stream3, + hls::stream >& token_stream4, + hls::stream& strm_nempty_cnt, + hls::stream& strm_largest_idx, + + int32_t* histograms0_ptr, + uint32_t* histograms_size0_ptr, + uint32_t* total_count0_ptr, + uint32_t* nonempty0_ptr, + + int32_t* histograms1_ptr, + uint32_t* histograms_size1_ptr, + uint32_t* total_count1_ptr, + uint32_t* nonempty1_ptr, + + int32_t* histograms2_ptr, + uint32_t* histograms_size2_ptr, + uint32_t* total_count2_ptr, + uint32_t* nonempty2_ptr, + + int32_t* histograms3_ptr, + uint32_t* histograms_size3_ptr, + uint32_t* total_count3_ptr, + uint32_t* nonempty3_ptr, + + int32_t* histograms4_ptr, + uint32_t* histograms_size4_ptr, + uint32_t* total_count4_ptr, + uint32_t* nonempty4_ptr) { +#pragma HLS INLINE off + int do_once[5]; + + do_once[0] = strm_do_once.read(); + do_once[1] = strm_do_once.read(); + do_once[2] = strm_do_once.read(); + do_once[3] = strm_do_once.read(); + do_once[4] = strm_do_once.read(); + + if (do_once[0]) { + init_histogram_core(token_stream0, histograms0_ptr, histograms_size0_ptr, total_count0_ptr, nonempty0_ptr, + strm_nempty_cnt, strm_largest_idx); + } else { + strm_nempty_cnt.write(0); + strm_largest_idx.write(0); + } + + if (do_once[1]) { + init_histogram_core(token_stream1, histograms1_ptr, histograms_size1_ptr, total_count1_ptr, nonempty1_ptr, + strm_nempty_cnt, strm_largest_idx); + } else { + strm_nempty_cnt.write(0); + strm_largest_idx.write(0); + } + + if (do_once[2]) { + init_histogram_core(token_stream2, histograms2_ptr, histograms_size2_ptr, total_count2_ptr, nonempty2_ptr, + strm_nempty_cnt, strm_largest_idx); + } else { + strm_nempty_cnt.write(0); + strm_largest_idx.write(0); + } + + if (do_once[3]) { + init_histogram_core(token_stream3, histograms3_ptr, histograms_size3_ptr, total_count3_ptr, nonempty3_ptr, + strm_nempty_cnt, strm_largest_idx); + } else { + strm_nempty_cnt.write(0); + strm_largest_idx.write(0); + } + + if (do_once[4]) { + init_histogram_core(token_stream4, histograms4_ptr, histograms_size4_ptr, total_count4_ptr, nonempty4_ptr, + strm_nempty_cnt, strm_largest_idx); + } else { + strm_nempty_cnt.write(0); + strm_largest_idx.write(0); + } +} + +void load_token(hls::stream& strm_do_once, ap_uint<64>* tokens_ptr, hls::stream >& token_stream) { +#pragma HLS INLINE off + + int enable = strm_do_once.read(); + if (enable) { + ap_uint<64> token_reg; + uint32_t token_size = tokens_ptr[0]; + LOAD_TOKEN_LOOP: + for (int i = 0; i < (token_size + 1 + 256) / 256; i++) { + for (int j = 0; j < 256; j++) { +#pragma HLS PIPELINE II = 1 + token_reg.range(62, 0) = tokens_ptr[i * 256 + j]; + token_reg[63] = 0; + if (i * 256 + j != 0 && i * 256 + j < token_size + 1) token_stream.write(token_reg); + } + } + token_reg[63] = 1; + token_stream.write(token_reg); + } +} +//=====================================================================================================// +// hls_enc_entropy_coder.cpp +//=====================================================================================================// +#define MAX_NUM_BLOCK88_JXL (256 / 8) + +const uint8_t kNumOrders = 13; + +uint8_t covered_blocks_x[] = {1, 1, 1, 1, 2, 4, 1, 2, 1, 4, 2, 4, 1, 1, 1, 1, 1, 1, 8, 4, 8, 16, 8, 16, 32, 16, 32}; + +uint8_t covered_blocks_y[] = {1, 1, 1, 1, 2, 4, 2, 1, 4, 1, 4, 2, 1, 1, 1, 1, 1, 1, 8, 8, 4, 16, 16, 8, 32, 32, 16}; +uint64_t coverd_blocks_lut[] = {0, 0, 1, 0, 2, 0, 1, 0, 3}; + +uint16_t hls_kCoeffFreqContext[64] = { + 0xBAD, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 16, 16, 17, 17, + 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, + 26, 26, 26, 26, 27, 27, 27, 27, 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, +}; + +uint16_t hls_kCoeffNumNonzeroContext[64] = { + 0xBAD, 0, 31, 62, 62, 93, 93, 93, 93, 123, 123, 123, 123, 152, 152, 152, 152, 152, 152, 152, 152, 180, + 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, + 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, +}; + +uint64_t hls_kCoeffOrderOffset[] = { + 0, 1, 2, 3, 4, 5, 6, 10, 14, 18, 34, 50, 66, 68, 70, 72, 76, 80, 84, 92, + 100, 108, 172, 236, 300, 332, 364, 396, 652, 908, 1164, 1292, 1420, 1548, 2572, 3596, 4620, 5132, 5644, 6156, +}; + +uint8_t hls_kStrategyOrder[] = { + 0, 1, 1, 1, 2, 3, 4, 4, 5, 5, 6, 6, 1, 1, 1, 1, 1, 1, 7, 8, 8, 9, 10, 10, 11, 12, 12, +}; + +uint8_t hls_kDefaultCtxMap[39] = { + // Default ctx map clusters all the large transforms together. + 0, 1, 2, 2, 3, 3, 4, 5, 6, 6, 6, 6, 6, // + 7, 8, 9, 9, 10, 11, 12, 13, 14, 14, 14, 14, 14, // + 7, 8, 9, 9, 10, 11, 12, 13, 14, 14, 14, 14, 14, // +}; + +uint32_t hls_covered_block_lut(int covered_blocks) { +#pragma HLS INLINE + int log2_covered_blk = 0; + if (covered_blocks == 4) { + log2_covered_blk = 2; + } else if (covered_blocks == 16) { + log2_covered_blk = 4; + } else { + log2_covered_blk = 0; + } + + return log2_covered_blk; +} + +// Non-zero context is based on number of non-zeros and block context. +// For better clustering, contexts with same number of non-zeros are grouped. +uint32_t hls_ZeroDensityContextsOffset(uint64_t num_ctxs, uint32_t block_ctx) { +#pragma HLS INLINE + return num_ctxs * kNonZeroBuckets + kZeroDensityContextCount * block_ctx; +} + +// Specialization for 8x8, where only top-left is LLF/DC. +// About 1% overall speedup vs. NumNonZeroExceptLLF. +int32_t HLS_NumNonZero8x8ExceptDC(hls::stream& strm_ac_coeff_raster, int32_t* nzeros_pos) { + int sum_zeros = 1; +HLS_COUNT_NZ8X8_INNER_LOOP: + for (int k = 0; k < kBlockDim * kBlockDim; k++) { +#pragma HLS PIPELINE II = 1 + int32_t ac_coeff = strm_ac_coeff_raster.read(); + + // strm_ac_coeff_raster_out.write(ac_coeff); + if (k == 0) { + continue; + } else { + if (!ac_coeff) { + sum_zeros++; + } + } + } + + *nzeros_pos = (kDCTBlockSize - sum_zeros); + + return (kDCTBlockSize - sum_zeros); +} + +int hls_Is_FirstBlock(int by, int bx, int8_t strategy) { +#pragma HLS INLINE + int32_t isFirstBlock = 0; + + if (strategy == 4) { + if ((bx % 2 == 0) && (by % 2 == 0)) { + isFirstBlock = 1; + } + } else if (strategy == 5) { + if ((bx % 4 == 0) && (by % 4 == 0)) { + isFirstBlock = 1; + } + } else { + isFirstBlock = 1; + } + + return isFirstBlock; +} + +int32_t hls_PredictFromTopAndLeft(const int32_t* row_top, + const int32_t* row, + int covered_blocks, + int log2_covered_blocks, + int c, + int32_t x, + int32_t y, + int32_t default_val) { +#pragma HLS INLINE + int32_t predict_nzeros = 0; + + if (x == 0 && y == 0) { + predict_nzeros = default_val; + } else if (x == 0) { + predict_nzeros = row_top[0]; // nzero_row_abv[0]; + } else if (y == 0) { + predict_nzeros = row[x - 1]; // nzero_row_left[c]; + } else { + predict_nzeros = (row_top[x] + row[x - 1] + 1) / 2; + } + + return predict_nzeros; +} + +// TODO(user): investigate, why disabling pre-clustering makes entropy code +// less dense. Perhaps we would need to add HQ clustering algorithm that would +// be able to squeeze better by spending more CPU cycles. +uint32_t hls_ZeroDensityContext( + uint32_t nonzeros_left, uint32_t k, uint32_t covered_blocks, uint32_t log2_covered_blocks, uint32_t prev) { +#pragma HLS INLINE + nonzeros_left = (nonzeros_left + covered_blocks - 1) >> log2_covered_blocks; + k >>= log2_covered_blocks; + + return (hls_kCoeffNumNonzeroContext[nonzeros_left] + hls_kCoeffFreqContext[k]) * 2 + prev; +} + +// Non-zero context is based on number of non-zeros and block context. +// For better clustering, contexts with same number of non-zeros are grouped. +uint32_t hls_NonZeroContext(uint64_t num_ctxs, uint32_t non_zeros, uint32_t block_ctx) { +#pragma HLS INLINE + uint32_t ctx; + if (non_zeros >= 64) non_zeros = 64; + if (non_zeros < 8) { + ctx = non_zeros; + } else { + ctx = 4 + non_zeros / 2; + } + return ctx * num_ctxs + block_ctx; +} + +// Encodes non-negative (X) into (2 * X), negative (-X) into (2 * X - 1) +uint32_t hls_PackSigned(int32_t value) { +#pragma HLS INLINE + // JXL_NO_SANITIZE("unsigned-integer-overflow") { + return (static_cast(value) << 1) ^ ((static_cast(~value) >> 31) - 1); +} + +int hls_dim_sanf_order(int i) { +#pragma HLS INLINE + int c = 0; + if (i == 0) + c = 1; + else if (i == 1) + c = 0; + else + c = 2; + + return c; +} + +uint64_t hls_Context(uint8_t* ctx_map, + uint32_t* qf_thresholds, + int qf_thresholds_size, + int kNumOrders, + int num_dc_ctxs, + int dc_idx, + int32_t qf, + uint64_t ord, + uint32_t c) { +#pragma HLS INLINE + uint32_t qf_idx = 0; + for (uint32_t i = 0; i < qf_thresholds_size; i++) { +#pragma HLS UNROLL + if (qf > qf_thresholds[i]) qf_idx++; + } + + uint32_t idx = c < 2 ? (c ^ 1) : 2; + idx = idx * kNumOrders + ord; + idx = idx * (qf_thresholds_size + 1) + qf_idx; + idx = idx * num_dc_ctxs + dc_idx; + return ctx_map[idx]; +} + +//==================== dataflow stage-1 ===========================// +void hls_count_nz(int ysize_blocks, + int xsize_blocks, + int nzeros_stride, + hls::stream& strm_ac_coeff_in, + hls::stream& strm_strategy_in, + + hls::stream& strm_ac_coeff_out, + hls::stream& strm_strategy_out, + hls::stream& strm_nzeros, + hls::stream& strm_predict_nzeros) { +#pragma HLS INLINE off + // bram + int32_t nzero_row_left[3] = {0, 0, 0}; + int32_t nzero_row_0[MAX_NUM_BLOCK88_JXL * MAX_NUM_BLOCK88_JXL]; + int32_t nzero_row_1[MAX_NUM_BLOCK88_JXL * MAX_NUM_BLOCK88_JXL]; + int32_t nzero_row_2[MAX_NUM_BLOCK88_JXL * MAX_NUM_BLOCK88_JXL]; + + // global config + int32_t hls_strategy; + +HLS_COUNT_NZ_OUTTER_LOOP: + for (uint32_t by = 0; by < ysize_blocks; ++by) { + for (uint32_t bx = 0; bx < xsize_blocks; ++bx) { + for (int i = 0; i < 3; i++) { + int c = hls_dim_sanf_order(i); + int32_t* row_nzeros; + int32_t* row_nzeros_top; + + if (c == 0) { + row_nzeros = &nzero_row_0[by * nzeros_stride + 0]; + row_nzeros_top = &nzero_row_0[(by - 1) * nzeros_stride + 0]; + } else if (c == 1) { + row_nzeros = &nzero_row_1[by * nzeros_stride + 0]; + row_nzeros_top = &nzero_row_1[(by - 1) * nzeros_stride + 0]; + + } else { + row_nzeros = &nzero_row_2[by * nzeros_stride + 0]; + row_nzeros_top = &nzero_row_2[(by - 1) * nzeros_stride + 0]; + } + + if (i == 0) { + hls_strategy = strm_strategy_in.read(); + strm_strategy_out.write(hls_strategy); + } + + bool hls_isFirstBlock = hls_Is_FirstBlock(by, bx, hls_strategy); + uint32_t cx = covered_blocks_x[hls_strategy]; // lut + uint32_t cy = covered_blocks_y[hls_strategy]; + const uint32_t covered_blocks = cx * cy; // = #LLF coefficients + uint32_t log2_covered_blocks = hls_covered_block_lut(covered_blocks); + uint32_t size = covered_blocks * kDCTBlockSize; + + if (hls_isFirstBlock) { + int32_t* nzeros_pos = row_nzeros + bx; + int num_zeros = 0; + for (int y = 0; y < cy * kBlockDim; y++) { + for (int x = 0; x < cx * kBlockDim; x++) { +#pragma HLS PIPELINE II = 1 + int32_t ac_coeff = strm_ac_coeff_in.read(); + strm_ac_coeff_out.write(ac_coeff); + if (!ac_coeff) { + num_zeros++; + } + } + } + + //=============Move to an independent process, linked with hls::stream num_zeros======= + int nzeros = int(cx * cy * kDCTBlockSize) - num_zeros; + const int32_t shifted_nzeros = + static_cast((nzeros + covered_blocks - 1) >> log2_covered_blocks); + NZ_EXCEPT_LLF_INNER_LOOP3: + for (int32_t y = 0; y < cy; y++) { + for (int32_t x = 0; x < cx; x++) { +#pragma HLS PIPELINE II = 1 + nzeros_pos[x + y * nzeros_stride] = shifted_nzeros; + } + } + + int32_t predicted_nzeros = hls_PredictFromTopAndLeft(row_nzeros_top, row_nzeros, covered_blocks, + log2_covered_blocks, c, bx, by, 32); + + strm_nzeros.write(nzeros); + strm_predict_nzeros.write(predicted_nzeros); + //=============================== + } + } + } + } +} + +// void hls_block_context( +// // config +// int rect_x0, +// int rect_y0, +// int ysize_blocks, +// int xsize_blocks, +// int num_ctxs, +// int num_dc_ctxs, +// int qf_thresholds_size, +// // bram +// uint8_t ctx_map[MAX_CTX_MAP_SIZE], +// uint32_t qf_thresholds[MAX_QF_THRESH_SIZE], +// // strm +// hls::stream& strm_qf, +// hls::stream& strm_qdc, +// hls::stream& strm_strategy_in, +// hls::stream& strm_strategy_out, +// hls::stream& strm_block_ctx) { +// #pragma HLS INLINE off + +// // global config +// int hls_strategy; +// uint8_t dc_idx; +// int32_t hls_qf; + +// Block_CTX_LOOP: +// for (uint32_t by = 0; by < ysize_blocks; ++by) { +// for (uint32_t bx = 0; bx < xsize_blocks; ++bx) { +// for (int i = 0; i < 3; i++) { +// #pragma HLS PIPELINE II = 1 + +// if (i == 0) { +// hls_strategy = strm_strategy_in.read(); +// strm_strategy_out.write(hls_strategy); + +// dc_idx = strm_qdc.read(); +// hls_qf = strm_qf.read(); +// } +// int ord = hls_kStrategyOrder[hls_strategy]; +// int c = hls_dim_sanf_order(i); +// uint32_t block_ctx = hls_Context(ctx_map, qf_thresholds, qf_thresholds_size, kNumOrders, num_dc_ctxs, +// dc_idx, hls_qf, ord, c); +// strm_block_ctx.write(block_ctx); +// } +// } +// } +// } + +//===================================================================================// +void hls_collect_syn(int xsize_blocks, + int ysize_blocks, + hls::stream& strm_strategy_in, + hls::stream >& strm_token_nz, + hls::stream >& strm_token_ac, + hls::stream >& strm_token_out, + hls::stream >& strm_token_internal) { +#pragma HLS INLINE off + int hls_strategy; + +COLLECT_SYN_OUTTER_LOOP: + for (uint32_t by = 0; by < ysize_blocks; ++by) { + for (uint32_t bx = 0; bx < xsize_blocks; ++bx) { + for (int i = 0; i < 3; i++) { + int c = hls_dim_sanf_order(i); + if (i == 0) { + hls_strategy = strm_strategy_in.read(); + } + bool hls_isFirstBlock = hls_Is_FirstBlock(by, bx, hls_strategy); + // covered block size + uint32_t cx = covered_blocks_x[hls_strategy]; // lut + uint32_t cy = covered_blocks_y[hls_strategy]; + uint32_t covered_blocks = cx * cy; // = #LLF coefficients + uint32_t size = covered_blocks * kDCTBlockSize; + // loop in block + if (hls_isFirstBlock) { + COLLECT_SYN_INNER_LOOP: + for (int k = 0; k < size + 1; k++) { +#pragma HLS PIPELINE II = 1 + if (k == 0) { + ap_uint<65> token_nz_reg = strm_token_nz.read(); + ap_uint<64> token_out_reg = token_nz_reg.range(63, 0); + strm_token_out.write(token_out_reg); + strm_token_internal.write(token_out_reg); + } else { + ap_uint<65> token_ac_reg = strm_token_ac.read(); + bool blk_end = token_ac_reg[64]; + + if (blk_end) { + break; + } else { + ap_uint<64> token_out_reg = token_ac_reg.range(63, 0); + strm_token_out.write(token_out_reg); + strm_token_internal.write(token_out_reg); + } + } + } + } + } + } + } +} + +void hls_tokenize_nz( + // config + int rect_x0, + int rect_y0, + int ysize_blocks, + int xsize_blocks, + int num_ctxs, + int num_dc_ctxs, + int qf_thresholds_size, + // bram + uint8_t ctx_map[MAX_CTX_MAP_SIZE], + uint32_t qf_thresholds[MAX_QF_THRESH_SIZE], + // stream + hls::stream& strm_qf, + hls::stream& strm_qdc, + hls::stream& strm_strategy_in, + hls::stream& strm_nzeros, + hls::stream& strm_predict_nzeros, + hls::stream& strm_strategy_out, + hls::stream& strm_strategy_out2, + hls::stream& strm_histo_offset, + hls::stream& strm_nzero_out, + hls::stream >& strm_token_nz) { +#pragma HLS INLINE off + + // global config + int hls_strategy; + uint8_t dc_idx; + int32_t hls_qf; + +TOKENIZE_NZ_LOOP: + for (uint32_t by = 0; by < ysize_blocks; ++by) { + for (uint32_t bx = 0; bx < xsize_blocks; ++bx) { + for (int i = 0; i < 3; i++) { +#pragma HLS PIPELINE II = 1 + + // only read 1 strategy per block + if (i == 0) { + // strategy + hls_strategy = strm_strategy_in.read(); + strm_strategy_out.write(hls_strategy); + strm_strategy_out2.write(hls_strategy); + + // qdc & qf + dc_idx = strm_qdc.read(); + hls_qf = strm_qf.read(); + } + + bool hls_isFirstBlock = hls_Is_FirstBlock(by, bx, hls_strategy); + + if (hls_isFirstBlock) { + int32_t nzeros = strm_nzeros.read(); + + strm_nzero_out.write(nzeros); + + int32_t predicted_nzeros = strm_predict_nzeros.read(); + + //=================Move this block_ctx calculation into an independent process====== + int ord = hls_kStrategyOrder[hls_strategy]; + int c = hls_dim_sanf_order(i); + uint32_t block_ctx = hls_Context(ctx_map, qf_thresholds, qf_thresholds_size, kNumOrders, + num_dc_ctxs, dc_idx, hls_qf, ord, c); + //================================================================================== + + int32_t nzero_ctx = hls_NonZeroContext(num_ctxs, predicted_nzeros, block_ctx); + + uint32_t histo_offset = hls_ZeroDensityContextsOffset(num_ctxs, block_ctx); + strm_histo_offset.write(histo_offset); + + ap_uint<65> token_nz_reg; + token_nz_reg.range(31, 0) = (uint32_t)nzeros; + token_nz_reg.range(63, 32) = (uint32_t)nzero_ctx; + token_nz_reg[64] = 0; + strm_token_nz.write(token_nz_reg); + } + } + } + } +} + +void hls_tokenize_ac(int xsize_blocks, + int ysize_blocks, + hls::stream& strm_coeff_ordered, + hls::stream& strm_strategy_in, + hls::stream& strm_histo_offset, + hls::stream& strm_nzeros_tokenAc, + hls::stream >& strm_token_ac) { +#pragma HLS INLINE off + // global variable + int hls_block_offset = 0; + ap_uint<64> token_reg; + ap_uint<64> token_reg_out; + uint32_t offset[3] = {}; + int hls_strategy; + +TOKENIZE_AC_OUTTER_LOOP: + for (uint32_t by = 0; by < ysize_blocks; ++by) { + for (uint32_t bx = 0; bx < xsize_blocks; ++bx) { + for (int i = 0; i < 3; i++) { + if (i == 0) { + hls_strategy = strm_strategy_in.read(); + } + + int c = hls_dim_sanf_order(i); + + bool hls_isFirstBlock = hls_Is_FirstBlock(by, bx, hls_strategy); + + if (hls_isFirstBlock) { + uint32_t cx = covered_blocks_x[hls_strategy]; // lut + uint32_t cy = covered_blocks_y[hls_strategy]; + const uint32_t covered_blocks = cx * cy; // = #LLF coefficients + uint32_t log2_covered_blocks = hls_covered_block_lut(covered_blocks); + uint32_t size = covered_blocks * kDCTBlockSize; + uint32_t histo_offset = strm_histo_offset.read(); + int32_t nzeros = strm_nzeros_tokenAc.read(); + + // Skip LLF. + int32_t prev = (nzeros > (int32_t)(size / 16) ? 0 : 1); + TOKENIZE_AC_INNER_LOOP: + for (int32_t k = 0; k < size; ++k) { +#pragma HLS PIPELINE II = 1 + int32_t coeff = strm_coeff_ordered.read(); + if (k >= covered_blocks) { + uint32_t ctx = histo_offset + + hls_ZeroDensityContext(nzeros, k, covered_blocks, log2_covered_blocks, prev); + + uint32_t u_coeff = hls_PackSigned(coeff); + + if (nzeros > 0) { + ap_uint<65> token_ac_reg; + token_ac_reg.range(31, 0) = (uint32_t)u_coeff; + token_ac_reg.range(63, 32) = (uint32_t)ctx; + token_ac_reg[64] = 0; // block_end + strm_token_ac.write(token_ac_reg); + + prev = coeff != 0; + nzeros -= prev; + } + } + } + + // end of a block + ap_uint<65> token_ac_reg; + token_ac_reg.range(63, 0) = 0; + token_ac_reg[64] = 1; // block_end + strm_token_ac.write(token_ac_reg); + + // offset + offset[c] += size; + } + } + } + } +} + +void hls_ac_tokenize_core(int rect_x0, + int rect_y0, + int xsize_blocks, + int ysize_blocks, + int num_ctxs, + int num_dc_ctxs, + int qf_thresholds_size, + int nzeros_stride, + uint8_t ctx_map[MAX_QF_THRESH_SIZE], + uint32_t qf_thresholds[MAX_CTX_MAP_SIZE], + hls::stream& strm_ac_coeff, + hls::stream& strm_strategy, + hls::stream& strm_qf, + hls::stream& strm_qdc, + hls::stream >& strm_token_internal, + hls::stream >& strm_token_out) { +#pragma HLS DATAFLOW + hls::stream strm_ac_coeff_nz; + hls::stream strm_nzeros; + hls::stream strm_predict_nzeros; + hls::stream strm_strategy_0_1; + hls::stream strm_ac_ordered_0; + hls_count_nz( + // config + ysize_blocks, xsize_blocks, nzeros_stride, + // stream_in + strm_ac_coeff, strm_strategy, strm_ac_coeff_nz, + // stream_out + strm_strategy_0_1, strm_nzeros, strm_predict_nzeros); + + // hls::stream strm_block_ctx; + // hls_block_context(rect_x0, rect_y0, ysize_blocks, xsize_blocks, num_ctxs, num_dc_ctxs, qf_thresholds_size, + // ctx_map, + // qf_thresholds, strm_qf, strm_qdc, strm_strategy_in, strm_strategy_out, strm_block_ctx); + + hls::stream strm_strategy_1; + hls::stream strm_strategy_1_2; + hls::stream, 16> strm_token_nz; + hls::stream strm_histo_offset; + hls::stream strm_nzeros2; + hls_tokenize_nz( + // config + rect_x0, rect_y0, ysize_blocks, xsize_blocks, num_ctxs, num_dc_ctxs, qf_thresholds_size, ctx_map, qf_thresholds, + // stream_in + strm_qf, strm_qdc, strm_strategy_0_1, strm_nzeros, strm_predict_nzeros, + // stream_out + strm_strategy_1, strm_strategy_1_2, strm_histo_offset, strm_nzeros2, strm_token_nz); + + hls::stream, 16> strm_token_ac; + hls_tokenize_ac( + // config + xsize_blocks, ysize_blocks, + // stream_in + strm_ac_coeff_nz, strm_strategy_1_2, strm_histo_offset, strm_nzeros2, + // sgream_out + strm_token_ac); + + hls_collect_syn( + // config + xsize_blocks, ysize_blocks, + // stream_in + strm_strategy_1, strm_token_nz, strm_token_ac, + // stream_out + strm_token_out, strm_token_internal); +} + +void load_ac_raster_by_group(hls::stream& strm_config, + int32_t* ac_coeff_ddr, + hls::stream& strm_ac_coeff_raster) { +#pragma HLS INLINE off + + int group_dim = strm_config.read(); + int pixel_xsize = strm_config.read(); + int pixel_ysize = strm_config.read(); + + int xsize_groups = (pixel_xsize + group_dim - 1) / group_dim; + int ysize_groups = (pixel_ysize + group_dim - 1) / group_dim; + int num_groups = xsize_groups * ysize_groups; + + uint64_t group_offset = 0; + +LOAD_AC_RASTER_OUTTER_LOOP: + for (int group_index = 0; group_index < num_groups; group_index++) { + // paras-calculated + int gx = group_index % xsize_groups; + int gy = group_index / xsize_groups; + int hls_x0 = gx * (group_dim >> 3); + int hls_y0 = gy * (group_dim >> 3); + // rect xsize_blocks& ysize_blocks + int size_max = group_dim >> 3; + int hls_xsize_blocks = (pixel_xsize + kBlockDim - 1) / kBlockDim; + int hls_ysize_blocks = (pixel_ysize + kBlockDim - 1) / kBlockDim; + int rect_xsize_blocks = (hls_x0 + size_max <= hls_xsize_blocks) + ? size_max + : (hls_xsize_blocks > hls_x0 ? hls_xsize_blocks - hls_x0 : 0); + int rect_ysize_blocks = (hls_y0 + size_max <= hls_ysize_blocks) + ? size_max + : (hls_ysize_blocks > hls_y0 ? hls_ysize_blocks - hls_y0 : 0); + // calculate core-config + int rect_x0 = hls_x0; + int rect_y0 = hls_y0; + int xsize_blocks = rect_xsize_blocks; + int ysize_blocks = rect_ysize_blocks; + + // loading ac_coeff by group + LOAD_AC_RASTER_INNER_LOOP: + for (int k = 0; k < xsize_blocks * ysize_blocks * kDCTBlockSize * 3; k++) { +#pragma HLS PIPELINE II = 1 + int32_t ac_coef_reg = ac_coeff_ddr[k + group_offset]; + strm_ac_coeff_raster.write(ac_coef_reg); + } + + // move to next group set + group_offset += ysize_blocks * xsize_blocks * kDCTBlockSize * 3; + } +} + +void load_ac_ordered_by_group(hls::stream& strm_config, + int32_t* ac_coeff_ordered_ddr, + hls::stream& strm_ac_coeff0) { +#pragma HLS INLINE off + + int group_dim = strm_config.read(); + int pixel_xsize = strm_config.read(); + int pixel_ysize = strm_config.read(); + + int xsize_groups = (pixel_xsize + group_dim - 1) / group_dim; + int ysize_groups = (pixel_ysize + group_dim - 1) / group_dim; + int num_groups = xsize_groups * ysize_groups; + + uint64_t group_offset = 0; + +LOAD_AC_ORDERED_OUTTER_LOOP: + for (int group_index = 0; group_index < num_groups; group_index++) { + // paras-calculated + int gx = group_index % xsize_groups; + int gy = group_index / xsize_groups; + int hls_x0 = gx * (group_dim >> 3); + int hls_y0 = gy * (group_dim >> 3); + // rect xsize_blocks& ysize_blocks + int size_max = group_dim >> 3; + int hls_xsize_blocks = (pixel_xsize + kBlockDim - 1) / kBlockDim; + int hls_ysize_blocks = (pixel_ysize + kBlockDim - 1) / kBlockDim; + int rect_xsize_blocks = (hls_x0 + size_max <= hls_xsize_blocks) + ? size_max + : (hls_xsize_blocks > hls_x0 ? hls_xsize_blocks - hls_x0 : 0); + int rect_ysize_blocks = (hls_y0 + size_max <= hls_ysize_blocks) + ? size_max + : (hls_ysize_blocks > hls_y0 ? hls_ysize_blocks - hls_y0 : 0); + // calculate core-config + int rect_x0 = hls_x0; + int rect_y0 = hls_y0; + int xsize_blocks = rect_xsize_blocks; + int ysize_blocks = rect_ysize_blocks; + + // loading ac_coeff by group + LOAD_AC_ORDERED_INNER_LOOP: + for (int k = 0; k < xsize_blocks * ysize_blocks * kDCTBlockSize * 3; k++) { +#pragma HLS PIPELINE II = 1 + int32_t ac_coef_ordered_reg = ac_coeff_ordered_ddr[k + group_offset]; + strm_ac_coeff0.write(ac_coef_ordered_reg); + } + + // move to next group set + group_offset += ysize_blocks * xsize_blocks * kDCTBlockSize * 3; + } +} + +void load_ac_strategy_by_group(hls::stream& strm_config, + int32_t* strategy_ddr, + hls::stream& strm_strategy) { +#pragma HLS INLINE off + + // local calculated + int group_dim = strm_config.read(); + int pixel_xsize = strm_config.read(); + int pixel_ysize = strm_config.read(); + + // pre-process + int xsize_groups = (pixel_xsize + group_dim - 1) / group_dim; + int ysize_groups = (pixel_ysize + group_dim - 1) / group_dim; + int num_groups = xsize_groups * ysize_groups; + + uint64_t group_offset = 0; + +LOAD_AC_STRATEGY_OUTTER_LOOP: + for (int group_index = 0; group_index < num_groups; group_index++) { + // paras-calculated + int gx = group_index % xsize_groups; + int gy = group_index / xsize_groups; + int hls_x0 = gx * (group_dim >> 3); + int hls_y0 = gy * (group_dim >> 3); + // rect xsize_blocks& ysize_blocks + int size_max = group_dim >> 3; + int hls_xsize_blocks = (pixel_xsize + kBlockDim - 1) / kBlockDim; + int hls_ysize_blocks = (pixel_ysize + kBlockDim - 1) / kBlockDim; + int rect_xsize_blocks = (hls_x0 + size_max <= hls_xsize_blocks) + ? size_max + : (hls_xsize_blocks > hls_x0 ? hls_xsize_blocks - hls_x0 : 0); + int rect_ysize_blocks = (hls_y0 + size_max <= hls_ysize_blocks) + ? size_max + : (hls_ysize_blocks > hls_y0 ? hls_ysize_blocks - hls_y0 : 0); + // calculate core-config + int xsize_blocks = rect_xsize_blocks; + int ysize_blocks = rect_ysize_blocks; + + // loading strategy by group + LOAD_AC_STRATEGY_INNER_LOOP: + for (int k = 0; k < xsize_blocks * ysize_blocks; k++) { +#pragma HLS PIPELINE II = 1 + int32_t strategy = strategy_ddr[k + group_offset]; + strm_strategy.write(strategy); + } + + // move to next group set + group_offset = group_offset + xsize_blocks * ysize_blocks; + } +} + +void load_qdc_by_group(hls::stream& strm_config, uint8_t* qdc_ddr, hls::stream& strm_qdc) { +#pragma HLS INLINE off + + int group_dim = strm_config.read(); + int pixel_xsize = strm_config.read(); + int pixel_ysize = strm_config.read(); + + int xsize_groups = (pixel_xsize + group_dim - 1) / group_dim; + int ysize_groups = (pixel_ysize + group_dim - 1) / group_dim; + int num_groups = xsize_groups * ysize_groups; + + uint64_t group_offset = 0; +LOAD_QDC_OUTTER_LOOP: + for (int group_index = 0; group_index < num_groups; group_index++) { + // paras-calculated + int gx = group_index % xsize_groups; + int gy = group_index / xsize_groups; + int hls_x0 = gx * (group_dim >> 3); + int hls_y0 = gy * (group_dim >> 3); + // rect xsize_blocks& ysize_blocks + int size_max = group_dim >> 3; + int hls_xsize_blocks = (pixel_xsize + kBlockDim - 1) / kBlockDim; + int hls_ysize_blocks = (pixel_ysize + kBlockDim - 1) / kBlockDim; + int rect_xsize_blocks = (hls_x0 + size_max <= hls_xsize_blocks) + ? size_max + : (hls_xsize_blocks > hls_x0 ? hls_xsize_blocks - hls_x0 : 0); + int rect_ysize_blocks = (hls_y0 + size_max <= hls_ysize_blocks) + ? size_max + : (hls_ysize_blocks > hls_y0 ? hls_ysize_blocks - hls_y0 : 0); + // calculate core-config + int xsize_blocks = rect_xsize_blocks; + int ysize_blocks = rect_ysize_blocks; + + LOAD_QDC_INNER_LOOP: + for (int by = 0; by < ysize_blocks; by++) { + for (int bx = 0; bx < xsize_blocks; bx++) { +#pragma HLS PIPELINE II = 1 + int32_t dc_idx = qdc_ddr[bx + by * xsize_blocks + group_offset]; + + strm_qdc.write(dc_idx); + } + } + + group_offset += ysize_blocks * xsize_blocks; + } +} + +void load_qf_by_group(hls::stream& strm_config, int32_t* qf_ddr, hls::stream& strm_qf) { +#pragma HLS INLINE off + + int group_dim = strm_config.read(); + int pixel_xsize = strm_config.read(); + int pixel_ysize = strm_config.read(); + + int xsize_groups = (pixel_xsize + group_dim - 1) / group_dim; + int ysize_groups = (pixel_ysize + group_dim - 1) / group_dim; + int num_groups = xsize_groups * ysize_groups; + + uint64_t group_offset = 0; +LOAD_QF_OUTTER_LOOP: + for (int group_index = 0; group_index < num_groups; group_index++) { + // paras-calculated + int gx = group_index % xsize_groups; + int gy = group_index / xsize_groups; + int hls_x0 = gx * (group_dim >> 3); + int hls_y0 = gy * (group_dim >> 3); + // rect xsize_blocks& ysize_blocks + int size_max = group_dim >> 3; + int hls_xsize_blocks = (pixel_xsize + kBlockDim - 1) / kBlockDim; + int hls_ysize_blocks = (pixel_ysize + kBlockDim - 1) / kBlockDim; + int rect_xsize_blocks = (hls_x0 + size_max <= hls_xsize_blocks) + ? size_max + : (hls_xsize_blocks > hls_x0 ? hls_xsize_blocks - hls_x0 : 0); + int rect_ysize_blocks = (hls_y0 + size_max <= hls_ysize_blocks) + ? size_max + : (hls_ysize_blocks > hls_y0 ? hls_ysize_blocks - hls_y0 : 0); + // calculate core-config + int xsize_blocks = rect_xsize_blocks; + int ysize_blocks = rect_ysize_blocks; + + LOAD_QF_INNER_LOOP: + for (int by = 0; by < ysize_blocks; by++) { + for (int bx = 0; bx < xsize_blocks; bx++) { +#pragma HLS PIPELINE II = 1 + uint8_t hls_qf = qf_ddr[bx + by * xsize_blocks + group_offset]; + strm_qf.write(hls_qf); + } + } + + group_offset += ysize_blocks * xsize_blocks; + } +} + +void ac_token_writeout(uint64_t* ac_tokens_ddr, hls::stream >& strm_token_out) { +#pragma HLS INLINE off + + bool token_stream_end = 0; + uint64_t idx_token = 0; + +AC_TOKEN_WRITEOUT_LOOP: + while (!token_stream_end) { +#pragma HLS PIPELINE II = 1 + ap_uint<64> token_reg = strm_token_out.read(); + ac_tokens_ddr[idx_token] = token_reg; + + token_stream_end = token_reg[63]; + idx_token++; + } +} + +void hls_TokenizeCoefficients( + // bram + uint8_t hls_ctx_map[MAX_CTX_MAP_SIZE], + uint32_t hls_qf_thresholds[MAX_CTX_MAP_SIZE], + // strm input + hls::stream& strm_global_config, + // size of pixel + hls::stream& strm_ac_coeff_ordered, + // size of blk_num + hls::stream& strm_strategy, + hls::stream& strm_qf, + hls::stream& strm_qdc, + // size of bram + hls::stream >& strm_token_internal, + // output + hls::stream >& strm_token_out + + ) { +#pragma HLS INLINE off + + // global config + int group_dim = strm_global_config.read(); + int pixel_xsize = strm_global_config.read(); + int pixel_ysize = strm_global_config.read(); + int qf_thresholds_size = strm_global_config.read(); + int num_ctxs = strm_global_config.read(); + int num_dc_ctxs = strm_global_config.read(); + int nzeros_stride = strm_global_config.read(); + + int xsize_groups = (pixel_xsize + group_dim - 1) / group_dim; + int ysize_groups = (pixel_ysize + group_dim - 1) / group_dim; + int num_groups = xsize_groups * ysize_groups; + + // global variable + ap_uint<64> token_reg_out; + +TOKENIZE_COEFF_LOOP: + for (int group_index = 0; group_index < num_groups; group_index++) { + // paras-calculated + int gx = group_index % xsize_groups; + int gy = group_index / xsize_groups; + int hls_x0 = gx * (group_dim >> 3); + int hls_y0 = gy * (group_dim >> 3); + int size_max = group_dim >> 3; + int hls_xsize_blocks = (pixel_xsize + kBlockDim - 1) / kBlockDim; + int hls_ysize_blocks = (pixel_ysize + kBlockDim - 1) / kBlockDim; + int xsize_blocks = (hls_x0 + size_max <= hls_xsize_blocks) + ? size_max + : (hls_xsize_blocks > hls_x0 ? hls_xsize_blocks - hls_x0 : 0); + int ysize_blocks = (hls_y0 + size_max <= hls_ysize_blocks) + ? size_max + : (hls_ysize_blocks > hls_y0 ? hls_ysize_blocks - hls_y0 : 0); + + hls_ac_tokenize_core(hls_x0, hls_y0, xsize_blocks, ysize_blocks, num_ctxs, num_dc_ctxs, qf_thresholds_size, + nzeros_stride, hls_ctx_map, hls_qf_thresholds, strm_ac_coeff_ordered, strm_strategy, + strm_qf, strm_qdc, strm_token_internal, strm_token_out); + + // post-process + token_reg_out(61, 0) = 0; + token_reg_out[62] = 1; + token_reg_out[63] = 0; + strm_token_out.write(token_reg_out); + } + + token_reg_out[62] = 0; + token_reg_out[63] = 1; + strm_token_out.write(token_reg_out); + + ap_uint<64> token_reg; + token_reg(62, 0) = 0; + token_reg[63] = 1; + strm_token_internal.write(token_reg); +} + +//=====================================================================================================// +// hls_init_histogram.cpp +//=====================================================================================================// +void hls_largest_And_empty_write_out(int config[32], + hls::stream& strm_nempty_cnt, + hls::stream& strm_largest_idx + + ) { +WRITE_LARGEST_IDX_LOOP: + for (int i = 17; i < 22; i++) { +#pragma HLS PIPELINE II = 1 + uint32_t largest_reg = strm_largest_idx.read(); + config[i] = largest_reg; + } + +WRITE_NEMPTY_CNT_LOOP: + for (int i = 22; i < 27; i++) { +#pragma HLS PIPELINE II = 1 + uint32_t nempty_cnt = strm_nempty_cnt.read(); + config[i] = nempty_cnt; + } +} + +void load_config(int config[32], + hls::stream& strm_config_2, + hls::stream& strm_config_3, + hls::stream& strm_config_4, + hls::stream& strm_config_5, + hls::stream& strm_global_config, + hls::stream& strm_do_once, + hls::stream& strm_do_once_0, + hls::stream& strm_do_once_1, + hls::stream& strm_do_once_2, + hls::stream& strm_do_once_3) { + strm_config_2.write(config[4]); + strm_config_2.write(config[5]); + strm_config_2.write(config[6]); + + strm_config_3.write(config[4]); + strm_config_3.write(config[5]); + strm_config_3.write(config[6]); + + strm_config_4.write(config[4]); + strm_config_4.write(config[5]); + strm_config_4.write(config[6]); + + strm_config_5.write(config[4]); + strm_config_5.write(config[5]); + strm_config_5.write(config[6]); + + strm_global_config.write(config[4]); + strm_global_config.write(config[5]); + strm_global_config.write(config[6]); + strm_global_config.write(config[9]); + strm_global_config.write(config[7]); + strm_global_config.write(config[8]); + strm_global_config.write(config[10]); + + strm_do_once_0.write(config[12]); + strm_do_once_1.write(config[13]); + strm_do_once_2.write(config[14]); + strm_do_once_3.write(config[15]); + + strm_do_once.write(config[12]); + strm_do_once.write(config[13]); + strm_do_once.write(config[14]); + strm_do_once.write(config[15]); + strm_do_once.write(config[16]); +} + +void load_bram( + // host config + int config[32], + uint8_t ctx_map[MAX_QF_THRESH_SIZE], + uint32_t qf_thresholds[MAX_CTX_MAP_SIZE], + uint8_t hls_ctx_map[MAX_CTX_MAP_SIZE], + uint32_t hls_qf_thresholds[MAX_QF_THRESH_SIZE]) { + // load size config + int ctx_map_size = config[11]; + int qf_threshold_size = config[9]; + +// loading into bram +LOAD_CTX_MAP_LOOP: + for (int i = 0; i < ctx_map_size; i++) { +#pragma HLS PIPELINE II = 1 + hls_ctx_map[i] = ctx_map[i]; + } + +LOAD_QF_THRESHOLDS_LOOP: + for (int i = 0; i < qf_threshold_size; i++) { +#pragma HLS PIPELINE II = 1 + hls_qf_thresholds[i] = qf_thresholds[i]; + } +} + +void hls_ANSinitHistogram_core(hls::stream& strm_global_config, + hls::stream& strm_config_2, + hls::stream& strm_config_3, + hls::stream& strm_config_4, + hls::stream& strm_config_5, + hls::stream& strm_do_once_0, + hls::stream& strm_do_once_1, + hls::stream& strm_do_once_2, + hls::stream& strm_do_once_3, + hls::stream& strm_do_once, + + uint8_t hls_ctx_map[MAX_CTX_MAP_SIZE], + uint32_t hls_qf_thresholds[MAX_QF_THRESH_SIZE], + // ac_coef_ordered_ddr + int32_t ac_coeff_ordered_ddr[ALL_PIXEL], + // ac_strategy ddr + int32_t strategy_ddr[MAX_NUM_BLK88], + // qf ddr + int32_t qf_ddr[MAX_NUM_BLK88], + // qdc ddr + uint8_t qdc_ddr[MAX_NUM_BLK88], + // ctx_map ddr + uint8_t ctx_map[MAX_QF_THRESH_SIZE], // + // quant field threshold + uint32_t qf_thresholds[MAX_CTX_MAP_SIZE], // + // ac_token_output + uint64_t ac_tokens_ddr[MAX_AC_TOKEN_SIZE], + + ap_uint<64>* tokens0_ptr, + ap_uint<64>* tokens1_ptr, + ap_uint<64>* tokens2_ptr, + ap_uint<64>* tokens3_ptr, + hls::stream& strm_nempty_cnt, + hls::stream& strm_largest_idx, + + int32_t* histograms0_ptr, + uint32_t* histograms_size0_ptr, + uint32_t* total_count0_ptr, + uint32_t* nonempty0_ptr, + + int32_t* histograms1_ptr, + uint32_t* histograms_size1_ptr, + uint32_t* total_count1_ptr, + uint32_t* nonempty1_ptr, + + int32_t* histograms2_ptr, + uint32_t* histograms_size2_ptr, + uint32_t* total_count2_ptr, + uint32_t* nonempty2_ptr, + + int32_t* histograms3_ptr, + uint32_t* histograms_size3_ptr, + uint32_t* total_count3_ptr, + uint32_t* nonempty3_ptr, + + int32_t* histograms4_ptr, + uint32_t* histograms_size4_ptr, + uint32_t* total_count4_ptr, + uint32_t* nonempty4_ptr) { +#pragma HLS DATAFLOW + + //================================== core ==========================================// + hls::stream strm_ac_coeff0; + hls::stream strm_ac_coeff1; + hls::stream strm_strategy; + hls::stream strm_qf; + hls::stream strm_qdc; + hls::stream, 16> token_stream0; + hls::stream, 16> token_stream1; + hls::stream, 16> token_stream2; + hls::stream, 16> token_stream3; + load_token(strm_do_once_0, tokens0_ptr, token_stream0); + load_token(strm_do_once_1, tokens1_ptr, token_stream1); + load_token(strm_do_once_2, tokens2_ptr, token_stream2); + load_token(strm_do_once_3, tokens3_ptr, token_stream3); + load_ac_ordered_by_group(strm_config_2, ac_coeff_ordered_ddr, strm_ac_coeff0); + load_ac_strategy_by_group(strm_config_3, strategy_ddr, strm_strategy); + load_qf_by_group(strm_config_4, qf_ddr, strm_qf); + load_qdc_by_group(strm_config_5, qdc_ddr, strm_qdc); + + hls::stream, 16> token_stream_internal; + hls::stream, 16> strm_token_out; + hls_TokenizeCoefficients(hls_ctx_map, hls_qf_thresholds, strm_global_config, strm_ac_coeff0, strm_strategy, strm_qf, + strm_qdc, token_stream_internal, strm_token_out); + + init_histogram_top(strm_do_once, token_stream0, token_stream1, token_stream2, token_stream3, token_stream_internal, + strm_nempty_cnt, strm_largest_idx, histograms0_ptr, histograms_size0_ptr, total_count0_ptr, + nonempty0_ptr, + + histograms1_ptr, histograms_size1_ptr, total_count1_ptr, nonempty1_ptr, + + histograms2_ptr, histograms_size2_ptr, total_count2_ptr, nonempty2_ptr, + + histograms3_ptr, histograms_size3_ptr, total_count3_ptr, nonempty3_ptr, + + histograms4_ptr, histograms_size4_ptr, total_count4_ptr, nonempty4_ptr); + + ac_token_writeout(ac_tokens_ddr, strm_token_out); +} + +namespace xf { +namespace codec { + +/** +* @brief JXL ANS init Histogram kernel +* +* @param config configuration for the kernel. +* @param ac_coef_ordered_ddr ac coefficients +* @param strategy_ddr ac strategy +* @param qf_ddr quant field +* @param qdc_ddr qdc +* @param ctx_map ctx_map ddr +* @param qf_thresholds quantfield_thresholds +* @param ac_tokens_ddr the ouput of ac tokens +* @param token0_ptr tokens for Block Context Map +* @param token1_ptr tokens for Modular frame tree +* @param token2_ptr tokens for coef orders +* @param token3_ptr tokens for Modular frames +* @param histograms0_ptr histograms for Block Context Map. +* @param histo_totalcnt0_ptr Count of context for histograms for Block Context Map. +* @param histo_size0_ptr size for each context +* @param nonempty_histo0_ptr indicate which context is empty +* @param histograms1_ptr histograms for Modular frame tree. +* @param histo_totalcnt1_ptr Count of context for histograms for Modular frame tree. +* @param histo_size1_ptr size for each context +* @param nonempty_histo1_ptr indicate which context is empty +* @param histograms2_ptr histograms for code from Modular frame. +* @param histo_totalcnt2_ptr Count of context for histograms for Modular frame. +* @param histo_size2_ptr size for each context +* @param nonempty_histo2_ptr indicate which context is empty +* @param histograms3_ptr histograms for coef orders. +* @param histo_totalcnt3_ptr Count of context for histograms for coef orders. +* @param histo_size3_ptr size for each context +* @param nonempty_histo3_ptr indicate which context is empty +* @param histograms4_ptr histograms for ac coefficients. +* @param histo_totalcnt4_ptr Count of context for histograms for ac coefficients. +* @param histo_size4_ptr size for each context +* @param nonempty_histo4_ptr indicate which context is empty +*/ + +extern "C" void JxlEnc_ans_initHistogram( + // host config + int config[32], + // ac_coef_ordered_ddr + int32_t ac_coeff_ordered_ddr[ALL_PIXEL], + // ac_strategy ddr + int32_t strategy_ddr[MAX_NUM_BLK88], + // qf ddr + int32_t qf_ddr[MAX_NUM_BLK88], + // qdc ddr + uint8_t qdc_ddr[MAX_NUM_BLK88], + // ctx_map ddr + uint8_t ctx_map[MAX_QF_THRESH_SIZE], + // quant field threshold + uint32_t qf_thresholds[MAX_CTX_MAP_SIZE], + // ac_token_output + uint64_t ac_tokens_ddr[MAX_AC_TOKEN_SIZE], + + ap_uint<64>* tokens0_ptr, + ap_uint<64>* tokens1_ptr, + ap_uint<64>* tokens2_ptr, + ap_uint<64>* tokens3_ptr, + + int32_t* histograms0_ptr, + uint32_t* histograms_size0_ptr, + uint32_t* total_count0_ptr, + uint32_t* nonempty0_ptr, + + int32_t* histograms1_ptr, + uint32_t* histograms_size1_ptr, + uint32_t* total_count1_ptr, + uint32_t* nonempty1_ptr, + + int32_t* histograms2_ptr, + uint32_t* histograms_size2_ptr, + uint32_t* total_count2_ptr, + uint32_t* nonempty2_ptr, + + int32_t* histograms3_ptr, + uint32_t* histograms_size3_ptr, + uint32_t* total_count3_ptr, + uint32_t* nonempty3_ptr, + + int32_t* histograms4_ptr, + uint32_t* histograms_size4_ptr, + uint32_t* total_count4_ptr, + uint32_t* nonempty4_ptr) { +// size of pixel +#pragma HLS INTERFACE mode = m_axi bundle = mm2 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 8 max_write_burst_length = 2 max_read_burst_length = 64 depth = ALL_PIXEL port = \ + ac_coeff_ordered_ddr +// size of num_blk +#pragma HLS INTERFACE mode = m_axi bundle = mm3 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 8 max_write_burst_length = 2 max_read_burst_length = 64 depth = MAX_NUM_BLK88 port = \ + strategy_ddr +#pragma HLS INTERFACE mode = m_axi bundle = mm4 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 8 max_write_burst_length = 2 max_read_burst_length = 64 depth = MAX_NUM_BLK88 port = \ + qf_ddr +#pragma HLS INTERFACE mode = m_axi bundle = mm5 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 8 max_write_burst_length = 2 max_read_burst_length = 64 depth = MAX_NUM_BLK88 port = \ + qdc_ddr +// size of bram +#pragma HLS INTERFACE mode = m_axi bundle = mm6 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 8 max_write_burst_length = 2 max_read_burst_length = 16 depth = MAX_CTX_MAP_SIZE port = \ + ctx_map +#pragma HLS INTERFACE mode = m_axi bundle = mm6 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 8 max_write_burst_length = 2 max_read_burst_length = 16 depth = MAX_QF_THRESH_SIZE port = \ + qf_thresholds +// config +#pragma HLS INTERFACE mode = m_axi bundle = mm7 latency = 32 offset = slave num_write_outstanding = \ + 1 num_read_outstanding = 8 max_write_burst_length = 2 max_read_burst_length = 16 depth = 32 port = config +// output +#pragma HLS INTERFACE mode = m_axi bundle = mm8 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 1 max_write_burst_length = 64 max_read_burst_length = 2 depth = MAX_AC_TOKEN_SIZE port = \ + ac_tokens_ddr + +#pragma HLS INTERFACE mode = m_axi bundle = mm9 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = MAX_AC_TOKEN_SIZE port = \ + tokens0_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm10 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = MAX_AC_TOKEN_SIZE port = \ + tokens1_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm11 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = MAX_AC_TOKEN_SIZE port = \ + tokens2_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm12 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = MAX_AC_TOKEN_SIZE port = \ + tokens3_ptr + +#pragma HLS INTERFACE mode = m_axi bundle = mm9 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 4096 port = \ + nonempty0_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm9 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 4096 port = \ + nonempty1_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm9 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 4096 port = \ + nonempty2_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm9 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 4096 port = \ + nonempty3_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm9 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 4096 port = \ + nonempty4_ptr + +#pragma HLS INTERFACE mode = m_axi bundle = mm10 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 163840 port = \ + histograms0_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm10 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 163840 port = \ + histograms1_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm10 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 163840 port = \ + histograms2_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm10 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 163840 port = \ + histograms3_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm10 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 163840 port = \ + histograms4_ptr + +#pragma HLS INTERFACE mode = m_axi bundle = mm11 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 4096 port = \ + histograms_size0_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm11 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 4096 port = \ + histograms_size1_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm11 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 4096 port = \ + histograms_size2_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm11 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 4096 port = \ + histograms_size3_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm11 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 4096 port = \ + histograms_size4_ptr + +#pragma HLS INTERFACE mode = m_axi bundle = mm12 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 4096 port = \ + total_count0_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm12 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 4096 port = \ + total_count1_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm12 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 4096 port = \ + total_count2_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm12 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 4096 port = \ + total_count3_ptr +#pragma HLS INTERFACE mode = m_axi bundle = mm12 latency = 32 offset = slave num_write_outstanding = \ + 8 num_read_outstanding = 8 max_write_burst_length = 64 max_read_burst_length = 64 depth = 4096 port = \ + total_count4_ptr + + //========================== top ==================================================// + hls::stream strm_global_config; + hls::stream strm_config_2; + hls::stream strm_config_3; + hls::stream strm_config_4; + hls::stream strm_config_5; + hls::stream strm_do_once_0("strm_do_once_0"); + hls::stream strm_do_once_1("strm_do_once_1"); + hls::stream strm_do_once_2("strm_do_once_2"); + hls::stream strm_do_once_3("strm_do_once_3"); + hls::stream strm_do_once("strm_do_once"); + load_config(config, strm_config_2, strm_config_3, strm_config_4, strm_config_5, strm_global_config, strm_do_once, + strm_do_once_0, strm_do_once_1, strm_do_once_2, strm_do_once_3); + + uint8_t hls_ctx_map[MAX_CTX_MAP_SIZE]; +#pragma HLS BIND_STORAGE impl = BRAM variable = hls_ctx_map type = ram_s2p + uint32_t hls_qf_thresholds[MAX_QF_THRESH_SIZE]; +#pragma HLS BIND_STORAGE impl = BRAM variable = hls_qf_thresholds type = ram_s2p + load_bram(config, ctx_map, qf_thresholds, hls_ctx_map, hls_qf_thresholds); + + //=============================== core =====================================// + hls::stream strm_nempty_cnt; + hls::stream strm_largest_idx; + hls_ANSinitHistogram_core( + strm_global_config, strm_config_2, strm_config_3, strm_config_4, strm_config_5, strm_do_once_0, strm_do_once_1, + strm_do_once_2, strm_do_once_3, strm_do_once, hls_ctx_map, hls_qf_thresholds, + + ac_coeff_ordered_ddr, strategy_ddr, qf_ddr, qdc_ddr, ctx_map, qf_thresholds, ac_tokens_ddr, + + tokens0_ptr, tokens1_ptr, tokens2_ptr, tokens3_ptr, strm_nempty_cnt, strm_largest_idx, + + histograms0_ptr, histograms_size0_ptr, total_count0_ptr, nonempty0_ptr, + + histograms1_ptr, histograms_size1_ptr, total_count1_ptr, nonempty1_ptr, + + histograms2_ptr, histograms_size2_ptr, total_count2_ptr, nonempty2_ptr, + + histograms3_ptr, histograms_size3_ptr, total_count3_ptr, nonempty3_ptr, + + histograms4_ptr, histograms_size4_ptr, total_count4_ptr, nonempty4_ptr); + //======================= larget_And_empty write out =========================// + hls_largest_And_empty_write_out(config, strm_nempty_cnt, strm_largest_idx); + //======================= End of All =========================================// +} + +} // namespace codec +} // namespace xf +#endif diff --git a/codec/L2/demos/jxlEnc/acc_tokInit_histogram/postSysLink.tcl b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/postSysLink.tcl new file mode 100644 index 0000000000..2dc2f67034 --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/postSysLink.tcl @@ -0,0 +1 @@ +set_property -dict [list CONFIG.ECC_EN {false} CONFIG.ECC_SCRUB_EN {false}] [get_bd_cells hmss_0] diff --git a/codec/L2/demos/jxlEnc/acc_tokInit_histogram/utils.mk b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/utils.mk new file mode 100644 index 0000000000..0ee80e90da --- /dev/null +++ b/codec/L2/demos/jxlEnc/acc_tokInit_histogram/utils.mk @@ -0,0 +1,270 @@ +# +# Copyright 2019-2022 Xilinx, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# vitis makefile-generator v2.0.6 +# +#+------------------------------------------------------------------------------- +# The following parameters are assigned with default values. These parameters can +# be overridden through the make command line +#+------------------------------------------------------------------------------- + +REPORT := no +PROFILE := no +DEBUG := no + +#'estimate' for estimate report generation +#'system' for system report generation +ifneq ($(REPORT), no) +VPP_LDFLAGS += --report estimate +VPP_LDFLAGS += --report system +endif + +#Generates profile summary report +ifeq ($(PROFILE), yes) +VPP_LDFLAGS += --profile_kernel data:all:all:all +endif + +#Generates debug summary report +ifeq ($(DEBUG), yes) +VPP_LDFLAGS += --dk protocol:all:all:all +endif + +#Check environment setup +ifndef XILINX_VITIS + XILINX_VITIS = /opt/xilinx/Vitis/$(TOOL_VERSION) + export XILINX_VITIS +endif +ifndef XILINX_XRT + XILINX_XRT = /opt/xilinx/xrt + export XILINX_XRT +endif + +check_device: + @set -eu; \ + inallowlist=False; \ + inblocklist=False; \ + for dev in $(PLATFORM_ALLOWLIST); \ + do if [[ $$(echo $(PLATFORM_NAME) | grep $$dev) != "" ]]; \ + then inallowlist=True; fi; \ + done ;\ + for dev in $(PLATFORM_BLOCKLIST); \ + do if [[ $$(echo $(PLATFORM_NAME) | grep $$dev) != "" ]]; \ + then inblocklist=True; fi; \ + done ;\ + if [[ $$inallowlist == False ]]; \ + then echo "[Warning]: The device $(PLATFORM_NAME) not in allowlist."; \ + fi; \ + if [[ $$inblocklist == True ]]; \ + then echo "[ERROR]: The device $(PLATFORM_NAME) in blocklist."; exit 1;\ + fi; + +#get HOST_ARCH by PLATFORM +ifneq (,$(PLATFORM)) +HOST_ARCH_temp = $(shell platforminfo -p $(PLATFORM) | grep 'CPU Type' | sed 's/.*://' | sed '/ai_engine/d' | sed 's/^[[:space:]]*//') +ifeq ($(HOST_ARCH_temp), x86) +HOST_ARCH := x86 +else ifeq ($(HOST_ARCH_temp), cortex-a9) +HOST_ARCH := aarch32 +else ifneq (,$(findstring cortex-a, $(HOST_ARCH_temp))) +HOST_ARCH := aarch64 +endif +endif + + +#get suffix of kernel by PLATFORM +VITIS_VER = $(shell v++ --version | grep 'v++' | sed 's/^[[:space:]]*//' | sed -e 's/^[*]* v++ v//g' | cut -d " " -f1) +DEVICE_TYPE = $(shell platforminfo -p $(PLATFORM) | grep 'FPGA Family' | sed 's/.*://' | sed '/ai_engine/d' | sed 's/^[[:space:]]*//') +ifeq ($(DEVICE_TYPE), versal) +ifeq ($(shell expr $(VITIS_VER) \>= 2022.1), 1) +LINK_TARGET_FMT := xsa +else +LINK_TARGET_FMT := xclbin +endif +else +LINK_TARGET_FMT := xclbin +endif + +#Checks for Device Family +ifeq ($(HOST_ARCH), aarch32) + DEV_FAM = 7Series +else ifeq ($(HOST_ARCH), aarch64) + DEV_FAM = Ultrascale +endif + +#Checks for Correct architecture +ifneq ($(HOST_ARCH), $(filter $(HOST_ARCH),aarch64 aarch32 x86)) +$(error HOST_ARCH variable not set, please set correctly and rerun) +endif + +check_version: +ifneq (, $(shell which git)) +ifneq (,$(wildcard $(XFLIB_DIR)/.git)) + @cd $(XFLIB_DIR) && git log --graph --pretty=format:'%Cred%h%Creset -%C(yellow)%d%Creset %s %Cgreen(%cr) %C(bold blue)<%an>%Creset' --abbrev-commit -n 1 && cd - +endif +endif + +#Checks for SYSROOT +check_sysroot: +ifneq ($(HOST_ARCH), x86) +ifndef SYSROOT + $(error SYSROOT ENV variable is not set, please set ENV variable correctly and rerun) +endif +endif + +#Checks for g++ +CXX := g++ +ifeq ($(HOST_ARCH), x86) +ifeq ($(shell expr $(VITIS_VER) \>= 2022.1), 1) +CXX_VER := 8.3.0 +else +CXX_VER := 6.2.0 +endif +CXX_V := $(shell echo $(CXX_VER) | awk -F. '{print tolower($$1)}') +ifneq ($(shell expr $(shell echo "__GNUG__" | g++ -E -x c++ - | tail -1) \>= $(CXX_V)), 1) +ifndef XILINX_VIVADO +$(error [ERROR]: g++ version too old. Please use $(CXX_VER) or above) +else +CXX := $(XILINX_VIVADO)/tps/lnx64/gcc-$(CXX_VER)/bin/g++ +ifeq ($(LD_LIBRARY_PATH),) +export LD_LIBRARY_PATH := $(XILINX_VIVADO)/tps/lnx64/gcc-$(CXX_VER)/lib64 +else +export LD_LIBRARY_PATH := $(XILINX_VIVADO)/tps/lnx64/gcc-$(CXX_VER)/lib64:$(LD_LIBRARY_PATH) +endif +$(warning [WARNING]: g++ version too old. Using g++ provided by the tool: $(CXX)) +endif +endif +else ifeq ($(HOST_ARCH), aarch64) +CXX := $(XILINX_VITIS)/gnu/aarch64/lin/aarch64-linux/bin/aarch64-linux-gnu-g++ +else ifeq ($(HOST_ARCH), aarch32) +CXX := $(XILINX_VITIS)/gnu/aarch32/lin/gcc-arm-linux-gnueabi/bin/arm-linux-gnueabihf-g++ +endif + +#Check OS and setting env for xrt c++ api +OSDIST = $(shell lsb_release -i |awk -F: '{print tolower($$2)}' | tr -d ' \t' ) +OSREL = $(shell lsb_release -r |awk -F: '{print tolower($$2)}' |tr -d ' \t') + +# for centos and redhat +ifneq ($(findstring centos,$(OSDIST)),) +ifeq (7,$(shell echo $(OSREL) | awk -F. '{print tolower($$1)}' )) +ifeq ($(HOST_ARCH), x86) +XRT_CXXFLAGS += -D_GLIBCXX_USE_CXX11_ABI=0 +endif +endif +else ifneq ($(findstring redhat,$(OSDIST)),) +ifeq (7,$(shell echo $(OSREL) | awk -F. '{print tolower($$1)}' )) +ifeq ($(HOST_ARCH), x86) +XRT_CXXFLAGS += -D_GLIBCXX_USE_CXX11_ABI=0 +endif +endif +endif + +#Setting VPP +VPP := v++ + +#Cheks for aiecompiler +AIECXX := aiecompiler +AIESIMULATOR := aiesimulator +X86SIMULATOR := x86simulator + +.PHONY: check_vivado +check_vivado: +ifeq (,$(wildcard $(XILINX_VIVADO)/bin/vivado)) + @echo "Cannot locate Vivado installation. Please set XILINX_VIVADO variable." && false +endif + +.PHONY: check_vpp +check_vpp: +ifeq (,$(wildcard $(XILINX_VITIS)/bin/v++)) + @echo "Cannot locate Vitis installation. Please set XILINX_VITIS variable." && false +endif + +.PHONY: check_xrt +check_xrt: +ifeq (,$(wildcard $(XILINX_XRT)/lib/libxilinxopencl.so)) + @echo "Cannot locate XRT installation. Please set XILINX_XRT variable." && false +endif + +export PATH := $(XILINX_VITIS)/bin:$(XILINX_XRT)/bin:$(PATH) +ifeq ($(HOST_ARCH), x86) +ifeq (,$(LD_LIBRARY_PATH)) +LD_LIBRARY_PATH := $(XILINX_XRT)/lib +else +LD_LIBRARY_PATH := $(XILINX_XRT)/lib:$(LD_LIBRARY_PATH) +endif +endif + +ifneq (,$(wildcard $(PLATFORM))) +# Use PLATFORM as a file path +XPLATFORM := $(PLATFORM) +else +# Use PLATFORM as a file name pattern +# 1. search paths specified by variable +ifneq (,$(PLATFORM_REPO_PATHS)) +# 1.1 as exact name +XPLATFORM := $(strip $(foreach p, $(subst :, ,$(PLATFORM_REPO_PATHS)), $(wildcard $(p)/$(PLATFORM)/$(PLATFORM).xpfm))) +# 1.2 as a pattern +ifeq (,$(XPLATFORM)) +XPLATFORMS := $(foreach p, $(subst :, ,$(PLATFORM_REPO_PATHS)), $(wildcard $(p)/*/*.xpfm)) +XPLATFORM := $(strip $(foreach p, $(XPLATFORMS), $(shell echo $(p) | awk '$$1 ~ /$(PLATFORM)/'))) +endif # 1.2 +endif # 1 +# 2. search Vitis installation +ifeq (,$(XPLATFORM)) +# 2.1 as exact name +XPLATFORM := $(strip $(wildcard $(XILINX_VITIS)/platforms/$(PLATFORM)/$(PLATFORM).xpfm)) +# 2.2 as a pattern +ifeq (,$(XPLATFORM)) +XPLATFORMS := $(wildcard $(XILINX_VITIS)/platforms/*/*.xpfm) +XPLATFORM := $(strip $(foreach p, $(XPLATFORMS), $(shell echo $(p) | awk '$$1 ~ /$(PLATFORM)/'))) +endif # 2.2 +endif # 2 +# 3. search default locations +ifeq (,$(XPLATFORM)) +# 3.1 as exact name +XPLATFORM := $(strip $(wildcard /opt/xilinx/platforms/$(PLATFORM)/$(PLATFORM).xpfm)) +# 3.2 as a pattern +ifeq (,$(XPLATFORM)) +XPLATFORMS := $(wildcard /opt/xilinx/platforms/*/*.xpfm) +XPLATFORM := $(strip $(foreach p, $(XPLATFORMS), $(shell echo $(p) | awk '$$1 ~ /$(PLATFORM)/'))) +endif # 3.2 +endif # 3 +endif + +define MSG_PLATFORM +No platform matched pattern '$(PLATFORM)'. +Available platforms are: $(XPLATFORMS) +To add more platform directories, set the PLATFORM_REPO_PATHS variable or point PLATFORM variable to the full path of platform .xpfm file. +endef +export MSG_PLATFORM + + +.PHONY: check_platform +check_platform: +ifeq (,$(XPLATFORM)) + @echo "$${MSG_PLATFORM}" && false +endif +#Check ends + +# device2xsa - create a filesystem friendly name from device name +# $(1) - full name of device +PLATFORM_NAME = $(strip $(patsubst %.xpfm, % , $(shell basename $(PLATFORM)))) + + +# Cleaning stuff +RM = rm -f +RMDIR = rm -rf + +MV = mv -f +CP = cp -rf +ECHO:= @echo diff --git a/codec/L2/demos/jxlEnc/images/small32x32.png b/codec/L2/demos/jxlEnc/images/small32x32.png new file mode 100644 index 0000000000..e50f46a988 Binary files /dev/null and b/codec/L2/demos/jxlEnc/images/small32x32.png differ diff --git a/codec/L2/demos/jxlEnc/images/t0.png b/codec/L2/demos/jxlEnc/images/t0.png new file mode 100644 index 0000000000..294bbaae40 Binary files /dev/null and b/codec/L2/demos/jxlEnc/images/t0.png differ diff --git a/codec/L2/demos/jxlEnc/images/t1.png b/codec/L2/demos/jxlEnc/images/t1.png new file mode 100644 index 0000000000..3b0012f91f Binary files /dev/null and b/codec/L2/demos/jxlEnc/images/t1.png differ diff --git a/codec/L2/demos/jxlEnc/images/t2.png b/codec/L2/demos/jxlEnc/images/t2.png new file mode 100644 index 0000000000..da8ecb130a Binary files /dev/null and b/codec/L2/demos/jxlEnc/images/t2.png differ diff --git a/codec/L2/demos/jxlEnc/others/include/acc_cluster_histogram.hpp b/codec/L2/demos/jxlEnc/others/include/acc_cluster_histogram.hpp new file mode 100644 index 0000000000..41d50d123e --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/acc_cluster_histogram.hpp @@ -0,0 +1,38 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef ACC_CLUSTER_HISTOGRAM_HPP +#define ACC_CLUSTER_HISTOGRAM_HPP + +#include "acc_phase3.hpp" + +namespace jxl { +void acc_ANSclusterHistogram(bool is_small_image, + bool do_once[5], + char* do_inner, + char* do_prefix_in, + + std::vector& params, + + std::vector >& histograms_, + std::vector& num_contexts, + std::vector*> context_map, + std::vector >& nonempty_histograms, + std::vector& largest_idx, + + std::vector codes, + std::vector >& clustered_histograms, + std::vector >& histogram_symbols, + + std::vector writer, + std::vector layer, + std::vector >& clustered_histogramsin, + std::vector > >& tokensin, + std::vector& codesin, + std::vector >& context_map_in); + +} // namespace jxl + +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/acc_common.hpp b/codec/L2/demos/jxlEnc/others/include/acc_common.hpp new file mode 100644 index 0000000000..6fc619cc31 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/acc_common.hpp @@ -0,0 +1,839 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef ACC_COMMON_HPP +#define ACC_COMMON_HPP + +#include "xlnx_cfg.h" + +#include +#include "lib/jxl/enc_frame.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_ar_control_field.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_coeff_order.h" +#include "lib/jxl/enc_context_map.h" +#include "lib/jxl/enc_entropy_coder.h" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_noise.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_photon_noise.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/enc_toc.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/gaborish.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" +#include "lib/jxl/toc.h" +#include "acc_enc_ac_strategy.hpp" +#include "acc_enc_chroma_from_luma.hpp" +#include "acc_enc_group.hpp" + +namespace jxl { +namespace { +void ClusterGroups(PassesEncoderState* enc_state) { + if (enc_state->shared.frame_header.passes.num_passes > 1) { + // TODO(veluca): implement this for progressive modes. + return; + } + // This only considers pass 0 for now. + std::vector context_map; + EntropyEncodingData codes; + auto& ac = enc_state->passes[0].ac_tokens; + size_t limit = std::ceil(std::sqrt(ac.size())); + if (limit == 1) return; + size_t num_contexts = enc_state->shared.block_ctx_map.NumACContexts(); + std::vector costs(ac.size()); + HistogramParams params; + params.uint_method = HistogramParams::HybridUintMethod::kNone; + params.lz77_method = HistogramParams::LZ77Method::kNone; + params.ans_histogram_strategy = HistogramParams::ANSHistogramStrategy::kApproximate; + size_t max = 0; + auto token_cost = [&](std::vector >& tokens, size_t num_ctx, bool estimate = true) { + // TODO(veluca): not estimating is very expensive. + BitWriter writer; + size_t c = BuildAndEncodeHistograms(params, num_ctx, tokens, &codes, &context_map, estimate ? nullptr : &writer, + 0, /*aux_out=*/0); + if (estimate) return c; + for (size_t i = 0; i < tokens.size(); i++) { + WriteTokens(tokens[i], codes, context_map, &writer, 0, nullptr); + } + return writer.BitsWritten(); + }; + for (size_t i = 0; i < ac.size(); i++) { + std::vector > tokens{ac[i]}; + costs[i] = token_cost(tokens, enc_state->shared.block_ctx_map.NumACContexts()); + if (costs[i] > costs[max]) { + max = i; + } + } + auto dist = [&](int i, int j) { + std::vector > tokens{ac[i], ac[j]}; + return token_cost(tokens, num_contexts) - costs[i] - costs[j]; + }; + std::vector out{max}; + std::vector old_map(ac.size()); + std::vector dists(ac.size()); + size_t farthest = 0; + for (size_t i = 0; i < ac.size(); i++) { + if (i == max) continue; + dists[i] = dist(max, i); + if (dists[i] > dists[farthest]) { + farthest = i; + } + } + + while (dists[farthest] > 0 && out.size() < limit) { + out.push_back(farthest); + dists[farthest] = 0; + enc_state->histogram_idx[farthest] = out.size() - 1; + for (size_t i = 0; i < ac.size(); i++) { + float d = dist(out.back(), i); + if (d < dists[i]) { + dists[i] = d; + old_map[i] = enc_state->histogram_idx[i]; + enc_state->histogram_idx[i] = out.size() - 1; + } + if (dists[i] > dists[farthest]) { + farthest = i; + } + } + } + + std::vector remap(out.size()); + std::iota(remap.begin(), remap.end(), 0); + for (size_t i = 0; i < enc_state->histogram_idx.size(); i++) { + enc_state->histogram_idx[i] = remap[enc_state->histogram_idx[i]]; + } + auto remap_cost = [&](std::vector remap) { + std::vector re_remap(remap.size(), remap.size()); + size_t r = 0; + for (size_t i = 0; i < remap.size(); i++) { + if (re_remap[remap[i]] == remap.size()) { + re_remap[remap[i]] = r++; + } + remap[i] = re_remap[remap[i]]; + } + auto tokens = ac; + size_t max_hist = 0; + for (size_t i = 0; i < tokens.size(); i++) { + for (size_t j = 0; j < tokens[i].size(); j++) { + size_t hist = remap[enc_state->histogram_idx[i]]; + tokens[i][j].context += hist * num_contexts; + max_hist = std::max(hist + 1, max_hist); + } + } + return token_cost(tokens, max_hist * num_contexts, /*estimate=*/false); + }; + + for (size_t src = 0; src < out.size(); src++) { + float cost = remap_cost(remap); + size_t best = src; + for (size_t j = src + 1; j < out.size(); j++) { + if (remap[src] == remap[j]) continue; + auto remap_c = remap; + std::replace(remap_c.begin(), remap_c.end(), remap[src], remap[j]); + float c = remap_cost(remap_c); + if (c < cost) { + best = j; + cost = c; + } + } + if (src != best) { + std::replace(remap.begin(), remap.end(), remap[src], remap[best]); + } + } + std::vector re_remap(remap.size(), remap.size()); + size_t r = 0; + for (size_t i = 0; i < remap.size(); i++) { + if (re_remap[remap[i]] == remap.size()) { + re_remap[remap[i]] = r++; + } + remap[i] = re_remap[remap[i]]; + } + + enc_state->shared.num_histograms = *std::max_element(remap.begin(), remap.end()) + 1; + for (size_t i = 0; i < enc_state->histogram_idx.size(); i++) { + enc_state->histogram_idx[i] = remap[enc_state->histogram_idx[i]]; + } + for (size_t i = 0; i < ac.size(); i++) { + for (size_t j = 0; j < ac[i].size(); j++) { + ac[i][j].context += enc_state->histogram_idx[i] * num_contexts; + } + } +} // ClusterGroups + +void FindBestBlockEntropyModel(PassesEncoderState& enc_state) { + if (enc_state.cparams.decoding_speed_tier >= 1) { + static constexpr uint8_t kSimpleCtxMap[] = { + // Cluster all blocks together + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // + }; + static_assert(3 * kNumOrders == sizeof(kSimpleCtxMap) / sizeof *kSimpleCtxMap, "Update simple context map"); + + auto bcm = enc_state.shared.block_ctx_map; + bcm.ctx_map.assign(std::begin(kSimpleCtxMap), std::end(kSimpleCtxMap)); + bcm.num_ctxs = 2; + bcm.num_dc_ctxs = 1; + return; + } + if (enc_state.cparams.speed_tier >= SpeedTier::kFalcon) { + return; + } + const ImageI& rqf = enc_state.shared.raw_quant_field; + // No need to change context modeling for small images. + size_t tot = rqf.xsize() * rqf.ysize(); + size_t size_for_ctx_model = (1 << 10) * enc_state.cparams.butteraugli_distance; + // if (tot < size_for_ctx_model) return; + + struct OccCounters { + // count the occurrences of each qf value and each strategy type. + OccCounters(const ImageI& rqf, const AcStrategyImage& ac_strategy) { + for (size_t y = 0; y < rqf.ysize(); y++) { + const int32_t* qf_row = rqf.Row(y); + AcStrategyRow acs_row = ac_strategy.ConstRow(y); + for (size_t x = 0; x < rqf.xsize(); x++) { + int ord = kStrategyOrder[acs_row[x].RawStrategy()]; + int qf = qf_row[x] - 1; + qf_counts[qf]++; + qf_ord_counts[ord][qf]++; + ord_counts[ord]++; + } + } + } + + size_t qf_counts[256] = {}; + size_t qf_ord_counts[kNumOrders][256] = {}; + size_t ord_counts[kNumOrders] = {}; + }; + // The OccCounters struct is too big to allocate on the stack. + std::unique_ptr counters(new OccCounters(rqf, enc_state.shared.ac_strategy)); + + // Splitting the context model according to the quantization field seems to + // mostly benefit only large images. + size_t size_for_qf_split = (1 << 13) * enc_state.cparams.butteraugli_distance; + size_t num_qf_segments = tot < size_for_qf_split ? 1 : 2; + std::vector& qft = enc_state.shared.block_ctx_map.qf_thresholds; + qft.clear(); + // Divide the quant field in up to num_qf_segments segments. + size_t cumsum = 0; + size_t next = 1; + size_t last_cut = 256; + size_t cut = tot * next / num_qf_segments; + for (uint32_t j = 0; j < 256; j++) { + cumsum += counters->qf_counts[j]; + if (cumsum > cut) { + if (j != 0) { + qft.push_back(j); + } + last_cut = j; + while (cumsum > cut) { + next++; + cut = tot * next / num_qf_segments; + } + } else if (next > qft.size() + 1) { + if (j - 1 == last_cut && j != 0) { + qft.push_back(j); + } + } + } + + // Count the occurrences of each segment. + std::vector counts(kNumOrders * (qft.size() + 1)); + size_t qft_pos = 0; + for (size_t j = 0; j < 256; j++) { + if (qft_pos < qft.size() && j == qft[qft_pos]) { + qft_pos++; + } + for (size_t i = 0; i < kNumOrders; i++) { + counts[qft_pos + i * (qft.size() + 1)] += counters->qf_ord_counts[i][j]; + } + } + + // Repeatedly merge the lowest-count pair. + std::vector remap((qft.size() + 1) * kNumOrders); + std::iota(remap.begin(), remap.end(), 0); + std::vector clusters(remap); + size_t nb_clusters = 4; // Clamp1((int)(tot / size_for_ctx_model / 2), 4, 8); + // This is O(n^2 log n), but n <= 14. + while (clusters.size() > nb_clusters) { + std::sort(clusters.begin(), clusters.end(), [&](int a, int b) { return counts[a] > counts[b]; }); + counts[clusters[clusters.size() - 2]] += counts[clusters.back()]; + counts[clusters.back()] = 0; + remap[clusters.back()] = clusters[clusters.size() - 2]; + clusters.pop_back(); + } + for (size_t i = 0; i < remap.size(); i++) { + while (remap[remap[i]] != remap[i]) { + remap[i] = remap[remap[i]]; + } + } + // Relabel starting from 0. + std::vector remap_remap(remap.size(), remap.size()); + size_t num = 0; + for (size_t i = 0; i < remap.size(); i++) { + if (remap_remap[remap[i]] == remap.size()) { + remap_remap[remap[i]] = num++; + } + remap[i] = remap_remap[remap[i]]; + } + // Write the block context map. + auto& ctx_map = enc_state.shared.block_ctx_map.ctx_map; + ctx_map = remap; + ctx_map.resize(remap.size() * 3); + for (size_t i = remap.size(); i < remap.size() * 3; i++) { + ctx_map[i] = remap[i % remap.size()] + num; + } + enc_state.shared.block_ctx_map.num_ctxs = *std::max_element(ctx_map.begin(), ctx_map.end()) + 1; +} + +// Returns the target size based on whether bitrate or direct targetsize is +// given. +size_t TargetSize(const CompressParams& cparams, const FrameDimensions& frame_dim) { + if (cparams.target_size > 0) { + return cparams.target_size; + } + if (cparams.target_bitrate > 0.0) { + return 0.5 + cparams.target_bitrate * frame_dim.xsize * frame_dim.ysize / kBitsPerByte; + } + return 0; +} +} // namespace + +class LossyFrameEncoder { + public: + LossyFrameEncoder(const CompressParams& cparams, + const FrameHeader& frame_header, + PassesEncoderState* JXL_RESTRICT enc_state, + ThreadPool* pool, + AuxOut* aux_out) + : enc_state_(enc_state), pool_(pool), aux_out_(aux_out) { + JXL_CHECK(InitializePassesSharedState(frame_header, &enc_state_->shared, + /*encoder=*/true)); + enc_state_->cparams = cparams; + enc_state_->passes.clear(); + } + + Status ComputeEncodingData(const ImageBundle* linear, + Image3F* JXL_RESTRICT opsin, + ThreadPool* pool, + ModularFrameEncoder* modular_frame_encoder, + BitWriter* JXL_RESTRICT writer, + FrameHeader* frame_header) { + PROFILER_ZONE("ComputeEncodingData uninstrumented"); + JXL_ASSERT((opsin->xsize() % kBlockDim) == 0 && (opsin->ysize() % kBlockDim) == 0); + PassesSharedState& shared = enc_state_->shared; + + if (!enc_state_->cparams.max_error_mode) { + float x_qm_scale_steps[3] = {0.65f, 1.25f, 9.0f}; + shared.frame_header.x_qm_scale = 1; + for (float x_qm_scale_step : x_qm_scale_steps) { + if (enc_state_->cparams.butteraugli_distance > x_qm_scale_step) { + shared.frame_header.x_qm_scale++; + } + } + } + + JXL_RETURN_IF_ERROR(enc_state_->heuristics->LossyFrameHeuristics(enc_state_, modular_frame_encoder, linear, + opsin, pool_, aux_out_)); + + /* InitializePassesEncoder(*opsin, pool_, enc_state_, + modular_frame_encoder, + aux_out_);*/ + + enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses()); + for (PassesEncoderState::PassData& pass : enc_state_->passes) { + pass.ac_tokens.resize(shared.frame_dim.num_groups); + } + + ComputeAllCoeffOrders(shared.frame_dim); + shared.num_histograms = 1; + + const auto tokenize_group_init = [&](const size_t num_threads) { + group_caches_.resize(num_threads); + return true; + }; + const auto tokenize_group = [&](const int group_index, const int thread) { + // Tokenize coefficients. + const Rect rect = shared.BlockGroupRect(group_index); + for (size_t idx_pass = 0; idx_pass < enc_state_->passes.size(); idx_pass++) { + JXL_ASSERT(enc_state_->coeffs[idx_pass]->Type() == ACType::k32); + const int32_t* JXL_RESTRICT ac_rows[3] = { + enc_state_->coeffs[idx_pass]->PlaneRow(0, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(1, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(2, group_index, 0).ptr32, + }; + // Ensure group cache is initialized. + group_caches_[thread].InitOnce(); + TokenizeCoefficients(&shared.coeff_orders[idx_pass * shared.coeff_order_size], rect, ac_rows, + shared.ac_strategy, frame_header->chroma_subsampling, + &group_caches_[thread].num_nzeroes, + &enc_state_->passes[idx_pass].ac_tokens[group_index], enc_state_->shared.quant_dc, + enc_state_->shared.raw_quant_field, enc_state_->shared.block_ctx_map); + } + }; + RunOnPool(pool_, 0, shared.frame_dim.num_groups, tokenize_group_init, tokenize_group, "TokenizeGroup"); + + *frame_header = shared.frame_header; + return true; + } + + Status ComputeJPEGTranscodingData(const jpeg::JPEGData& jpeg_data, + ModularFrameEncoder* modular_frame_encoder, + FrameHeader* frame_header) { + PROFILER_ZONE("ComputeJPEGTranscodingData uninstrumented"); + PassesSharedState& shared = enc_state_->shared; + + frame_header->x_qm_scale = 2; + frame_header->b_qm_scale = 2; + + FrameDimensions frame_dim = frame_header->ToFrameDimensions(); + + const size_t xsize = frame_dim.xsize_padded; + const size_t ysize = frame_dim.ysize_padded; + const size_t xsize_blocks = frame_dim.xsize_blocks; + const size_t ysize_blocks = frame_dim.ysize_blocks; + + // no-op chroma from luma + shared.cmap = ColorCorrelationMap(xsize, ysize, false); + shared.ac_strategy.FillDCT8(); + FillImage(uint8_t(0), &shared.epf_sharpness); + + enc_state_->coeffs.clear(); + enc_state_->coeffs.emplace_back(make_unique >(kGroupDim * kGroupDim, frame_dim.num_groups)); + + // convert JPEG quantization table to a Quantizer object + float dcquantization[3]; + std::vector qe(DequantMatrices::kNum, QuantEncoding::Library(0)); + + auto jpeg_c_map = JpegOrder(frame_header->color_transform, jpeg_data.components.size() == 1); + + std::vector qt(192); + for (size_t c = 0; c < 3; c++) { + size_t jpeg_c = jpeg_c_map[c]; + const int* quant = jpeg_data.quant[jpeg_data.components[jpeg_c].quant_idx].values.data(); + + dcquantization[c] = 255 * 8.0f / quant[0]; + for (size_t y = 0; y < 8; y++) { + for (size_t x = 0; x < 8; x++) { + // JPEG XL transposes the DCT, JPEG doesn't. + qt[c * 64 + 8 * x + y] = quant[8 * y + x]; + } + } + } + DequantMatricesSetCustomDC(&shared.matrices, dcquantization); + float dcquantization_r[3] = {1.0f / dcquantization[0], 1.0f / dcquantization[1], 1.0f / dcquantization[2]}; + + qe[AcStrategy::Type::DCT] = QuantEncoding::RAW(qt); + DequantMatricesSetCustom(&shared.matrices, qe, modular_frame_encoder); + + // Ensure that InvGlobalScale() is 1. + shared.quantizer = Quantizer(&shared.matrices, 1, kGlobalScaleDenom); + // Recompute MulDC() and InvMulDC(). + shared.quantizer.RecomputeFromGlobalScale(); + + // Per-block dequant scaling should be 1. + FillImage(static_cast(shared.quantizer.InvGlobalScale()), &shared.raw_quant_field); + + std::vector scaled_qtable(192); + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 64; i++) { + scaled_qtable[64 * c + i] = (1 << kCFLFixedPointPrecision) * qt[64 + i] / qt[64 * c + i]; + } + } + + auto jpeg_row = [&](size_t c, size_t y) { + return jpeg_data.components[jpeg_c_map[c]].coeffs.data() + + jpeg_data.components[jpeg_c_map[c]].width_in_blocks * kDCTBlockSize * y; + }; + + Image3F dc = Image3F(xsize_blocks, ysize_blocks); + bool DCzero = (shared.frame_header.color_transform == ColorTransform::kYCbCr); + // Compute chroma-from-luma for AC (doesn't seem to be useful for DC) + if (frame_header->chroma_subsampling.Is444() && enc_state_->cparams.force_cfl_jpeg_recompression && + jpeg_data.components.size() == 3) { + for (size_t c : {0, 2}) { + ImageSB* map = (c == 0 ? &shared.cmap.ytox_map : &shared.cmap.ytob_map); + const float kScale = kDefaultColorFactor; + const int kOffset = 127; + const float kBase = c == 0 ? shared.cmap.YtoXRatio(0) : shared.cmap.YtoBRatio(0); + const float kZeroThresh = + kScale * kZeroBiasDefault[c] * 0.9999f; // just epsilon less for better rounding + + auto process_row = [&](int task, int thread) { + size_t ty = task; + int8_t* JXL_RESTRICT row_out = map->Row(ty); + for (size_t tx = 0; tx < map->xsize(); ++tx) { + const size_t y0 = ty * kColorTileDimInBlocks; + const size_t x0 = tx * kColorTileDimInBlocks; + const size_t y1 = std::min(frame_dim.ysize_blocks, (ty + 1) * kColorTileDimInBlocks); + const size_t x1 = std::min(frame_dim.xsize_blocks, (tx + 1) * kColorTileDimInBlocks); + int32_t d_num_zeros[257] = {0}; + // TODO(veluca): this needs SIMD + fixed point adaptation, and/or + // conversion to the new CfL algorithm. + for (size_t y = y0; y < y1; ++y) { + const int16_t* JXL_RESTRICT row_m = jpeg_row(1, y); + const int16_t* JXL_RESTRICT row_s = jpeg_row(c, y); + for (size_t x = x0; x < x1; ++x) { + for (size_t coeffpos = 1; coeffpos < kDCTBlockSize; coeffpos++) { + const float scaled_m = row_m[x * kDCTBlockSize + coeffpos] * + scaled_qtable[64 * c + coeffpos] * + (1.0f / (1 << kCFLFixedPointPrecision)); + const float scaled_s = kScale * row_s[x * kDCTBlockSize + coeffpos] + + (kOffset - kBase * kScale) * scaled_m; + if (std::abs(scaled_m) > 1e-8f) { + float from, to; + if (scaled_m > 0) { + from = (scaled_s - kZeroThresh) / scaled_m; + to = (scaled_s + kZeroThresh) / scaled_m; + } else { + from = (scaled_s + kZeroThresh) / scaled_m; + to = (scaled_s - kZeroThresh) / scaled_m; + } + if (from < 0.0f) { + from = 0.0f; + } + if (to > 255.0f) { + to = 255.0f; + } + // Instead of clamping the both values + // we just check that range is sane. + if (from <= to) { + d_num_zeros[static_cast(std::ceil(from))]++; + d_num_zeros[static_cast(std::floor(to + 1))]--; + } + } + } + } + } + int best = 0; + int32_t best_sum = 0; + FindIndexOfSumMaximum(d_num_zeros, 256, &best, &best_sum); + int32_t offset_sum = 0; + for (int i = 0; i < 256; ++i) { + if (i <= kOffset) { + offset_sum += d_num_zeros[i]; + } + } + row_out[tx] = 0; + if (best_sum > offset_sum + 1) { + row_out[tx] = best - kOffset; + } + } + }; + + RunOnPool(pool_, 0, map->ysize(), ThreadPool::SkipInit(), process_row, "FindCorrelation"); + } + } + if (!frame_header->chroma_subsampling.Is444()) { + ZeroFillImage(&dc); + enc_state_->coeffs[0]->ZeroFill(); + } + // JPEG DC is from -1024 to 1023. + std::vector dc_counts[3] = {}; + dc_counts[0].resize(2048); + dc_counts[1].resize(2048); + dc_counts[2].resize(2048); + size_t total_dc[3] = {}; + for (size_t c : {1, 0, 2}) { + if (jpeg_data.components.size() == 1 && c != 1) { + enc_state_->coeffs[0]->ZeroFillPlane(c); + ZeroFillImage(&dc.Plane(c)); + // Ensure no division by 0. + dc_counts[c][1024] = 1; + total_dc[c] = 1; + continue; + } + size_t hshift = frame_header->chroma_subsampling.HShift(c); + size_t vshift = frame_header->chroma_subsampling.VShift(c); + ImageSB& map = (c == 0 ? shared.cmap.ytox_map : shared.cmap.ytob_map); + for (size_t group_index = 0; group_index < frame_dim.num_groups; group_index++) { + const size_t gx = group_index % frame_dim.xsize_groups; + const size_t gy = group_index / frame_dim.xsize_groups; + size_t offset = 0; + int32_t* JXL_RESTRICT ac = enc_state_->coeffs[0]->PlaneRow(c, group_index, 0).ptr32; + for (size_t by = gy * kGroupDimInBlocks; by < ysize_blocks && by < (gy + 1) * kGroupDimInBlocks; ++by) { + if ((by >> vshift) << vshift != by) continue; + const int16_t* JXL_RESTRICT inputjpeg = jpeg_row(c, by >> vshift); + const int16_t* JXL_RESTRICT inputjpegY = jpeg_row(1, by); + float* JXL_RESTRICT fdc = dc.PlaneRow(c, by >> vshift); + const int8_t* JXL_RESTRICT cm = map.ConstRow(by / kColorTileDimInBlocks); + for (size_t bx = gx * kGroupDimInBlocks; bx < xsize_blocks && bx < (gx + 1) * kGroupDimInBlocks; + ++bx) { + if ((bx >> hshift) << hshift != bx) continue; + size_t base = (bx >> hshift) * kDCTBlockSize; + int idc; + if (DCzero) { + idc = inputjpeg[base]; + } else { + idc = inputjpeg[base] + 1024 / qt[c * 64]; + } + dc_counts[c][std::min(static_cast(idc + 1024), uint32_t(2047))]++; + total_dc[c]++; + fdc[bx >> hshift] = idc * dcquantization_r[c]; + if (c == 1 || !enc_state_->cparams.force_cfl_jpeg_recompression || + !frame_header->chroma_subsampling.Is444()) { + for (size_t y = 0; y < 8; y++) { + for (size_t x = 0; x < 8; x++) { + ac[offset + y * 8 + x] = inputjpeg[base + x * 8 + y]; + } + } + } else { + const int32_t scale = shared.cmap.RatioJPEG(cm[bx / kColorTileDimInBlocks]); + + for (size_t y = 0; y < 8; y++) { + for (size_t x = 0; x < 8; x++) { + int Y = inputjpegY[kDCTBlockSize * bx + x * 8 + y]; + int QChroma = inputjpeg[kDCTBlockSize * bx + x * 8 + y]; + // Fixed-point multiply of CfL scale with quant table ratio + // first, and Y value second. + int coeff_scale = (scale * scaled_qtable[64 * c + y * 8 + x] + + (1 << (kCFLFixedPointPrecision - 1))) >> + kCFLFixedPointPrecision; + int cfl_factor = (Y * coeff_scale + (1 << (kCFLFixedPointPrecision - 1))) >> + kCFLFixedPointPrecision; + int QCR = QChroma - cfl_factor; + ac[offset + y * 8 + x] = QCR; + } + } + } + offset += 64; + } + } + } + } + + auto& dct = enc_state_->shared.block_ctx_map.dc_thresholds; + auto& num_dc_ctxs = enc_state_->shared.block_ctx_map.num_dc_ctxs; + enc_state_->shared.block_ctx_map.num_dc_ctxs = 1; + for (size_t i = 0; i < 3; i++) { + dct[i].clear(); + int num_thresholds = (CeilLog2Nonzero(total_dc[i]) - 10) / 2; + // up to 3 buckets per channel: + // dark/medium/bright, yellow/unsat/blue, green/unsat/red + num_thresholds = std::min(std::max(num_thresholds, 0), 2); + size_t cumsum = 0; + size_t cut = total_dc[i] / (num_thresholds + 1); + for (int j = 0; j < 2048; j++) { + cumsum += dc_counts[i][j]; + if (cumsum > cut) { + dct[i].push_back(j - 1025); + cut = total_dc[i] * (dct[i].size() + 1) / (num_thresholds + 1); + } + } + num_dc_ctxs *= dct[i].size() + 1; + } + + auto& ctx_map = enc_state_->shared.block_ctx_map.ctx_map; + ctx_map.clear(); + ctx_map.resize(3 * kNumOrders * num_dc_ctxs, 0); + + int lbuckets = (dct[1].size() + 1); + for (size_t i = 0; i < num_dc_ctxs; i++) { + // up to 9 contexts for luma + ctx_map[i] = i / lbuckets; + // up to 3 contexts for chroma + ctx_map[kNumOrders * num_dc_ctxs + i] = num_dc_ctxs / lbuckets + (i % lbuckets); + ctx_map[2 * kNumOrders * num_dc_ctxs + i] = num_dc_ctxs / lbuckets + (i % lbuckets); + } + enc_state_->shared.block_ctx_map.num_ctxs = *std::max_element(ctx_map.begin(), ctx_map.end()) + 1; + + enc_state_->histogram_idx.resize(shared.frame_dim.num_groups); + + // disable DC frame for now + shared.frame_header.UpdateFlag(false, FrameHeader::kUseDcFrame); + auto compute_dc_coeffs = [&](int group_index, int /* thread */) { + modular_frame_encoder->AddVarDCTDC(dc, group_index, /*nl_dc=*/false, enc_state_); + modular_frame_encoder->AddACMetadata(group_index, /*jpeg_transcode=*/true, enc_state_); + }; + RunOnPool(pool_, 0, shared.frame_dim.num_dc_groups, ThreadPool::SkipInit(), compute_dc_coeffs, + "Compute DC coeffs"); + + // Must happen before WriteFrameHeader! + shared.frame_header.UpdateFlag(true, FrameHeader::kSkipAdaptiveDCSmoothing); + + enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses()); + for (PassesEncoderState::PassData& pass : enc_state_->passes) { + pass.ac_tokens.resize(shared.frame_dim.num_groups); + } + + JXL_CHECK(enc_state_->passes.size() == 1); // skipping coeff splitting so need to have only one pass + + ComputeAllCoeffOrders(frame_dim); + shared.num_histograms = 1; + + const auto tokenize_group_init = [&](const size_t num_threads) { + group_caches_.resize(num_threads); + return true; + }; + const auto tokenize_group = [&](const int group_index, const int thread) { + // Tokenize coefficients. + const Rect rect = shared.BlockGroupRect(group_index); + for (size_t idx_pass = 0; idx_pass < enc_state_->passes.size(); idx_pass++) { + JXL_ASSERT(enc_state_->coeffs[idx_pass]->Type() == ACType::k32); + const int32_t* JXL_RESTRICT ac_rows[3] = { + enc_state_->coeffs[idx_pass]->PlaneRow(0, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(1, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(2, group_index, 0).ptr32, + }; + // Ensure group cache is initialized. + group_caches_[thread].InitOnce(); + TokenizeCoefficients(&shared.coeff_orders[idx_pass * shared.coeff_order_size], rect, ac_rows, + shared.ac_strategy, frame_header->chroma_subsampling, + &group_caches_[thread].num_nzeroes, + &enc_state_->passes[idx_pass].ac_tokens[group_index], enc_state_->shared.quant_dc, + enc_state_->shared.raw_quant_field, enc_state_->shared.block_ctx_map); + } + }; + RunOnPool(pool_, 0, shared.frame_dim.num_groups, tokenize_group_init, tokenize_group, "TokenizeGroup"); + *frame_header = shared.frame_header; + return true; + } + + Status EncodeGlobalDCInfo(const FrameHeader& frame_header, BitWriter* writer) const { + // Encode quantizer DC and global scale. + JXL_RETURN_IF_ERROR(enc_state_->shared.quantizer.Encode(writer, kLayerQuant, aux_out_)); + EncodeBlockCtxMap(enc_state_->shared.block_ctx_map, writer, aux_out_); + ColorCorrelationMapEncodeDC(&enc_state_->shared.cmap, writer, kLayerDC, aux_out_); + return true; + } + + Status EncodeGlobalACInfo(BitWriter* writer, ModularFrameEncoder* modular_frame_encoder) { + JXL_RETURN_IF_ERROR(DequantMatricesEncode(&enc_state_->shared.matrices, writer, kLayerDequantTables, aux_out_, + modular_frame_encoder)); + if (enc_state_->cparams.speed_tier <= SpeedTier::kTortoise) { + ClusterGroups(enc_state_); + } + size_t num_histo_bits = CeilLog2Nonzero(enc_state_->shared.frame_dim.num_groups); + if (num_histo_bits != 0) { + BitWriter::Allotment allotment(writer, num_histo_bits); + writer->Write(num_histo_bits, enc_state_->shared.num_histograms - 1); + ReclaimAndCharge(writer, &allotment, kLayerAC, aux_out_); + } + + for (size_t i = 0; i < enc_state_->progressive_splitter.GetNumPasses(); i++) { + // Encode coefficient orders. + size_t order_bits = 0; + JXL_RETURN_IF_ERROR(U32Coder::CanEncode(kOrderEnc, enc_state_->used_orders[i], &order_bits)); + BitWriter::Allotment allotment(writer, order_bits); + JXL_CHECK(U32Coder::Write(kOrderEnc, enc_state_->used_orders[i], writer)); + ReclaimAndCharge(writer, &allotment, kLayerOrder, aux_out_); + EncodeCoeffOrders(enc_state_->used_orders[i], + &enc_state_->shared.coeff_orders[i * enc_state_->shared.coeff_order_size], writer, + kLayerOrder, aux_out_); + + // Encode histograms. + HistogramParams hist_params(enc_state_->cparams.speed_tier, + enc_state_->shared.block_ctx_map.NumACContexts()); + if (enc_state_->cparams.speed_tier > SpeedTier::kTortoise) { + hist_params.lz77_method = HistogramParams::LZ77Method::kNone; + } + if (enc_state_->cparams.decoding_speed_tier >= 1) { + hist_params.max_histograms = 6; + } + BuildAndEncodeHistograms( + hist_params, enc_state_->shared.num_histograms * enc_state_->shared.block_ctx_map.NumACContexts(), + enc_state_->passes[i].ac_tokens, &enc_state_->passes[i].codes, &enc_state_->passes[i].context_map, + writer, kLayerAC, aux_out_); + } + + return true; + } + + Status EncodeACGroup(size_t pass, size_t group_index, BitWriter* group_code, AuxOut* local_aux_out) { + return EncodeGroupTokenizedCoefficients(group_index, pass, enc_state_->histogram_idx[group_index], *enc_state_, + group_code, local_aux_out); + } + + PassesEncoderState* State() { return enc_state_; } + + void ComputeAllCoeffOrders(const FrameDimensions& frame_dim) { + PROFILER_FUNC; + enc_state_->used_orders.resize(enc_state_->progressive_splitter.GetNumPasses()); + for (size_t i = 0; i < enc_state_->progressive_splitter.GetNumPasses(); i++) { + // No coefficient reordering in Falcon or faster. + if (enc_state_->cparams.speed_tier < SpeedTier::kFalcon) { + enc_state_->used_orders[i] = + ComputeUsedOrders(enc_state_->cparams.speed_tier, enc_state_->shared.ac_strategy, + Rect(enc_state_->shared.raw_quant_field)); + } + ComputeCoeffOrder(enc_state_->cparams.speed_tier, *enc_state_->coeffs[i], enc_state_->shared.ac_strategy, + frame_dim, enc_state_->used_orders[i], + &enc_state_->shared.coeff_orders[i * enc_state_->shared.coeff_order_size]); + } + } + + std::vector& get_group_cashes() { return group_caches_; } + + private: + template + static inline void FindIndexOfSumMaximum(const V* array, const size_t len, R* idx, V* sum) { + JXL_ASSERT(len > 0); + V maxval = 0; + V val = 0; + R maxidx = 0; + for (size_t i = 0; i < len; ++i) { + val += array[i]; + if (val > maxval) { + maxval = val; + maxidx = i; + } + } + *idx = maxidx; + *sum = maxval; + } + + PassesEncoderState* JXL_RESTRICT enc_state_; + ThreadPool* pool_; + AuxOut* aux_out_; + std::vector group_caches_; +}; +} // namespace jxl +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/acc_dct-inl.h b/codec/L2/demos/jxlEnc/others/include/acc_dct-inl.h new file mode 100644 index 0000000000..ee6c0568c5 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/acc_dct-inl.h @@ -0,0 +1,347 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Fast SIMD floating-point (I)DCT, any power of two. +#ifndef ACC_DCT_INL_HPP +#define ACC_DCT_INL_HPP + +#if defined(LIB_JXL_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_DCT_INL_H_ +#undef LIB_JXL_DCT_INL_H_ +#else +#define LIB_JXL_DCT_INL_H_ +#endif + +#include + +#include + +#include "lib/jxl/dct_block-inl.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/transpose-inl.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +template +struct FVImpl { + using type = HWY_CAPPED(float, SZ); +}; + +template <> +struct FVImpl<0> { + using type = HWY_FULL(float); +}; + +template +using FV = typename FVImpl::type; + +// Implementation of Lowest Complexity Self Recursive Radix-2 DCT II/III +// Algorithms, by Siriani M. Perera and Jianhua Liu. + +template +struct CoeffBundle { + static void AddReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2, float* JXL_RESTRICT aout) { + for (size_t i = 0; i < N; i++) { + auto in1 = Load(FV(), ain1 + i * SZ); + auto in2 = Load(FV(), ain2 + (N - i - 1) * SZ); + Store(in1 + in2, FV(), aout + i * SZ); + } + } + static void SubReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2, float* JXL_RESTRICT aout) { + for (size_t i = 0; i < N; i++) { + auto in1 = Load(FV(), ain1 + i * SZ); + auto in2 = Load(FV(), ain2 + (N - i - 1) * SZ); + Store(in1 - in2, FV(), aout + i * SZ); + } + } + static void B(float* JXL_RESTRICT coeff) { + auto sqrt2 = Set(FV(), square_root<2>::value); + auto in1 = Load(FV(), coeff); + auto in2 = Load(FV(), coeff + SZ); + Store(MulAdd(in1, sqrt2, in2), FV(), coeff); + for (size_t i = 1; i + 1 < N; i++) { + auto in1 = Load(FV(), coeff + i * SZ); + auto in2 = Load(FV(), coeff + (i + 1) * SZ); + Store(in1 + in2, FV(), coeff + i * SZ); + } + } + static void BTranspose(float* JXL_RESTRICT coeff) { + for (size_t i = N - 1; i > 0; i--) { + auto in1 = Load(FV(), coeff + i * SZ); + auto in2 = Load(FV(), coeff + (i - 1) * SZ); + Store(in1 + in2, FV(), coeff + i * SZ); + } + auto sqrt2 = Set(FV(), square_root<2>::value); + auto in1 = Load(FV(), coeff); + Store(in1 * sqrt2, FV(), coeff); + } + // Ideally optimized away by compiler (except the multiply). + static void InverseEvenOdd(const float* JXL_RESTRICT ain, float* JXL_RESTRICT aout) { + for (size_t i = 0; i < N / 2; i++) { + auto in1 = Load(FV(), ain + i * SZ); + Store(in1, FV(), aout + 2 * i * SZ); + } + for (size_t i = N / 2; i < N; i++) { + auto in1 = Load(FV(), ain + i * SZ); + Store(in1, FV(), aout + (2 * (i - N / 2) + 1) * SZ); + } + } + // Ideally optimized away by compiler. + static void ForwardEvenOdd(const float* JXL_RESTRICT ain, size_t ain_stride, float* JXL_RESTRICT aout) { + for (size_t i = 0; i < N / 2; i++) { + auto in1 = LoadU(FV(), ain + 2 * i * ain_stride); + Store(in1, FV(), aout + i * SZ); + } + for (size_t i = N / 2; i < N; i++) { + auto in1 = LoadU(FV(), ain + (2 * (i - N / 2) + 1) * ain_stride); + Store(in1, FV(), aout + i * SZ); + } + } + // Invoked on full vector. + static void Multiply(float* JXL_RESTRICT coeff) { + for (size_t i = 0; i < N / 2; i++) { + auto in1 = Load(FV(), coeff + (N / 2 + i) * SZ); + auto mul = Set(FV(), WcMultipliers::kMultipliers[i]); + Store(in1 * mul, FV(), coeff + (N / 2 + i) * SZ); + } + } + static void MultiplyAndAdd(const float* JXL_RESTRICT coeff, float* JXL_RESTRICT out, size_t out_stride) { + for (size_t i = 0; i < N / 2; i++) { + auto mul = Set(FV(), WcMultipliers::kMultipliers[i]); + auto in1 = Load(FV(), coeff + i * SZ); + auto in2 = Load(FV(), coeff + (N / 2 + i) * SZ); + auto out1 = MulAdd(mul, in2, in1); + auto out2 = NegMulAdd(mul, in2, in1); + StoreU(out1, FV(), out + i * out_stride); + StoreU(out2, FV(), out + (N - i - 1) * out_stride); + } + } + template + static void LoadFromBlock(const Block& in, size_t off, float* JXL_RESTRICT coeff) { + for (size_t i = 0; i < N; i++) { + Store(in.LoadPart(FV(), i, off), FV(), coeff + i * SZ); + } + } + template + static void StoreToBlockAndScale(const float* JXL_RESTRICT coeff, const Block& out, size_t off) { + auto mul = Set(FV(), 1.0f / N); + for (size_t i = 0; i < N; i++) { + out.StorePart(FV(), mul * Load(FV(), coeff + i * SZ), i, off); + } + } +}; + +template +struct DCT1DImpl; + +template +struct DCT1DImpl<1, SZ> { + JXL_INLINE void operator()(float* JXL_RESTRICT mem) {} +}; + +template +struct DCT1DImpl<2, SZ> { + JXL_INLINE void operator()(float* JXL_RESTRICT mem) { + auto in1 = Load(FV(), mem); + auto in2 = Load(FV(), mem + SZ); + Store(in1 + in2, FV(), mem); + Store(in1 - in2, FV(), mem + SZ); + } +}; + +template +struct DCT1DImpl { + void operator()(float* JXL_RESTRICT mem) { + // This is relatively small (4kB with 64-DCT and AVX-512) + HWY_ALIGN float tmp[N * SZ]; + CoeffBundle::AddReverse(mem, mem + N / 2 * SZ, tmp); + DCT1DImpl()(tmp); + CoeffBundle::SubReverse(mem, mem + N / 2 * SZ, tmp + N / 2 * SZ); + CoeffBundle::Multiply(tmp); + DCT1DImpl()(tmp + N / 2 * SZ); + CoeffBundle::B(tmp + N / 2 * SZ); + CoeffBundle::InverseEvenOdd(tmp, mem); + } +}; + +template +struct IDCT1DImpl; + +template +struct IDCT1DImpl<1, SZ> { + JXL_INLINE void operator()(const float* from, size_t from_stride, float* to, size_t to_stride) { + StoreU(LoadU(FV(), from), FV(), to); + } +}; + +template +struct IDCT1DImpl<2, SZ> { + JXL_INLINE void operator()(const float* from, size_t from_stride, float* to, size_t to_stride) { + JXL_DASSERT(from_stride >= SZ); + JXL_DASSERT(to_stride >= SZ); + auto in1 = LoadU(FV(), from); + auto in2 = LoadU(FV(), from + from_stride); + StoreU(in1 + in2, FV(), to); + StoreU(in1 - in2, FV(), to + to_stride); + } +}; + +template +struct IDCT1DImpl { + void operator()(const float* from, size_t from_stride, float* to, size_t to_stride) { + JXL_DASSERT(from_stride >= SZ); + JXL_DASSERT(to_stride >= SZ); + // This is relatively small (4kB with 64-DCT and AVX-512) + HWY_ALIGN float tmp[N * SZ]; + CoeffBundle::ForwardEvenOdd(from, from_stride, tmp); + IDCT1DImpl()(tmp, SZ, tmp, SZ); + CoeffBundle::BTranspose(tmp + N / 2 * SZ); + IDCT1DImpl()(tmp + N / 2 * SZ, SZ, tmp + N / 2 * SZ, SZ); + CoeffBundle::MultiplyAndAdd(tmp, to, to_stride); + } +}; + +template +void DCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) { + size_t M = M_or_0 != 0 ? M_or_0 : Mp; + constexpr size_t SZ = MaxLanes(FV()); + HWY_ALIGN float tmp[N * SZ]; + for (size_t i = 0; i < M; i += Lanes(FV())) { + // TODO(veluca): consider removing the temporary memory here (as is done in + // IDCT), if it turns out that some compilers don't optimize away the loads + // and this is performance-critical. + CoeffBundle::LoadFromBlock(from, i, tmp); + DCT1DImpl()(tmp); + CoeffBundle::StoreToBlockAndScale(tmp, to, i); + } +} + +template +void IDCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) { + size_t M = M_or_0 != 0 ? M_or_0 : Mp; + constexpr size_t SZ = MaxLanes(FV()); + for (size_t i = 0; i < M; i += Lanes(FV())) { + IDCT1DImpl()(from.Address(0, i), from.Stride(), to.Address(0, i), to.Stride()); + } +} + +template +struct DCT1D { + template + void operator()(const FromBlock& from, const ToBlock& to) { + return DCT1DWrapper(from, to, M); + } +}; + +template +struct DCT1D MaxLanes(FV<0>()))>::type> { + template + void operator()(const FromBlock& from, const ToBlock& to) { + return NoInlineWrapper(DCT1DWrapper, from, to, M); + } +}; + +template +struct IDCT1D { + template + void operator()(const FromBlock& from, const ToBlock& to) { + return IDCT1DWrapper(from, to, M); + } +}; + +template +struct IDCT1D MaxLanes(FV<0>()))>::type> { + template + void operator()(const FromBlock& from, const ToBlock& to) { + return NoInlineWrapper(IDCT1DWrapper, from, to, M); + } +}; + +// Computes the in-place NxN transposed-scaled-DCT (tsDCT) of block. +// Requires that block is HWY_ALIGN'ed. +// +// See also DCTSlow, ComputeDCT +template +struct ComputeTransposedScaledDCT { + // scratch_space must be aligned, and should have space for N*N floats. + template + HWY_MAYBE_UNUSED void operator()(const From& from, float* JXL_RESTRICT to, float* JXL_RESTRICT scratch_space) { + float* JXL_RESTRICT block = scratch_space; + DCT1D()(from, DCTTo(to, N)); + Transpose::Run(DCTFrom(to, N), DCTTo(block, N)); + DCT1D()(DCTFrom(block, N), DCTTo(to, N)); + } +}; + +// Computes the in-place NxN transposed-scaled-iDCT (tsIDCT)of block. +// Requires that block is HWY_ALIGN'ed. +// +// See also IDCTSlow, ComputeIDCT. + +template +struct ComputeTransposedScaledIDCT { + // scratch_space must be aligned, and should have space for N*N floats. + template + HWY_MAYBE_UNUSED void operator()(float* JXL_RESTRICT from, const To& to, float* JXL_RESTRICT scratch_space) { + float* JXL_RESTRICT block = scratch_space; + IDCT1D()(DCTFrom(from, N), DCTTo(block, N)); + Transpose::Run(DCTFrom(block, N), DCTTo(from, N)); + IDCT1D()(DCTFrom(from, N), to); + } +}; +// Computes the non-transposed, scaled DCT of a block, that needs to be +// HWY_ALIGN'ed. Used for rectangular blocks. +template +struct ComputeScaledDCT { + // scratch_space must be aligned, and should have space for ROWS*COLS + // floats. + template + HWY_MAYBE_UNUSED void operator()(const From& from, float* to, float* JXL_RESTRICT scratch_space) { + float* JXL_RESTRICT block = scratch_space; + if (ROWS < COLS) { + DCT1D()(from, DCTTo(block, COLS)); + Transpose::Run(DCTFrom(block, COLS), DCTTo(to, ROWS)); + DCT1D()(DCTFrom(to, ROWS), DCTTo(block, ROWS)); + Transpose::Run(DCTFrom(block, ROWS), DCTTo(to, COLS)); + } else { + DCT1D()(from, DCTTo(to, COLS)); + Transpose::Run(DCTFrom(to, COLS), DCTTo(block, ROWS)); + DCT1D()(DCTFrom(block, ROWS), DCTTo(to, ROWS)); + } + } +}; +// Computes the non-transposed, scaled DCT of a block, that needs to be +// HWY_ALIGN'ed. Used for rectangular blocks. +template +struct ComputeScaledIDCT { + // scratch_space must be aligned, and should have space for ROWS*COLS + // floats. + template + HWY_MAYBE_UNUSED void operator()(float* JXL_RESTRICT from, const To& to, float* JXL_RESTRICT scratch_space) { + float* JXL_RESTRICT block = scratch_space; + // Reverse the steps done in ComputeScaledDCT. + if (ROWS < COLS) { + Transpose::Run(DCTFrom(from, COLS), DCTTo(block, ROWS)); + IDCT1D()(DCTFrom(block, ROWS), DCTTo(from, ROWS)); + Transpose::Run(DCTFrom(from, ROWS), DCTTo(block, COLS)); + IDCT1D()(DCTFrom(block, COLS), to); + } else { + IDCT1D()(DCTFrom(from, ROWS), DCTTo(block, ROWS)); + Transpose::Run(DCTFrom(block, ROWS), DCTTo(from, COLS)); + IDCT1D()(DCTFrom(from, COLS), to); + } + } +}; + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); +#endif // LIB_JXL_DCT_INL_H_ + +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/acc_enc_ac_strategy.hpp b/codec/L2/demos/jxlEnc/others/include/acc_enc_ac_strategy.hpp new file mode 100644 index 0000000000..bf1eea7920 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/acc_enc_ac_strategy.hpp @@ -0,0 +1,90 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_AC_STRATEGY_H_ +#define LIB_JXL_ENC_AC_STRATEGY_H_ + +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image.h" +#include "lib/jxl/quant_weights.h" + +// `FindBestAcStrategy` uses heuristics to choose which AC strategy should be +// used in each block, as well as the initial quantization field. + +namespace jxl { + +// AC strategy selection: utility struct. + +struct ACSConfig { + const DequantMatrices* JXL_RESTRICT dequant; + float info_loss_multiplier; + float info_loss_multiplier2; + float* JXL_RESTRICT quant_field_row; + size_t quant_field_stride; + float* JXL_RESTRICT masking_field_row; + size_t masking_field_stride; + const float* JXL_RESTRICT src_rows[3]; + size_t src_stride; + // Cost for 1 (-1), 2 (-2) explicitly, cost for others computed with cost1 + + // cost2 + sqrt(q) * cost_delta. + float cost1; + float cost2; + float cost_delta; + float base_entropy; + float zeros_mul; + const float& Pixel(size_t c, size_t x, size_t y) const { return src_rows[c][y * src_stride + x]; } + float Masking(size_t bx, size_t by) const { + JXL_DASSERT(masking_field_row[by * masking_field_stride + bx] > 0); + return masking_field_row[by * masking_field_stride + bx]; + } + float Quant(size_t bx, size_t by) const { + JXL_DASSERT(quant_field_row[by * quant_field_stride + bx] > 0); + return quant_field_row[by * quant_field_stride + bx]; + } + void SetQuant(size_t bx, size_t by, float value) const { + JXL_DASSERT(value > 0); + quant_field_row[by * quant_field_stride + bx] = value; + } +}; + +struct AcStrategyHeuristics { + void Init(const Image3F& src, PassesEncoderState* enc_state); + void ProcessRect(const Rect& rect, + size_t xsize, + size_t ysize, + std::vector >& dctIDT, + std::vector >& dct2x2, + std::vector >& dct4x4, + std::vector >& dct8x8, + std::vector >& dct16x16, + std::vector >& dct32x32, + std::vector >& dcIDT, + std::vector >& dc2x2, + std::vector >& dc4x4, + std::vector >& dc8x8, + std::vector >& dc16x16, + std::vector >& dc32x32); + void Finalize(AuxOut* aux_out); + ACSConfig config; + PassesEncoderState* enc_state; +}; + +// Debug. +void DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize, size_t ysize, const char* tag, AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_ENC_AC_STRATEGY_H_ diff --git a/codec/L2/demos/jxlEnc/others/include/acc_enc_chroma_from_luma.hpp b/codec/L2/demos/jxlEnc/others/include/acc_enc_chroma_from_luma.hpp new file mode 100644 index 0000000000..47fe2ae994 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/acc_enc_chroma_from_luma.hpp @@ -0,0 +1,80 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_CHROMA_FROM_LUMA_H_ +#define LIB_JXL_ENC_CHROMA_FROM_LUMA_H_ + +// Chroma-from-luma, computed using heuristics to determine the best linear +// model for the X and B channels from the Y channel. + +#include +#include + +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/quant_weights.h" + +namespace jxl { + +void ColorCorrelationMapEncodeDC(ColorCorrelationMap* map, BitWriter* writer, size_t layer, AuxOut* aux_out); + +struct CfLHeuristics { + void Init(const Image3F& opsin); + + void PrepareForThreads(size_t num_threads) { mem = hwy::AllocateAligned(num_threads * kItemsPerThread); } + + void ComputeTile(const Rect& r, + const Image3F& opsin, + const DequantMatrices& dequant, + const AcStrategyImage* ac_strategy, + const Quantizer* quantizer, + bool fast, + size_t thread, + ColorCorrelationMap* cmap, + size_t xsize, + size_t ysize, + std::vector >& dctIDT, + std::vector >& dct2x2, + std::vector >& dct4x4, + std::vector >& dct8x8, + std::vector >& dct16x16, + std::vector >& dct32x32, + std::vector >& dcIDT, + std::vector >& dc2x2, + std::vector >& dc4x4, + std::vector >& dc8x8, + std::vector >& dc16x16, + std::vector >& dc32x32); + + void ComputeDC(bool fast, ColorCorrelationMap* cmap); + + ImageF dc_values; + hwy::AlignedFreeUniquePtr mem; + + // Working set is too large for stack; allocate dynamically. + constexpr static size_t kItemsPerThread = AcStrategy::kMaxCoeffArea * 3 // Blocks + + kColorTileDim * kColorTileDim * 4 // AC coeff storage + + AcStrategy::kMaxCoeffArea * 2; // Scratch space +}; + +} // namespace jxl + +#endif // LIB_JXL_ENC_CHROMA_FROM_LUMA_H_ diff --git a/codec/L2/demos/jxlEnc/others/include/acc_enc_cluster.hpp b/codec/L2/demos/jxlEnc/others/include/acc_enc_cluster.hpp new file mode 100644 index 0000000000..ffa6373b66 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/acc_enc_cluster.hpp @@ -0,0 +1,79 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Functions for clustering similar histograms together. + +#ifndef LIB_JXL_ENC_CLUSTER_H_ +#define LIB_JXL_ENC_CLUSTER_H_ + +#include +#include +#include + +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/enc_ans.h" + +namespace jxl { + +struct Histogram { + Histogram() { total_count_ = 0; } + void Clear() { + data_.clear(); + total_count_ = 0; + } + void Add(size_t symbol) { + if (data_.size() <= symbol) { + data_.resize(DivCeil(symbol + 1, kRounding) * kRounding); + } + ++data_[symbol]; + ++total_count_; + } + void AddHistogram(const Histogram& other) { + if (other.data_.size() > data_.size()) { + data_.resize(other.data_.size()); + } + for (size_t i = 0; i < other.data_.size(); ++i) { + data_[i] += other.data_[i]; + } + total_count_ += other.total_count_; + } + float PopulationCost() const { return ANSPopulationCost(data_.data(), data_.size()); } + float ShannonEntropy() const; + + std::vector data_; + size_t total_count_; + mutable float entropy_; // WARNING: not kept up-to-date. + static constexpr size_t kRounding = 8; +}; + +void acc_FastClusterHistograms(const std::vector& in, + std::vector nonempty_histograms, + uint32_t largest_idx_in, + const size_t num_contexts, + size_t max_histograms, + float min_distance, + std::vector* out, + std::vector* histogram_symbols); + +void HistogramReindex(std::vector* out, std::vector* symbols); + +void ClusterHistograms(HistogramParams params, + const std::vector& in, + size_t num_contexts, + size_t max_histograms, + std::vector* out, + std::vector* histogram_symbols); + +void ClusterHistogramsNew(HistogramParams params, + const std::vector& in, + size_t num_contexts, + size_t max_histograms, + std::vector* out, + std::vector* histogram_symbols); +} // namespace jxl + +#endif // LIB_JXL_ENC_CLUSTER_H_ diff --git a/codec/L2/demos/jxlEnc/others/include/acc_enc_group.hpp b/codec/L2/demos/jxlEnc/others/include/acc_enc_group.hpp new file mode 100644 index 0000000000..529a307d16 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/acc_enc_group.hpp @@ -0,0 +1,48 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_GROUP_H_ +#define LIB_JXL_ENC_GROUP_H_ + +#include +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" + +namespace jxl { + +// Fills DC +void ComputeCoefficients(size_t group_idx, + PassesEncoderState* enc_state, + const Image3F& opsin, + Image3F* dc, + size_t xsize, + size_t ysize, + std::vector >& dctIDT, + std::vector >& dct2x2, + std::vector >& dct4x4, + std::vector >& dct8x8, + std::vector >& dct16x16, + std::vector >& dct32x32, + std::vector >& dcIDT, + std::vector >& dc2x2, + std::vector >& dc4x4, + std::vector >& dc8x8, + std::vector >& dc16x16, + std::vector >& dc32x32); + +Status EncodeGroupTokenizedCoefficients(size_t group_idx, + size_t pass_idx, + size_t histogram_idx, + const PassesEncoderState& enc_state, + BitWriter* writer, + AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_ENC_GROUP_H_ diff --git a/codec/L2/demos/jxlEnc/others/include/acc_init_histogram.hpp b/codec/L2/demos/jxlEnc/others/include/acc_init_histogram.hpp new file mode 100644 index 0000000000..18691cbf62 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/acc_init_histogram.hpp @@ -0,0 +1,31 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef ACC_INIT_HISTOGRAM_HPP +#define ACCC_INIT_HISTOGRAM_HPP + +#include "acc_phase3.hpp" + +namespace jxl { +bool acc_InitHistogram(std::vector& histograms, std::vector >& tokens); + +void acc_ANSinitHistogram(LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& frame_header, + + std::vector& params, + bool do_once[5], + + std::vector >& tokens0, + std::vector >& tokens1, + std::vector >& tokens2, + std::vector >& tokens3, + + char* do_prefix_out, + std::vector& largest_idx, + std::vector >& nonempty_histograms, + std::vector >& histograms_); +} // namespace jxl + +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/acc_store_encode_data.hpp b/codec/L2/demos/jxlEnc/others/include/acc_store_encode_data.hpp new file mode 100644 index 0000000000..3a10b60a96 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/acc_store_encode_data.hpp @@ -0,0 +1,579 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef ACC_STORE_ENCODE_DATA_HPP +#define ACC_STORE_ENCODE_DATA_HPP + +#include "acc_phase3.hpp" + +namespace jxl { + +bool ans_fuzzer_friendly_ = false; +static const int kMaxNumSymbolsForSmallCode = 4; + +struct SizeWriterNew { + size_t size = 0; + void Write(size_t num, size_t bits) { size += num; } +}; + +template +void StoreVarLenUint8New(size_t n, Writer* writer) { + JXL_DASSERT(n <= 255); + if (n == 0) { + writer->Write(1, 0); + } else { + writer->Write(1, 1); + size_t nbits = FloorLog2Nonzero(n); + writer->Write(3, nbits); + writer->Write(nbits, n - (1ULL << nbits)); + } +} + +template +void StoreVarLenUint16New(size_t n, Writer* writer) { + JXL_DASSERT(n <= 65535); + if (n == 0) { + writer->Write(1, 0); + } else { + writer->Write(1, 1); + size_t nbits = FloorLog2Nonzero(n); + writer->Write(4, nbits); + writer->Write(nbits, n - (1ULL << nbits)); + } +} + +template +void EncodeUintConfig(const HybridUintConfig uint_config, Writer* writer, size_t log_alpha_size) { + writer->Write(CeilLog2Nonzero(log_alpha_size + 1), uint_config.split_exponent); + if (uint_config.split_exponent == log_alpha_size) { + return; // msb/lsb don't matter. + } + size_t nbits = CeilLog2Nonzero(uint_config.split_exponent + 1); + writer->Write(nbits, uint_config.msb_in_token); + nbits = CeilLog2Nonzero(uint_config.split_exponent - uint_config.msb_in_token + 1); + writer->Write(nbits, uint_config.lsb_in_token); +} +template +void EncodeUintConfigsNew(const std::vector& uint_config, Writer* writer, size_t log_alpha_size) { + // TODO(veluca): RLE? + for (size_t i = 0; i < uint_config.size(); i++) { + EncodeUintConfig(uint_config[i], writer, log_alpha_size); + } +} + +void ANSBuildInfoTableNew(const ANSHistBin* counts, + const AliasTable::Entry* table, + size_t alphabet_size, + size_t log_alpha_size, + ANSEncSymbolInfo* info) { + size_t log_entry_size = ANS_LOG_TAB_SIZE - log_alpha_size; + size_t entry_size_minus_1 = (1 << log_entry_size) - 1; + // create valid alias table for empty streams. + for (size_t s = 0; s < std::max(1, alphabet_size); ++s) { + const ANSHistBin freq = s == alphabet_size ? ANS_TAB_SIZE : counts[s]; + info[s].freq_ = static_cast(freq); +#ifdef USE_MULT_BY_RECIPROCAL + if (freq != 0) { + info[s].ifreq_ = ((1ull << RECIPROCAL_PRECISION) + info[s].freq_ - 1) / info[s].freq_; + } else { + info[s].ifreq_ = 1; // shouldn't matter (symbol shouldn't occur), but... + } +#endif + info[s].reverse_map_.resize(freq); + } + for (int i = 0; i < ANS_TAB_SIZE; i++) { + AliasTable::Symbol s = AliasTable::Lookup(table, i, log_entry_size, entry_size_minus_1); + info[s.value].reverse_map_[s.offset] = i; + } +} + +float EstimateDataBitsNew(const ANSHistBin* histogram, const ANSHistBin* counts, size_t len) { + float sum = 0.0f; + int total_histogram = 0; + int total_counts = 0; + for (size_t i = 0; i < len; ++i) { + total_histogram += histogram[i]; + total_counts += counts[i]; + if (histogram[i] > 0) { + JXL_ASSERT(counts[i] > 0); + // += histogram[i] * -log(counts[i]/total_counts) + sum += histogram[i] * std::max(0.0f, ANS_LOG_TAB_SIZE - FastLog2f(counts[i])); + } + } + if (total_histogram > 0) { + JXL_ASSERT(total_counts == ANS_TAB_SIZE); + } + return sum; +} + +float EstimateDataBitsFlatNew(const ANSHistBin* histogram, size_t len) { + const float flat_bits = std::max(FastLog2f(len), 0.0f); + int total_histogram = 0; + for (size_t i = 0; i < len; ++i) { + total_histogram += histogram[i]; + } + return total_histogram * flat_bits; +} + +// Static Huffman code for encoding logcounts. The last symbol is used as RLE +// sequence. +static const uint8_t kLogCountBitLengths[ANS_LOG_TAB_SIZE + 2] = { + 5, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 6, 7, 7, +}; +static const uint8_t kLogCountSymbols[ANS_LOG_TAB_SIZE + 2] = { + 17, 11, 15, 3, 9, 7, 4, 2, 5, 6, 0, 33, 1, 65, +}; + +// Returns the difference between largest count that can be represented and is +// smaller than "count" and smallest representable count larger than "count". +static int SmallestIncrement(uint32_t count, uint32_t shift) { + int bits = count == 0 ? -1 : FloorLog2Nonzero(count); + int drop_bits = bits - GetPopulationCountPrecision(bits, shift); + return drop_bits < 0 ? 1 : (1 << drop_bits); +} + +template +bool RebalanceHistogramNew( + const float* targets, int max_symbol, int table_size, uint32_t shift, int* omit_pos, ANSHistBin* counts) { + int sum = 0; + float sum_nonrounded = 0.0; + int remainder_pos = 0; // if all of them are handled in first loop + int remainder_log = -1; + for (int n = 0; n < max_symbol; ++n) { + if (targets[n] > 0 && targets[n] < 1.0f) { + counts[n] = 1; + sum_nonrounded += targets[n]; + sum += counts[n]; + } + } + const float discount_ratio = (table_size - sum) / (table_size - sum_nonrounded); + JXL_ASSERT(discount_ratio > 0); + JXL_ASSERT(discount_ratio <= 1.0f); + // Invariant for minimize_error_of_sum == true: + // abs(sum - sum_nonrounded) + // <= SmallestIncrement(max(targets[])) + max_symbol + for (int n = 0; n < max_symbol; ++n) { + if (targets[n] >= 1.0f) { + sum_nonrounded += targets[n]; + counts[n] = static_cast(targets[n] * discount_ratio); // truncate + if (counts[n] == 0) counts[n] = 1; + if (counts[n] == table_size) counts[n] = table_size - 1; + // Round the count to the closest nonzero multiple of SmallestIncrement + // (when minimize_error_of_sum is false) or one of two closest so as to + // keep the sum as close as possible to sum_nonrounded. + int inc = SmallestIncrement(counts[n], shift); + counts[n] -= counts[n] & (inc - 1); + // TODO(robryk): Should we rescale targets[n]? + const float target = minimize_error_of_sum ? (sum_nonrounded - sum) : targets[n]; + if (counts[n] == 0 || (target > counts[n] + inc / 2 && counts[n] + inc < table_size)) { + counts[n] += inc; + } + sum += counts[n]; + const int count_log = FloorLog2Nonzero(static_cast(counts[n])); + if (count_log > remainder_log) { + remainder_pos = n; + remainder_log = count_log; + } + } + } + JXL_ASSERT(remainder_pos != -1); + // NOTE: This is the only place where counts could go negative. We could + // detect that, return false and make ANSHistBin uint32_t. + counts[remainder_pos] -= sum - table_size; + *omit_pos = remainder_pos; + return counts[remainder_pos] > 0; +} + +Status NormalizeCountsNew(ANSHistBin* counts, + int* omit_pos, + const int length, + const int precision_bits, + uint32_t shift, + int* num_symbols, + int* symbols) { + const int32_t table_size = 1 << precision_bits; // target sum / table size + uint64_t total = 0; + int max_symbol = 0; + int symbol_count = 0; + for (int n = 0; n < length; ++n) { + total += counts[n]; + if (counts[n] > 0) { + if (symbol_count < kMaxNumSymbolsForSmallCode) { + symbols[symbol_count] = n; + } + ++symbol_count; + max_symbol = n + 1; + } + } + *num_symbols = symbol_count; + if (symbol_count == 0) { + return true; + } + if (symbol_count == 1) { + counts[symbols[0]] = table_size; + return true; + } + if (symbol_count > table_size) return JXL_FAILURE("Too many entries in an ANS histogram"); + + // printf("%s: %s: %d, max_symbol=%d\n", __FILE__, __FUNCTION__, __LINE__, max_symbol); + const float norm = 1.f * table_size / total; + std::vector targets(max_symbol); + for (size_t n = 0; n < targets.size(); ++n) { + targets[n] = norm * counts[n]; + } + if (!RebalanceHistogramNew(&targets[0], max_symbol, table_size, shift, omit_pos, counts)) { + // Use an alternative rebalancing mechanism if the one above failed + // to create a histogram that is positive wherever the original one was. + if (!RebalanceHistogramNew(&targets[0], max_symbol, table_size, shift, omit_pos, counts)) { + return JXL_FAILURE("Logic error: couldn't rebalance a histogram"); + } + } + return true; +} + +template +bool EncodeCountsNew(const ANSHistBin* counts, + const int alphabet_size, + const int omit_pos, + const int num_symbols, + uint32_t shift, + const int* symbols, + Writer* writer) { + bool ok = true; + if (num_symbols <= 2) { + // Small tree marker to encode 1-2 symbols. + writer->Write(1, 1); + if (num_symbols == 0) { + writer->Write(1, 0); + StoreVarLenUint8New(0, writer); + } else { + writer->Write(1, num_symbols - 1); + for (int i = 0; i < num_symbols; ++i) { + StoreVarLenUint8New(symbols[i], writer); + } + } + if (num_symbols == 2) { + writer->Write(ANS_LOG_TAB_SIZE, counts[symbols[0]]); + } + } else { + // Mark non-small tree. + writer->Write(1, 0); + // Mark non-flat histogram. + writer->Write(1, 0); + + // Precompute sequences for RLE encoding. Contains the number of identical + // values starting at a given index. Only contains the value at the first + // element of the series. + std::vector same(alphabet_size, 0); + int last = 0; + for (int i = 1; i < alphabet_size; i++) { + // Store the sequence length once different symbol reached, or we're at + // the end, or the length is longer than we can encode, or we are at + // the omit_pos. We don't support including the omit_pos in an RLE + // sequence because this value may use a different amount of log2 bits + // than standard, it is too complex to handle in the decoder. + if (counts[i] != counts[last] || i + 1 == alphabet_size || (i - last) >= 255 || i == omit_pos || + i == omit_pos + 1) { + same[last] = (i - last); + last = i + 1; + } + } + + int length = 0; + std::vector logcounts(alphabet_size); + int omit_log = 0; + for (int i = 0; i < alphabet_size; ++i) { + JXL_ASSERT(counts[i] <= ANS_TAB_SIZE); + JXL_ASSERT(counts[i] >= 0); + if (i == omit_pos) { + length = i + 1; + } else if (counts[i] > 0) { + logcounts[i] = FloorLog2Nonzero(static_cast(counts[i])) + 1; + length = i + 1; + if (i < omit_pos) { + omit_log = std::max(omit_log, logcounts[i] + 1); + } else { + omit_log = std::max(omit_log, logcounts[i]); + } + } + } + logcounts[omit_pos] = omit_log; + + // Elias gamma-like code for shift. Only difference is that if the number + // of bits to be encoded is equal to FloorLog2(ANS_LOG_TAB_SIZE+1), we skip + // the terminating 0 in unary coding. + int upper_bound_log = FloorLog2Nonzero(ANS_LOG_TAB_SIZE + 1); + int log = FloorLog2Nonzero(shift + 1); + writer->Write(log, (1 << log) - 1); + if (log != upper_bound_log) writer->Write(1, 0); + writer->Write(log, ((1 << log) - 1) & (shift + 1)); + + // Since num_symbols >= 3, we know that length >= 3, therefore we encode + // length - 3. + if (length - 3 > 255) { + // Pretend that everything is OK, but complain about correctness later. + StoreVarLenUint8New(255, writer); + ok = false; + } else { + StoreVarLenUint8New(length - 3, writer); + } + + // The logcount values are encoded with a static Huffman code. + static const size_t kMinReps = 4; + size_t rep = ANS_LOG_TAB_SIZE + 1; + // printf("%s: %s: %d, length=%d\n", __FILE__, __FUNCTION__, __LINE__, length); + for (int i = 0; i < length; ++i) { + if (i > 0 && same[i - 1] > kMinReps) { + // Encode the RLE symbol and skip the repeated ones. + writer->Write(kLogCountBitLengths[rep], kLogCountSymbols[rep]); + StoreVarLenUint8New(same[i - 1] - kMinReps - 1, writer); + i += same[i - 1] - 2; + continue; + } + writer->Write(kLogCountBitLengths[logcounts[i]], kLogCountSymbols[logcounts[i]]); + } + for (int i = 0; i < length; ++i) { + if (i > 0 && same[i - 1] > kMinReps) { + // Skip symbols encoded by RLE. + i += same[i - 1] - 2; + continue; + } + if (logcounts[i] > 1 && i != omit_pos) { + int bitcount = GetPopulationCountPrecision(logcounts[i] - 1, shift); + int drop_bits = logcounts[i] - 1 - bitcount; + JXL_CHECK((counts[i] & ((1 << drop_bits) - 1)) == 0); + writer->Write(bitcount, (counts[i] >> drop_bits) - (1 << bitcount)); + } + } + } + return ok; +} + +void EncodeFlatHistogramNew(const int alphabet_size, BitWriter* writer) { + // Mark non-small tree. + writer->Write(1, 0); + // Mark uniform histogram. + writer->Write(1, 1); + JXL_ASSERT(alphabet_size > 0); + // Encode alphabet size. + StoreVarLenUint8New(alphabet_size - 1, writer); +} + +float ComputeHistoAndDataCostNew(const ANSHistBin* histogram, size_t alphabet_size, uint32_t method) { + if (method == 0) { // Flat code + return ANS_LOG_TAB_SIZE + 2 + EstimateDataBitsFlatNew(histogram, alphabet_size); + } + // Non-flat: shift = method-1. + uint32_t shift = method - 1; + std::vector counts(histogram, histogram + alphabet_size); + int omit_pos = 0; + int num_symbols; + int symbols[kMaxNumSymbolsForSmallCode] = {}; + JXL_CHECK( + NormalizeCountsNew(counts.data(), &omit_pos, alphabet_size, ANS_LOG_TAB_SIZE, shift, &num_symbols, symbols)); + SizeWriterNew writer; + // Ignore the correctness, no real encoding happens at this stage. + (void)EncodeCountsNew(counts.data(), alphabet_size, omit_pos, num_symbols, shift, symbols, &writer); + return writer.size + EstimateDataBitsNew(histogram, counts.data(), alphabet_size); +} + +uint32_t ComputeBestMethodNew(const ANSHistBin* histogram, + size_t alphabet_size, + float* cost, + HistogramParams::ANSHistogramStrategy ans_histogram_strategy) { + size_t method = 0; + float fcost = ComputeHistoAndDataCostNew(histogram, alphabet_size, 0); + // printf("%s: %s: %d, ANS_LOG_TAB_SIZE=%d, ans_histogram_strategy=%d\n", __FILE__, __FUNCTION__, __LINE__, + // ANS_LOG_TAB_SIZE, ans_histogram_strategy != HistogramParams::ANSHistogramStrategy::kPrecise); + for (uint32_t shift = 0; shift <= ANS_LOG_TAB_SIZE; + ans_histogram_strategy != HistogramParams::ANSHistogramStrategy::kPrecise ? shift += 2 : shift++) { + float c = ComputeHistoAndDataCostNew(histogram, alphabet_size, shift + 1); + if (c < fcost) { + method = shift + 1; + fcost = c; + } else if (ans_histogram_strategy == HistogramParams::ANSHistogramStrategy::kFast) { + // do not be as precise if estimating cost. + break; + } + } + // printf("%s: %s: %d, alphabet_size=%zu, method=%zu, fcost=%f, ANS_TAB_SIZE=%d\n", + // __FILE__, __FUNCTION__, __LINE__, + // alphabet_size, method, fcost); + *cost = fcost; + return method; +} + +size_t BuildAndStoreANSEncodingDataNew(HistogramParams::ANSHistogramStrategy ans_histogram_strategy, + const ANSHistBin* histogram, + size_t alphabet_size, + size_t log_alpha_size, + bool use_prefix_code, + ANSEncSymbolInfo* info, + BitWriter* writer) { + // printf("%s: %s: %d, ans_histogram_strategy=%d, alphabet_size=%zu, log_alpha_size=%zu, ANS_TAB_SIZE=%d, + // ANS_MAX_ALPHABET_SIZE=%d, ANS_LOG_TAB_SIZE=%d\n", + // __FILE__, __FUNCTION__, __LINE__, + // ans_histogram_strategy, alphabet_size, log_alpha_size, ANS_TAB_SIZE, ANS_MAX_ALPHABET_SIZE, ANS_LOG_TAB_SIZE); + if (use_prefix_code) { + if (alphabet_size <= 1) return 0; + std::vector histo(alphabet_size); + for (size_t i = 0; i < alphabet_size; i++) { + histo[i] = histogram[i]; + JXL_CHECK(histogram[i] >= 0); + } + size_t cost = 0; + { + std::vector depths(alphabet_size); + std::vector bits(alphabet_size); + BitWriter tmp_writer; + BitWriter* w = writer ? writer : &tmp_writer; + size_t start = w->BitsWritten(); + BitWriter::Allotment allotment(w, 8 * alphabet_size + 8); // safe upper bound + BuildAndStoreHuffmanTree(histo.data(), alphabet_size, depths.data(), bits.data(), w); + ReclaimAndCharge(w, &allotment, 0, /*aux_out=*/nullptr); + + for (size_t i = 0; i < alphabet_size; i++) { + info[i].bits = depths[i] == 0 ? 0 : bits[i]; + info[i].depth = depths[i]; + } + cost = w->BitsWritten() - start; + } + // Estimate data cost. + for (size_t i = 0; i < alphabet_size; i++) { + cost += histogram[i] * info[i].depth; + } + return cost; + } + JXL_ASSERT(alphabet_size <= ANS_TAB_SIZE); + // Ensure we ignore trailing zeros in the histogram. + if (alphabet_size != 0) { + size_t largest_symbol = 0; + for (size_t i = 0; i < alphabet_size; i++) { + if (histogram[i] != 0) largest_symbol = i; + } + alphabet_size = largest_symbol + 1; + } + // printf("%s: %s: %d, updated alphabet_size=%zu\n", __FILE__, __FUNCTION__, __LINE__, alphabet_size); + float cost; + uint32_t method = ComputeBestMethodNew(histogram, alphabet_size, &cost, ans_histogram_strategy); + JXL_ASSERT(cost >= 0); + int num_symbols; + int symbols[kMaxNumSymbolsForSmallCode] = {}; + std::vector counts(histogram, histogram + alphabet_size); + if (!counts.empty()) { + size_t sum = 0; + for (size_t i = 0; i < counts.size(); i++) { + sum += counts[i]; + } + if (sum == 0) { + counts[0] = ANS_TAB_SIZE; + } + } + if (method == 0) { + counts = CreateFlatHistogram(alphabet_size, ANS_TAB_SIZE); + AliasTable::Entry a[ANS_MAX_ALPHABET_SIZE]; + InitAliasTable(counts, ANS_TAB_SIZE, log_alpha_size, a); + ANSBuildInfoTableNew(counts.data(), a, alphabet_size, log_alpha_size, info); + if (writer != nullptr) { + EncodeFlatHistogramNew(alphabet_size, writer); + } + return cost; + } + int omit_pos = 0; + uint32_t shift = method - 1; + JXL_CHECK( + NormalizeCountsNew(counts.data(), &omit_pos, alphabet_size, ANS_LOG_TAB_SIZE, shift, &num_symbols, symbols)); + AliasTable::Entry a[ANS_MAX_ALPHABET_SIZE]; + InitAliasTable(counts, ANS_TAB_SIZE, log_alpha_size, a); + ANSBuildInfoTableNew(counts.data(), a, alphabet_size, log_alpha_size, info); + if (writer != nullptr) { + bool ok = EncodeCountsNew(counts.data(), alphabet_size, omit_pos, num_symbols, shift, symbols, writer); + (void)ok; + JXL_DASSERT(ok); + } + return cost; +} + +float ANSPopulationCostNew(const ANSHistBin* data, size_t alphabet_size) { + float c; + ComputeBestMethodNew(data, alphabet_size, &c, HistogramParams::ANSHistogramStrategy::kFast); + return c; +} + +size_t StoreEntropyCodesNew(const HistogramParams& params, + const std::vector >& tokens, + EntropyEncodingData* codes, + bool use_prefix_code, + BitWriter* writer, + size_t layer, + AuxOut* aux_out, + std::vector clustered_histograms) { + size_t cost = 0; + codes->use_prefix_code = use_prefix_code; + size_t log_alpha_size = codes->lz77.enabled ? 8 : 7; // Sane default. + if (ans_fuzzer_friendly_) { + codes->uint_config.clear(); + codes->uint_config.resize(1, HybridUintConfig(7, 0, 0)); + } else { + codes->uint_config.resize(clustered_histograms.size()); + if (params.uint_method == HistogramParams::HybridUintMethod::kContextMap) { + codes->uint_config.clear(); + codes->uint_config.resize(clustered_histograms.size(), HybridUintConfig(2, 0, 1)); + } + } + if (log_alpha_size < 5) log_alpha_size = 5; + SizeWriterNew size_writer; // Used if writer == nullptr to estimate costs. + cost += 1; + if (writer) writer->Write(1, use_prefix_code); + if (use_prefix_code) { + log_alpha_size = PREFIX_MAX_BITS; + } else { + cost += 2; + } + if (writer == nullptr) { + EncodeUintConfigsNew(codes->uint_config, &size_writer, log_alpha_size); + } else { + if (!use_prefix_code) writer->Write(2, log_alpha_size - 5); + EncodeUintConfigsNew(codes->uint_config, writer, log_alpha_size); + } + if (use_prefix_code) { + for (size_t c = 0; c < clustered_histograms.size(); ++c) { + size_t num_symbol = 1; + for (size_t i = 0; i < clustered_histograms[c].data_.size(); i++) { + if (clustered_histograms[c].data_[i]) num_symbol = i + 1; + } + if (writer) { + StoreVarLenUint16New(num_symbol - 1, writer); + } else { + StoreVarLenUint16New(num_symbol - 1, &size_writer); + } + } + } + cost += size_writer.size; + // printf("%s: %s: %d, final clustered_histograms size=%zu\n", __FILE__, __FUNCTION__, __LINE__, + // clustered_histograms.size()); + for (size_t c = 0; c < clustered_histograms.size(); ++c) { + size_t num_symbol = 1; + // printf("%s: %s: %d, final clustered_histograms data size=%zu\n", __FILE__, __FUNCTION__, __LINE__, + // clustered_histograms[c].data_.size()); + for (size_t i = 0; i < clustered_histograms[c].data_.size(); i++) { + if (clustered_histograms[c].data_[i]) num_symbol = i + 1; + } + codes->encoding_info.emplace_back(); + codes->encoding_info.back().resize(std::max(1, num_symbol)); + // printf("%s: %s: %d, encoding_info size=%zu, adder=%zu\n", __FILE__, __FUNCTION__, __LINE__, + // codes->encoding_info.size(), num_symbol); + BitWriter::Allotment allotment(writer, 256 + num_symbol * 24); + cost += BuildAndStoreANSEncodingDataNew(params.ans_histogram_strategy, clustered_histograms[c].data_.data(), + num_symbol, log_alpha_size, use_prefix_code, + codes->encoding_info.back().data(), writer); + allotment.FinishedHistogram(writer); + ReclaimAndCharge(writer, &allotment, layer, aux_out); + } + return cost; +} +} // namespace jxl + +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram/acc_host.hpp b/codec/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram/acc_host.hpp new file mode 100644 index 0000000000..01fb212447 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram/acc_host.hpp @@ -0,0 +1,35 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef ACC_HOST_HPP +#define ACC_HOST_HPP + +#include "acc_common.hpp" +#include "acc_phase1.hpp" +#include "acc_phase2.hpp" +#include "acc_phase3.hpp" + +namespace jxl { + +Status acc_host(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + const ImageBundle* JXL_RESTRICT ib_or_linear, + ThreadPool* pool, + std::unique_ptr& modular_frame_encoder, + BitWriter* writer, + AuxOut* aux_out, + std::unique_ptr& frame_header, + const FrameInfo& frame_info, + CompressParams cparams, + const std::vector* extra_channels, + PassesEncoderState* passes_enc_state, + FrameDimensions frame_dim, + const size_t num_groups, + const ImageBundle& ib, + std::vector& aux_outs, + const std::function& resize_aux_outs); +} +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram/acc_phase1.hpp b/codec/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram/acc_phase1.hpp new file mode 100644 index 0000000000..202b712afd --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram/acc_phase1.hpp @@ -0,0 +1,92 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef ACC_PHASE1_HPP +#define ACC_PHASE1_HPP + +#include + +#include "acc_common.hpp" +#include "xlnx_cfg.h" + +#include "lib/jxl/enc_frame.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dct_util.h" +#include "acc_enc_ac_strategy.hpp" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_ar_control_field.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "acc_enc_chroma_from_luma.hpp" +#include "lib/jxl/enc_coeff_order.h" +#include "lib/jxl/enc_context_map.h" +#include "lib/jxl/enc_entropy_coder.h" +#include "acc_enc_group.hpp" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_noise.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_photon_noise.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/enc_toc.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/gaborish.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" +#include "lib/jxl/toc.h" + +namespace jxl { + +Status acc_phase1(Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + const FrameInfo& frame_info, + const ImageBundle* JXL_RESTRICT ib_or_linear, + const ImageBundle& ib, + AuxOut* aux_out, + ThreadPool* pool); +} + +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram/acc_phase2.hpp b/codec/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram/acc_phase2.hpp new file mode 100644 index 0000000000..4adc5b8932 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram/acc_phase2.hpp @@ -0,0 +1,95 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef HLS_PHASE2_HPP +#define HLS_PHASE2_HPP + +#include + +#include "acc_common.hpp" +#include "xlnx_cfg.h" + +#include "lib/jxl/enc_frame.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dct_util.h" +#include "acc_enc_ac_strategy.hpp" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_ar_control_field.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "acc_enc_chroma_from_luma.hpp" +#include "lib/jxl/enc_coeff_order.h" +#include "lib/jxl/enc_context_map.h" +#include "lib/jxl/enc_entropy_coder.h" +#include "acc_enc_group.hpp" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_noise.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_photon_noise.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/enc_toc.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/gaborish.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" +#include "lib/jxl/toc.h" +#include "lib/jxl/enc_transforms-inl.h" + +namespace jxl { + +Status acc_phase2(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& modular_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + const std::vector* extra_channels, + const ImageBundle* JXL_RESTRICT ib_or_linear, + const ImageBundle& ib, + ThreadPool* pool, + AuxOut* aux_out); +} + +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram/acc_phase3.hpp b/codec/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram/acc_phase3.hpp new file mode 100644 index 0000000000..f36ca63ed8 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram/acc_phase3.hpp @@ -0,0 +1,102 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef HLS_PHASE3_HPP +#define HLS_PHASE3_HPP + +#include + +#include "acc_common.hpp" +#include "xlnx_cfg.h" + +#include "lib/jxl/enc_frame.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dct_util.h" +#include "acc_enc_ac_strategy.hpp" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_ar_control_field.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_cluster.h" +#include "acc_enc_chroma_from_luma.hpp" +#include "lib/jxl/enc_coeff_order.h" +#include "lib/jxl/enc_context_map.h" +#include "lib/jxl/enc_huffman.h" +#include "lib/jxl/enc_entropy_coder.h" +#include "acc_enc_group.hpp" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_noise.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_photon_noise.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/enc_toc.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/fast_math-inl.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/gaborish.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" +#include "lib/jxl/toc.h" +#include "lib/jxl/modular/encoding/ma_common.h" + +namespace jxl { + +Status acc_phase3(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& modular_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + PassesEncoderState* passes_enc_state, + FrameDimensions frame_dim, + BitWriter* writer, + const size_t num_groups, + AuxOut* aux_out, + ThreadPool* pool, + std::vector& aux_outs, + const ImageBundle& ib, + const std::function& resize_aux_outs); +} + +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram/xlnx_cfg.h b/codec/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram/xlnx_cfg.h new file mode 100644 index 0000000000..bc8ce19ab6 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/host_acc_cluster_histogram/xlnx_cfg.h @@ -0,0 +1,35 @@ +/* + * Copyright 2022 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef XLNX_CFG_H +#define XLNX_CFG_H + +//#define XLNX_DEBUG_DCT +//#define XLNX_DEBUG_CMAP + +//#define XLNX_QC_DEBUG +//#define XLNX_QC_DEBUG_AC_ESTIMATE_ENTROPY +//#define XLNX_QC_DEBUG_DCT +//#define XLNX_QC_DEBUG_DC +//#define XLNX_QC_DEBUG_ENC_GROUP +//#define XLNX_QC_DEBUG_ENC_GROUP_DC + +#define XLNX_DISABLE_BLK_DICT +#define XLNX_DISABLE_RECT_DCT +#define XLNX_DISABLE_ARC +#define XLNX_DISABLE_2NDCMP + +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute/acc_host.hpp b/codec/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute/acc_host.hpp new file mode 100644 index 0000000000..01fb212447 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute/acc_host.hpp @@ -0,0 +1,35 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef ACC_HOST_HPP +#define ACC_HOST_HPP + +#include "acc_common.hpp" +#include "acc_phase1.hpp" +#include "acc_phase2.hpp" +#include "acc_phase3.hpp" + +namespace jxl { + +Status acc_host(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + const ImageBundle* JXL_RESTRICT ib_or_linear, + ThreadPool* pool, + std::unique_ptr& modular_frame_encoder, + BitWriter* writer, + AuxOut* aux_out, + std::unique_ptr& frame_header, + const FrameInfo& frame_info, + CompressParams cparams, + const std::vector* extra_channels, + PassesEncoderState* passes_enc_state, + FrameDimensions frame_dim, + const size_t num_groups, + const ImageBundle& ib, + std::vector& aux_outs, + const std::function& resize_aux_outs); +} +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute/acc_phase1.hpp b/codec/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute/acc_phase1.hpp new file mode 100644 index 0000000000..202b712afd --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute/acc_phase1.hpp @@ -0,0 +1,92 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef ACC_PHASE1_HPP +#define ACC_PHASE1_HPP + +#include + +#include "acc_common.hpp" +#include "xlnx_cfg.h" + +#include "lib/jxl/enc_frame.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dct_util.h" +#include "acc_enc_ac_strategy.hpp" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_ar_control_field.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "acc_enc_chroma_from_luma.hpp" +#include "lib/jxl/enc_coeff_order.h" +#include "lib/jxl/enc_context_map.h" +#include "lib/jxl/enc_entropy_coder.h" +#include "acc_enc_group.hpp" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_noise.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_photon_noise.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/enc_toc.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/gaborish.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" +#include "lib/jxl/toc.h" + +namespace jxl { + +Status acc_phase1(Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + const FrameInfo& frame_info, + const ImageBundle* JXL_RESTRICT ib_or_linear, + const ImageBundle& ib, + AuxOut* aux_out, + ThreadPool* pool); +} + +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute/acc_phase2.hpp b/codec/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute/acc_phase2.hpp new file mode 100644 index 0000000000..0d737bf54c --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute/acc_phase2.hpp @@ -0,0 +1,96 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef HLS_PHASE2_HPP +#define HLS_PHASE2_HPP + +#include "ap_int.h" +#include "ap_fixed.h" +#include "hls_math.h" + +#include "acc_common.hpp" +#include "xlnx_cfg.h" + +#include "lib/jxl/enc_frame.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/enc_ac_strategy.h" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_ar_control_field.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_chroma_from_luma.h" +#include "lib/jxl/enc_coeff_order.h" +#include "lib/jxl/enc_context_map.h" +#include "lib/jxl/enc_entropy_coder.h" +#include "lib/jxl/enc_group.h" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_noise.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_photon_noise.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/enc_toc.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/gaborish.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" +#include "lib/jxl/toc.h" + +namespace jxl { + +Status acc_phase2(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& modular_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + const std::vector* extra_channels, + const ImageBundle* JXL_RESTRICT ib_or_linear, + const ImageBundle& ib, + ThreadPool* pool, + AuxOut* aux_out); +} + +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute/acc_phase3.hpp b/codec/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute/acc_phase3.hpp new file mode 100644 index 0000000000..acea1f77e9 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute/acc_phase3.hpp @@ -0,0 +1,98 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef HLS_PHASE3_HPP +#define HLS_PHASE3_HPP + +#include + +#include "acc_common.hpp" +// #include "xlnx_cfg.h" + +#include "lib/jxl/enc_frame.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/enc_ac_strategy.h" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_ar_control_field.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_chroma_from_luma.h" +#include "lib/jxl/enc_coeff_order.h" +#include "lib/jxl/enc_context_map.h" +#include "lib/jxl/enc_entropy_coder.h" +#include "lib/jxl/enc_group.h" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_noise.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_photon_noise.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/enc_toc.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/gaborish.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" +#include "lib/jxl/toc.h" + +namespace jxl { + +Status acc_phase3(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& modular_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + PassesEncoderState* passes_enc_state, + FrameDimensions frame_dim, + BitWriter* writer, + const size_t num_groups, + AuxOut* aux_out, + ThreadPool* pool, + std::vector& aux_outs, + const ImageBundle& ib, + const std::function& resize_aux_outs); +} + +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute/xlnx_cfg.h b/codec/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute/xlnx_cfg.h new file mode 100644 index 0000000000..2d28564478 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/host_acc_lossy_enc_compute/xlnx_cfg.h @@ -0,0 +1,36 @@ +/* + * Copyright 2022 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef XLNX_CFG_H +#define XLNX_CFG_H + +//#define XLNX_DEBUG_DCT +//#define XLNX_DEBUG_CMAP + +//#define XLNX_QC_DEBUG +//#define XLNX_QC_DEBUG_AC_ESTIMATE_ENTROPY +//#define XLNX_QC_DEBUG_DCT +//#define XLNX_QC_DEBUG_DC +//#define XLNX_QC_DEBUG_ENC_GROUP +//#define XLNX_QC_DEBUG_ENC_GROUP_DC + +#define XLNX_DISABLE_BLK_DICT +#define XLNX_DISABLE_RECT_DCT +#define XLNX_DISABLE_ARC +#define XLNX_DISABLE_2NDCMP +#define DISABLE_ACC_BIT_WRITER + +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram/acc_host.hpp b/codec/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram/acc_host.hpp new file mode 100644 index 0000000000..01fb212447 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram/acc_host.hpp @@ -0,0 +1,35 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef ACC_HOST_HPP +#define ACC_HOST_HPP + +#include "acc_common.hpp" +#include "acc_phase1.hpp" +#include "acc_phase2.hpp" +#include "acc_phase3.hpp" + +namespace jxl { + +Status acc_host(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + const ImageBundle* JXL_RESTRICT ib_or_linear, + ThreadPool* pool, + std::unique_ptr& modular_frame_encoder, + BitWriter* writer, + AuxOut* aux_out, + std::unique_ptr& frame_header, + const FrameInfo& frame_info, + CompressParams cparams, + const std::vector* extra_channels, + PassesEncoderState* passes_enc_state, + FrameDimensions frame_dim, + const size_t num_groups, + const ImageBundle& ib, + std::vector& aux_outs, + const std::function& resize_aux_outs); +} +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram/acc_phase1.hpp b/codec/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram/acc_phase1.hpp new file mode 100644 index 0000000000..202b712afd --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram/acc_phase1.hpp @@ -0,0 +1,92 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef ACC_PHASE1_HPP +#define ACC_PHASE1_HPP + +#include + +#include "acc_common.hpp" +#include "xlnx_cfg.h" + +#include "lib/jxl/enc_frame.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dct_util.h" +#include "acc_enc_ac_strategy.hpp" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_ar_control_field.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "acc_enc_chroma_from_luma.hpp" +#include "lib/jxl/enc_coeff_order.h" +#include "lib/jxl/enc_context_map.h" +#include "lib/jxl/enc_entropy_coder.h" +#include "acc_enc_group.hpp" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_noise.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_photon_noise.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/enc_toc.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/gaborish.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" +#include "lib/jxl/toc.h" + +namespace jxl { + +Status acc_phase1(Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + const FrameInfo& frame_info, + const ImageBundle* JXL_RESTRICT ib_or_linear, + const ImageBundle& ib, + AuxOut* aux_out, + ThreadPool* pool); +} + +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram/acc_phase2.hpp b/codec/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram/acc_phase2.hpp new file mode 100644 index 0000000000..4adc5b8932 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram/acc_phase2.hpp @@ -0,0 +1,95 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef HLS_PHASE2_HPP +#define HLS_PHASE2_HPP + +#include + +#include "acc_common.hpp" +#include "xlnx_cfg.h" + +#include "lib/jxl/enc_frame.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dct_util.h" +#include "acc_enc_ac_strategy.hpp" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_ar_control_field.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "acc_enc_chroma_from_luma.hpp" +#include "lib/jxl/enc_coeff_order.h" +#include "lib/jxl/enc_context_map.h" +#include "lib/jxl/enc_entropy_coder.h" +#include "acc_enc_group.hpp" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_noise.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_photon_noise.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/enc_toc.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/gaborish.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" +#include "lib/jxl/toc.h" +#include "lib/jxl/enc_transforms-inl.h" + +namespace jxl { + +Status acc_phase2(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& modular_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + const std::vector* extra_channels, + const ImageBundle* JXL_RESTRICT ib_or_linear, + const ImageBundle& ib, + ThreadPool* pool, + AuxOut* aux_out); +} + +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram/acc_phase3.hpp b/codec/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram/acc_phase3.hpp new file mode 100644 index 0000000000..2c87cd9993 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram/acc_phase3.hpp @@ -0,0 +1,97 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef ACC_PHASE3_HPP +#define ACC_PHASE3_HPP + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "acc_common.hpp" +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_ar_control_field.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "acc_enc_cluster.hpp" +#include "lib/jxl/enc_coeff_order.h" +#include "lib/jxl/enc_context_map.h" +#include "lib/jxl/enc_entropy_coder.h" +#include "lib/jxl/enc_huffman.h" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_noise.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_photon_noise.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/enc_toc.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/fast_math-inl.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/gaborish.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/modular/encoding/ma_common.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" +#include "lib/jxl/toc.h" +#include "acc_enc_ac_strategy.hpp" +#include "acc_enc_chroma_from_luma.hpp" +#include "acc_enc_group.hpp" +#include "xlnx_cfg.h" + +namespace jxl { + +Status acc_phase3(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& modular_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + PassesEncoderState* passes_enc_state, + FrameDimensions frame_dim, + BitWriter* writer, + const size_t num_groups, + AuxOut* aux_out, + ThreadPool* pool, + std::vector& aux_outs, + const ImageBundle& ib, + const std::function& resize_aux_outs); +} + +#endif diff --git a/codec/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram/xlnx_cfg.h b/codec/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram/xlnx_cfg.h new file mode 100644 index 0000000000..bc8ce19ab6 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/include/host_acc_tokInit_histogram/xlnx_cfg.h @@ -0,0 +1,35 @@ +/* + * Copyright 2022 Xilinx, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef XLNX_CFG_H +#define XLNX_CFG_H + +//#define XLNX_DEBUG_DCT +//#define XLNX_DEBUG_CMAP + +//#define XLNX_QC_DEBUG +//#define XLNX_QC_DEBUG_AC_ESTIMATE_ENTROPY +//#define XLNX_QC_DEBUG_DCT +//#define XLNX_QC_DEBUG_DC +//#define XLNX_QC_DEBUG_ENC_GROUP +//#define XLNX_QC_DEBUG_ENC_GROUP_DC + +#define XLNX_DISABLE_BLK_DICT +#define XLNX_DISABLE_RECT_DCT +#define XLNX_DISABLE_ARC +#define XLNX_DISABLE_2NDCMP + +#endif diff --git a/codec/L2/demos/jxlEnc/others/src/acc_cluster_histogram.cpp b/codec/L2/demos/jxlEnc/others/src/acc_cluster_histogram.cpp new file mode 100644 index 0000000000..ead487d886 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/acc_cluster_histogram.cpp @@ -0,0 +1,201 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef ACC_CLUSTER_HISTOGRAM_CPP +#define ACC_CLUSTER_HISTOGRAM_CPP + +#include "acc_cluster_histogram.hpp" + +namespace jxl { +void acc_ANSclusterHistogram(bool is_small_image, + bool do_once[5], + char* do_inner, + char* do_prefix_in, + + std::vector& params, + + std::vector >& histograms_, + std::vector& num_contexts, + std::vector*> context_map, + std::vector >& nonempty_histograms, + std::vector& largest_idx, + + std::vector codes, + std::vector >& clustered_histograms, + std::vector >& histogram_symbols, + + std::vector writer, + std::vector layer, + std::vector >& clustered_histogramsin, + std::vector > >& tokensin, + std::vector& codesin, + std::vector >& context_map_in) { + constexpr float kMinDistanceForDistinctFast = 64.0f; + constexpr float kMinDistanceForDistinctBest = 16.0f; + + for (int i = 0; i < 5; i++) { + if (!do_once[i]) continue; + + codes[i]->lz77.nonserialized_distance_context = num_contexts[i]; + codes[i]->lz77.enabled = false; + codes[i]->lz77.min_symbol = 224; + codes[i]->encoding_info.clear(); + context_map[i]->resize(histograms_[i].size()); + clustered_histograms[i] = histograms_[i]; + + if (histograms_[i].size() > 1) { + size_t max_histograms = std::min(kClustersLimit, params[i].max_histograms); + acc_FastClusterHistograms(histograms_[i], nonempty_histograms[i], largest_idx[i], + nonempty_histograms[i].size(), max_histograms, kMinDistanceForDistinctFast, + &clustered_histograms[i], &histogram_symbols[i]); + } + } + + for (int i = 0; i < 5; i++) { + if (!do_once[i]) continue; + if (histograms_[i].size() > 1) { + // Convert the context map to a canonical form. + HistogramReindex(&clustered_histograms[i], &histogram_symbols[i]); + + for (size_t c = 0; c < histograms_[i].size(); ++c) { + (*context_map[i])[c] = static_cast(histogram_symbols[i][c]); + } + } + } + + for (int i = 0; i < 5; i++) { + if (!do_once[i]) continue; + size_t histograms_size = histograms_[i].size(); + if (histograms_size > 1) { + if (writer[i] != nullptr) { + size_t num_histograms = clustered_histograms[i].size(); + if (num_histograms == 1) { + } else { + for (size_t j = 0; j < (*context_map[i]).size(); j++) { + tokensin[i][0].emplace_back(0, (*context_map[i])[j]); + } + + size_t entry_bits = CeilLog2Nonzero(num_histograms); + if (entry_bits < 4) { + } else { + do_inner[i] = 1; + } + } + } + } + + if (do_inner[i]) { + codesin[i].lz77.nonserialized_distance_context = 1; + codesin[i].lz77.enabled = false; + codesin[i].lz77.min_symbol = 224; + + bool use_prefix_code = false; + do_prefix_in[i] = (char)use_prefix_code; + + std::vector ctxHistograms_(1); + HybridUintConfig uint_config; // Default config for clustering. + + for (size_t j = 0; j < tokensin[i].size(); ++j) { + for (size_t k = 0; k < tokensin[i][j].size(); ++k) { + const Token token = tokensin[i][j][k]; + uint32_t tok, nbits, bits; + uint_config.Encode(token.value, &tok, &nbits, &bits); + ctxHistograms_[0].Add(tok); + clustered_histogramsin[i] = ctxHistograms_; + + codesin[i].encoding_info.clear(); + context_map_in[i].resize(clustered_histogramsin[i].size()); + } + } + } + } + + for (int i = 0; i < 5; i++) { + if (!do_once[i]) continue; + + if (i == 0) { + if (!is_small_image) { + writer[0]->update_part(1); + } else { + writer[0]->update_part(1); + } + + } else if (i == 1) { + if (!is_small_image) { + writer[1]->update_part(31); + } else { + writer[1]->update_part(31); + } + } else if (i == 2) { + if (!is_small_image) { + writer[2]->update_part(51); + } else { + writer[2]->update_part(51); + } + } else if (i == 3) { + if (!is_small_image) { + writer[3]->update_part(1); + } else { + writer[3]->update_part(81); + } + } else if (i == 4) { + if (!is_small_image) { + writer[4]->update_part(21); + } else { + writer[4]->update_part(101); + } + } + + size_t histograms_size = histograms_[i].size(); + + const size_t max_contexts = std::min(num_contexts[i], kClustersLimit); + BitWriter::Allotment allotment(writer[i], 128 + num_contexts[i] * 40 + max_contexts * 96); + if (writer[i]) { + JXL_CHECK(Bundle::Write(codes[i]->lz77, writer[i], layer[i], nullptr)); + } + + if (histograms_size > 1) { + size_t num_histograms = clustered_histograms[i].size(); + if (writer[i] != nullptr) { + // printf("%s: %s: %d, Start EncodeContextMap context size=%zu\n\n", + // __FILE__, __FUNCTION__, __LINE__, (*context_map).size()); + if (num_histograms == 1) { + writer[i]->Write(1, 1); + writer[i]->Write(2, 0); + } else { + size_t entry_bits = CeilLog2Nonzero(num_histograms); + if (entry_bits < 4) { + writer[i]->Write(1, 1); + writer[i]->Write(2, entry_bits); + for (size_t j = 0; j < (*context_map[i]).size(); j++) { + writer[i]->Write(entry_bits, (*context_map[i])[j]); + } + } else { + writer[i]->Write(1, 0); + writer[i]->Write(1, 0); + } + } + } + } + // StoreEntropyCodesNew + allotment.FinishedHistogram(writer[i]); + ReclaimAndCharge(writer[i], &allotment, layer[i], nullptr); + + if (do_inner[i]) { + // do inner ontext map = true + BitWriter::Allotment allotment(writer[i], 128 + 1 * 40 + 96); + JXL_CHECK(Bundle::Write(codesin[i].lz77, writer[i], 0, nullptr)); + + // StoreEntropyCodesNew + // WriteToken + allotment.FinishedHistogram(writer[i]); + ReclaimAndCharge(writer[i], &allotment, 0, nullptr); + } + } +} + +} // namespace jxl + +#endif diff --git a/codec/L2/demos/jxlEnc/others/src/acc_enc_ac_strategy.cpp b/codec/L2/demos/jxlEnc/others/src/acc_enc_ac_strategy.cpp new file mode 100644 index 0000000000..6fdd0da827 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/acc_enc_ac_strategy.cpp @@ -0,0 +1,1197 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +#include "acc_enc_ac_strategy.hpp" + +#include +#include + +#include +#include +#include +#include +#include + +#include "xlnx_cfg.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "xilinx/src/acc_enc_ac_strategy.cpp" +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_transforms-inl.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/fast_math-inl.h" + +// Some of the floating point constants in this file and in other +// files in the libjxl project have been obtained using the +// tools/optimizer/simplex_fork.py tool. It is a variation of +// Nelder-Mead optimization, and we generally try to minimize +// BPP * pnorm aggregate as reported by the benchmark_xl tool, +// but occasionally the values are optimized by using additional +// constraints such as maintaining a certain density, or ratio of +// popularity of integral transforms. Jyrki visually reviews all +// such changes and often makes manual changes to maintain good +// visual quality to changes where butteraugli was not sufficiently +// sensitive to some kind of degradation. Unfortunately image quality +// is still more of an art than science. + +// This must come before the begin/end_target, but HWY_ONCE is only true +// after that, so use an "include guard". +#ifndef LIB_JXL_ENC_AC_STRATEGY_ +#define LIB_JXL_ENC_AC_STRATEGY_ +// Parameters of the heuristic are marked with a OPTIMIZE comment. +namespace jxl { + +// Debugging utilities. + +// Returns a linear sRGB color (as bytes) for each AC strategy. +const uint8_t* TypeColor(const uint8_t& raw_strategy) { + JXL_ASSERT(AcStrategy::IsRawStrategyValid(raw_strategy)); + static_assert(AcStrategy::kNumValidStrategies == 27, "Change colors"); + static constexpr uint8_t kColors[][3] = { + {0xFF, 0xFF, 0x00}, // DCT8 + {0xFF, 0x80, 0x80}, // HORNUSS + {0xFF, 0x80, 0x80}, // DCT2x2 + {0xFF, 0x80, 0x80}, // DCT4x4 + {0x80, 0xFF, 0x00}, // DCT16x16 + {0x00, 0xC0, 0x00}, // DCT32x32 + {0xC0, 0xFF, 0x00}, // DCT16x8 + {0xC0, 0xFF, 0x00}, // DCT8x16 + {0x00, 0xFF, 0x00}, // DCT32x8 + {0x00, 0xFF, 0x00}, // DCT8x32 + {0x00, 0xFF, 0x00}, // DCT32x16 + {0x00, 0xFF, 0x00}, // DCT16x32 + {0xFF, 0x80, 0x00}, // DCT4x8 + {0xFF, 0x80, 0x00}, // DCT8x4 + {0xFF, 0xFF, 0x80}, // AFV0 + {0xFF, 0xFF, 0x80}, // AFV1 + {0xFF, 0xFF, 0x80}, // AFV2 + {0xFF, 0xFF, 0x80}, // AFV3 + {0x00, 0xC0, 0xFF}, // DCT64x64 + {0x00, 0xFF, 0xFF}, // DCT64x32 + {0x00, 0xFF, 0xFF}, // DCT32x64 + {0x00, 0x40, 0xFF}, // DCT128x128 + {0x00, 0x80, 0xFF}, // DCT128x64 + {0x00, 0x80, 0xFF}, // DCT64x128 + {0x00, 0x00, 0xC0}, // DCT256x256 + {0x00, 0x00, 0xFF}, // DCT256x128 + {0x00, 0x00, 0xFF}, // DCT128x256 + }; + return kColors[raw_strategy]; +} + +const uint8_t* TypeMask(const uint8_t& raw_strategy) { + JXL_ASSERT(AcStrategy::IsRawStrategyValid(raw_strategy)); + static_assert(AcStrategy::kNumValidStrategies == 27, "Add masks"); + // implicitly, first row and column is made dark + static constexpr uint8_t kMask[][64] = { + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // DCT8 + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 1, 0, 0, 1, 0, 0, // + 0, 0, 1, 0, 0, 1, 0, 0, // + 0, 0, 1, 1, 1, 1, 0, 0, // + 0, 0, 1, 0, 0, 1, 0, 0, // + 0, 0, 1, 0, 0, 1, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // HORNUSS + { + 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 0, 1, 0, 1, 0, 1, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 0, 1, 0, 1, 0, 1, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 0, 1, 0, 1, 0, 1, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 0, 1, 0, 1, 0, 1, 0, // + }, // 2x2 + { + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + }, // 4x4 + {}, // DCT16x16 (unused) + {}, // DCT32x32 (unused) + {}, // DCT16x8 (unused) + {}, // DCT8x16 (unused) + {}, // DCT32x8 (unused) + {}, // DCT8x32 (unused) + {}, // DCT32x16 (unused) + {}, // DCT16x32 (unused) + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // DCT4x8 + { + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + }, // DCT8x4 + { + 1, 1, 1, 1, 1, 0, 0, 0, // + 1, 1, 1, 1, 0, 0, 0, 0, // + 1, 1, 1, 0, 0, 0, 0, 0, // + 1, 1, 0, 0, 0, 0, 0, 0, // + 1, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // AFV0 + { + 0, 0, 0, 0, 1, 1, 1, 1, // + 0, 0, 0, 0, 0, 1, 1, 1, // + 0, 0, 0, 0, 0, 0, 1, 1, // + 0, 0, 0, 0, 0, 0, 0, 1, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // AFV1 + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 1, 0, 0, 0, 0, 0, 0, 0, // + 1, 1, 0, 0, 0, 0, 0, 0, // + 1, 1, 1, 0, 0, 0, 0, 0, // + 1, 1, 1, 1, 0, 0, 0, 0, // + }, // AFV2 + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 1, // + 0, 0, 0, 0, 0, 0, 1, 1, // + 0, 0, 0, 0, 0, 1, 1, 1, // + }, // AFV3 + }; + return kMask[raw_strategy]; +} + +void DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize, size_t ysize, const char* tag, AuxOut* aux_out) { + Image3F color_acs(xsize, ysize); + for (size_t y = 0; y < ysize; y++) { + float* JXL_RESTRICT rows[3] = { + color_acs.PlaneRow(0, y), color_acs.PlaneRow(1, y), color_acs.PlaneRow(2, y), + }; + const AcStrategyRow acs_row = ac_strategy.ConstRow(y / kBlockDim); + for (size_t x = 0; x < xsize; x++) { + AcStrategy acs = acs_row[x / kBlockDim]; + const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy()); + for (size_t c = 0; c < 3; c++) { + rows[c][x] = color[c] / 255.f; + } + } + } + size_t stride = color_acs.PixelsPerRow(); + for (size_t c = 0; c < 3; c++) { + for (size_t by = 0; by < DivCeil(ysize, kBlockDim); by++) { + float* JXL_RESTRICT row = color_acs.PlaneRow(c, by * kBlockDim); + const AcStrategyRow acs_row = ac_strategy.ConstRow(by); + for (size_t bx = 0; bx < DivCeil(xsize, kBlockDim); bx++) { + AcStrategy acs = acs_row[bx]; + if (!acs.IsFirstBlock()) continue; + const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy()); + const uint8_t* JXL_RESTRICT mask = TypeMask(acs.RawStrategy()); + if (acs.covered_blocks_x() == 1 && acs.covered_blocks_y() == 1) { + for (size_t iy = 0; iy < kBlockDim && by * kBlockDim + iy < ysize; iy++) { + for (size_t ix = 0; ix < kBlockDim && bx * kBlockDim + ix < xsize; ix++) { + if (mask[iy * kBlockDim + ix]) { + row[iy * stride + bx * kBlockDim + ix] = color[c] / 800.f; + } + } + } + } + // draw block edges + for (size_t ix = 0; ix < kBlockDim * acs.covered_blocks_x() && bx * kBlockDim + ix < xsize; ix++) { + row[0 * stride + bx * kBlockDim + ix] = color[c] / 350.f; + } + for (size_t iy = 0; iy < kBlockDim * acs.covered_blocks_y() && by * kBlockDim + iy < ysize; iy++) { + row[iy * stride + bx * kBlockDim + 0] = color[c] / 350.f; + } + } + } + } + aux_out->DumpImage(tag, color_acs); +} + +} // namespace jxl +#endif // LIB_JXL_ENC_AC_STRATEGY_ + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +bool MultiBlockTransformCrossesHorizontalBoundary(const AcStrategyImage& ac_strategy, + size_t start_x, + size_t y, + size_t end_x) { + if (start_x >= ac_strategy.xsize() || y >= ac_strategy.ysize()) { + return false; + } + if (y % 8 == 0) { + // Nothing crosses 64x64 boundaries, and the memory on the other side + // of the 64x64 block may still uninitialized. + return false; + } + end_x = std::min(end_x, ac_strategy.xsize()); + // The first multiblock might be before the start_x, let's adjust it + // to point to the first IsFirstBlock() == true block we find by backward + // tracing. + AcStrategyRow row = ac_strategy.ConstRow(y); + const size_t start_x_limit = start_x & ~7; + while (start_x != start_x_limit && !row[start_x].IsFirstBlock()) { + --start_x; + } + for (size_t x = start_x; x < end_x;) { + if (row[x].IsFirstBlock()) { + x += row[x].covered_blocks_x(); + } else { + return true; + } + } + return false; +} + +bool MultiBlockTransformCrossesVerticalBoundary(const AcStrategyImage& ac_strategy, + size_t x, + size_t start_y, + size_t end_y) { + if (x >= ac_strategy.xsize() || start_y >= ac_strategy.ysize()) { + return false; + } + if (x % 8 == 0) { + // Nothing crosses 64x64 boundaries, and the memory on the other side + // of the 64x64 block may still uninitialized. + return false; + } + end_y = std::min(end_y, ac_strategy.ysize()); + // The first multiblock might be before the start_y, let's adjust it + // to point to the first IsFirstBlock() == true block we find by backward + // tracing. + const size_t start_y_limit = start_y & ~7; + while (start_y != start_y_limit && !ac_strategy.ConstRow(start_y)[x].IsFirstBlock()) { + --start_y; + } + + for (size_t y = start_y; y < end_y;) { + AcStrategyRow row = ac_strategy.ConstRow(y); + if (row[x].IsFirstBlock()) { + y += row[x].covered_blocks_y(); + } else { + return true; + } + } + return false; +} + +float EstimateEntropy(const AcStrategy& acs, + size_t x, + size_t y, + const ACSConfig& config, + const float* JXL_RESTRICT cmap_factors, + float* block, + float* scratch_space, + uint32_t* quantized, + size_t xsize, + size_t ysize, + std::vector >& dctIDT, + std::vector >& dct2x2, + std::vector >& dct4x4, + std::vector >& dct8x8, + std::vector >& dct16x16, + std::vector >& dct32x32, + std::vector >& dcIDT, + std::vector >& dc2x2, + std::vector >& dc4x4, + std::vector >& dc8x8, + std::vector >& dc16x16, + std::vector >& dc32x32) { + const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize; + + // Apply transform. + for (size_t c = 0; c < 3; c++) { + float* JXL_RESTRICT block_c = block + size * c; +// TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y), +// config.src_stride, block_c, scratch_space); + +#ifdef XLNX_QC_DEBUG_AC_ESTIMATE_ENTROPY + if (acs.RawStrategy() == 4) { + std::cout << "========================debug===================== convered blocks: " + << acs.covered_blocks_x() << " tile_xsize: " << tile_xsize << " x: " << x << " y: " << y + << std::endl; + for (int i = 0; i < 64; i++) { + std::cout << std::setw(15) << block_c[i] << " "; + } + std::cout << std::endl; + for (int i = 0; i < 64; i++) { + std::cout << std::setw(15) << dct2x2[c][64 * (y / 8 * (tile_xsize / 8) + x / 8) + i] << " "; + } + std::cout << std::endl; + for (int i = 0; i < 64; i++) { + if (block_c[i] != dct2x2[c][64 * (y / 8 * (tile_xsize / 8) + x / 8) + i]) std::cout << "!!!"; + } + std::cout << std::endl; + } +#endif + size_t tile_xsize = (xsize + 63) / 64 * 64; + size_t tile_ysize = (ysize + 63) / 64 * 64; + for (int i = 0; i < 32 * 32; i++) { + if (acs.RawStrategy() == 0) { + if (i < 64) block_c[i] = dct8x8[c][64 * (y / 8 * (tile_xsize / 8) + x / 8) + i]; + } else if (acs.RawStrategy() == 1) { + if (i < 64) block_c[i] = dctIDT[c][64 * (y / 8 * (tile_xsize / 8) + x / 8) + i]; + } else if (acs.RawStrategy() == 2) { + if (i < 64) block_c[i] = dct2x2[c][64 * (y / 8 * (tile_xsize / 8) + x / 8) + i]; + } else if (acs.RawStrategy() == 3) { + if (i < 64) block_c[i] = dct4x4[c][64 * (y / 8 * (tile_xsize / 8) + x / 8) + i]; + } else if (acs.RawStrategy() == 4) { + if (i < 256) block_c[i] = dct16x16[c][16 * 16 * (y / 16 * (tile_xsize / 16) + x / 16) + i]; + } else if (acs.RawStrategy() == 5) { + block_c[i] = dct32x32[c][32 * 32 * (y / 32 * (tile_xsize / 32) + x / 32) + i]; + } else { + std::cout << "unsupported DCT" << std::endl; + } + } + } + + HWY_FULL(float) df; + + const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y(); + float quant_norm8 = 0; + float masking = 0; + if (num_blocks == 1) { + // When it is only one 8x8, we don't need aggregation of values. + quant_norm8 = config.Quant(x / 8, y / 8); + masking = 2.0f * config.Masking(x / 8, y / 8); + } else if (num_blocks == 2) { + // Taking max instead of 8th norm seems to work + // better for smallest blocks up to 16x8. Jyrki couldn't get + // improvements in trying the same for 16x16 blocks. + if (acs.covered_blocks_y() == 2) { + quant_norm8 = std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1)); + masking = 2.0f * std::max(config.Masking(x / 8, y / 8), config.Masking(x / 8, y / 8 + 1)); + } else { + quant_norm8 = std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8)); + masking = 2.0f * std::max(config.Masking(x / 8, y / 8), config.Masking(x / 8 + 1, y / 8)); + } + } else { + float masking_norm2 = 0; + float masking_max = 0; + // Load QF value, calculate empirical heuristic on masking field + // for weighting the information loss. Information loss manifests + // itself as ringing, and masking could hide it. + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + float qval = config.Quant(x / 8 + ix, y / 8 + iy); + qval *= qval; + qval *= qval; + quant_norm8 += qval * qval; + float maskval = config.Masking(x / 8 + ix, y / 8 + iy); + masking_max = std::max(masking_max, maskval); + masking_norm2 += maskval * maskval; + } + } + quant_norm8 /= num_blocks; + quant_norm8 = FastPowf(quant_norm8, 1.0f / 8.0f); + masking_norm2 = sqrt(masking_norm2 / num_blocks); + // This is a highly empirical formula. + masking = (masking_norm2 + masking_max); + } + const auto q = Set(df, quant_norm8); + + // Compute entropy. + float entropy = config.base_entropy; + auto info_loss = Zero(df); + auto info_loss2 = Zero(df); + + for (size_t c = 0; c < 3; c++) { + const float* inv_matrix = config.dequant->InvMatrix(acs.RawStrategy(), c); + const auto cmap_factor = Set(df, cmap_factors[c]); + + auto entropy_v = Zero(df); + auto nzeros_v = Zero(df); + auto cost1 = Set(df, config.cost1); + auto cost2 = Set(df, config.cost2); + auto cost_delta = Set(df, config.cost_delta); + for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) { + const auto in = Load(df, block + c * size + i); + const auto in_y = Load(df, block + size + i) * cmap_factor; + const auto im = Load(df, inv_matrix + i); + const auto val = (in - in_y) * im * q; + const auto rval = Round(val); + const auto diff = AbsDiff(val, rval); + info_loss += diff; + info_loss2 += diff * diff; + const auto q = Abs(rval); + const auto q_is_zero = q == Zero(df); + entropy_v += IfThenElseZero(q >= Set(df, 1.5f), cost2); + // We used to have q * C here, but that cost model seems to + // be punishing large values more than necessary. Sqrt tries + // to avoid large values less aggressively. Having high accuracy + // around zero is most important at low qualities, and there + // we have directly specified costs for 0, 1, and 2. + entropy_v += Sqrt(q) * cost_delta; + nzeros_v += IfThenZeroElse(q_is_zero, Set(df, 1.0f)); + } + entropy_v += nzeros_v * cost1; + + entropy += GetLane(SumOfLanes(entropy_v)); + size_t num_nzeros = GetLane(SumOfLanes(nzeros_v)); + // Add #bit of num_nonzeros, as an estimate of the cost for encoding the + // number of non-zeros of the block. + size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1; + // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a + // bias. + entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits); + } + float ret = entropy + + masking * ((config.info_loss_multiplier * GetLane(SumOfLanes(info_loss))) + + (config.info_loss_multiplier2 * sqrt(num_blocks * GetLane(SumOfLanes(info_loss2))))); + return ret; +} + +uint8_t FindBest8x8Transform(size_t x, + size_t y, + int encoding_speed_tier, + const ACSConfig& config, + const float* JXL_RESTRICT cmap_factors, + AcStrategyImage* JXL_RESTRICT ac_strategy, + float* block, + float* scratch_space, + uint32_t* quantized, + float* entropy_out, + size_t xsize, + size_t ysize, + std::vector >& dctIDT, + std::vector >& dct2x2, + std::vector >& dct4x4, + std::vector >& dct8x8, + std::vector >& dct16x16, + std::vector >& dct32x32, + std::vector >& dcIDT, + std::vector >& dc2x2, + std::vector >& dc4x4, + std::vector >& dc8x8, + std::vector >& dc16x16, + std::vector >& dc32x32) { + struct TransformTry8x8 { + AcStrategy::Type type; + int encoding_speed_tier_max_limit; + float entropy_add; + float entropy_mul; + }; + static const TransformTry8x8 kTransforms8x8[] = { + { + AcStrategy::Type::DCT, 9, 3.0f, 0.745f, + }, + { + AcStrategy::Type::DCT4X4, 5, 4.0f, 1.0179946967008329f, + }, + { + AcStrategy::Type::DCT2X2, 4, 4.0f, 0.76721119707580943f, + }, +#ifndef XLNX_DISABLE_RECT_DCT + { + AcStrategy::Type::DCT4X8, 5, 0.0f, 0.700754622182473063f, + }, + { + AcStrategy::Type::DCT8X4, 5, 0.0f, 0.700754622182473063f, + }, +#endif + { + AcStrategy::Type::IDENTITY, 5, 8.0f, 0.81217614513585534f, + }, +#ifndef XLNX_DISABLE_RECT_DCT + { + AcStrategy::Type::AFV0, 4, 3.0f, 0.70086131125719425f, + }, + { + AcStrategy::Type::AFV1, 4, 3.0f, 0.70086131125719425f, + }, + { + AcStrategy::Type::AFV2, 4, 3.0f, 0.70086131125719425f, + }, + { + AcStrategy::Type::AFV3, 4, 3.0f, 0.70086131125719425f, + }, +#endif + }; + double best = 1e30; + uint8_t best_tx = kTransforms8x8[0].type; + for (auto tx : kTransforms8x8) { + if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) { + continue; + } + AcStrategy acs = AcStrategy::FromRawStrategy(tx.type); + float entropy = + EstimateEntropy(acs, x, y, config, cmap_factors, block, scratch_space, quantized, xsize, ysize, dctIDT, + dct2x2, dct4x4, dct8x8, dct16x16, dct32x32, dcIDT, dc2x2, dc4x4, dc8x8, dc16x16, dc32x32); + entropy = tx.entropy_add + tx.entropy_mul * entropy; + if (entropy < best) { + best_tx = tx.type; + best = entropy; + } + } + *entropy_out = best; + return best_tx; +} + +// bx, by addresses the 64x64 block at 8x8 subresolution +// cx, cy addresses the left, upper 8x8 block position of the candidate +// transform. +/*void TryMergeAcs(AcStrategy::Type acs_raw, size_t bx, size_t by, size_t cx, + size_t cy, const ACSConfig& config, + const float* JXL_RESTRICT cmap_factors, + AcStrategyImage* JXL_RESTRICT ac_strategy, + const float entropy_mul, const uint8_t candidate_priority, + uint8_t* priority, float* JXL_RESTRICT entropy_estimate, + float* block, float* scratch_space, uint32_t* quantized) { + AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); + float entropy_current = 0; + for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) { + if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) { + // Transform would reuse already allocated blocks and + // lead to invalid overlaps, for example DCT64X32 vs. + // DCT32X64. + return; + } + entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)]; + } + } + float entropy_candidate = + entropy_mul * EstimateEntropy(acs, (bx + cx) * 8, (by + cy) * 8, config, + cmap_factors, block, scratch_space, + quantized); + if (entropy_candidate >= entropy_current) return; + // Accept the candidate. + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + entropy_estimate[(cy + iy) * 8 + cx + ix] = 0; + priority[(cy + iy) * 8 + cx + ix] = candidate_priority; + } + } + ac_strategy->Set(bx + cx, by + cy, acs_raw); + // if (acs_raw > 5) printf("try_merge acs: %d\n", acs_raw); + entropy_estimate[cy * 8 + cx] = entropy_candidate; +}*/ + +static void SetEntropyForTransform( + size_t cx, size_t cy, const AcStrategy::Type acs_raw, float entropy, float* JXL_RESTRICT entropy_estimate) { + const AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); + for (size_t dy = 0; dy < acs.covered_blocks_y(); ++dy) { + for (size_t dx = 0; dx < acs.covered_blocks_x(); ++dx) { + entropy_estimate[(cy + dy) * 8 + cx + dx] = 0.0; + } + } + entropy_estimate[cy * 8 + cx] = entropy; +} + +AcStrategy::Type AcsSquare(size_t blocks) { + if (blocks == 2) { + return AcStrategy::Type::DCT16X16; + } else if (blocks == 4) { + return AcStrategy::Type::DCT32X32; + } else { + return AcStrategy::Type::DCT64X64; + } +} + +AcStrategy::Type AcsVerticalSplit(size_t blocks) { + if (blocks == 2) { + return AcStrategy::Type::DCT16X8; + } else if (blocks == 4) { + return AcStrategy::Type::DCT32X16; + } else { + return AcStrategy::Type::DCT64X32; + } +} + +AcStrategy::Type AcsHorizontalSplit(size_t blocks) { + if (blocks == 2) { + return AcStrategy::Type::DCT8X16; + } else if (blocks == 4) { + return AcStrategy::Type::DCT16X32; + } else { + return AcStrategy::Type::DCT32X64; + } +} + +// The following function tries to merge smaller transforms into +// squares and the rectangles originating from a single middle division +// (horizontal or vertical) fairly. +// +// This is now generalized to concern about squares +// of blocks X blocks size, where a block is 8x8 pixels. +void FindBestFirstLevelDivisionForSquare(size_t blocks, + bool allow_square_transform, + size_t bx, + size_t by, + size_t cx, + size_t cy, + const ACSConfig& config, + const float* JXL_RESTRICT cmap_factors, + AcStrategyImage* JXL_RESTRICT ac_strategy, + const float entropy_mul_JXK, + const float entropy_mul_JXJ, + float* JXL_RESTRICT entropy_estimate, + float* block, + float* scratch_space, + uint32_t* quantized, + + size_t xsize, + size_t ysize, + + std::vector >& dctIDT, + std::vector >& dct2x2, + std::vector >& dct4x4, + std::vector >& dct8x8, + std::vector >& dct16x16, + std::vector >& dct32x32, + + std::vector >& dcIDT, + std::vector >& dc2x2, + std::vector >& dc4x4, + std::vector >& dc8x8, + std::vector >& dc16x16, + std::vector >& dc32x32 + //================================ + + ) { + // We denote J for the larger dimension here, and K for the smaller. + // For example, for 32x32 block splitting, J would be 32, K 16. + const size_t blocks_half = blocks / 2; + const AcStrategy::Type acs_rawJXK = AcsVerticalSplit(blocks); + const AcStrategy::Type acs_rawKXJ = AcsHorizontalSplit(blocks); + const AcStrategy::Type acs_rawJXJ = AcsSquare(blocks); + const AcStrategy acsJXK = AcStrategy::FromRawStrategy(acs_rawJXK); + const AcStrategy acsKXJ = AcStrategy::FromRawStrategy(acs_rawKXJ); + const AcStrategy acsJXJ = AcStrategy::FromRawStrategy(acs_rawJXJ); + AcStrategyRow row0 = ac_strategy->ConstRow(by + cy + 0); + AcStrategyRow row1 = ac_strategy->ConstRow(by + cy + blocks_half); + // Let's check if we can consider a JXJ block here at all. + // This is not necessary in the basic use of hierarchically merging + // blocks in the simplest possible way, but is needed when we try other + // 'floating' options of merging, possibly after a simple hierarchical + // merge has been explored. + if (MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx, by + cy, bx + cx + blocks) || + MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx, by + cy + blocks, bx + cx + blocks) || + MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx, by + cy, by + cy + blocks) || + MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx + blocks, by + cy, by + cy + blocks)) { + return; // not suitable for JxJ analysis, some transforms leak out. + } + // For floating transforms there may be + // already blocks selected that make either or both JXK and + // KXJ not feasible for this location. + const bool allow_JXK = + !MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx + blocks_half, by + cy, by + cy + blocks); + const bool allow_KXJ = + !MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx, by + cy + blocks_half, bx + cx + blocks); + // Current entropies aggregated on NxN resolution. + float entropy[2][2] = {}; + for (size_t dy = 0; dy < blocks; ++dy) { + for (size_t dx = 0; dx < blocks; ++dx) { + entropy[dy / blocks_half][dx / blocks_half] += entropy_estimate[(cy + dy) * 8 + (cx + dx)]; + } + } + float entropy_JXK_left = std::numeric_limits::max(); + float entropy_JXK_right = std::numeric_limits::max(); + float entropy_KXJ_top = std::numeric_limits::max(); + float entropy_KXJ_bottom = std::numeric_limits::max(); + float entropy_JXJ = std::numeric_limits::max(); +#ifndef XLNX_DISABLE_RECT_DCT + if (allow_JXK) { + if (row0[bx + cx + 0].RawStrategy() != acs_rawJXK) { + entropy_JXK_left = entropy_mul_JXK * EstimateEntropy(acsJXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, + cmap_factors, block, scratch_space, quantized); + } + if (row0[bx + cx + blocks_half].RawStrategy() != acs_rawJXK) { + entropy_JXK_right = + entropy_mul_JXK * EstimateEntropy(acsJXK, (bx + cx + blocks_half) * 8, (by + cy + 0) * 8, config, + cmap_factors, block, scratch_space, quantized); + } + } + if (allow_KXJ) { + if (row0[bx + cx].RawStrategy() != acs_rawKXJ) { + entropy_KXJ_top = entropy_mul_JXK * EstimateEntropy(acsKXJ, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, + cmap_factors, block, scratch_space, quantized); + } + if (row1[bx + cx].RawStrategy() != acs_rawKXJ) { + entropy_KXJ_bottom = + entropy_mul_JXK * EstimateEntropy(acsKXJ, (bx + cx + 0) * 8, (by + cy + blocks_half) * 8, config, + cmap_factors, block, scratch_space, quantized); + } + } +#endif + if (allow_square_transform && acs_rawJXJ != AcStrategy::Type::DCT64X64) { + // We control the exploration of the square transform separately so that + // we can turn it off at high decoding speeds for 32x32, but still allow + // exploring 16x32 and 32x16. + entropy_JXJ = + entropy_mul_JXJ * EstimateEntropy(acsJXJ, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, cmap_factors, block, + scratch_space, quantized, xsize, ysize, dctIDT, dct2x2, dct4x4, dct8x8, + dct16x16, dct32x32, dcIDT, dc2x2, dc4x4, dc8x8, dc16x16, dc32x32); + } + +// Test if this block should have JXK or KXJ transforms, +// because it can have only one or the other. +#ifndef XLNX_DISABLE_RECT_DCT + float costJxN = std::min(entropy_JXK_left, entropy[0][0] + entropy[1][0]) + + std::min(entropy_JXK_right, entropy[0][1] + entropy[1][1]); + float costNxJ = std::min(entropy_KXJ_top, entropy[0][0] + entropy[0][1]) + + std::min(entropy_KXJ_bottom, entropy[1][0] + entropy[1][1]); + if (entropy_JXJ < costJxN && entropy_JXJ < costNxJ) { +#else + if (entropy_JXJ < entropy[0][0] + entropy[1][0] + entropy[0][1] + entropy[1][1]) { +#endif + ac_strategy->Set(bx + cx, by + cy, acs_rawJXJ); + SetEntropyForTransform(cx, cy, acs_rawJXJ, entropy_JXJ, entropy_estimate); + } +#ifndef XLNX_DISABLE_RECT_DCT + else if (costJxN < costNxJ) { + if (entropy_JXK_left < entropy[0][0] + entropy[1][0]) { + ac_strategy->Set(bx + cx, by + cy, acs_rawJXK); + SetEntropyForTransform(cx, cy, acs_rawJXK, entropy_JXK_left, entropy_estimate); + } + if (entropy_JXK_right < entropy[0][1] + entropy[1][1]) { + ac_strategy->Set(bx + cx + blocks_half, by + cy, acs_rawJXK); + SetEntropyForTransform(cx + blocks_half, cy, acs_rawJXK, entropy_JXK_right, entropy_estimate); + } + } else { + if (entropy_KXJ_top < entropy[0][0] + entropy[0][1]) { + ac_strategy->Set(bx + cx, by + cy, acs_rawKXJ); + SetEntropyForTransform(cx, cy, acs_rawKXJ, entropy_KXJ_top, entropy_estimate); + } + if (entropy_KXJ_bottom < entropy[1][0] + entropy[1][1]) { + ac_strategy->Set(bx + cx, by + cy + blocks_half, acs_rawKXJ); + SetEntropyForTransform(cx, cy + blocks_half, acs_rawKXJ, entropy_KXJ_bottom, entropy_estimate); + } + } +#endif +} + +void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state, + const ACSConfig& config, + const Rect& rect, + size_t xsize, + size_t ysize, + std::vector >& dctIDT, + std::vector >& dct2x2, + std::vector >& dct4x4, + std::vector >& dct8x8, + std::vector >& dct16x16, + std::vector >& dct32x32, + + std::vector >& dcIDT, + std::vector >& dc2x2, + std::vector >& dc4x4, + std::vector >& dc8x8, + std::vector >& dc16x16, + std::vector >& dc32x32 + //================================ + ) { + // Main philosophy here: + // 1. First find best 8x8 transform for each area. + // 2. Merging them into larger transforms where possibly, but + // starting from the smallest transforms (16x8 and 8x16). + // Additional complication: 16x8 and 8x16 are considered + // simultanouesly and fairly against each other. + // We are looking at 64x64 squares since the YtoX and YtoB + // maps happen to be at that resolution, and having + // integral transforms cross these boundaries leads to + // additional complications. + const CompressParams& cparams = enc_state->cparams; + const float butteraugli_target = cparams.butteraugli_distance; + AcStrategyImage* ac_strategy = &enc_state->shared.ac_strategy; + // TODO(veluca): reuse allocations + auto mem = hwy::AllocateAligned(5 * AcStrategy::kMaxCoeffArea); + auto qmem = hwy::AllocateAligned(AcStrategy::kMaxCoeffArea); + uint32_t* JXL_RESTRICT quantized = qmem.get(); + float* JXL_RESTRICT block = mem.get(); + float* JXL_RESTRICT scratch_space = mem.get() + 3 * AcStrategy::kMaxCoeffArea; + size_t bx = rect.x0(); + size_t by = rect.y0(); + JXL_ASSERT(rect.xsize() <= 8); + JXL_ASSERT(rect.ysize() <= 8); + size_t tx = bx / kColorTileDimInBlocks; + size_t ty = by / kColorTileDimInBlocks; + const float cmap_factors[3] = { + enc_state->shared.cmap.YtoXRatio(enc_state->shared.cmap.ytox_map.ConstRow(ty)[tx]), 0.0f, + enc_state->shared.cmap.YtoBRatio(enc_state->shared.cmap.ytob_map.ConstRow(ty)[tx]), + }; + if (cparams.speed_tier > SpeedTier::kHare) return; + // First compute the best 8x8 transform for each square. Later, we do not + // experiment with different combinations, but only use the best of the 8x8s + // when DCT8X8 is specified in the tree search. + // 8x8 transforms have 10 variants, but every larger transform is just a DCT. + float entropy_estimate[64] = {}; + // Favor all 8x8 transforms (against 16x8 and larger transforms)) at + // low butteraugli_target distances. + static const float k8x8mul1 = -0.55; + static const float k8x8mul2 = 1.0735757687292623f; + static const float k8x8base = 1.4; + const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base); + for (size_t iy = 0; iy < rect.ysize(); iy++) { + for (size_t ix = 0; ix < rect.xsize(); ix++) { + float entropy = 0.0; + const uint8_t best_of_8x8s = FindBest8x8Transform( + 8 * (bx + ix), 8 * (by + iy), static_cast(cparams.speed_tier), config, cmap_factors, ac_strategy, + block, scratch_space, quantized, &entropy, xsize, ysize, dctIDT, dct2x2, dct4x4, dct8x8, dct16x16, + dct32x32, dcIDT, dc2x2, dc4x4, dc8x8, dc16x16, dc32x32); + ac_strategy->Set(bx + ix, by + iy, static_cast(best_of_8x8s)); + // if (static_cast(best_of_8x8s) > 5) { + // printf("after find best8x8 acs: %d\n", + // static_cast(best_of_8x8s)); + // } + entropy_estimate[iy * 8 + ix] = entropy * mul8x8; + } + } + // Merge when a larger transform is better than the previously + // searched best combination of 8x8 transforms. + struct MergeTry { + AcStrategy::Type type; + uint8_t priority; + uint8_t decoding_speed_tier_max_limit; + uint8_t encoding_speed_tier_max_limit; + float entropy_mul; + }; + static const float k8X16mul1 = -0.55; + static const float k8X16mul2 = 0.9019587899705066; + static const float k8X16base = 1.6; + const float entropy_mul16X8 = k8X16mul2 + k8X16mul1 / (butteraugli_target + k8X16base); + // const float entropy_mul16X8 = mul8X16 * 0.91195782912371126f; + + static const float k16X16mul1 = -0.35; + static const float k16X16mul2 = 0.82098067020252011; + static const float k16X16base = 2.0; + const float entropy_mul16X16 = k16X16mul2 + k16X16mul1 / (butteraugli_target + k16X16base); + // const float entropy_mul16X16 = mul16X16 * 0.83183417727960129f; + + static const float k32X16mul1 = -0.1; + static const float k32X16mul2 = 0.86098067020252011; + static const float k32X16base = 2.5; + const float entropy_mul16X32 = k32X16mul2 + k32X16mul1 / (butteraugli_target + k32X16base); + + const float entropy_mul32X32 = 0.9188333021616017f; + const float entropy_mul64X64 = 1.50f; + // TODO(jyrki): Consider this feedback in further changes: + // Also effectively when the multipliers for smaller blocks are + // below 1, this raises the bar for the bigger blocks even higher + // in that sense these constants are not independent (e.g. changing + // the constant for DCT16x32 by -5% (making it more likely) also + // means that DCT32x32 becomes harder to do when starting from + // two DCT16x32s). It might be better to make them more independent, + // e.g. by not applying the multiplier when storing the new entropy + // estimates in TryMergeToACSCandidate(). + const MergeTry kTransformsForMerge[9] = { + {AcStrategy::Type::DCT16X8, 2, 4, 5, entropy_mul16X8}, + {AcStrategy::Type::DCT8X16, 2, 4, 5, entropy_mul16X8}, + // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its + // subdivisions. {AcStrategy::Type::DCT16X16, 3, entropy_mul16X16}, + {AcStrategy::Type::DCT16X32, 4, 4, 4, entropy_mul16X32}, + {AcStrategy::Type::DCT32X16, 4, 4, 4, entropy_mul16X32}, + // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its + // subdivisions. {AcStrategy::Type::DCT32X32, 5, 1, 5, + // 0.9822994906548809f}, + // TODO(jyrki): re-enable 64x32 and 64x64 if/when possible. + {AcStrategy::Type::DCT64X32, 6, 1, 3, 1.27f}, + {AcStrategy::Type::DCT32X64, 6, 1, 3, 1.27f}, + // {AcStrategy::Type::DCT64X64, 8, 1, 3, 2.0846542128012948f}, + }; + /* + These sizes not yet included in merge heuristic: + set(AcStrategy::Type::DCT32X8, 0.0f, 2.261390410971102f); + set(AcStrategy::Type::DCT8X32, 0.0f, 2.261390410971102f); + set(AcStrategy::Type::DCT128X128, 0.0f, 1.0f); + set(AcStrategy::Type::DCT128X64, 0.0f, 0.73f); + set(AcStrategy::Type::DCT64X128, 0.0f, 0.73f); + set(AcStrategy::Type::DCT256X256, 0.0f, 1.0f); + set(AcStrategy::Type::DCT256X128, 0.0f, 0.73f); + set(AcStrategy::Type::DCT128X256, 0.0f, 0.73f); + */ + + // Priority is a tricky kludge to avoid collisions so that transforms + // don't overlap. + uint8_t priority[64] = {}; + for (auto tx : kTransformsForMerge) { + if (tx.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) { + continue; + } + AcStrategy acs = AcStrategy::FromRawStrategy(tx.type); + for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize(); cy += acs.covered_blocks_y()) { + for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize(); cx += acs.covered_blocks_x()) { + if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) { + if (cparams.decoding_speed_tier < 4 && tx.type == AcStrategy::Type::DCT32X64) { + // We handle both DCT8X16 and DCT16X8 at the same time. + if ((cy | cx) % 8 == 0) { + FindBestFirstLevelDivisionForSquare(8, true, bx, by, cx, cy, config, cmap_factors, + ac_strategy, tx.entropy_mul, entropy_mul64X64, + entropy_estimate, block, scratch_space, quantized, + xsize, ysize, dctIDT, dct2x2, dct4x4, dct8x8, dct16x16, + dct32x32, dcIDT, dc2x2, dc4x4, dc8x8, dc16x16, dc32x32); + } + continue; + } else if (tx.type == AcStrategy::Type::DCT32X16) { + // We handled both DCT8X16 and DCT16X8 at the same time, + // and that is above. The last column and last row, + // when the last column or last row is odd numbered, + // are still handled by TryMergeAcs. + continue; + } + } + if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) || + (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) { + // already covered by FindBest32X32 + continue; + } + + if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) { + if (tx.type == AcStrategy::Type::DCT16X32) { + // We handle both DCT8X16 and DCT16X8 at the same time. + bool enable_32x32 = cparams.decoding_speed_tier < 4; + if ((cy | cx) % 4 == 0) { + FindBestFirstLevelDivisionForSquare(4, enable_32x32, bx, by, cx, cy, config, cmap_factors, + ac_strategy, tx.entropy_mul, entropy_mul32X32, + entropy_estimate, block, scratch_space, quantized, + xsize, ysize, dctIDT, dct2x2, dct4x4, dct8x8, dct16x16, + dct32x32, dcIDT, dc2x2, dc4x4, dc8x8, dc16x16, dc32x32); + } + continue; + } else if (tx.type == AcStrategy::Type::DCT32X16) { + // We handled both DCT8X16 and DCT16X8 at the same time, + // and that is above. The last column and last row, + // when the last column or last row is odd numbered, + // are still handled by TryMergeAcs. + continue; + } + } + if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) || + (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) { + // already covered by FindBest32X32 + continue; + } + if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) { + if (tx.type == AcStrategy::Type::DCT8X16) { + // We handle both DCT8X16 and DCT16X8 at the same time. + if ((cy | cx) % 2 == 0) { + FindBestFirstLevelDivisionForSquare(2, true, bx, by, cx, cy, config, cmap_factors, + ac_strategy, tx.entropy_mul, entropy_mul16X16, + entropy_estimate, block, scratch_space, quantized, + xsize, ysize, dctIDT, dct2x2, dct4x4, dct8x8, dct16x16, + dct32x32, dcIDT, dc2x2, dc4x4, dc8x8, dc16x16, dc32x32); + } + continue; + } else if (tx.type == AcStrategy::Type::DCT16X8) { + // We handled both DCT8X16 and DCT16X8 at the same time, + // and that is above. The last column and last row, + // when the last column or last row is odd numbered, + // are still handled by TryMergeAcs. + continue; + } + } + if ((tx.type == AcStrategy::Type::DCT8X16 && cy % 2 == 1) || + (tx.type == AcStrategy::Type::DCT16X8 && cx % 2 == 1)) { + // already covered by FindBestFirstLevelDivisionForSquare + continue; + } +// All other merge sizes are handled here. +// Some of the DCT16X8s and DCT8X16s will still leak through here +// when there is an odd number of 8x8 blocks, then the last row +// and column will get their DCT16X8s and DCT8X16s through the +// normal integral transform merging process. + +#ifndef XLNX_DISABLE_RECT_DCT + TryMergeAcs(tx.type, bx, by, cx, cy, config, cmap_factors, ac_strategy, tx.entropy_mul, tx.priority, + &priority[0], entropy_estimate, block, scratch_space, quantized); +#endif + } + } + } + // Here we still try to do some non-aligned matching, find a few more + // 16X8, 8X16 and 16X16s between the non-2-aligned blocks. + if (cparams.speed_tier >= SpeedTier::kHare) { + return; + } + /* std::cout<enc_state = enc_state; + config.dequant = &enc_state->shared.matrices; + const CompressParams& cparams = enc_state->cparams; + const float butteraugli_target = cparams.butteraugli_distance; + + // Image row pointers and strides. + config.quant_field_row = enc_state->initial_quant_field.Row(0); + config.quant_field_stride = enc_state->initial_quant_field.PixelsPerRow(); + auto& mask = enc_state->initial_quant_masking; + if (mask.xsize() > 0 && mask.ysize() > 0) { + config.masking_field_row = mask.Row(0); + config.masking_field_stride = mask.PixelsPerRow(); + } + + config.src_rows[0] = src.ConstPlaneRow(0, 0); + config.src_rows[1] = src.ConstPlaneRow(1, 0); + config.src_rows[2] = src.ConstPlaneRow(2, 0); + config.src_stride = src.PixelsPerRow(); + + // Entropy estimate is composed of two factors: + // - estimate of the number of bits that will be used by the block + // - information loss due to quantization + // The following constant controls the relative weights of these components. + config.info_loss_multiplier = 138.0f; + config.info_loss_multiplier2 = 50.46839691767866; + // TODO(jyrki): explore base_entropy setting more. + // A small value (0?) works better at high distance, while a larger value + // may be more effective at low distance/high bpp. + config.base_entropy = 0.0; + config.zeros_mul = 7.565053364251793f; + // Lots of +1 and -1 coefficients at high quality, it is + // beneficial to favor them. At low qualities zeros matter more + // and +1 / -1 coefficients are already quite harmful. + float slope = std::min(1.0f, butteraugli_target * (1.0f / 3)); + config.cost1 = 1 + slope * 8.8703248061477744f; + config.cost2 = 4.4628149885273363f; + config.cost_delta = 5.3359184934516337f; + JXL_ASSERT(enc_state->shared.ac_strategy.xsize() == enc_state->shared.frame_dim.xsize_blocks); + JXL_ASSERT(enc_state->shared.ac_strategy.ysize() == enc_state->shared.frame_dim.ysize_blocks); +} + +void AcStrategyHeuristics::ProcessRect(const Rect& rect, + + size_t xsize, + size_t ysize, + std::vector >& dctIDT, + std::vector >& dct2x2, + std::vector >& dct4x4, + std::vector >& dct8x8, + std::vector >& dct16x16, + std::vector >& dct32x32, + + std::vector >& dcIDT, + std::vector >& dc2x2, + std::vector >& dc4x4, + std::vector >& dc8x8, + std::vector >& dc16x16, + std::vector >& dc32x32) { + PROFILER_FUNC; + const CompressParams& cparams = enc_state->cparams; + // In Falcon mode, use DCT8 everywhere and uniform quantization. + if (cparams.speed_tier >= SpeedTier::kCheetah) { + enc_state->shared.ac_strategy.FillDCT8(rect); + return; + } + HWY_DYNAMIC_DISPATCH(ProcessRectACS) + (enc_state, config, rect, xsize, ysize, dctIDT, dct2x2, dct4x4, dct8x8, dct16x16, dct32x32, dcIDT, dc2x2, dc4x4, + dc8x8, dc16x16, dc32x32); +} + +void AcStrategyHeuristics::Finalize(AuxOut* aux_out) { + const auto& ac_strategy = enc_state->shared.ac_strategy; + // Accounting and debug output. + if (aux_out != nullptr) { + aux_out->num_dct2_blocks = 32 * (ac_strategy.CountBlocks(AcStrategy::Type::DCT32X64) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT64X32)); + aux_out->num_dct4_blocks = 64 * ac_strategy.CountBlocks(AcStrategy::Type::DCT64X64); + aux_out->num_dct4x8_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT4X8) + ac_strategy.CountBlocks(AcStrategy::Type::DCT8X4); + aux_out->num_afv_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::AFV0) + ac_strategy.CountBlocks(AcStrategy::Type::AFV1) + + ac_strategy.CountBlocks(AcStrategy::Type::AFV2) + ac_strategy.CountBlocks(AcStrategy::Type::AFV3); + aux_out->num_dct8_blocks = ac_strategy.CountBlocks(AcStrategy::Type::DCT); + aux_out->num_dct8x16_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT8X16) + ac_strategy.CountBlocks(AcStrategy::Type::DCT16X8); + aux_out->num_dct8x32_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT8X32) + ac_strategy.CountBlocks(AcStrategy::Type::DCT32X8); + aux_out->num_dct16_blocks = ac_strategy.CountBlocks(AcStrategy::Type::DCT16X16); + aux_out->num_dct16x32_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT16X32) + ac_strategy.CountBlocks(AcStrategy::Type::DCT32X16); + aux_out->num_dct32_blocks = ac_strategy.CountBlocks(AcStrategy::Type::DCT32X32); +#ifdef XLNX_DEBUG_DCT + printf("=====================================\n"); + printf("DCT info: \n"); + printf("afv_blocks: %ld\n", aux_out->num_afv_blocks); + printf("dct2: %ld\n", aux_out->num_dct2_blocks); + printf("dct4: %ld\n", aux_out->num_dct4_blocks); + printf("dct8: %ld\n", aux_out->num_dct8_blocks); + printf("dct16: %ld\n", aux_out->num_dct16_blocks); + printf("dct32: %ld\n", aux_out->num_dct32_blocks); + printf("dct64: %ld\n\n", ac_strategy.CountBlocks(AcStrategy::Type::DCT64X64)); + printf("dct4x8: %ld\n", aux_out->num_dct4x8_blocks); + printf("dct8x16: %ld\n", aux_out->num_dct8x16_blocks); + printf("dct8x32: %ld\n", aux_out->num_dct8x32_blocks); + printf("dct16x32: %ld\n\n", aux_out->num_dct16x32_blocks); +#endif + } + + if (WantDebugOutput(aux_out)) { + DumpAcStrategy(ac_strategy, enc_state->shared.frame_dim.xsize, enc_state->shared.frame_dim.ysize, "ac_strategy", + aux_out); + } +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/others/src/acc_enc_adaptive_quantization.cpp b/codec/L2/demos/jxlEnc/others/src/acc_enc_adaptive_quantization.cpp new file mode 100644 index 0000000000..f1d2dd060c --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/acc_enc_adaptive_quantization.cpp @@ -0,0 +1,1009 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_adaptive_quantization.h" + +#include +#include +#include + +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "xilinx/src/acc_enc_adaptive_quantization.cpp" +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/butteraugli/butteraugli.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/dec_group.h" +#include "lib/jxl/dec_reconstruct.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_cache.h" +#include "acc_enc_group.hpp" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_transforms-inl.h" +#include "lib/jxl/epf.h" +#include "lib/jxl/fast_math-inl.h" +#include "lib/jxl/gauss_blur.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/quant_weights.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Rebind; + +// The following functions modulate an exponent (out_val) and return the updated +// value. Their descriptor is limited to 8 lanes for 8x8 blocks. + +// Hack for mask estimation. Eventually replace this code with butteraugli's +// masking. +float ComputeMaskForAcStrategyUse(const float out_val) { + const float kMul = 1.0f; + const float kOffset = 0.4f; + return kMul / (out_val + kOffset); +} + +template +V ComputeMask(const D d, const V out_val) { + const auto kBase = Set(d, -0.74174993f); + const auto kMul4 = Set(d, 3.2353257320940401f); + const auto kMul2 = Set(d, 12.906028311180409f); + const auto kOffset2 = Set(d, 305.04035728311436f); + const auto kMul3 = Set(d, 5.0220313103171232f); + const auto kOffset3 = Set(d, 2.1925739705298404f); + const auto kOffset4 = Set(d, 0.25f) * kOffset3; + const auto kMul0 = Set(d, 0.74760422233706747f); + const auto k1 = Set(d, 1.0f); + + // Avoid division by zero. + const auto v1 = Max(out_val * kMul0, Set(d, 1e-3f)); + const auto v2 = k1 / (v1 + kOffset2); + const auto v3 = k1 / MulAdd(v1, v1, kOffset3); + const auto v4 = k1 / MulAdd(v1, v1, kOffset4); + // TODO(jyrki): + // A log or two here could make sense. In butteraugli we have effectively + // log(log(x + C)) for this kind of use, as a single log is used in + // saturating visual masking and here the modulation values are exponential, + // another log would counter that. + return kBase + MulAdd(kMul4, v4, MulAdd(kMul2, v2, kMul3 * v3)); +} + +// For converting full vectors to a subset. Assumes `vfull` lanes are identical. +template +Vec CapTo(const D d, VFull vfull) { + using T = typename D::T; + const HWY_FULL(T) dfull; + HWY_ALIGN T lanes[MaxLanes(dfull)]; + Store(vfull, dfull, lanes); + return Load(d, lanes); +} + +// mul and mul2 represent a scaling difference between jxl and butteraugli. +static const float kSGmul = 226.0480446705883f; +static const float kSGmul2 = 1.0f / 73.377132366608819f; +static const float kLog2 = 0.693147181f; +// Includes correction factor for std::log -> log2. +static const float kSGRetMul = kSGmul2 * 18.6580932135f * kLog2; +static const float kSGVOffset = 7.14672470003f; + +template +V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) { + // The opsin space in jxl is the cubic root of photons, i.e., v * v * v + // is related to the number of photons. + // + // SimpleGamma(v * v * v) is the psychovisual space in butteraugli. + // This ratio allows quantization to move from jxl's opsin space to + // butteraugli's log-gamma space. + v = ZeroIfNegative(v); + const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul); + const auto kVOffset = Set(d, kSGVOffset * kLog2); + const auto kDenMul = Set(d, kLog2 * kSGmul); + + const auto v2 = v * v; + + const auto num = kNumMul * v2; + const auto den = MulAdd(kDenMul * v, v2, kVOffset); + return invert ? num / den : den / num; +} + +template +static float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) { + using DScalar = HWY_CAPPED(float, 1); + auto vscalar = Load(DScalar(), &v); + return GetLane(RatioOfDerivativesOfCubicRootToSimpleGamma(DScalar(), vscalar)); +} + +// TODO(veluca): this function computes an approximation of the derivative of +// SimpleGamma with (f(x+eps)-f(x))/eps. Consider two-sided approximation or +// exact derivatives. For reference, SimpleGamma was: +/* +template +V SimpleGamma(const D d, V v) { + // A simple HDR compatible gamma function. + const auto mul = Set(d, kSGmul); + const auto kRetMul = Set(d, kSGRetMul); + const auto kRetAdd = Set(d, kSGmul2 * -20.2789020414f); + const auto kVOffset = Set(d, kSGVOffset); + + v *= mul; + + // This should happen rarely, but may lead to a NaN, which is rather + // undesirable. Since negative photons don't exist we solve the NaNs by + // clamping here. + // TODO(veluca): with FastLog2f, this no longer leads to NaNs. + v = ZeroIfNegative(v); + return kRetMul * FastLog2f(d, v + kVOffset) + kRetAdd; +} +*/ + +template +V GammaModulation( + const D d, const size_t x, const size_t y, const ImageF& xyb_x, const ImageF& xyb_y, const V out_val) { + const float kBias = 0.16f; + JXL_DASSERT(kBias > kOpsinAbsorbanceBias[0]); + JXL_DASSERT(kBias > kOpsinAbsorbanceBias[1]); + JXL_DASSERT(kBias > kOpsinAbsorbanceBias[2]); + auto overall_ratio = Zero(d); + auto bias = Set(d, kBias); + auto half = Set(d, 0.5f); + for (size_t dy = 0; dy < 8; ++dy) { + const float* const JXL_RESTRICT row_in_x = xyb_x.Row(y + dy); + const float* const JXL_RESTRICT row_in_y = xyb_y.Row(y + dy); + for (size_t dx = 0; dx < 8; dx += Lanes(d)) { + const auto iny = Load(d, row_in_y + x + dx) + bias; + const auto inx = Load(d, row_in_x + x + dx); + const auto r = iny - inx; + const auto g = iny + inx; + const auto ratio_r = RatioOfDerivativesOfCubicRootToSimpleGamma(d, r); + const auto ratio_g = RatioOfDerivativesOfCubicRootToSimpleGamma(d, g); + const auto avg_ratio = half * (ratio_r + ratio_g); + + overall_ratio += avg_ratio; + } + } + overall_ratio = SumOfLanes(overall_ratio); + overall_ratio *= Set(d, 1.0f / 64); + // ideally -1.0, but likely optimal correction adds some entropy, so slightly + // less than that. + // ln(2) constant folded in because we want std::log but have FastLog2f. + const auto kGam = Set(d, -0.15526878023684174f * 0.693147180559945f); + return MulAdd(kGam, FastLog2f(d, overall_ratio), out_val); +} + +// Change precision in 8x8 blocks that have high frequency content. +template +V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb, const V out_val) { + // Zero out the invalid differences for the rightmost value per row. + const Rebind du; + HWY_ALIGN constexpr uint32_t kMaskRight[kBlockDim] = {~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, 0}; + + auto sum = Zero(d); // sum of absolute differences with right and below + + for (size_t dy = 0; dy < 8; ++dy) { + const float* JXL_RESTRICT row_in = xyb.Row(y + dy) + x; + const float* JXL_RESTRICT row_in_next = dy == 7 ? row_in : xyb.Row(y + dy + 1) + x; + +// In SCALAR, there is no guarantee of having extra row padding. +// Hence, we need to ensure we don't access pixels outside the row itself. +// In SIMD modes, however, rows are padded, so it's safe to access one +// garbage value after the row. The vector then gets masked with kMaskRight +// to remove the influence of that value. +#if HWY_TARGET != HWY_SCALAR + for (size_t dx = 0; dx < 8; dx += Lanes(d)) { +#else + for (size_t dx = 0; dx < 7; dx += Lanes(d)) { +#endif + const auto p = Load(d, row_in + dx); + const auto pr = LoadU(d, row_in + dx + 1); + const auto mask = BitCast(d, Load(du, kMaskRight + dx)); + sum += And(mask, AbsDiff(p, pr)); + + const auto pd = Load(d, row_in_next + dx); + sum += AbsDiff(p, pd); + } + } + + sum = SumOfLanes(sum); + return MulAdd(sum, Set(d, -2.0052193233688884f / 112), out_val); +} + +void PerBlockModulations(const float butteraugli_target, + const ImageF& xyb_x, + const ImageF& xyb_y, + const float scale, + const Rect& rect, + ImageF* out) { + JXL_ASSERT(SameSize(xyb_x, xyb_y)); + JXL_ASSERT(DivCeil(xyb_x.xsize(), kBlockDim) == out->xsize()); + JXL_ASSERT(DivCeil(xyb_x.ysize(), kBlockDim) == out->ysize()); + + float base_level = 0.5f * scale; + float kDampenRampStart = 7.0f; + float kDampenRampEnd = 14.0f; + float dampen = 1.0f; + if (butteraugli_target >= kDampenRampStart) { + dampen = 1.0f - ((butteraugli_target - kDampenRampStart) / (kDampenRampEnd - kDampenRampStart)); + if (dampen < 0) { + dampen = 0; + } + } + const float mul = scale * dampen; + const float add = (1.0f - dampen) * base_level; + for (size_t iy = rect.y0(); iy < rect.y0() + rect.ysize(); iy++) { + const size_t y = iy * 8; + float* const JXL_RESTRICT row_out = out->Row(iy); + const HWY_CAPPED(float, kBlockDim) df; + for (size_t ix = rect.x0(); ix < rect.x0() + rect.xsize(); ix++) { + size_t x = ix * 8; + auto out_val = Set(df, row_out[ix]); + out_val = ComputeMask(df, out_val); + out_val = HfModulation(df, x, y, xyb_y, out_val); + out_val = GammaModulation(df, x, y, xyb_x, xyb_y, out_val); + // We want multiplicative quantization field, so everything + // until this point has been modulating the exponent. + row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add; + } + } +} + +template +V MaskingSqrt(const D d, V v) { + static const float kLogOffset = 26.481471032459346f; + static const float kMul = 211.50759899638012f; + const auto mul_v = Set(d, kMul * 1e8); + const auto offset_v = Set(d, kLogOffset); + return Set(d, 0.25f) * Sqrt(MulAdd(v, Sqrt(mul_v), offset_v)); +} + +float MaskingSqrt(const float v) { + using DScalar = HWY_CAPPED(float, 1); + auto vscalar = Load(DScalar(), &v); + return GetLane(MaskingSqrt(DScalar(), vscalar)); +} + +void StoreMin4(const float v, float& min0, float& min1, float& min2, float& min3) { + if (v < min3) { + if (v < min0) { + min3 = min2; + min2 = min1; + min1 = min0; + min0 = v; + } else if (v < min1) { + min3 = min2; + min2 = min1; + min1 = v; + } else if (v < min2) { + min3 = min2; + min2 = v; + } else { + min3 = v; + } + } +} + +// Look for smooth areas near the area of degradation. +// If the areas are generally smooth, don't do masking. +// Output is downsampled 2x. +void FuzzyErosion(const Rect& from_rect, const ImageF& from, const Rect& to_rect, ImageF* to) { + const size_t xsize = from.xsize(); + const size_t ysize = from.ysize(); + constexpr int kStep = 1; + static_assert(kStep == 1, "Step must be 1"); + JXL_ASSERT(to_rect.xsize() * 2 == from_rect.xsize()); + JXL_ASSERT(to_rect.ysize() * 2 == from_rect.ysize()); + for (size_t fy = 0; fy < from_rect.ysize(); ++fy) { + size_t y = fy + from_rect.y0(); + size_t ym1 = y >= kStep ? y - kStep : y; + size_t yp1 = y + kStep < ysize ? y + kStep : y; + const float* rowt = from.Row(ym1); + const float* row = from.Row(y); + const float* rowb = from.Row(yp1); + float* row_out = to_rect.Row(to, fy / 2); + for (size_t fx = 0; fx < from_rect.xsize(); ++fx) { + size_t x = fx + from_rect.x0(); + size_t xm1 = x >= kStep ? x - kStep : x; + size_t xp1 = x + kStep < xsize ? x + kStep : x; + float min0 = row[x]; + float min1 = row[xm1]; + float min2 = row[xp1]; + float min3 = rowt[xm1]; + // Sort the first four values. + if (min0 > min1) std::swap(min0, min1); + if (min0 > min2) std::swap(min0, min2); + if (min0 > min3) std::swap(min0, min3); + if (min1 > min2) std::swap(min1, min2); + if (min1 > min3) std::swap(min1, min3); + if (min2 > min3) std::swap(min2, min3); + // The remaining five values of a 3x3 neighbourhood. + StoreMin4(rowt[x], min0, min1, min2, min3); + StoreMin4(rowt[xp1], min0, min1, min2, min3); + StoreMin4(rowb[xm1], min0, min1, min2, min3); + StoreMin4(rowb[x], min0, min1, min2, min3); + StoreMin4(rowb[xp1], min0, min1, min2, min3); + static const float kMulC = 0.05f; + static const float kMul0 = 0.05f; + static const float kMul1 = 0.05f; + static const float kMul2 = 0.05f; + static const float kMul3 = 0.05f; + float v = kMulC * row[x] + kMul0 * min0 + kMul1 * min1 + kMul2 * min2 + kMul3 * min3; + if (fx % 2 == 0 && fy % 2 == 0) { + row_out[fx / 2] = v; + } else { + row_out[fx / 2] += v; + } + } + } +} + +struct AdaptiveQuantizationImpl { + void Init(const Image3F& xyb) { + JXL_DASSERT(xyb.xsize() % kBlockDim == 0); + JXL_DASSERT(xyb.ysize() % kBlockDim == 0); + const size_t xsize = xyb.xsize(); + const size_t ysize = xyb.ysize(); + aq_map = ImageF(xsize / kBlockDim, ysize / kBlockDim); + } + void PrepareBuffers(size_t num_threads) { + diff_buffer = ImageF(kEncTileDim + 8, num_threads); + for (size_t i = pre_erosion.size(); i < num_threads; i++) { + pre_erosion.emplace_back(kEncTileDimInBlocks * 2 + 2, kEncTileDimInBlocks * 2 + 2); + } + } + + void ComputeTile( + float butteraugli_target, float scale, const Image3F& xyb, const Rect& rect, const int thread, ImageF* mask) { + PROFILER_ZONE("aq DiffPrecompute"); + const size_t xsize = xyb.xsize(); + const size_t ysize = xyb.ysize(); + + // The XYB gamma is 3.0 to be able to decode faster with two muls. + // Butteraugli's gamma is matching the gamma of human eye, around 2.6. + // We approximate the gamma difference by adding one cubic root into + // the adaptive quantization. This gives us a total gamma of 2.6666 + // for quantization uses. + const float match_gamma_offset = 0.019; + + const HWY_FULL(float) df; + const float kXMul = 23.426802998210313f; + const auto kXMulv = Set(df, kXMul); + + size_t y_start = rect.y0() * 8; + size_t y_end = y_start + rect.ysize() * 8; + + size_t x0 = rect.x0() * 8; + size_t x1 = x0 + rect.xsize() * 8; + if (x0 != 0) x0 -= 4; + if (x1 != xyb.xsize()) x1 += 4; + if (y_start != 0) y_start -= 4; + if (y_end != xyb.ysize()) y_end += 4; + pre_erosion[thread].ShrinkTo((x1 - x0) / 4, (y_end - y_start) / 4); + + // Computes image (padded to multiple of 8x8) of local pixel differences. + // Subsample both directions by 4. + for (size_t y = y_start; y < y_end; ++y) { + size_t y2 = y + 1 < ysize ? y + 1 : y; + size_t y1 = y > 0 ? y - 1 : y; + + const float* row_in = xyb.PlaneRow(1, y); + const float* row_in1 = xyb.PlaneRow(1, y1); + const float* row_in2 = xyb.PlaneRow(1, y2); + const float* row_x_in = xyb.PlaneRow(0, y); + const float* row_x_in1 = xyb.PlaneRow(0, y1); + const float* row_x_in2 = xyb.PlaneRow(0, y2); + float* JXL_RESTRICT row_out = diff_buffer.Row(thread); + + auto scalar_pixel = [&](size_t x) { + const size_t x2 = x + 1 < xsize ? x + 1 : x; + const size_t x1 = x > 0 ? x - 1 : x; + const float base = 0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]); + const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma(row_in[x] + match_gamma_offset); + float diff = gammac * (row_in[x] - base); + diff *= diff; + const float base_x = 0.25f * (row_x_in2[x] + row_x_in1[x] + row_x_in[x1] + row_x_in[x2]); + float diff_x = gammac * (row_x_in[x] - base_x); + diff_x *= diff_x; + diff += kXMul * diff_x; + diff = MaskingSqrt(diff); + if ((y % 4) != 0) { + row_out[x - x0] += diff; + } else { + row_out[x - x0] = diff; + } + }; + + size_t x = x0; + // First pixel of the row. + if (x0 == 0) { + scalar_pixel(x0); + ++x; + } + // SIMD + const auto match_gamma_offset_v = Set(df, match_gamma_offset); + const auto quarter = Set(df, 0.25f); + for (; x + 1 + Lanes(df) < x1; x += Lanes(df)) { + const auto in = LoadU(df, row_in + x); + const auto in_r = LoadU(df, row_in + x + 1); + const auto in_l = LoadU(df, row_in + x - 1); + const auto in_t = LoadU(df, row_in2 + x); + const auto in_b = LoadU(df, row_in1 + x); + auto base = quarter * (in_r + in_l + in_t + in_b); + auto gammacv = + RatioOfDerivativesOfCubicRootToSimpleGamma(df, in + match_gamma_offset_v); + auto diff = gammacv * (in - base); + diff *= diff; + + const auto in_x = LoadU(df, row_x_in + x); + const auto in_x_r = LoadU(df, row_x_in + x + 1); + const auto in_x_l = LoadU(df, row_x_in + x - 1); + const auto in_x_t = LoadU(df, row_x_in2 + x); + const auto in_x_b = LoadU(df, row_x_in1 + x); + auto base_x = quarter * (in_x_r + in_x_l + in_x_t + in_x_b); + auto diff_x = gammacv * (in_x - base_x); + diff_x *= diff_x; + diff += kXMulv * diff_x; + diff = MaskingSqrt(df, diff); + if ((y & 3) != 0) { + diff += LoadU(df, row_out + x - x0); + } + StoreU(diff, df, row_out + x - x0); + } + // Scalar + for (; x < x1; ++x) { + scalar_pixel(x); + } + if (y % 4 == 3) { + float* row_dout = pre_erosion[thread].Row((y - y_start) / 4); + for (size_t x = 0; x < (x1 - x0) / 4; x++) { + row_dout[x] = + (row_out[x * 4] + row_out[x * 4 + 1] + row_out[x * 4 + 2] + row_out[x * 4 + 3]) * 0.25f; + } + } + } + Rect from_rect(x0 % 8 == 0 ? 0 : 1, y_start % 8 == 0 ? 0 : 1, rect.xsize() * 2, rect.ysize() * 2); + FuzzyErosion(from_rect, pre_erosion[thread], rect, &aq_map); + for (size_t y = 0; y < rect.ysize(); ++y) { + const float* aq_map_row = rect.ConstRow(aq_map, y); + float* mask_row = rect.Row(mask, y); + for (size_t x = 0; x < rect.xsize(); ++x) { + mask_row[x] = ComputeMaskForAcStrategyUse(aq_map_row[x]); + } + } + PerBlockModulations(butteraugli_target, xyb.Plane(0), xyb.Plane(1), scale, rect, &aq_map); + } + std::vector pre_erosion; + ImageF aq_map; + ImageF diff_buffer; +}; + +ImageF AdaptiveQuantizationMap(const float butteraugli_target, + const Image3F& xyb, + const FrameDimensions& frame_dim, + float scale, + ThreadPool* pool, + ImageF* mask) { + PROFILER_ZONE("aq AdaptiveQuantMap"); + + AdaptiveQuantizationImpl impl; + impl.Init(xyb); + *mask = ImageF(frame_dim.xsize_blocks, frame_dim.ysize_blocks); + RunOnPool(pool, 0, DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks) * + DivCeil(frame_dim.ysize_blocks, kEncTileDimInBlocks), + [&](size_t num_threads) { + impl.PrepareBuffers(num_threads); + return true; + }, + [&](const int tid, int thread) { + size_t n_enc_tiles = DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks); + size_t tx = tid % n_enc_tiles; + size_t ty = tid / n_enc_tiles; + size_t by0 = ty * kEncTileDimInBlocks; + size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, frame_dim.ysize_blocks); + size_t bx0 = tx * kEncTileDimInBlocks; + size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, frame_dim.xsize_blocks); + Rect r(bx0, by0, bx1 - bx0, by1 - by0); + impl.ComputeTile(butteraugli_target, scale, xyb, r, thread, mask); + }, + "AQ DiffPrecompute"); + + return std::move(impl).aq_map; +} + +} // namespace + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(AdaptiveQuantizationMap); + +namespace { +bool FLAGS_log_search_state = false; +// If true, prints the quantization maps at each iteration. +bool FLAGS_dump_quant_state = false; + +void DumpHeatmap( + const AuxOut* aux_out, const std::string& label, const ImageF& image, float good_threshold, float bad_threshold) { + Image3F heatmap = CreateHeatMapImage(image, good_threshold, bad_threshold); + char filename[200]; + snprintf(filename, sizeof(filename), "%s%05d", label.c_str(), aux_out->num_butteraugli_iters); + aux_out->DumpImage(filename, heatmap); +} + +void DumpHeatmaps(const AuxOut* aux_out, + float ba_target, + const ImageF& quant_field, + const ImageF& tile_heatmap, + const ImageF& bt_diffmap) { + if (!WantDebugOutput(aux_out)) return; + ImageF inv_qmap(quant_field.xsize(), quant_field.ysize()); + for (size_t y = 0; y < quant_field.ysize(); ++y) { + const float* JXL_RESTRICT row_q = quant_field.ConstRow(y); + float* JXL_RESTRICT row_inv_q = inv_qmap.Row(y); + for (size_t x = 0; x < quant_field.xsize(); ++x) { + row_inv_q[x] = 1.0f / row_q[x]; // never zero + } + } + DumpHeatmap(aux_out, "quant_heatmap", inv_qmap, 4.0f * ba_target, 6.0f * ba_target); + DumpHeatmap(aux_out, "tile_heatmap", tile_heatmap, ba_target, 1.5f * ba_target); + // matches heat maps produced by the command line tool. + DumpHeatmap(aux_out, "bt_diffmap", bt_diffmap, ButteraugliFuzzyInverse(1.5), ButteraugliFuzzyInverse(0.5)); +} + +ImageF TileDistMap(const ImageF& distmap, int tile_size, int margin, const AcStrategyImage& ac_strategy) { + PROFILER_FUNC; + const int tile_xsize = (distmap.xsize() + tile_size - 1) / tile_size; + const int tile_ysize = (distmap.ysize() + tile_size - 1) / tile_size; + ImageF tile_distmap(tile_xsize, tile_ysize); + size_t distmap_stride = tile_distmap.PixelsPerRow(); + for (int tile_y = 0; tile_y < tile_ysize; ++tile_y) { + AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(tile_y); + float* JXL_RESTRICT dist_row = tile_distmap.Row(tile_y); + for (int tile_x = 0; tile_x < tile_xsize; ++tile_x) { + AcStrategy acs = ac_strategy_row[tile_x]; + if (!acs.IsFirstBlock()) continue; + int this_tile_xsize = acs.covered_blocks_x() * tile_size; + int this_tile_ysize = acs.covered_blocks_y() * tile_size; + int y_begin = std::max(0, tile_size * tile_y - margin); + int y_end = std::min(distmap.ysize(), tile_size * tile_y + this_tile_ysize + margin); + int x_begin = std::max(0, tile_size * tile_x - margin); + int x_end = std::min(distmap.xsize(), tile_size * tile_x + this_tile_xsize + margin); + float dist_norm = 0.0; + double pixels = 0; + for (int y = y_begin; y < y_end; ++y) { + float ymul = 1.0; + constexpr float kBorderMul = 0.98f; + constexpr float kCornerMul = 0.7f; + if (margin != 0 && (y == y_begin || y == y_end - 1)) { + ymul = kBorderMul; + } + const float* const JXL_RESTRICT row = distmap.Row(y); + for (int x = x_begin; x < x_end; ++x) { + float xmul = ymul; + if (margin != 0 && (x == x_begin || x == x_end - 1)) { + if (xmul == 1.0) { + xmul = kBorderMul; + } else { + xmul = kCornerMul; + } + } + float v = row[x]; + v *= v; + v *= v; + v *= v; + v *= v; + dist_norm += xmul * v; + pixels += xmul; + } + } + if (pixels == 0) pixels = 1; + // 16th norm is less than the max norm, we reduce the difference + // with this normalization factor. + constexpr float kTileNorm = 1.2f; + const float tile_dist = kTileNorm * std::pow(dist_norm / pixels, 1.0f / 16.0f); + dist_row[tile_x] = tile_dist; + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + dist_row[tile_x + distmap_stride * iy + ix] = tile_dist; + } + } + } + } + return tile_distmap; +} + +constexpr float kDcQuantPow = 0.57f; +static const float kDcQuant = 1.12f; +static const float kAcQuant = 0.787f; + +void FindBestQuantization( + const ImageBundle& linear, const Image3F& opsin, PassesEncoderState* enc_state, ThreadPool* pool, AuxOut* aux_out) { + const CompressParams& cparams = enc_state->cparams; + Quantizer& quantizer = enc_state->shared.quantizer; + ImageI& raw_quant_field = enc_state->shared.raw_quant_field; + ImageF& quant_field = enc_state->initial_quant_field; + + const float butteraugli_target = cparams.butteraugli_distance; + ButteraugliParams params = cparams.ba_params; + params.intensity_target = linear.metadata()->IntensityTarget(); + // Hack the default intensity target value to be 80.0, the intensity + // target of sRGB images and a more reasonable viewing default than + // JPEG XL file format's default. + if (fabs(params.intensity_target - 255.0f) < 1e-3) { + params.intensity_target = 80.0f; + } + JxlButteraugliComparator comparator(params); + JXL_CHECK(comparator.SetReferenceImage(linear)); + bool lower_is_better = (comparator.GoodQualityScore() < comparator.BadQualityScore()); + const float initial_quant_dc = InitialQuantDC(butteraugli_target); + AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field), &quant_field); + ImageF tile_distmap; + ImageF initial_quant_field = CopyImage(quant_field); + + float initial_qf_min, initial_qf_max; + ImageMinMax(initial_quant_field, &initial_qf_min, &initial_qf_max); + float initial_qf_ratio = initial_qf_max / initial_qf_min; + float qf_max_deviation_low = std::sqrt(250 / initial_qf_ratio); + float asymmetry = 2; + if (qf_max_deviation_low < asymmetry) asymmetry = qf_max_deviation_low; + float qf_lower = initial_qf_min / (asymmetry * qf_max_deviation_low); + float qf_higher = initial_qf_max * (qf_max_deviation_low / asymmetry); + + JXL_ASSERT(qf_higher / qf_lower < 253); + + constexpr int kOriginalComparisonRound = 1; + int iters = cparams.max_butteraugli_iters; + if (iters > 7) { + iters = 7; + } + if (cparams.speed_tier != SpeedTier::kTortoise) { + iters = 2; + } + for (int i = 0; i < iters + 1; ++i) { + if (FLAGS_dump_quant_state) { + printf("\nQuantization field:\n"); + for (size_t y = 0; y < quant_field.ysize(); ++y) { + for (size_t x = 0; x < quant_field.xsize(); ++x) { + printf(" %.5f", quant_field.Row(y)[x]); + } + printf("\n"); + } + } + quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field); + ImageBundle linear = RoundtripImage(opsin, enc_state, pool); + PROFILER_ZONE("enc Butteraugli"); + float score; + ImageF diffmap; + JXL_CHECK(comparator.CompareWith(linear, &diffmap, &score)); + if (!lower_is_better) { + score = -score; + diffmap = ScaleImage(-1.0f, diffmap); + } + tile_distmap = TileDistMap(diffmap, 8, 0, enc_state->shared.ac_strategy); + if (WantDebugOutput(aux_out)) { + aux_out->DumpImage(("dec" + ToString(i)).c_str(), *linear.color()); + DumpHeatmaps(aux_out, butteraugli_target, quant_field, tile_distmap, diffmap); + } + if (aux_out != nullptr) ++aux_out->num_butteraugli_iters; + if (FLAGS_log_search_state) { + float minval, maxval; + ImageMinMax(quant_field, &minval, &maxval); + printf("\nButteraugli iter: %d/%d\n", i, cparams.max_butteraugli_iters); + printf("Butteraugli distance: %f\n", score); + printf("quant range: %f ... %f DC quant: %f\n", minval, maxval, initial_quant_dc); + if (FLAGS_dump_quant_state) { + quantizer.DumpQuantizationMap(raw_quant_field); + } + } + + if (i == iters) break; + + double kPow[8] = { + 0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + }; + double kPowMod[8] = { + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + }; + if (i == kOriginalComparisonRound) { + // Don't allow optimization to make the quant field a lot worse than + // what the initial guess was. This allows the AC field to have enough + // precision to reduce the oscillations due to the dc reconstruction. + double kInitMul = 0.6; + const double kOneMinusInitMul = 1.0 - kInitMul; + for (size_t y = 0; y < quant_field.ysize(); ++y) { + float* const JXL_RESTRICT row_q = quant_field.Row(y); + const float* const JXL_RESTRICT row_init = initial_quant_field.Row(y); + for (size_t x = 0; x < quant_field.xsize(); ++x) { + double clamp = kOneMinusInitMul * row_q[x] + kInitMul * row_init[x]; + if (row_q[x] < clamp) { + row_q[x] = clamp; + if (row_q[x] > qf_higher) row_q[x] = qf_higher; + if (row_q[x] < qf_lower) row_q[x] = qf_lower; + } + } + } + } + + double cur_pow = 0.0; + if (i < 7) { + cur_pow = kPow[i] + (butteraugli_target - 1.0) * kPowMod[i]; + if (cur_pow < 0) { + cur_pow = 0; + } + } + if (cur_pow == 0.0) { + for (size_t y = 0; y < quant_field.ysize(); ++y) { + const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y); + float* const JXL_RESTRICT row_q = quant_field.Row(y); + for (size_t x = 0; x < quant_field.xsize(); ++x) { + const float diff = row_dist[x] / butteraugli_target; + if (diff > 1.0f) { + float old = row_q[x]; + row_q[x] *= diff; + int qf_old = old * quantizer.InvGlobalScale() + 0.5; + int qf_new = row_q[x] * quantizer.InvGlobalScale() + 0.5; + if (qf_old == qf_new) { + row_q[x] = old + quantizer.Scale(); + } + } + if (row_q[x] > qf_higher) row_q[x] = qf_higher; + if (row_q[x] < qf_lower) row_q[x] = qf_lower; + } + } + } else { + for (size_t y = 0; y < quant_field.ysize(); ++y) { + const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y); + float* const JXL_RESTRICT row_q = quant_field.Row(y); + for (size_t x = 0; x < quant_field.xsize(); ++x) { + const float diff = row_dist[x] / butteraugli_target; + if (diff <= 1.0f) { + row_q[x] *= std::pow(diff, cur_pow); + } else { + float old = row_q[x]; + row_q[x] *= diff; + int qf_old = old * quantizer.InvGlobalScale() + 0.5; + int qf_new = row_q[x] * quantizer.InvGlobalScale() + 0.5; + if (qf_old == qf_new) { + row_q[x] = old + quantizer.Scale(); + } + } + if (row_q[x] > qf_higher) row_q[x] = qf_higher; + if (row_q[x] < qf_lower) row_q[x] = qf_lower; + } + } + } + } + quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field); +} + +void FindBestQuantizationMaxError(const Image3F& opsin, + PassesEncoderState* enc_state, + ThreadPool* pool, + AuxOut* aux_out) { + // TODO(veluca): this only works if opsin is in XYB. The current encoder does + // not have code paths that produce non-XYB opsin here. + JXL_CHECK(enc_state->shared.frame_header.color_transform == ColorTransform::kXYB); + const CompressParams& cparams = enc_state->cparams; + Quantizer& quantizer = enc_state->shared.quantizer; + ImageI& raw_quant_field = enc_state->shared.raw_quant_field; + ImageF& quant_field = enc_state->initial_quant_field; + + // TODO(veluca): better choice of this value. + const float initial_quant_dc = 16 * std::sqrt(0.1f / cparams.butteraugli_distance); + AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field), &quant_field); + + const float inv_max_err[3] = {1.0f / enc_state->cparams.max_error[0], 1.0f / enc_state->cparams.max_error[1], + 1.0f / enc_state->cparams.max_error[2]}; + + for (int i = 0; i < cparams.max_butteraugli_iters + 1; ++i) { + quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field); + if (aux_out) { + aux_out->DumpXybImage(("ops" + ToString(i)).c_str(), opsin); + } + ImageBundle decoded = RoundtripImage(opsin, enc_state, pool); + if (aux_out) { + aux_out->DumpXybImage(("dec" + ToString(i)).c_str(), *decoded.color()); + } + + for (size_t by = 0; by < enc_state->shared.frame_dim.ysize_blocks; by++) { + AcStrategyRow ac_strategy_row = enc_state->shared.ac_strategy.ConstRow(by); + for (size_t bx = 0; bx < enc_state->shared.frame_dim.xsize_blocks; bx++) { + AcStrategy acs = ac_strategy_row[bx]; + if (!acs.IsFirstBlock()) continue; + float max_error = 0; + for (size_t c = 0; c < 3; c++) { + for (size_t y = by * kBlockDim; y < (by + acs.covered_blocks_y()) * kBlockDim; y++) { + if (y >= decoded.ysize()) continue; + const float* JXL_RESTRICT in_row = opsin.ConstPlaneRow(c, y); + const float* JXL_RESTRICT dec_row = decoded.color()->ConstPlaneRow(c, y); + for (size_t x = bx * kBlockDim; x < (bx + acs.covered_blocks_x()) * kBlockDim; x++) { + if (x >= decoded.xsize()) continue; + max_error = std::max(std::abs(in_row[x] - dec_row[x]) * inv_max_err[c], max_error); + } + } + } + // Target an error between max_error/2 and max_error. + // If the error in the varblock is above the target, increase the qf to + // compensate. If the error is below the target, decrease the qf. + // However, to avoid an excessive increase of the qf, only do so if the + // error is less than half the maximum allowed error. + const float qf_mul = (max_error < 0.5f) ? max_error * 2.0f : (max_error > 1.0f) ? max_error : 1.0f; + for (size_t qy = by; qy < by + acs.covered_blocks_y(); qy++) { + float* JXL_RESTRICT quant_field_row = quant_field.Row(qy); + for (size_t qx = bx; qx < bx + acs.covered_blocks_x(); qx++) { + quant_field_row[qx] *= qf_mul; + } + } + } + } + } + quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field); +} + +} // namespace + +void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect, ImageF* quant_field) { + // Replace the whole quant_field in non-8x8 blocks with the maximum of each + // 8x8 block. + size_t stride = quant_field->PixelsPerRow(); + for (size_t y = 0; y < rect.ysize(); ++y) { + AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(rect, y); + float* JXL_RESTRICT quant_row = rect.Row(quant_field, y); + for (size_t x = 0; x < rect.xsize(); ++x) { + AcStrategy acs = ac_strategy_row[x]; + if (!acs.IsFirstBlock()) continue; + JXL_ASSERT(x + acs.covered_blocks_x() <= quant_field->xsize()); + JXL_ASSERT(y + acs.covered_blocks_y() <= quant_field->ysize()); + float max = quant_row[x]; + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + max = std::max(quant_row[x + ix + iy * stride], max); + } + } + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + quant_row[x + ix + iy * stride] = max; + } + } + } + } +} + +float InitialQuantDC(float butteraugli_target) { + const float kDcMul = 2.9; // Butteraugli target where non-linearity kicks in. + const float butteraugli_target_dc = std::max( + 0.5f * butteraugli_target, + std::min(butteraugli_target, kDcMul * std::pow((1.0f / kDcMul) * butteraugli_target, kDcQuantPow))); + // We want the maximum DC value to be at most 2**15 * kInvDCQuant / quant_dc. + // The maximum DC value might not be in the kXybRange because of inverse + // gaborish, so we add some slack to the maximum theoretical quant obtained + // this way (64). + return std::min(kDcQuant / butteraugli_target_dc, 50.f); +} + +ImageF InitialQuantField(const float butteraugli_target, + const Image3F& opsin, + const FrameDimensions& frame_dim, + ThreadPool* pool, + float rescale, + ImageF* mask) { + PROFILER_FUNC; + const float quant_ac = kAcQuant / butteraugli_target; + return HWY_DYNAMIC_DISPATCH(AdaptiveQuantizationMap)(butteraugli_target, opsin, frame_dim, quant_ac * rescale, pool, + mask); +} + +void FindBestQuantizer(const ImageBundle* linear, + const Image3F& opsin, + PassesEncoderState* enc_state, + ThreadPool* pool, + AuxOut* aux_out, + double rescale) { + const CompressParams& cparams = enc_state->cparams; + if (cparams.max_error_mode) { + PROFILER_ZONE("enc find best maxerr"); + FindBestQuantizationMaxError(opsin, enc_state, pool, aux_out); + } else if (cparams.speed_tier <= SpeedTier::kKitten) { + // Normal encoding to a butteraugli score. + PROFILER_ZONE("enc find best2"); + FindBestQuantization(*linear, opsin, enc_state, pool, aux_out); + } +} + +ImageBundle RoundtripImage(const Image3F& opsin, PassesEncoderState* enc_state, ThreadPool* pool) { + PROFILER_ZONE("enc roundtrip"); + std::unique_ptr dec_state = jxl::make_unique(); + JXL_CHECK(dec_state->output_encoding_info.Set( + *enc_state->shared.metadata, ColorEncoding::LinearSRGB(enc_state->shared.metadata->m.color_encoding.IsGray()))); + dec_state->shared = &enc_state->shared; + JXL_ASSERT(opsin.ysize() % kBlockDim == 0); + + const size_t xsize_groups = DivCeil(opsin.xsize(), kGroupDim); + const size_t ysize_groups = DivCeil(opsin.ysize(), kGroupDim); + const size_t num_groups = xsize_groups * ysize_groups; + + size_t num_special_frames = enc_state->special_frames.size(); + + std::unique_ptr modular_frame_encoder = + jxl::make_unique(enc_state->shared.frame_header, enc_state->cparams); + /* InitializePassesEncoder(opsin, pool, enc_state, modular_frame_encoder.get(), + nullptr);*/ + JXL_CHECK(dec_state->Init()); + dec_state->InitForAC(pool); + + ImageBundle decoded(&enc_state->shared.metadata->m); + decoded.origin = enc_state->shared.frame_header.frame_origin; + decoded.SetFromImage(Image3F(opsin.xsize(), opsin.ysize()), dec_state->output_encoding_info.color_encoding); + + // Same as dec_state->shared->frame_header.nonserialized_metadata->m + const ImageMetadata& metadata = *decoded.metadata(); + if (!metadata.extra_channel_info.empty()) { + // Add dummy extra channels to the dec_state: FinalizeFrameDecoding moves + // these extra channels to the ImageBundle, and is required that the amount + // of extra channels matches its metadata()->extra_channel_info.size(). + // Normally we'd place these extra channels in the ImageBundle, but in this + // case FinalizeFrameDecoding is the one that does this. + std::vector extra_channels; + extra_channels.reserve(metadata.extra_channel_info.size()); + for (size_t i = 0; i < metadata.extra_channel_info.size(); i++) { + extra_channels.emplace_back(decoded.xsize(), decoded.ysize()); + // Must initialize the image with data to not affect blending with + // uninitialized memory. + ZeroFillImage(&extra_channels.back()); + } + dec_state->extra_channels = std::move(extra_channels); + } + + hwy::AlignedUniquePtr group_dec_caches; + const auto allocate_storage = [&](size_t num_threads) { + dec_state->EnsureStorage(num_threads); + group_dec_caches = hwy::MakeUniqueAlignedArray(num_threads); + return true; + }; + const auto process_group = [&](const int group_index, const int thread) { + if (dec_state->shared->frame_header.loop_filter.epf_iters > 0) { + ComputeSigma(dec_state->shared->BlockGroupRect(group_index), dec_state.get()); + } + JXL_CHECK(DecodeGroupForRoundtrip(enc_state->coeffs, group_index, dec_state.get(), &group_dec_caches[thread], + thread, &decoded, nullptr)); + }; + RunOnPool(pool, 0, num_groups, allocate_storage, process_group, "AQ loop"); + + // Fine to do a JXL_ASSERT instead of error handling, since this only happens + // on the encoder side where we can't be fed with invalid data. + JXL_CHECK(FinalizeFrameDecoding(&decoded, dec_state.get(), pool, + /*force_fir=*/false, /*skip_blending=*/true)); + // Ensure we don't create any new special frames. + enc_state->special_frames.resize(num_special_frames); + + return decoded; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/others/src/acc_enc_cache.cpp b/codec/L2/demos/jxlEnc/others/src/acc_enc_cache.cpp new file mode 100644 index 0000000000..75a87723b1 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/acc_enc_cache.cpp @@ -0,0 +1,204 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_cache.h" + +#include +#include + +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/dec_frame.h" +#include "lib/jxl/enc_frame.h" +#include "acc_enc_group.hpp" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/passes_state.h" +#include "lib/jxl/quantizer.h" + +namespace jxl { + +void InitializePassesEncoder(const Image3F& opsin, + ThreadPool* pool, + PassesEncoderState* enc_state, + ModularFrameEncoder* modular_frame_encoder, + AuxOut* aux_out, + size_t xsize, + size_t ysize, + std::vector >& dctIDT, + std::vector >& dct2x2, + std::vector >& dct4x4, + std::vector >& dct8x8, + std::vector >& dct16x16, + std::vector >& dct32x32, + std::vector >& dcIDT, + std::vector >& dc2x2, + std::vector >& dc4x4, + std::vector >& dc8x8, + std::vector >& dc16x16, + std::vector >& dc32x32) { + PROFILER_FUNC; + + PassesSharedState& JXL_RESTRICT shared = enc_state->shared; + + enc_state->histogram_idx.resize(shared.frame_dim.num_groups); + + enc_state->x_qm_multiplier = std::pow(1.25f, shared.frame_header.x_qm_scale - 2.0f); + enc_state->b_qm_multiplier = std::pow(1.25f, shared.frame_header.b_qm_scale - 2.0f); + + if (enc_state->coeffs.size() < shared.frame_header.passes.num_passes) { + enc_state->coeffs.reserve(shared.frame_header.passes.num_passes); + for (size_t i = enc_state->coeffs.size(); i < shared.frame_header.passes.num_passes; i++) { + // Allocate enough coefficients for each group on every row. + enc_state->coeffs.emplace_back( + make_unique >(kGroupDim * kGroupDim, shared.frame_dim.num_groups)); + } + } + while (enc_state->coeffs.size() > shared.frame_header.passes.num_passes) { + enc_state->coeffs.pop_back(); + } + + Image3F dc(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks); + /* RunOnPool( + pool, 0, shared.frame_dim.num_groups, ThreadPool::SkipInit(), + [&](size_t group_idx, size_t _) { + ComputeCoefficients(group_idx, enc_state, opsin, &dc,xsize, ysize, dctIDT, dct2x2, + dct4x4, dct8x8, dct16x16, dct32x32, dcIDT, dc2x2, dc4x4, + dc8x8, dc16x16, dc32x32); + }, + "Compute coeffs");*/ + + for (int i = 0; i < shared.frame_dim.num_groups; i++) { + ComputeCoefficients(i, enc_state, opsin, &dc, xsize, ysize, dctIDT, dct2x2, dct4x4, dct8x8, dct16x16, dct32x32, + dcIDT, dc2x2, dc4x4, dc8x8, dc16x16, dc32x32); + } + + if (shared.frame_header.flags & FrameHeader::kUseDcFrame) { + CompressParams cparams = enc_state->cparams; + // Guess a distance that produces good initial results. + cparams.butteraugli_distance = + std::max(kMinButteraugliDistance, enc_state->cparams.butteraugli_distance * 0.1f); + cparams.dots = Override::kOff; + cparams.noise = Override::kOff; + cparams.patches = Override::kOff; + cparams.gaborish = Override::kOff; + cparams.epf = 0; + cparams.max_error_mode = true; + cparams.resampling = 1; + cparams.ec_resampling = 1; + for (size_t c = 0; c < 3; c++) { + cparams.max_error[c] = shared.quantizer.MulDC()[c]; + } + JXL_ASSERT(cparams.progressive_dc > 0); + cparams.progressive_dc--; + // The DC frame will have alpha=0. Don't erase its contents. + cparams.keep_invisible = Override::kOn; + // No EPF or Gaborish in DC frames. + cparams.epf = 0; + cparams.gaborish = Override::kOff; + // Use kVarDCT in max_error_mode for intermediate progressive DC, + // and kModular for the smallest DC (first in the bitstream) + if (cparams.progressive_dc == 0) { + cparams.modular_mode = true; + cparams.quality_pair.first = cparams.quality_pair.second = + 99.f - enc_state->cparams.butteraugli_distance * 0.2f; + } + ImageBundle ib(&shared.metadata->m); + // This is a lie - dc is in XYB + // (but EncodeFrame will skip RGB->XYB conversion anyway) + ib.SetFromImage(std::move(dc), ColorEncoding::LinearSRGB(shared.metadata->m.color_encoding.IsGray())); + if (!ib.metadata()->extra_channel_info.empty()) { + // Add dummy extra channels to the patch image: dc_level frames do not yet + // support extra channels, but the codec expects that the amount of extra + // channels in frames matches that in the metadata of the codestream. + std::vector extra_channels; + extra_channels.reserve(ib.metadata()->extra_channel_info.size()); + for (size_t i = 0; i < ib.metadata()->extra_channel_info.size(); i++) { + extra_channels.emplace_back(ib.xsize(), ib.ysize()); + // Must initialize the image with data to not affect blending with + // uninitialized memory. + // TODO(lode): dc_level must copy and use the real extra channels + // instead. + ZeroFillImage(&extra_channels.back()); + } + ib.SetExtraChannels(std::move(extra_channels)); + } + std::unique_ptr state = jxl::make_unique(); + + auto special_frame = std::unique_ptr(new BitWriter()); + FrameInfo dc_frame_info; + dc_frame_info.frame_type = FrameType::kDCFrame; + dc_frame_info.dc_level = shared.frame_header.dc_level + 1; + dc_frame_info.ib_needs_color_transform = false; + dc_frame_info.save_before_color_transform = true; // Implicitly true + // TODO(lode): the EncodeFrame / DecodeFrame pair here is likely broken in + // case of dc_level >= 3, since EncodeFrame may output multiple frames + // to the bitwriter, while DecodeFrame reads only one. + JXL_CHECK( + EncodeFrame(cparams, dc_frame_info, shared.metadata, ib, state.get(), pool, special_frame.get(), nullptr)); + const Span encoded = special_frame->GetSpan(); + enc_state->special_frames.emplace_back(std::move(special_frame)); + + BitReader br(encoded); + ImageBundle decoded(&shared.metadata->m); + std::unique_ptr dec_state = jxl::make_unique(); + JXL_CHECK(dec_state->output_encoding_info.Set( + *shared.metadata, ColorEncoding::LinearSRGB(shared.metadata->m.color_encoding.IsGray()))); + JXL_CHECK(DecodeFrame({}, dec_state.get(), pool, &br, &decoded, *shared.metadata, /*constraints=*/nullptr)); + // TODO(lode): shared.frame_header.dc_level should be equal to + // dec_state.shared->frame_header.dc_level - 1 here, since above we set + // dc_frame_info.dc_level = shared.frame_header.dc_level + 1, and + // dc_frame_info.dc_level is used by EncodeFrame. However, if EncodeFrame + // outputs multiple frames, this assumption could be wrong. + shared.dc_storage = CopyImage(dec_state->shared->dc_frames[shared.frame_header.dc_level]); + ZeroFillImage(&shared.quant_dc); + shared.dc = &shared.dc_storage; + JXL_CHECK(br.Close()); + } else { + auto compute_dc_coeffs = [&](int group_index, int /* thread */) { + modular_frame_encoder->AddVarDCTDC(dc, group_index, enc_state->cparams.butteraugli_distance >= 2.0f && + enc_state->cparams.speed_tier < SpeedTier::kFalcon, + enc_state); + }; + RunOnPool(pool, 0, shared.frame_dim.num_dc_groups, ThreadPool::SkipInit(), compute_dc_coeffs, + "Compute DC coeffs"); + // TODO(veluca): this is only useful in tests and if inspection is enabled. + if (!(shared.frame_header.flags & FrameHeader::kSkipAdaptiveDCSmoothing)) { + AdaptiveDCSmoothing(shared.quantizer.MulDC(), &shared.dc_storage, pool); + } + } + auto compute_ac_meta = [&](int group_index, int /* thread */) { + modular_frame_encoder->AddACMetadata(group_index, /*jpeg_transcode=*/false, enc_state); + }; + RunOnPool(pool, 0, shared.frame_dim.num_dc_groups, ThreadPool::SkipInit(), compute_ac_meta, "Compute AC Metadata"); + + if (aux_out != nullptr) { + aux_out->InspectImage3F("compressed_image:InitializeFrameEncCache:dc_dec", shared.dc_storage); + } +} + +void EncCache::InitOnce() { + PROFILER_FUNC; + + if (num_nzeroes.xsize() == 0) { + num_nzeroes = Image3I(kGroupDimInBlocks, kGroupDimInBlocks); + } +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/others/src/acc_enc_chroma_from_luma.cpp b/codec/L2/demos/jxlEnc/others/src/acc_enc_chroma_from_luma.cpp new file mode 100644 index 0000000000..7fca240d0b --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/acc_enc_chroma_from_luma.cpp @@ -0,0 +1,536 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "acc_enc_chroma_from_luma.hpp" + +#include +#include + +#include +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "xilinx/src/acc_enc_chroma_from_luma.cpp" +#include +#include +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_transforms-inl.h" +#include "lib/jxl/enc_transforms-inl.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/quantizer.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +static HWY_FULL(float) df; + +struct CFLFunction { + static constexpr float kCoeff = 1.f / 3; + static constexpr float kThres = 100.0f; + static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor; + CFLFunction(const float* values_m, const float* values_s, size_t num, float base, float distance_mul) + : values_m(values_m), values_s(values_s), num(num), base(base), distance_mul(distance_mul) {} + + // Returns f'(x), where f is 1/3 * sum ((|color residual| + 1)^2-1) + + // distance_mul * x^2 * num. + float Compute(float x, float eps, float* fpeps, float* fmeps) const { + float first_derivative = 2 * distance_mul * num * x; + float first_derivative_peps = 2 * distance_mul * num * (x + eps); + float first_derivative_meps = 2 * distance_mul * num * (x - eps); + + const auto inv_color_factor = Set(df, kInvColorFactor); + const auto thres = Set(df, kThres); + const auto coeffx2 = Set(df, kCoeff * 2.0f); + const auto one = Set(df, 1.0f); + const auto zero = Set(df, 0.0f); + const auto base_v = Set(df, base); + const auto x_v = Set(df, x); + const auto xpe_v = Set(df, x + eps); + const auto xme_v = Set(df, x - eps); + auto fd_v = Zero(df); + auto fdpe_v = Zero(df); + auto fdme_v = Zero(df); + JXL_ASSERT(num % Lanes(df) == 0); + + for (size_t i = 0; i < num; i += Lanes(df)) { + // color residual = ax + b + const auto a = inv_color_factor * Load(df, values_m + i); + const auto b = base_v * Load(df, values_m + i) - Load(df, values_s + i); + const auto v = a * x_v + b; + const auto vpe = a * xpe_v + b; + const auto vme = a * xme_v + b; + const auto av = Abs(v); + const auto avpe = Abs(vpe); + const auto avme = Abs(vme); + auto d = coeffx2 * (av + one) * a; + auto dpe = coeffx2 * (avpe + one) * a; + auto dme = coeffx2 * (avme + one) * a; + d = IfThenElse(v < zero, zero - d, d); + dpe = IfThenElse(vpe < zero, zero - dpe, dpe); + dme = IfThenElse(vme < zero, zero - dme, dme); + fd_v += IfThenElse(av >= thres, zero, d); + fdpe_v += IfThenElse(av >= thres, zero, dpe); + fdme_v += IfThenElse(av >= thres, zero, dme); + } + + *fpeps = first_derivative_peps + GetLane(SumOfLanes(fdpe_v)); + *fmeps = first_derivative_meps + GetLane(SumOfLanes(fdme_v)); + return first_derivative + GetLane(SumOfLanes(fd_v)); + } + + const float* JXL_RESTRICT values_m; + const float* JXL_RESTRICT values_s; + size_t num; + float base; + float distance_mul; +}; + +int32_t FindBestMultiplier( + const float* values_m, const float* values_s, size_t num, float base, float distance_mul, bool fast) { + if (num == 0) { + return 0; + } + float x; + if (fast) { + static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor; + auto ca = Zero(df); + auto cb = Zero(df); + const auto inv_color_factor = Set(df, kInvColorFactor); + const auto base_v = Set(df, base); + for (size_t i = 0; i < num; i += Lanes(df)) { + // color residual = ax + b + const auto a = inv_color_factor * Load(df, values_m + i); + const auto b = base_v * Load(df, values_m + i) - Load(df, values_s + i); + ca = MulAdd(a, a, ca); + cb = MulAdd(a, b, cb); + } + // + distance_mul * x^2 * num + x = -GetLane(SumOfLanes(cb)) / (GetLane(SumOfLanes(ca)) + num * distance_mul * 0.5f); + } else { + constexpr float eps = 1; + constexpr float kClamp = 20.0f; + CFLFunction fn(values_m, values_s, num, base, distance_mul); + x = 0; + // Up to 20 Newton iterations, with approximate derivatives. + // Derivatives are approximate due to the high amount of noise in the exact + // derivatives. + for (size_t i = 0; i < 20; i++) { + float dfpeps, dfmeps; + float df = fn.Compute(x, eps, &dfpeps, &dfmeps); + float ddf = (dfpeps - dfmeps) / (2 * eps); + float step = df / ddf; + x -= std::min(kClamp, std::max(-kClamp, step)); + if (std::abs(step) < 3e-3) break; + } + } + return std::max(-128.0f, std::min(127.0f, roundf(x))); +} + +void InitDCStorage(size_t num_blocks, ImageF* dc_values) { + // First row: Y channel + // Second row: X channel + // Third row: Y channel + // Fourth row: B channel + *dc_values = ImageF(RoundUpTo(num_blocks, Lanes(df)), 4); + + JXL_ASSERT(dc_values->xsize() != 0); + // Zero-fill the last lanes + for (size_t y = 0; y < 4; y++) { + for (size_t x = dc_values->xsize() - Lanes(df); x < dc_values->xsize(); x++) { + dc_values->Row(y)[x] = 0; + } + } +} + +void ComputeDC(const ImageF& dc_values, bool fast, int* dc_x, int* dc_b) { + constexpr float kDistanceMultiplierDC = 1e-5f; + const float* JXL_RESTRICT dc_values_yx = dc_values.Row(0); + const float* JXL_RESTRICT dc_values_x = dc_values.Row(1); + const float* JXL_RESTRICT dc_values_yb = dc_values.Row(2); + const float* JXL_RESTRICT dc_values_b = dc_values.Row(3); + *dc_x = FindBestMultiplier(dc_values_yx, dc_values_x, dc_values.xsize(), 0.0f, kDistanceMultiplierDC, fast); + *dc_b = FindBestMultiplier(dc_values_yb, dc_values_b, dc_values.xsize(), kYToBRatio, kDistanceMultiplierDC, fast); +} + +void ComputeTile(const Image3F& opsin, + const DequantMatrices& dequant, + const AcStrategyImage* ac_strategy, + const Quantizer* quantizer, + const Rect& r, + bool fast, + bool use_dct8, + ImageSB* map_x, + ImageSB* map_b, + ImageF* dc_values, + float* mem, + + //==========acc interface======== + size_t xsize, + size_t ysize, + std::vector >& dctIDT, + std::vector >& dct2x2, + std::vector >& dct4x4, + std::vector >& dct8x8, + std::vector >& dct16x16, + std::vector >& dct32x32, + + std::vector >& dcIDT, + std::vector >& dc2x2, + std::vector >& dc4x4, + std::vector >& dc8x8, + std::vector >& dc16x16, + std::vector >& dc32x32 + //================================ + ) { + static_assert(kEncTileDimInBlocks == kColorTileDimInBlocks, "Invalid color tile dim"); + size_t xsize_blocks = opsin.xsize() / kBlockDim; + constexpr float kDistanceMultiplierAC = 1e-3f; + + const size_t y0 = r.y0(); + const size_t x0 = r.x0(); + const size_t x1 = r.x0() + r.xsize(); + const size_t y1 = r.y0() + r.ysize(); + + int ty = y0 / kColorTileDimInBlocks; + int tx = x0 / kColorTileDimInBlocks; + + int8_t* JXL_RESTRICT row_out_x = map_x->Row(ty); + int8_t* JXL_RESTRICT row_out_b = map_b->Row(ty); + + float* JXL_RESTRICT dc_values_yx = dc_values->Row(0); + float* JXL_RESTRICT dc_values_x = dc_values->Row(1); + float* JXL_RESTRICT dc_values_yb = dc_values->Row(2); + float* JXL_RESTRICT dc_values_b = dc_values->Row(3); + + // All are aligned. + float* HWY_RESTRICT block_y = mem; + float* HWY_RESTRICT block_x = block_y + AcStrategy::kMaxCoeffArea; + float* HWY_RESTRICT block_b = block_x + AcStrategy::kMaxCoeffArea; + float* HWY_RESTRICT coeffs_yx = block_b + AcStrategy::kMaxCoeffArea; + float* HWY_RESTRICT coeffs_x = coeffs_yx + kColorTileDim * kColorTileDim; + float* HWY_RESTRICT coeffs_yb = coeffs_x + kColorTileDim * kColorTileDim; + float* HWY_RESTRICT coeffs_b = coeffs_yb + kColorTileDim * kColorTileDim; + float* HWY_RESTRICT scratch_space = coeffs_b + kColorTileDim * kColorTileDim; + JXL_DASSERT(scratch_space + 2 * AcStrategy::kMaxCoeffArea == block_y + CfLHeuristics::kItemsPerThread); + + // Small (~256 bytes each) + HWY_ALIGN_MAX float dc_y[AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks] = {}; + HWY_ALIGN_MAX float dc_x[AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks] = {}; + HWY_ALIGN_MAX float dc_b[AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks] = {}; + size_t num_ac = 0; + + for (size_t y = y0; y < y1; ++y) { + const float* JXL_RESTRICT row_y = opsin.ConstPlaneRow(1, y * kBlockDim); + const float* JXL_RESTRICT row_x = opsin.ConstPlaneRow(0, y * kBlockDim); + const float* JXL_RESTRICT row_b = opsin.ConstPlaneRow(2, y * kBlockDim); + size_t stride = opsin.PixelsPerRow(); + + for (size_t x = x0; x < x1; x++) { + AcStrategy acs = + use_dct8 ? AcStrategy::FromRawStrategy(AcStrategy::Type::DCT) : ac_strategy->ConstRow(y)[x]; + if (!acs.IsFirstBlock()) continue; + size_t xs = acs.covered_blocks_x(); + // TransformFromPixels(acs.Strategy(), row_y + x * kBlockDim, stride, + // block_y, scratch_space); + // DCFromLowestFrequencies(acs.Strategy(), block_y, dc_y, xs); + /* TransformFromPixels(acs.Strategy(), row_x + x * kBlockDim, stride, + block_x, scratch_space); + DCFromLowestFrequencies(acs.Strategy(), block_x, dc_x, xs);*/ + /* TransformFromPixels(acs.Strategy(), row_b + x * kBlockDim, stride, + block_b, scratch_space); + DCFromLowestFrequencies(acs.Strategy(), block_b, dc_b, xs);*/ + + //================color Y AC + size_t tile_xsize = (xsize + 63) / 64 * 64; + size_t tile_ysize = (ysize + 63) / 64 * 64; + for (int i = 0; i < 32 * 32; i++) { + if (acs.RawStrategy() == 0) { + if (i < 64) block_y[i] = dct8x8[1][64 * (y * (tile_xsize / 8) + x) + i]; + } else if (acs.RawStrategy() == 1) { + if (i < 64) block_y[i] = dctIDT[1][64 * (y * (tile_xsize / 8) + x) + i]; + } else if (acs.RawStrategy() == 2) { + if (i < 64) block_y[i] = dct2x2[1][64 * (y * (tile_xsize / 8) + x) + i]; + } else if (acs.RawStrategy() == 3) { + if (i < 64) block_y[i] = dct4x4[1][64 * (y * (tile_xsize / 8) + x) + i]; + } else if (acs.RawStrategy() == 4) { + if (i < 256) block_y[i] = dct16x16[1][16 * 16 * (y / 2 * (tile_xsize / 16) + x / 2) + i]; + } else if (acs.RawStrategy() == 5) { + block_y[i] = dct32x32[1][32 * 32 * (y / 4 * (tile_xsize / 32) + x / 4) + i]; + } else { + std::cout << "unsupported DCT" << std::endl; + } + } + + //================color Y DC + if (acs.RawStrategy() == 0) { + dc_y[0] = dc8x8[1][(y * (tile_xsize / 8) + x)]; + } else if (acs.RawStrategy() == 1) { + dc_y[0] = dcIDT[1][(y * (tile_xsize / 8) + x)]; + } else if (acs.RawStrategy() == 2) { + dc_y[0] = dc2x2[1][(y * (tile_xsize / 8) + x)]; + } else if (acs.RawStrategy() == 3) { + dc_y[0] = dc4x4[1][(y * (tile_xsize / 8) + x)]; + } else if (acs.RawStrategy() == 4) { + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + dc_y[i * xs + j] = dc16x16[1][4 * (y / 2 * (tile_xsize / 16) + x / 2) + i * 2 + j]; + } + } + } else if (acs.RawStrategy() == 5) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + dc_y[i * xs + j] = dc32x32[1][16 * (y / 4 * (tile_xsize / 32) + x / 4) + i * 4 + j]; + } + } + } else { + std::cout << "unsupported DCFromLowFREQ" << std::endl; + } + + //================color X AC + for (int i = 0; i < 32 * 32; i++) { + if (acs.RawStrategy() == 0) { + if (i < 64) block_x[i] = dct8x8[0][64 * (y * (tile_xsize / 8) + x) + i]; + } else if (acs.RawStrategy() == 1) { + if (i < 64) block_x[i] = dctIDT[0][64 * (y * (tile_xsize / 8) + x) + i]; + } else if (acs.RawStrategy() == 2) { + if (i < 64) block_x[i] = dct2x2[0][64 * (y * (tile_xsize / 8) + x) + i]; + } else if (acs.RawStrategy() == 3) { + if (i < 64) block_x[i] = dct4x4[0][64 * (y * (tile_xsize / 8) + x) + i]; + } else if (acs.RawStrategy() == 4) { + if (i < 256) block_x[i] = dct16x16[0][16 * 16 * (y / 2 * (tile_xsize / 16) + x / 2) + i]; + } else if (acs.RawStrategy() == 5) { + block_x[i] = dct32x32[0][32 * 32 * (y / 4 * (tile_xsize / 32) + x / 4) + i]; + } else { + std::cout << "unsupported DCT" << std::endl; + } + } + + //================color X DC + if (acs.RawStrategy() == 0) { + dc_x[0] = dc8x8[0][(y * (tile_xsize / 8) + x)]; + } else if (acs.RawStrategy() == 1) { + dc_x[0] = dcIDT[0][(y * (tile_xsize / 8) + x)]; + } else if (acs.RawStrategy() == 2) { + dc_x[0] = dc2x2[0][(y * (tile_xsize / 8) + x)]; + } else if (acs.RawStrategy() == 3) { + dc_x[0] = dc4x4[0][(y * (tile_xsize / 8) + x)]; + } else if (acs.RawStrategy() == 4) { + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + dc_x[i * xs + j] = dc16x16[0][4 * (y / 2 * (tile_xsize / 16) + x / 2) + i * 2 + j]; + } + } + } else if (acs.RawStrategy() == 5) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + dc_x[i * xs + j] = dc32x32[0][16 * (y / 4 * (tile_xsize / 32) + x / 4) + i * 4 + j]; + } + } + } else { + std::cout << "unsupported DCFromLowFREQ" << std::endl; + } + + //================color B AC + for (int i = 0; i < 32 * 32; i++) { + if (acs.RawStrategy() == 0) { + if (i < 64) block_b[i] = dct8x8[2][64 * (y * (tile_xsize / 8) + x) + i]; + } else if (acs.RawStrategy() == 1) { + if (i < 64) block_b[i] = dctIDT[2][64 * (y * (tile_xsize / 8) + x) + i]; + } else if (acs.RawStrategy() == 2) { + if (i < 64) block_b[i] = dct2x2[2][64 * (y * (tile_xsize / 8) + x) + i]; + } else if (acs.RawStrategy() == 3) { + if (i < 64) block_b[i] = dct4x4[2][64 * (y * (tile_xsize / 8) + x) + i]; + } else if (acs.RawStrategy() == 4) { + if (i < 256) block_b[i] = dct16x16[2][16 * 16 * (y / 2 * (tile_xsize / 16) + x / 2) + i]; + } else if (acs.RawStrategy() == 5) { + block_b[i] = dct32x32[2][32 * 32 * (y / 4 * (tile_xsize / 32) + x / 4) + i]; + } else { + std::cout << "unsupported DCT" << std::endl; + } + } + + //================color B DC + if (acs.RawStrategy() == 0) { + dc_b[0] = dc8x8[2][(y * (tile_xsize / 8) + x)]; + } else if (acs.RawStrategy() == 1) { + dc_b[0] = dcIDT[2][(y * (tile_xsize / 8) + x)]; + } else if (acs.RawStrategy() == 2) { + dc_b[0] = dc2x2[2][(y * (tile_xsize / 8) + x)]; + } else if (acs.RawStrategy() == 3) { + dc_b[0] = dc4x4[2][(y * (tile_xsize / 8) + x)]; + } else if (acs.RawStrategy() == 4) { + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + dc_b[i * xs + j] = dc16x16[2][4 * (y / 2 * (tile_xsize / 16) + x / 2) + i * 2 + j]; + } + } + } else if (acs.RawStrategy() == 5) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + dc_b[i * xs + j] = dc32x32[2][16 * (y / 4 * (tile_xsize / 32) + x / 4) + i * 4 + j]; + } + } + } else { + std::cout << "unsupported DCFromLowFREQ" << std::endl; + } + //=======DCT FINISH + + const float* const JXL_RESTRICT qm_x = dequant.InvMatrix(acs.Strategy(), 0); + const float* const JXL_RESTRICT qm_b = dequant.InvMatrix(acs.Strategy(), 2); + // Why does a constant seem to work better than + // raw_quant_field->Row(y)[x] ? + float q = use_dct8 ? 1 : quantizer->Scale() * 400.0f; + float q_dc_x = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(0); + float q_dc_b = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(2); + + // Copy DCs in dc_values. + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < xs; ix++) { + dc_values_yx[(iy + y) * xsize_blocks + ix + x] = dc_y[iy * xs + ix] * q_dc_x; + dc_values_x[(iy + y) * xsize_blocks + ix + x] = dc_x[iy * xs + ix] * q_dc_x; + dc_values_yb[(iy + y) * xsize_blocks + ix + x] = dc_y[iy * xs + ix] * q_dc_b; + dc_values_b[(iy + y) * xsize_blocks + ix + x] = dc_b[iy * xs + ix] * q_dc_b; + } + } + + // Do not use this block for computing AC CfL. + if (acs.covered_blocks_x() + x0 > x1 || acs.covered_blocks_y() + y0 > y1) { + continue; + } + + // Copy AC coefficients in the local block. The order in which + // coefficients get stored does not matter. + size_t cx = acs.covered_blocks_x(); + size_t cy = acs.covered_blocks_y(); + CoefficientLayout(&cy, &cx); + // Zero out LFs. This introduces terms in the optimization loop that + // don't affect the result, as they are all 0, but allow for simpler + // SIMDfication. + for (size_t iy = 0; iy < cy; iy++) { + for (size_t ix = 0; ix < cx; ix++) { + block_y[cx * kBlockDim * iy + ix] = 0; + block_x[cx * kBlockDim * iy + ix] = 0; + block_b[cx * kBlockDim * iy + ix] = 0; + } + } + const auto qv = Set(df, q); + for (size_t i = 0; i < cx * cy * 64; i += Lanes(df)) { + const auto b_y = Load(df, block_y + i); + const auto b_x = Load(df, block_x + i); + const auto b_b = Load(df, block_b + i); + const auto qqm_x = qv * Load(df, qm_x + i); + const auto qqm_b = qv * Load(df, qm_b + i); + Store(b_y * qqm_x, df, coeffs_yx + num_ac); + Store(b_x * qqm_x, df, coeffs_x + num_ac); + Store(b_y * qqm_b, df, coeffs_yb + num_ac); + Store(b_b * qqm_b, df, coeffs_b + num_ac); + num_ac += Lanes(df); + } + } + } + JXL_CHECK(num_ac % Lanes(df) == 0); + row_out_x[tx] = FindBestMultiplier(coeffs_yx, coeffs_x, num_ac, 0.0f, kDistanceMultiplierAC, fast); + row_out_b[tx] = FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, kYToBRatio, kDistanceMultiplierAC, fast); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(InitDCStorage); +HWY_EXPORT(ComputeDC); +HWY_EXPORT(ComputeTile); + +void CfLHeuristics::Init(const Image3F& opsin) { + size_t xsize_blocks = opsin.xsize() / kBlockDim; + size_t ysize_blocks = opsin.ysize() / kBlockDim; + HWY_DYNAMIC_DISPATCH(InitDCStorage) + (xsize_blocks * ysize_blocks, &dc_values); +} + +void CfLHeuristics::ComputeTile(const Rect& r, + const Image3F& opsin, + const DequantMatrices& dequant, + const AcStrategyImage* ac_strategy, + const Quantizer* quantizer, + bool fast, + size_t thread, + ColorCorrelationMap* cmap, + + //==========acc interface======== + size_t xsize, + size_t ysize, + std::vector >& dctIDT, + std::vector >& dct2x2, + std::vector >& dct4x4, + std::vector >& dct8x8, + std::vector >& dct16x16, + std::vector >& dct32x32, + + std::vector >& dcIDT, + std::vector >& dc2x2, + std::vector >& dc4x4, + std::vector >& dc8x8, + std::vector >& dc16x16, + std::vector >& dc32x32 + //================================ + ) { + bool use_dct8 = ac_strategy == nullptr; + HWY_DYNAMIC_DISPATCH(ComputeTile) + (opsin, dequant, ac_strategy, quantizer, r, fast, use_dct8, &cmap->ytox_map, &cmap->ytob_map, &dc_values, + mem.get() + thread * kItemsPerThread, xsize, ysize, dctIDT, dct2x2, dct4x4, dct8x8, dct16x16, dct32x32, dcIDT, + dc2x2, dc4x4, dc8x8, dc16x16, dc32x32); +} + +void CfLHeuristics::ComputeDC(bool fast, ColorCorrelationMap* cmap) { + int32_t ytob_dc = 0; + int32_t ytox_dc = 0; + HWY_DYNAMIC_DISPATCH(ComputeDC)(dc_values, fast, &ytox_dc, &ytob_dc); + cmap->SetYToBDC(ytob_dc); + cmap->SetYToXDC(ytox_dc); +} + +void ColorCorrelationMapEncodeDC(ColorCorrelationMap* map, BitWriter* writer, size_t layer, AuxOut* aux_out) { + float color_factor = map->GetColorFactor(); + float base_correlation_x = map->GetBaseCorrelationX(); + float base_correlation_b = map->GetBaseCorrelationB(); + int32_t ytox_dc = map->GetYToXDC(); + int32_t ytob_dc = map->GetYToBDC(); + + BitWriter::Allotment allotment(writer, 1 + 2 * kBitsPerByte + 12 + 32); + if (ytox_dc == 0 && ytob_dc == 0 && color_factor == kDefaultColorFactor && base_correlation_x == 0.0f && + base_correlation_b == kYToBRatio) { + writer->Write(1, 1); + ReclaimAndCharge(writer, &allotment, layer, aux_out); + return; + } + writer->Write(1, 0); + JXL_CHECK(U32Coder::Write(kColorFactorDist, color_factor, writer)); + JXL_CHECK(F16Coder::Write(base_correlation_x, writer)); + JXL_CHECK(F16Coder::Write(base_correlation_b, writer)); + writer->Write(kBitsPerByte, ytox_dc - std::numeric_limits::min()); + writer->Write(kBitsPerByte, ytob_dc - std::numeric_limits::min()); + ReclaimAndCharge(writer, &allotment, layer, aux_out); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/others/src/acc_enc_cluster.cpp b/codec/L2/demos/jxlEnc/others/src/acc_enc_cluster.cpp new file mode 100644 index 0000000000..6f31a84d63 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/acc_enc_cluster.cpp @@ -0,0 +1,758 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "acc_enc_cluster.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "enc_cluster.cpp" +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/fast_math-inl.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +template +V Entropy(V count, V inv_total, V total) { + const HWY_CAPPED(float, Histogram::kRounding) d; + const auto zero = Set(d, 0.0f); + return IfThenZeroElse(count == total, + zero - count * FastLog2f(d, count) + + count * FastLog2f(d, total)); // zero-count*FastLog2f(d, inv_total * count)); +} + +void HistogramEntropy(const Histogram& a) { + a.entropy_ = 0.0f; + if (a.total_count_ == 0) return; + + const HWY_CAPPED(float, Histogram::kRounding) df; + const HWY_CAPPED(int32_t, Histogram::kRounding) di; + + const auto inv_tot = Set(df, 1.0f / a.total_count_); + auto entropy_lanes = Zero(df); + auto total = Set(df, a.total_count_); + // printf("%s: %s: %d, a.data_.size=%d\n", __FILE__, __FUNCTION__, __LINE__, + // a.data_.size()); + for (size_t i = 0; i < a.data_.size(); i += Lanes(di)) { + const auto counts = LoadU(di, &a.data_[i]); + entropy_lanes += Entropy(ConvertTo(df, counts), inv_tot, total); + } + a.entropy_ += GetLane(SumOfLanes(entropy_lanes)); +} + +float HistogramDistance(const Histogram& a, const Histogram& b) { + if (a.total_count_ == 0 || b.total_count_ == 0) return 0; + + const HWY_CAPPED(float, Histogram::kRounding) df; + const HWY_CAPPED(int32_t, Histogram::kRounding) di; + + const auto inv_tot = Set(df, 1.0f / (a.total_count_ + b.total_count_)); + auto distance_lanes = Zero(df); + auto total = Set(df, a.total_count_ + b.total_count_); + + for (size_t i = 0; i < std::max(a.data_.size(), b.data_.size()); i += Lanes(di)) { + const auto a_counts = a.data_.size() > i ? LoadU(di, &a.data_[i]) : Zero(di); + const auto b_counts = b.data_.size() > i ? LoadU(di, &b.data_[i]) : Zero(di); + const auto counts = ConvertTo(df, a_counts + b_counts); + distance_lanes += Entropy(counts, inv_tot, total); + } + const float total_distance = GetLane(SumOfLanes(distance_lanes)); + return total_distance - a.entropy_ - b.entropy_; +} + +// First step of a k-means clustering with a fancy distance metric. +/*void FastClusterHistograms(const std::vector& in, + const size_t num_contexts_in, size_t max_histograms, + float min_distance, std::vector* out, + std::vector* histogram_symbols) { + PROFILER_FUNC; + size_t largest_idx = 0; + std::vector nonempty_histograms; + nonempty_histograms.reserve(in.size()); + int largest_count = 0; + printf("%s: %s: %d, num_contexts_in=%d\n", __FILE__, __FUNCTION__, __LINE__, +num_contexts_in); for (size_t i = 0; i < num_contexts_in; i++) { // get +position for largest total_count_ id in in if (in[i].total_count_ == 0) +continue; HistogramEntropy(in[i]); if (in[i].total_count_ > +in[largest_idx].total_count_) { largest_idx = i; largest_count = +in[i].total_count_; + } + nonempty_histograms.push_back(i); + } + // No symbols. + if (nonempty_histograms.empty()) { + out->resize(1); + histogram_symbols->clear(); + histogram_symbols->resize(in.size(), 0); + return; + } + largest_idx = std::find(nonempty_histograms.begin(), + nonempty_histograms.end(), largest_idx) - + nonempty_histograms.begin(); // get position for largest +total_count_ id in nonempty_histograms size_t num_contexts = +nonempty_histograms.size(); printf("%s: %s: %d, num_contexts of non-empty=%d, +largest_idx=%d, largest_count=%d\n", __FILE__, __FUNCTION__, __LINE__, + num_contexts, largest_idx, largest_count); + out->clear(); + out->reserve(max_histograms); + std::vector dists(num_contexts, std::numeric_limits::max()); + histogram_symbols->resize(in.size(), max_histograms); + + int while_count = 0; + while (out->size() < max_histograms && out->size() < num_contexts) { + (*histogram_symbols)[nonempty_histograms[largest_idx]] = out->size(); + out->push_back(in[nonempty_histograms[largest_idx]]); + largest_idx = 0; + while_count++; + for (size_t i = 0; i < num_contexts; i++) { + dists[i] = std::min( + HistogramDistance(in[nonempty_histograms[i]], out->back()), dists[i]); + // Avoid repeating histograms + if ((*histogram_symbols)[nonempty_histograms[i]] != max_histograms) { + continue; + } + if (dists[i] > dists[largest_idx]) largest_idx = i; + } + if (dists[largest_idx] < min_distance) break; + } + + for (size_t i = 0; i < num_contexts_in; i++) { + if ((*histogram_symbols)[i] != max_histograms) continue; + if (in[i].total_count_ == 0) { + (*histogram_symbols)[i] = 0; + continue; + } + size_t best = 0; + float best_dist = HistogramDistance(in[i], (*out)[best]); + for (size_t j = 1; j < out->size(); j++) { + float dist = HistogramDistance(in[i], (*out)[j]); + if (dist < best_dist) { + best = j; + best_dist = dist; + } + } + (*out)[best].AddHistogram(in[i]); + HistogramEntropy((*out)[best]); + (*histogram_symbols)[i] = best; + } + + printf("%s: %s: %d, out size=%zu, FastClusterHistograms size=%zu, +while_count=%d\n", __FILE__, __FUNCTION__, __LINE__, out->size(), +histogram_symbols->size(), while_count); +}*/ + +float accHistogramDistanceEntropy(const Histogram& a, const Histogram& b, bool isEntropy) { + if (!isEntropy) { + if (a.total_count_ == 0 || b.total_count_ == 0) return 0; + } else { + a.entropy_ = 0.0f; + if (a.total_count_ == 0) return 0; + } + + float total; + if (!isEntropy) { + total = a.total_count_ + b.total_count_; + } else { + total = a.total_count_; + } + float totallog2 = total == 0 ? 0 : std::log2(total) /*acc::log2(total)*/; + float distance_lanes = 0; + size_t sum_count = 0; + float sum_dist = 0; + + size_t size; + if (!isEntropy) { + size = std::max(a.data_.size(), b.data_.size()); + } else { + size = a.data_.size(); + } + + for (size_t i = 0; i < size; i++) { + float counts; + if (!isEntropy) { + size_t a_counts = a.data_.size() > i ? a.data_[i] : 0; + size_t b_counts = b.data_.size() > i ? b.data_[i] : 0; + counts = a_counts + b_counts; + } else { + counts = a.data_[i]; + } + + float countlog2 = counts == 0 ? 0 : /*acc::log2(counts)*/ std::log2(counts); + + sum_count += counts == total ? 0 : counts; + sum_dist += counts == total ? 0 : counts * countlog2; + } + distance_lanes = sum_count * totallog2 - sum_dist; + float result; + if (!isEntropy) { + result = distance_lanes - a.entropy_ - b.entropy_; + } else { + result = distance_lanes; + } + return result; +} + +// clang-format off +float accHistogramDistanceEntropy( +#ifndef __SYNTHESIS__ + bool isEntropy, + int32_t a_size, + int32_t a_total_count, + std::vector a_histo, + int32_t b_size, + int32_t b_total_count, + std::vector b_histo +#else + bool isEntropy, + int32_t a_size, + int32_t a_total_count, + a_histo[40], + int32_t b_size, + int32_t b_total_count, + b_histo[40] +#endif +) { + // clang-format on + if (!isEntropy) { + if (a_total_count == 0 || b_total_count == 0) return 0; + } else { + if (a_total_count == 0) return 0; + } + + float total; + if (!isEntropy) { + total = a_total_count + b_total_count; + } else { + total = a_total_count; + } + float totallog2 = total == 0 ? 0 : /*acc::log2(total)*/ std::log2(total); + float distance_lanes = 0; + size_t sum_count = 0; + float sum_dist = 0; + + size_t size; + if (!isEntropy) { + size = std::max(a_size, b_size); + } else { + size = a_size; + } + + for (size_t i = 0; i < size; i++) { + float counts; + if (!isEntropy) { + size_t a_counts = a_size > i ? a_histo[i] : 0; + size_t b_counts = b_size > i ? b_histo[i] : 0; + counts = a_counts + b_counts; + } else { + counts = a_histo[i]; + } + + float countlog2 = counts == 0 ? 0 : /*acc::log2(counts)*/ std::log2(counts); + + sum_count += counts == total ? 0 : counts; + sum_dist += counts == total ? 0 : counts * countlog2; + } + distance_lanes = sum_count * totallog2 - sum_dist; + return distance_lanes; +} + +void acc_HistogramDistance(bool isEntropy, + size_t num_contexts, + size_t j, + const std::vector in, + std::vector nonempty_histograms, + Histogram& ref, + std::vector& dists, + std::vector& best, + size_t& largest_idx) { + largest_idx = 0; + for (size_t i = 0; i < num_contexts; i++) { + const Histogram a = in[nonempty_histograms[i]]; + float dist_std = accHistogramDistanceEntropy(isEntropy, a.data_.size(), a.total_count_, a.data_, + ref.data_.size(), ref.total_count_, ref.data_); + if (!isEntropy) { + if (dist_std - a.entropy_ - ref.entropy_ < dists[i]) { + best[i] = j; + dists[i] = dist_std - a.entropy_ - ref.entropy_; + } + } else { + dists[i] = dist_std; + } + if (dists[i] > dists[largest_idx]) largest_idx = i; + } +} + +// clang-format off +void acc_HistogramDistance( +#ifndef __SYNTHESIS__ + bool isEntropy, uint32_t num_contexts, uint32_t j, + + const std::vector acc_histoSize, + const std::vector > acc_uramHisto, + const std::vector > acc_hbmHisto, + const std::vector acc_totalcount, + const std::vector acc_entropy, + std::vector nonempty_histograms, + + uint32_t refSize, + std::vector ref_histo, + uint32_t ref_totalcount, + float ref_entropy, + + std::vector& dists, + std::vector& best, + uint32_t& largest_idx +#else + bool isEntropy, uint32_t num_contexts, uint32_t j, + + uint32_t acc_histoSize[8192], + int32_t acc_uramHisto[4096][40], + int32_t acc_hbmHisto[4096][40], + uint32_t acc_totalcount[8192], + float acc_entropy[8192], + uint32_t nonempty_histograms[8192], + + uint32_t refSize, + int32_t ref_histo[40], + uint32_t ref_totalcount, + float ref_entropy, + + float dists[1024], + uint32_t best[1024], + uint32_t& largest_idx +#endif +) { + // clang-format on + largest_idx = 0; + for (size_t i = 0; i < num_contexts; i++) { + int idx = nonempty_histograms[i]; + std::vector tmp_histo = idx < 4096 ? acc_uramHisto[idx] : acc_hbmHisto[idx - 4096]; + float dist_std = accHistogramDistanceEntropy(isEntropy, acc_histoSize[idx], acc_totalcount[idx], tmp_histo, + refSize, ref_totalcount, ref_histo); + if (!isEntropy) { + if (dist_std - acc_entropy[i] - ref_entropy < dists[i]) { + best[i] = j; + dists[i] = dist_std - acc_entropy[i] - ref_entropy; + } + } else { + dists[i] = dist_std; + } + if (dists[i] > dists[largest_idx]) largest_idx = i; + } +} + +void FastClusterHistograms(const std::vector& in, + const size_t num_contexts_in, + size_t max_histograms, + float min_distance, + std::vector* out, + std::vector* histogram_symbols) { + PROFILER_FUNC; + uint32_t largest_idx = 0; + std::vector nonempty_histograms; + nonempty_histograms.reserve(in.size()); + for (size_t i = 0; i < num_contexts_in; i++) { + if (in[i].total_count_ == 0) continue; + + if (in[i].total_count_ > in[largest_idx].total_count_) { + largest_idx = i; + } + nonempty_histograms.push_back(i); + } + + largest_idx = + std::find(nonempty_histograms.begin(), nonempty_histograms.end(), largest_idx) - nonempty_histograms.begin(); + + size_t num_contexts = nonempty_histograms.size(); + std::vector entropy(num_contexts); + // for(size_t i=0;i > acc_uramHisto(4096, std::vector(40, 0)); + std::vector > acc_hbmHisto(4096, std::vector(40, 0)); + std::vector acc_total_count(8192, 0); + std::vector acc_entropy(8192, 0); + std::vector acc_histoSize(8192, 0); + + for (int i = 0; i < in.size(); i++) { + acc_total_count[i] = in[i].total_count_; + acc_entropy[i] = in[i].entropy_; + acc_histoSize[i] = in[i].data_.size(); + for (int j = 0; j < in[i].data_.size(); j++) { + if (i < 4096) { + acc_uramHisto[i][j] = in[i].data_[j]; + } else if (i < 8192) { + acc_hbmHisto[i - 4096][j] = in[i].data_[j]; + } else { + std::cout << "Error Histogram too big!" << std::endl; + } + } + } + + Histogram tmp0; + std::vector tmp1; + uint32_t tmp2; + acc_HistogramDistance(true, num_contexts, 0, acc_histoSize, acc_uramHisto, acc_hbmHisto, acc_total_count, + acc_entropy, nonempty_histograms, tmp0.data_.size(), tmp0.data_, tmp0.total_count_, + tmp0.entropy_, entropy, tmp1, tmp2); + + for (size_t i = 0; i < num_contexts; i++) { + in[nonempty_histograms[i]].entropy_ = entropy[i]; + acc_entropy[nonempty_histograms[i]] = entropy[i]; + } + + // No symbols. + if (nonempty_histograms.empty()) { + out->resize(1); + histogram_symbols->clear(); + histogram_symbols->resize(in.size(), 0); + return; + } + + out->clear(); + out->reserve(max_histograms); + std::vector dists(num_contexts, std::numeric_limits::max()); + std::vector best_tmp(num_contexts, 0); // no use + histogram_symbols->clear(); + histogram_symbols->resize(in.size(), 0); + + while (out->size() < max_histograms && out->size() < num_contexts) { + (*histogram_symbols)[nonempty_histograms[largest_idx]] = out->size(); + out->push_back(in[nonempty_histograms[largest_idx]]); + Histogram backhisto = out->back(); + acc_HistogramDistance(false, num_contexts, 0, acc_histoSize, acc_uramHisto, acc_hbmHisto, acc_total_count, + entropy, nonempty_histograms, backhisto.data_.size(), backhisto.data_, + backhisto.total_count_, backhisto.entropy_, dists, best_tmp, largest_idx); + if (dists[largest_idx] < min_distance) break; + } + + std::vector best_dist(num_contexts, std::numeric_limits::max()); + std::vector best(num_contexts, 0); + + for (size_t j = 0; j < out->size(); j++) { + Histogram outHisto = (*out)[j]; + acc_HistogramDistance(false, num_contexts, j, acc_histoSize, acc_uramHisto, acc_hbmHisto, acc_total_count, + entropy, nonempty_histograms, outHisto.data_.size(), outHisto.data_, + outHisto.total_count_, outHisto.entropy_, best_dist, best, largest_idx); + } + + for (size_t i = 0; i < num_contexts; i++) { + for (size_t j = 0; j < out->size(); j++) { + (*out)[best[i]].AddHistogram(in[nonempty_histograms[i]]); + (*histogram_symbols)[nonempty_histograms[i]] = best[i]; + } + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(FastClusterHistograms); // Local function +HWY_EXPORT(HistogramEntropy); // Local function + +float Histogram::ShannonEntropy() const { + HWY_DYNAMIC_DISPATCH(HistogramEntropy)(*this); + return entropy_; +} + +// Reorder histograms in *out so that the new symbols in *symbols come in +// increasing order. +void HistogramReindex(std::vector* out, std::vector* symbols) { + std::vector tmp(*out); + std::map new_index; + int next_index = 0; + for (uint32_t symbol : *symbols) { + if (new_index.find(symbol) == new_index.end()) { + new_index[symbol] = next_index; + (*out)[next_index] = tmp[symbol]; + ++next_index; + } + } + out->resize(next_index); + for (uint32_t& symbol : *symbols) { + symbol = new_index[symbol]; + } +} + +// Clusters similar histograms in 'in' together, the selected histograms are +// placed in 'out', and for each index in 'in', *histogram_symbols will +// indicate which of the 'out' histograms is the best approximation. +void ClusterHistograms(const HistogramParams params, + const std::vector& in, + const size_t num_contexts, + size_t max_histograms, + std::vector* out, + std::vector* histogram_symbols) { + constexpr float kMinDistanceForDistinctFast = 64.0f; + constexpr float kMinDistanceForDistinctBest = 16.0f; + max_histograms = std::min(max_histograms, params.max_histograms); + // printf("%s: %s: %d, max_histograms=%d\n", __FILE__, __FUNCTION__, __LINE__, + // max_histograms); + if (params.clustering == HistogramParams::ClusteringType::kFastest) { + HWY_DYNAMIC_DISPATCH(FastClusterHistograms) + (in, num_contexts, 4, kMinDistanceForDistinctFast, out, histogram_symbols); + } else if (params.clustering == HistogramParams::ClusteringType::kFast) { + HWY_DYNAMIC_DISPATCH(FastClusterHistograms) + (in, num_contexts, max_histograms, kMinDistanceForDistinctFast, out, histogram_symbols); + } else { + PROFILER_FUNC; + HWY_DYNAMIC_DISPATCH(FastClusterHistograms) + (in, num_contexts, max_histograms, kMinDistanceForDistinctBest, out, histogram_symbols); + + // printf("%s: %s: %d, FastClusterHistograms out->size=%d\n", __FILE__, + // __FUNCTION__, __LINE__, out->size()); + for (size_t i = 0; i < out->size(); i++) { + (*out)[i].entropy_ = ANSPopulationCost((*out)[i].data_.data(), (*out)[i].data_.size()); + } + uint32_t next_version = 2; + std::vector version(out->size(), 1); + std::vector renumbering(out->size()); + std::iota(renumbering.begin(), renumbering.end(), 0); + + // Try to pair up clusters if doing so reduces the total cost. + + struct HistogramPair { + // validity of a pair: p.version == max(version[i], version[j]) + float cost; + uint32_t first; + uint32_t second; + uint32_t version; + // We use > because priority queues sort in *decreasing* order, but we + // want lower cost elements to appear first. + bool operator<(const HistogramPair& other) const { + return std::make_tuple(cost, first, second, version) > + std::make_tuple(other.cost, other.first, other.second, other.version); + } + }; + + // Create list of all pairs by increasing merging cost. + std::priority_queue pairs_to_merge; + for (uint32_t i = 0; i < out->size(); i++) { + for (uint32_t j = i + 1; j < out->size(); j++) { + Histogram histo; + histo.AddHistogram((*out)[i]); + histo.AddHistogram((*out)[j]); + float cost = + ANSPopulationCost(histo.data_.data(), histo.data_.size()) - (*out)[i].entropy_ - (*out)[j].entropy_; + // Avoid enqueueing pairs that are not advantageous to merge. + if (cost >= 0) continue; + pairs_to_merge.push(HistogramPair{cost, i, j, std::max(version[i], version[j])}); + } + } + + int merge_count = 0; + // Merge the best pair to merge, add new pairs that get formed as a + // consequence. + while (!pairs_to_merge.empty()) { + merge_count++; + uint32_t first = pairs_to_merge.top().first; + uint32_t second = pairs_to_merge.top().second; + uint32_t ver = pairs_to_merge.top().version; + pairs_to_merge.pop(); + if (ver != std::max(version[first], version[second]) || version[first] == 0 || version[second] == 0) { + continue; + } + (*out)[first].AddHistogram((*out)[second]); + (*out)[first].entropy_ = ANSPopulationCost((*out)[first].data_.data(), (*out)[first].data_.size()); + for (size_t i = 0; i < renumbering.size(); i++) { + if (renumbering[i] == second) { + renumbering[i] = first; + } + } + version[second] = 0; + version[first] = next_version++; + for (uint32_t j = 0; j < out->size(); j++) { + if (j == first) continue; + if (version[j] == 0) continue; + Histogram histo; + histo.AddHistogram((*out)[first]); + histo.AddHistogram((*out)[j]); + float cost = ANSPopulationCost(histo.data_.data(), histo.data_.size()) - (*out)[first].entropy_ - + (*out)[j].entropy_; + // Avoid enqueueing pairs that are not advantageous to merge. + if (cost >= 0) continue; + pairs_to_merge.push( + HistogramPair{cost, std::min(first, j), std::max(first, j), std::max(version[first], version[j])}); + } + } + std::vector reverse_renumbering(out->size(), -1); + size_t num_alive = 0; + for (size_t i = 0; i < out->size(); i++) { + if (version[i] == 0) continue; + (*out)[num_alive++] = (*out)[i]; + reverse_renumbering[i] = num_alive - 1; + } + out->resize(num_alive); + // printf( + // "%s: %s: %d, culster num_alive=%zu, histogram_symbols size=%zu, " + // "merge_count=%d\n", + // __FILE__, __FUNCTION__, __LINE__, num_alive, + // histogram_symbols->size(), merge_count); + for (size_t i = 0; i < histogram_symbols->size(); i++) { + (*histogram_symbols)[i] = reverse_renumbering[renumbering[(*histogram_symbols)[i]]]; + } + } + + // Convert the context map to a canonical form. + HistogramReindex(out, histogram_symbols); + // printf("%s: %s: %d, culster final out size=%zu, histogram_symbols + // size=%zu\n", + // __FILE__, __FUNCTION__, __LINE__, out->size(), + // histogram_symbols->size()); +} + +void acc_FastClusterHistograms(const std::vector& in, + std::vector nonempty_histograms, + uint32_t largest_idx_in, + const size_t num_contexts, + size_t max_histograms, + float min_distance, + std::vector* out, + std::vector* histogram_symbols) { + PROFILER_FUNC; + + uint32_t largest_idx = largest_idx_in; + std::vector entropy(num_contexts); + // for(size_t i=0;i > acc_uramHisto(4096, std::vector(40, 0)); + std::vector > acc_hbmHisto(4096, std::vector(40, 0)); + std::vector acc_total_count(8192, 0); + std::vector acc_entropy(8192, 0); + std::vector acc_histoSize(8192, 0); + + for (int i = 0; i < in.size(); i++) { + acc_total_count[i] = in[i].total_count_; + acc_entropy[i] = in[i].entropy_; + acc_histoSize[i] = in[i].data_.size(); + for (int j = 0; j < in[i].data_.size(); j++) { + if (i < 4096) { + acc_uramHisto[i][j] = in[i].data_[j]; + } else if (i < 8192) { + acc_hbmHisto[i - 4096][j] = in[i].data_[j]; + } else { + std::cout << "Error Histogram too big!" << std::endl; + } + } + } + + Histogram tmp0; + std::vector tmp1; + uint32_t tmp2; + jxl::N_SCALAR::acc_HistogramDistance(true, num_contexts, 0, acc_histoSize, acc_uramHisto, acc_hbmHisto, + acc_total_count, acc_entropy, nonempty_histograms, tmp0.data_.size(), + tmp0.data_, tmp0.total_count_, tmp0.entropy_, entropy, tmp1, tmp2); + + for (size_t i = 0; i < num_contexts; i++) { + in[nonempty_histograms[i]].entropy_ = entropy[i]; + acc_entropy[nonempty_histograms[i]] = entropy[i]; + } + + // No symbols. + if (nonempty_histograms.empty()) { + out->resize(1); + histogram_symbols->clear(); + histogram_symbols->resize(in.size(), 0); + return; + } + + out->clear(); + out->reserve(max_histograms); + std::vector dists(num_contexts, std::numeric_limits::max()); + std::vector best_tmp(num_contexts, 0); // no use + histogram_symbols->clear(); + histogram_symbols->resize(in.size(), 0); + + while (out->size() < max_histograms && out->size() < num_contexts) { + (*histogram_symbols)[nonempty_histograms[largest_idx]] = out->size(); + out->push_back(in[nonempty_histograms[largest_idx]]); + Histogram backhisto = out->back(); + jxl::N_SCALAR::acc_HistogramDistance(false, num_contexts, 0, acc_histoSize, acc_uramHisto, acc_hbmHisto, + acc_total_count, entropy, nonempty_histograms, backhisto.data_.size(), + backhisto.data_, backhisto.total_count_, backhisto.entropy_, dists, + best_tmp, largest_idx); + if (dists[largest_idx] < min_distance) break; + } + + std::vector best_dist(num_contexts, std::numeric_limits::max()); + std::vector best(num_contexts, 0); + + for (size_t j = 0; j < out->size(); j++) { + Histogram outHisto = (*out)[j]; + jxl::N_SCALAR::acc_HistogramDistance(false, num_contexts, j, acc_histoSize, acc_uramHisto, acc_hbmHisto, + acc_total_count, entropy, nonempty_histograms, outHisto.data_.size(), + outHisto.data_, outHisto.total_count_, outHisto.entropy_, best_dist, best, + largest_idx); + } + + for (size_t i = 0; i < num_contexts; i++) { + for (size_t j = 0; j < out->size(); j++) { + (*out)[best[i]].AddHistogram(in[nonempty_histograms[i]]); + (*histogram_symbols)[nonempty_histograms[i]] = best[i]; + } + } +} + +void ClusterHistogramsNew(const HistogramParams params, + const std::vector& in, + const size_t num_contexts, + size_t max_histograms, + std::vector* out, + std::vector* histogram_symbols) { + constexpr float kMinDistanceForDistinctFast = 64.0f; + constexpr float kMinDistanceForDistinctBest = 16.0f; + max_histograms = std::min(max_histograms, params.max_histograms); + // printf("%s: %s: %d, max_histograms=%d\n", __FILE__, __FUNCTION__, __LINE__, + // max_histograms); + + uint32_t largest_idx = 0; + std::vector nonempty_histograms; + nonempty_histograms.reserve(in.size()); + for (size_t i = 0; i < num_contexts; i++) { + if (in[i].total_count_ == 0) continue; + + if (in[i].total_count_ > in[largest_idx].total_count_) { + largest_idx = i; + } + nonempty_histograms.push_back(i); + } + + largest_idx = + std::find(nonempty_histograms.begin(), nonempty_histograms.end(), largest_idx) - nonempty_histograms.begin(); + + acc_FastClusterHistograms(in, nonempty_histograms, largest_idx, nonempty_histograms.size(), max_histograms, + kMinDistanceForDistinctFast, out, histogram_symbols); + + // Convert the context map to a canonical form. + HistogramReindex(out, histogram_symbols); + // printf("%s: %s: %d, culster final out size=%zu, histogram_symbols + // size=%zu\n", + // __FILE__, __FUNCTION__, __LINE__, out->size(), + // histogram_symbols->size()); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/others/src/acc_enc_frame.cpp b/codec/L2/demos/jxlEnc/others/src/acc_enc_frame.cpp new file mode 100644 index 0000000000..f96e3592e6 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/acc_enc_frame.cpp @@ -0,0 +1,584 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "acc_host.hpp" +#include "lib/jxl/enc_frame.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "acc_enc_chroma_from_luma.hpp" +#include "lib/jxl/enc_coeff_order.h" +#include "lib/jxl/enc_context_map.h" +#include "lib/jxl/enc_entropy_coder.h" +#include "acc_enc_group.hpp" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_noise.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/enc_toc.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/gaborish.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" +#include "lib/jxl/toc.h" + +namespace jxl { +namespace { + +uint64_t FrameFlagsFromParams(const CompressParams& cparams) { + uint64_t flags = 0; + + const float dist = cparams.butteraugli_distance; + + // We don't add noise at low butteraugli distances because the original + // noise is stored within the compressed image and adding noise makes things + // worse. + if (ApplyOverride(cparams.noise, dist >= kMinButteraugliForNoise) || cparams.photon_noise_iso > 0) { + flags |= FrameHeader::kNoise; + } + + if (cparams.progressive_dc > 0 && cparams.modular_mode == false) { + flags |= FrameHeader::kUseDcFrame; + } + + return flags; +} + +Status LoopFilterFromParams(const CompressParams& cparams, FrameHeader* JXL_RESTRICT frame_header) { + LoopFilter* loop_filter = &frame_header->loop_filter; + + // Gaborish defaults to enabled in Hare or slower. + loop_filter->gab = ApplyOverride(cparams.gaborish, cparams.speed_tier <= SpeedTier::kHare && + frame_header->encoding == FrameEncoding::kVarDCT && + cparams.decoding_speed_tier < 4); + + if (cparams.epf != -1) { + loop_filter->epf_iters = cparams.epf; + } else { + if (frame_header->encoding == FrameEncoding::kModular) { + loop_filter->epf_iters = 0; + } else { + constexpr float kThresholds[3] = {0.7, 1.5, 4.0}; + loop_filter->epf_iters = 0; + if (cparams.decoding_speed_tier < 3) { + for (size_t i = cparams.decoding_speed_tier == 2 ? 1 : 0; i < 3; i++) { + if (cparams.butteraugli_distance >= kThresholds[i]) { + loop_filter->epf_iters++; + } + } + } + } + } + // Strength of EPF in modular mode. + if (frame_header->encoding == FrameEncoding::kModular && cparams.quality_pair.first < 100) { + // TODO(veluca): this formula is nonsense. + loop_filter->epf_sigma_for_modular = 20.0f * (1.0f - cparams.quality_pair.first / 100); + } + if (frame_header->encoding == FrameEncoding::kModular && cparams.lossy_palette) { + loop_filter->epf_sigma_for_modular = 1.0f; + } + + return true; +} + +Status MakeFrameHeader(const CompressParams& cparams, + const ProgressiveSplitter& progressive_splitter, + const FrameInfo& frame_info, + const ImageBundle& ib, + FrameHeader* JXL_RESTRICT frame_header) { + frame_header->nonserialized_is_preview = frame_info.is_preview; + frame_header->is_last = frame_info.is_last; + frame_header->save_before_color_transform = frame_info.save_before_color_transform; + frame_header->frame_type = frame_info.frame_type; + frame_header->name = ib.name; + + progressive_splitter.InitPasses(&frame_header->passes); + + if (cparams.modular_mode) { + frame_header->encoding = FrameEncoding::kModular; + frame_header->group_size_shift = cparams.modular_group_size_shift; + } + + frame_header->chroma_subsampling = ib.chroma_subsampling; + if (ib.IsJPEG()) { + // we are transcoding a JPEG, so we don't get to choose + frame_header->encoding = FrameEncoding::kVarDCT; + frame_header->color_transform = ib.color_transform; + } else { + frame_header->color_transform = cparams.color_transform; + if (!cparams.modular_mode && + (frame_header->chroma_subsampling.MaxHShift() != 0 || frame_header->chroma_subsampling.MaxVShift() != 0)) { + return JXL_FAILURE( + "Chroma subsampling is not supported in VarDCT mode when not " + "recompressing JPEGs"); + } + } + + frame_header->flags = FrameFlagsFromParams(cparams); + // Noise is not supported in the Modular encoder for now. + if (frame_header->encoding != FrameEncoding::kVarDCT) { + frame_header->UpdateFlag(false, FrameHeader::Flags::kNoise); + } + + JXL_RETURN_IF_ERROR(LoopFilterFromParams(cparams, frame_header)); + + frame_header->dc_level = frame_info.dc_level; + if (frame_header->dc_level > 2) { + // With 3 or more progressive_dc frames, the implementation does not yet + // work, see enc_cache.cc. + return JXL_FAILURE("progressive_dc > 2 is not yet supported"); + } + if (cparams.progressive_dc > 0 && (cparams.ec_resampling != 1 || cparams.resampling != 1)) { + return JXL_FAILURE("Resampling not supported with DC frames"); + } + if (cparams.resampling != 1 && cparams.resampling != 2 && cparams.resampling != 4 && cparams.resampling != 8) { + return JXL_FAILURE("Invalid resampling factor"); + } + if (cparams.ec_resampling != 1 && cparams.ec_resampling != 2 && cparams.ec_resampling != 4 && + cparams.ec_resampling != 8) { + return JXL_FAILURE("Invalid ec_resampling factor"); + } + // Resized frames. + if (frame_info.frame_type != FrameType::kDCFrame) { + frame_header->frame_origin = ib.origin; + size_t ups = 1; + if (cparams.already_downsampled) ups = cparams.resampling; + frame_header->frame_size.xsize = ib.xsize() * ups; + frame_header->frame_size.ysize = ib.ysize() * ups; + if (ib.origin.x0 != 0 || ib.origin.y0 != 0 || frame_header->frame_size.xsize != frame_header->default_xsize() || + frame_header->frame_size.ysize != frame_header->default_ysize()) { + frame_header->custom_size_or_origin = true; + } + } + // Upsampling. + frame_header->upsampling = cparams.resampling; + const std::vector& extra_channels = frame_header->nonserialized_metadata->m.extra_channel_info; + frame_header->extra_channel_upsampling.clear(); + frame_header->extra_channel_upsampling.resize(extra_channels.size(), cparams.ec_resampling); + frame_header->save_as_reference = frame_info.save_as_reference; + + // Set blending-related information. + if (ib.blend || frame_header->custom_size_or_origin) { + // Set blend_channel to the first alpha channel. These values are only + // encoded in case a blend mode involving alpha is used and there are more + // than one extra channels. + size_t index = 0; + if (extra_channels.size() > 1) { + for (size_t i = 0; i < extra_channels.size(); i++) { + if (extra_channels[i].type == ExtraChannel::kAlpha) { + index = i; + break; + } + } + } + frame_header->blending_info.alpha_channel = index; + frame_header->blending_info.mode = ib.blend ? ib.blendmode : BlendMode::kReplace; + // previous frames are saved with ID 1. + frame_header->blending_info.source = 1; + for (size_t i = 0; i < extra_channels.size(); i++) { + frame_header->extra_channel_blending_info[i].alpha_channel = index; + BlendMode default_blend = ib.blendmode; + if (extra_channels[i].type != ExtraChannel::kBlack && i != index) { + // K needs to be blended, spot colors and other stuff gets added + default_blend = BlendMode::kAdd; + } + frame_header->extra_channel_blending_info[i].mode = ib.blend ? default_blend : BlendMode::kReplace; + frame_header->extra_channel_blending_info[i].source = 1; + } + } + + frame_header->animation_frame.duration = ib.duration; + + // TODO(veluca): timecode. + + return true; +} + +} // namespace + +Status EncodeFrame(const CompressParams& cparams_orig, + const FrameInfo& frame_info, + const CodecMetadata* metadata, + const ImageBundle& ib, + PassesEncoderState* passes_enc_state, + ThreadPool* pool, + BitWriter* writer, + AuxOut* aux_out, + std::string xclbinPath) { + ib.VerifyMetadata(); + passes_enc_state->special_frames.clear(); + + CompressParams cparams = cparams_orig; + + if (cparams.progressive_dc < 0) { + if (cparams.progressive_dc != -1) { + return JXL_FAILURE("Invalid progressive DC setting value (%d)", cparams.progressive_dc); + } + cparams.progressive_dc = 0; + // Enable progressive_dc for lower qualities. + if (cparams.butteraugli_distance >= kMinButteraugliDistanceForProgressiveDc) { + cparams.progressive_dc = 1; + } + } + if (cparams.ec_resampling < cparams.resampling) { + cparams.ec_resampling = cparams.resampling; + } + if (cparams.resampling > 1) cparams.progressive_dc = 0; + + if (frame_info.dc_level + cparams.progressive_dc > 4) { + return JXL_FAILURE("Too many levels of progressive DC"); + } + + if (cparams.butteraugli_distance != 0 && cparams.butteraugli_distance < kMinButteraugliDistance) { + return JXL_FAILURE("Butteraugli distance is too low (%f)", cparams.butteraugli_distance); + } + if (cparams.butteraugli_distance > 0.9f && cparams.modular_mode == false && cparams.quality_pair.first == 100) { + // in case the color image is lossy, make the alpha slightly lossy too + cparams.quality_pair.first = std::max(90.f, 99.f - 0.3f * cparams.butteraugli_distance); + } + + if (ib.IsJPEG()) { + cparams.gaborish = Override::kOff; + cparams.epf = 0; + cparams.modular_mode = false; + } + + if (ib.xsize() == 0 || ib.ysize() == 0) return JXL_FAILURE("Empty image"); + + // Assert that this metadata is correctly set up for the compression params, + // this should have been done by enc_file.cc + JXL_ASSERT(metadata->m.xyb_encoded == (cparams.color_transform == ColorTransform::kXYB)); + std::unique_ptr frame_header = jxl::make_unique(metadata); + JXL_RETURN_IF_ERROR( + MakeFrameHeader(cparams, passes_enc_state->progressive_splitter, frame_info, ib, frame_header.get())); + // Check that if the codestream header says xyb_encoded, the color_transform + // matches the requirement. This is checked from the cparams here, even though + // optimally we'd be able to check this against what has actually been written + // in the main codestream header, but since ib is a const object and the data + // written to the main codestream header is (in modified form) in ib, the + // encoder cannot indicate this fact in the ib's metadata. + if (cparams_orig.color_transform == ColorTransform::kXYB) { + if (frame_header->color_transform != ColorTransform::kXYB) { + return JXL_FAILURE( + "The color transform of frames must be xyb if the codestream is xyb " + "encoded"); + } + } else { + if (frame_header->color_transform == ColorTransform::kXYB) { + return JXL_FAILURE( + "The color transform of frames cannot be xyb if the codestream is " + "not xyb encoded"); + } + } + + FrameDimensions frame_dim = frame_header->ToFrameDimensions(); + + const size_t num_groups = frame_dim.num_groups; + + Image3F opsin; + const ColorEncoding& c_linear = ColorEncoding::LinearSRGB(ib.IsGray()); + std::unique_ptr metadata_linear = jxl::make_unique(); + metadata_linear->xyb_encoded = (cparams.color_transform == ColorTransform::kXYB); + metadata_linear->color_encoding = c_linear; + ImageBundle linear_storage(metadata_linear.get()); + + std::vector aux_outs; + // LossyFrameEncoder stores a reference to a std::function + // so we need to keep the std::function being referenced + // alive while lossy_frame_encoder is used. We could make resize_aux_outs a + // lambda type by making LossyFrameEncoder a template instead, but this is + // simpler. + const std::function resize_aux_outs = [&aux_outs, aux_out](size_t num_threads) -> Status { + if (aux_out != nullptr) { + size_t old_size = aux_outs.size(); + for (size_t i = num_threads; i < old_size; i++) { + aux_out->Assimilate(aux_outs[i]); + } + aux_outs.resize(num_threads); + // Each thread needs these INPUTS. Don't copy the entire AuxOut + // because it may contain stats which would be Assimilated multiple + // times below. + for (size_t i = old_size; i < aux_outs.size(); i++) { + aux_outs[i].dump_image = aux_out->dump_image; + aux_outs[i].debug_prefix = aux_out->debug_prefix; + } + } + return true; + }; + + LossyFrameEncoder lossy_frame_encoder(cparams, *frame_header, passes_enc_state, pool, aux_out); + std::unique_ptr modular_frame_encoder = + jxl::make_unique(*frame_header, cparams); + + const std::vector* extra_channels = &ib.extra_channels(); + std::vector extra_channels_storage; + const ImageBundle* JXL_RESTRICT ib_or_linear; + + if (ib.IsJPEG()) { + JXL_RETURN_IF_ERROR(lossy_frame_encoder.ComputeJPEGTranscodingData(*ib.jpeg_data, modular_frame_encoder.get(), + frame_header.get())); + } else if (!lossy_frame_encoder.State()->heuristics->HandlesColorConversion(cparams, ib) || + frame_header->encoding != FrameEncoding::kVarDCT) { + acc_host(xclbinPath, opsin, lossy_frame_encoder, ib_or_linear, pool, modular_frame_encoder, writer, aux_out, + frame_header, frame_info, cparams, &ib.extra_channels(), passes_enc_state, frame_dim, num_groups, ib, + aux_outs, resize_aux_outs); + if (frame_header->encoding == FrameEncoding::kVarDCT) { + } else if (frame_header->upsampling != 1 && !cparams.already_downsampled) { + // In VarDCT mode, LossyFrameHeuristics takes care of running downsampling + // after noise, if necessary. + DownsampleImage(&opsin, frame_header->upsampling); + } + + } else { + JXL_RETURN_IF_ERROR(lossy_frame_encoder.ComputeEncodingData(&ib, &opsin, pool, modular_frame_encoder.get(), + writer, frame_header.get())); + } + + if (!ib.IsJPEG() && (!lossy_frame_encoder.State()->heuristics->HandlesColorConversion(cparams, ib) || + frame_header->encoding != FrameEncoding::kVarDCT) && + frame_header->encoding == FrameEncoding::kVarDCT) { + } else { + if (cparams.ec_resampling != 1 && !cparams.already_downsampled) { + extra_channels = &extra_channels_storage; + for (size_t i = 0; i < ib.extra_channels().size(); i++) { + extra_channels_storage.emplace_back(CopyImage(ib.extra_channels()[i])); + DownsampleImage(&extra_channels_storage.back(), cparams.ec_resampling); + } + } + // needs to happen *AFTER* VarDCT-ComputeEncodingData. + JXL_RETURN_IF_ERROR(modular_frame_encoder->ComputeEncodingData( + *frame_header, *ib.metadata(), &opsin, *extra_channels, lossy_frame_encoder.State(), pool, aux_out, + /* do_color=*/frame_header->encoding == FrameEncoding::kModular)); + + writer->AppendByteAligned(lossy_frame_encoder.State()->special_frames); + frame_header->UpdateFlag(lossy_frame_encoder.State()->shared.image_features.patches.HasAny(), + FrameHeader::kPatches); + frame_header->UpdateFlag(lossy_frame_encoder.State()->shared.image_features.splines.HasAny(), + FrameHeader::kSplines); + JXL_RETURN_IF_ERROR(WriteFrameHeader(*frame_header, writer, aux_out)); + + const size_t num_passes = passes_enc_state->progressive_splitter.GetNumPasses(); + + // DC global info + DC groups + AC global info + AC groups * + // num_passes. + const bool has_ac_global = true; + std::vector group_codes( + NumTocEntries(frame_dim.num_groups, frame_dim.num_dc_groups, num_passes, has_ac_global)); + const size_t global_ac_index = frame_dim.num_dc_groups + 1; + const bool is_small_image = frame_dim.num_groups == 1 && num_passes == 1; + const auto get_output = [&](const size_t index) { return &group_codes[is_small_image ? 0 : index]; }; + auto ac_group_code = [&](size_t pass, size_t group) { + return get_output(AcGroupIndex(pass, group, frame_dim.num_groups, frame_dim.num_dc_groups, has_ac_global)); + }; + + if (frame_header->flags & FrameHeader::kPatches) { + PatchDictionaryEncoder::Encode(lossy_frame_encoder.State()->shared.image_features.patches, get_output(0), + kLayerDictionary, aux_out); + } + + if (frame_header->flags & FrameHeader::kSplines) { + EncodeSplines(lossy_frame_encoder.State()->shared.image_features.splines, get_output(0), kLayerSplines, + HistogramParams(), aux_out); + } + + if (frame_header->flags & FrameHeader::kNoise) { + EncodeNoise(lossy_frame_encoder.State()->shared.image_features.noise_params, get_output(0), kLayerNoise, + aux_out); + } + + JXL_RETURN_IF_ERROR(DequantMatricesEncodeDC(&lossy_frame_encoder.State()->shared.matrices, get_output(0), + kLayerDequantTables, aux_out)); + if (frame_header->encoding == FrameEncoding::kVarDCT) { + JXL_RETURN_IF_ERROR(lossy_frame_encoder.EncodeGlobalDCInfo(*frame_header, get_output(0))); + } + JXL_RETURN_IF_ERROR(modular_frame_encoder->EncodeGlobalInfo(get_output(0), aux_out)); + JXL_RETURN_IF_ERROR(modular_frame_encoder->EncodeStream(get_output(0), aux_out, kLayerModularGlobal, + ModularStreamId::Global())); + + const auto process_dc_group = [&](const int group_index, const int thread) { + AuxOut* my_aux_out = aux_out ? &aux_outs[thread] : nullptr; + BitWriter* output = get_output(group_index + 1); + if (frame_header->encoding == FrameEncoding::kVarDCT && !(frame_header->flags & FrameHeader::kUseDcFrame)) { + BitWriter::Allotment allotment(output, 2); + output->Write(2, modular_frame_encoder->extra_dc_precision[group_index]); + ReclaimAndCharge(output, &allotment, kLayerDC, my_aux_out); + JXL_CHECK(modular_frame_encoder->EncodeStream(output, my_aux_out, kLayerDC, + ModularStreamId::VarDCTDC(group_index))); + } + JXL_CHECK(modular_frame_encoder->EncodeStream(output, my_aux_out, kLayerModularDcGroup, + ModularStreamId::ModularDC(group_index))); + if (frame_header->encoding == FrameEncoding::kVarDCT) { + const Rect& rect = lossy_frame_encoder.State()->shared.DCGroupRect(group_index); + size_t nb_bits = CeilLog2Nonzero(rect.xsize() * rect.ysize()); + if (nb_bits != 0) { + BitWriter::Allotment allotment(output, nb_bits); + output->Write(nb_bits, modular_frame_encoder->ac_metadata_size[group_index] - 1); + ReclaimAndCharge(output, &allotment, kLayerControlFields, my_aux_out); + } + JXL_CHECK(modular_frame_encoder->EncodeStream(output, my_aux_out, kLayerControlFields, + ModularStreamId::ACMetadata(group_index))); + } + }; + RunOnPool(pool, 0, frame_dim.num_dc_groups, resize_aux_outs, process_dc_group, "EncodeDCGroup"); + + if (frame_header->encoding == FrameEncoding::kVarDCT) { + JXL_RETURN_IF_ERROR( + lossy_frame_encoder.EncodeGlobalACInfo(get_output(global_ac_index), modular_frame_encoder.get())); + } + + std::atomic num_errors{0}; + const auto process_group = [&](const int group_index, const int thread) { + AuxOut* my_aux_out = aux_out ? &aux_outs[thread] : nullptr; + + for (size_t i = 0; i < num_passes; i++) { + if (frame_header->encoding == FrameEncoding::kVarDCT) { + if (!lossy_frame_encoder.EncodeACGroup(i, group_index, ac_group_code(i, group_index), my_aux_out)) { + num_errors.fetch_add(1, std::memory_order_relaxed); + return; + } + } + // Write all modular encoded data (color?, alpha, depth, extra channels) + if (!modular_frame_encoder->EncodeStream(ac_group_code(i, group_index), my_aux_out, + kLayerModularAcGroup, + ModularStreamId::ModularAC(group_index, i))) { + num_errors.fetch_add(1, std::memory_order_relaxed); + return; + } + } + }; + RunOnPool(pool, 0, num_groups, resize_aux_outs, process_group, "EncodeGroupCoefficients"); + + // Resizing aux_outs to 0 also Assimilates the array. + static_cast(resize_aux_outs(0)); + JXL_RETURN_IF_ERROR(num_errors.load(std::memory_order_relaxed) == 0); + + for (BitWriter& bw : group_codes) { + bw.ZeroPadToByte(); // end of group. + } + + std::vector* permutation_ptr = nullptr; + std::vector permutation; + if (cparams.centerfirst && !(num_passes == 1 && num_groups == 1)) { + permutation_ptr = &permutation; + // Don't permute global DC/AC or DC. + permutation.resize(global_ac_index + 1); + std::iota(permutation.begin(), permutation.end(), 0); + std::vector ac_group_order(num_groups); + std::iota(ac_group_order.begin(), ac_group_order.end(), 0); + size_t group_dim = frame_dim.group_dim; + + // The center of the image is either given by parameters or chosen + // to be the middle of the image by default if center_x, center_y resp. + // are not provided. + + int64_t imag_cx; + if (cparams.center_x != static_cast(-1)) { + JXL_RETURN_IF_ERROR(cparams.center_x < ib.xsize()); + imag_cx = cparams.center_x; + } else { + imag_cx = ib.xsize() / 2; + } + + int64_t imag_cy; + if (cparams.center_y != static_cast(-1)) { + JXL_RETURN_IF_ERROR(cparams.center_y < ib.ysize()); + imag_cy = cparams.center_y; + } else { + imag_cy = ib.ysize() / 2; + } + + // The center of the group containing the center of the image. + int64_t cx = (imag_cx / group_dim) * group_dim + group_dim / 2; + int64_t cy = (imag_cy / group_dim) * group_dim + group_dim / 2; + // This identifies in what area of the central group the center of the + // image + // lies in. + double direction = -std::atan2(imag_cy - cy, imag_cx - cx); + // This identifies the side of the central group the center of the image + // lies closest to. This can take values 0, 1, 2, 3 corresponding to left, + // bottom, right, top. + int64_t side = std::fmod((direction + 5 * kPi / 4), 2 * kPi) * 2 / kPi; + auto get_distance_from_center = [&](size_t gid) { + Rect r = passes_enc_state->shared.GroupRect(gid); + int64_t gcx = r.x0() + group_dim / 2; + int64_t gcy = r.y0() + group_dim / 2; + int64_t dx = gcx - cx; + int64_t dy = gcy - cy; + // The angle is determined by taking atan2 and adding an appropriate + // starting point depending on the side we want to start on. + double angle = std::remainder(std::atan2(dy, dx) + kPi / 4 + side * (kPi / 2), 2 * kPi); + // Concentric squares in clockwise order. + return std::make_pair(std::max(std::abs(dx), std::abs(dy)), angle); + }; + std::sort(ac_group_order.begin(), ac_group_order.end(), [&](coeff_order_t a, coeff_order_t b) { + return get_distance_from_center(a) < get_distance_from_center(b); + }); + std::vector inv_ac_group_order(ac_group_order.size(), 0); + for (size_t i = 0; i < ac_group_order.size(); i++) { + inv_ac_group_order[ac_group_order[i]] = i; + } + for (size_t i = 0; i < num_passes; i++) { + size_t pass_start = permutation.size(); + for (coeff_order_t v : inv_ac_group_order) { + permutation.push_back(pass_start + v); + } + } + std::vector new_group_codes(group_codes.size()); + for (size_t i = 0; i < permutation.size(); i++) { + new_group_codes[permutation[i]] = std::move(group_codes[i]); + } + group_codes = std::move(new_group_codes); + } + + JXL_RETURN_IF_ERROR(WriteGroupOffsets(group_codes, permutation_ptr, writer, aux_out)); + writer->AppendByteAligned(group_codes); + writer->ZeroPadToByte(); // end of frame. + } + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/others/src/acc_enc_group.cpp b/codec/L2/demos/jxlEnc/others/src/acc_enc_group.cpp new file mode 100644 index 0000000000..090e12c212 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/acc_enc_group.cpp @@ -0,0 +1,525 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "acc_enc_group.hpp" + +#include +#include +#include + +#include "hwy/aligned_allocator.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "xilinx/src/acc_enc_group.cpp" +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/dec_transforms-inl.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_transforms-inl.h" +#include "lib/jxl/image.h" +#include "lib/jxl/quantizer-inl.h" +#include "lib/jxl/quantizer.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// NOTE: caller takes care of extracting quant from rect of RawQuantField. +void QuantizeBlockAC(const Quantizer& quantizer, + const bool error_diffusion, + size_t c, + int32_t quant, + float qm_multiplier, + size_t quant_kind, + size_t xsize, + size_t ysize, + const float* JXL_RESTRICT block_in, + int32_t* JXL_RESTRICT block_out) { + PROFILER_FUNC; + const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c); + const float qac = quantizer.Scale() * quant; + // Not SIMD-fied for now. + float thres[4] = {0.5f, 0.6f, 0.6f, 0.65f}; + if (c != 1) { + for (int i = 1; i < 4; ++i) { + thres[i] = 0.75f; + } + } + + if (!error_diffusion) { + HWY_CAPPED(float, kBlockDim) df; + HWY_CAPPED(int32_t, kBlockDim) di; + HWY_CAPPED(uint32_t, kBlockDim) du; + const auto quant = Set(df, qac * qm_multiplier); + + for (size_t y = 0; y < ysize * kBlockDim; y++) { + size_t yfix = static_cast(y >= ysize * kBlockDim / 2) * 2; + const size_t off = y * kBlockDim * xsize; + for (size_t x = 0; x < xsize * kBlockDim; x += Lanes(df)) { + auto thr = Zero(df); + if (xsize == 1) { + HWY_ALIGN uint32_t kMask[kBlockDim] = {0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u}; + const auto mask = MaskFromVec(BitCast(df, Load(du, kMask + x))); + thr = IfThenElse(mask, Set(df, thres[yfix + 1]), Set(df, thres[yfix])); + } else { + // Same for all lanes in the vector. + thr = Set(df, thres[yfix + static_cast(x >= xsize * kBlockDim / 2)]); + } + + const auto q = Load(df, qm + off + x) * quant; + const auto in = Load(df, block_in + off + x); + const auto val = q * in; + const auto nzero_mask = Abs(val) >= thr; + const auto v = ConvertTo(di, IfThenElseZero(nzero_mask, Round(val))); + Store(v, di, block_out + off + x); + } + } + return; + } + +retry: + int hfNonZeros[4] = {}; + float hfError[4] = {}; + float hfMaxError[4] = {}; + size_t hfMaxErrorIx[4] = {}; + for (size_t y = 0; y < ysize * kBlockDim; y++) { + for (size_t x = 0; x < xsize * kBlockDim; x++) { + const size_t pos = y * kBlockDim * xsize + x; + if (x < xsize && y < ysize) { + // Ensure block is initialized + block_out[pos] = 0; + continue; + } + const size_t hfix = + (static_cast(y >= ysize * kBlockDim / 2) * 2 + static_cast(x >= xsize * kBlockDim / 2)); + const float val = block_in[pos] * (qm[pos] * qac * qm_multiplier); + float v = (std::abs(val) < thres[hfix]) ? 0 : rintf(val); + const float error = std::abs(val) - std::abs(v); + hfError[hfix] += error; + if (hfMaxError[hfix] < error) { + hfMaxError[hfix] = error; + hfMaxErrorIx[hfix] = pos; + } + if (v != 0.0f) { + hfNonZeros[hfix] += std::abs(v); + } + block_out[pos] = static_cast(rintf(v)); + } + } + if (c != 1) return; + // TODO(veluca): include AFV? + const size_t kPartialBlockKinds = (1 << AcStrategy::Type::IDENTITY) | (1 << AcStrategy::Type::DCT2X2) | + (1 << AcStrategy::Type::DCT4X4) | (1 << AcStrategy::Type::DCT4X8) | + (1 << AcStrategy::Type::DCT8X4); + if ((1 << quant_kind) & kPartialBlockKinds) return; + float hfErrorLimit = 0.1f * (xsize * ysize) * kDCTBlockSize * 0.25f; + bool goretry = false; + for (int i = 1; i < 4; ++i) { + if (hfError[i] >= hfErrorLimit && hfNonZeros[i] <= (xsize + ysize) * 0.25f) { + if (thres[i] >= 0.4f) { + thres[i] -= 0.01f; + goretry = true; + } + } + } + if (goretry) goto retry; + for (int i = 1; i < 4; ++i) { + if (hfError[i] >= hfErrorLimit && hfNonZeros[i] == 0) { + const size_t pos = hfMaxErrorIx[i]; + if (hfMaxError[i] >= 0.4f) { + block_out[pos] = block_in[pos] > 0.0f ? 1.0f : -1.0f; + } + } + } +} + +// NOTE: caller takes care of extracting quant from rect of RawQuantField. +void QuantizeRoundtripYBlockAC(const Quantizer& quantizer, + const bool error_diffusion, + int32_t quant, + size_t quant_kind, + size_t xsize, + size_t ysize, + const float* JXL_RESTRICT biases, + float* JXL_RESTRICT inout, + int32_t* JXL_RESTRICT quantized) { + QuantizeBlockAC(quantizer, error_diffusion, 1, quant, 1.0f, quant_kind, xsize, ysize, inout, quantized); + + PROFILER_ZONE("enc quant adjust bias"); + const float* JXL_RESTRICT dequant_matrix = quantizer.DequantMatrix(quant_kind, 1); + + HWY_CAPPED(float, kDCTBlockSize) df; + HWY_CAPPED(int32_t, kDCTBlockSize) di; + const auto inv_qac = Set(df, quantizer.inv_quant_ac(quant)); + for (size_t k = 0; k < kDCTBlockSize * xsize * ysize; k += Lanes(df)) { + const auto quant = Load(di, quantized + k); + const auto adj_quant = AdjustQuantBias(di, 1, quant, biases); + const auto dequantm = Load(df, dequant_matrix + k); + Store(adj_quant * dequantm * inv_qac, df, inout + k); + } +} + +void ComputeCoefficients(size_t group_idx, + PassesEncoderState* enc_state, + const Image3F& opsin, + Image3F* dc, + + //==========acc interface======== + size_t xsize, + size_t ysize, + std::vector >& dctIDT, + std::vector >& dct2x2, + std::vector >& dct4x4, + std::vector >& dct8x8, + std::vector >& dct16x16, + std::vector >& dct32x32, + + std::vector >& dcIDT, + std::vector >& dc2x2, + std::vector >& dc4x4, + std::vector >& dc8x8, + std::vector >& dc16x16, + std::vector >& dc32x32 + //================================ + ) { + PROFILER_FUNC; + const Rect block_group_rect = enc_state->shared.BlockGroupRect(group_idx); + const Rect group_rect = enc_state->shared.GroupRect(group_idx); + const Rect cmap_rect(block_group_rect.x0() / kColorTileDimInBlocks, block_group_rect.y0() / kColorTileDimInBlocks, + DivCeil(block_group_rect.xsize(), kColorTileDimInBlocks), + DivCeil(block_group_rect.ysize(), kColorTileDimInBlocks)); + + const size_t xsize_blocks = block_group_rect.xsize(); + const size_t ysize_blocks = block_group_rect.ysize(); + + const size_t dc_stride = static_cast(dc->PixelsPerRow()); + const size_t opsin_stride = static_cast(opsin.PixelsPerRow()); + + const ImageI& full_quant_field = enc_state->shared.raw_quant_field; + const CompressParams& cparams = enc_state->cparams; + + // TODO(veluca): consider strategies to reduce this memory. + auto mem = hwy::AllocateAligned(3 * AcStrategy::kMaxCoeffArea); + auto fmem = hwy::AllocateAligned(5 * AcStrategy::kMaxCoeffArea); + float* JXL_RESTRICT scratch_space = fmem.get() + 3 * AcStrategy::kMaxCoeffArea; + { + // Only use error diffusion in Squirrel mode or slower. + const bool error_diffusion = cparams.speed_tier <= SpeedTier::kSquirrel; + constexpr HWY_CAPPED(float, kDCTBlockSize) d; + + int32_t* JXL_RESTRICT coeffs[kMaxNumPasses][3] = {}; + size_t num_passes = enc_state->progressive_splitter.GetNumPasses(); + JXL_DASSERT(num_passes > 0); + for (size_t i = 0; i < num_passes; i++) { + // TODO(veluca): 16-bit quantized coeffs are not implemented yet. + JXL_ASSERT(enc_state->coeffs[i]->Type() == ACType::k32); + for (size_t c = 0; c < 3; c++) { + coeffs[i][c] = enc_state->coeffs[i]->PlaneRow(c, group_idx, 0).ptr32; + } + } + + HWY_ALIGN float* coeffs_in = fmem.get(); + HWY_ALIGN int32_t* quantized = mem.get(); + + size_t offset = 0; + + for (size_t by = 0; by < ysize_blocks; ++by) { + const int32_t* JXL_RESTRICT row_quant_ac = block_group_rect.ConstRow(full_quant_field, by); + size_t ty = by / kColorTileDimInBlocks; + const int8_t* JXL_RESTRICT row_cmap[3] = { + cmap_rect.ConstRow(enc_state->shared.cmap.ytox_map, ty), nullptr, + cmap_rect.ConstRow(enc_state->shared.cmap.ytob_map, ty), + }; + const float* JXL_RESTRICT opsin_rows[3] = { + group_rect.ConstPlaneRow(opsin, 0, by * kBlockDim), group_rect.ConstPlaneRow(opsin, 1, by * kBlockDim), + group_rect.ConstPlaneRow(opsin, 2, by * kBlockDim), + }; + float* JXL_RESTRICT dc_rows[3] = { + block_group_rect.PlaneRow(dc, 0, by), block_group_rect.PlaneRow(dc, 1, by), + block_group_rect.PlaneRow(dc, 2, by), + }; + AcStrategyRow ac_strategy_row = enc_state->shared.ac_strategy.ConstRow(block_group_rect, by); + for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks); tx++) { + const auto x_factor = Set(d, enc_state->shared.cmap.YtoXRatio(row_cmap[0][tx])); + const auto b_factor = Set(d, enc_state->shared.cmap.YtoBRatio(row_cmap[2][tx])); + for (size_t bx = tx * kColorTileDimInBlocks; bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks; + ++bx) { + const AcStrategy acs = ac_strategy_row[bx]; + if (!acs.IsFirstBlock()) continue; + + size_t xblocks = acs.covered_blocks_x(); + size_t yblocks = acs.covered_blocks_y(); + + CoefficientLayout(&yblocks, &xblocks); // QC: xblocks and yblocks are + // updated inside. Calculate + // how may horizontal 8x8 + // blocks (xblocks) covered by + // the ACstrategy and vertical + // 8x8 blocks (yblocks) + // covered by the acs. + + size_t size = kDCTBlockSize * xblocks * yblocks; + + // DCT Y channel, roundtrip-quantize it and set DC. + const int32_t quant_ac = row_quant_ac[bx]; + // TransformFromPixels(acs.Strategy(), opsin_rows[1] + bx * + // kBlockDim, + // opsin_stride, coeffs_in + size, + // scratch_space); + + // DCFromLowestFrequencies(acs.Strategy(), coeffs_in + size, + // dc_rows[1] + bx, dc_stride); + + size_t tile_xsize = (xsize + 63) / 64 * 64; + size_t tile_ysize = (ysize + 63) / 64 * 64; + float* coef_dct = coeffs_in + size; + size_t block_cnt8x8 = (block_group_rect.y0() + by) * (tile_xsize / 8) + block_group_rect.x0() + bx; + size_t block_cnt16x16 = + (block_group_rect.y0() + by) / 2 * (tile_xsize / 16) + (block_group_rect.x0() + bx) / 2; + size_t block_cnt32x32 = + (block_group_rect.y0() + by) / 4 * (tile_xsize / 32) + (block_group_rect.x0() + bx) / 4; + +#ifdef XLNX_QC_DEBUG_ENC_GROUP + if (acs.RawStrategy() == 0) { + std::cout << "========================debug===================== " + "convered blocks: " + << acs.covered_blocks_x() << " tile_xsize: " << tile_xsize + << " bx: " << block_group_rect.x0() << " " << bx << " by: " << block_group_rect.y0() + << " " << by << std::endl; + for (int i = 0; i < 64; i++) { + std::cout << std::setw(15) << coef_dct[i] << " "; + } + std::cout << std::endl; + for (int i = 0; i < 64; i++) { + std::cout << std::setw(15) << dct8x8[1][64 * block_cnt8x8 + i] << " "; + } + std::cout << std::endl; + for (int i = 0; i < 64; i++) { + if (coef_dct[i] != dct8x8[1][64 * block_cnt8x8 + i]) std::cout << "!!!"; + } + std::cout << std::endl; + } +#endif + + for (int i = 0; i < 32 * 32; i++) { + if (acs.RawStrategy() == 0) { + if (i < 64) coef_dct[i] = dct8x8[1][64 * block_cnt8x8 + i]; + } else if (acs.RawStrategy() == 1) { + if (i < 64) coef_dct[i] = dctIDT[1][64 * block_cnt8x8 + i]; + } else if (acs.RawStrategy() == 2) { + if (i < 64) coef_dct[i] = dct2x2[1][64 * block_cnt8x8 + i]; + } else if (acs.RawStrategy() == 3) { + if (i < 64) coef_dct[i] = dct4x4[1][64 * block_cnt8x8 + i]; + } else if (acs.RawStrategy() == 4) { + if (i < 256) coef_dct[i] = dct16x16[1][16 * 16 * block_cnt16x16 + i]; + } else if (acs.RawStrategy() == 5) { + coef_dct[i] = dct32x32[1][32 * 32 * block_cnt32x32 + i]; + } else { + std::cout << "unsupported DCT" << std::endl; + } + } + + float* coef_dc = dc_rows[1] + bx; + +#ifdef XLNX_QC_DEBUG_ENC_GROUP_DC + if (acs.RawStrategy() == 5) { + std::cout << "========================debug===================== " + "convered blocks: " + << acs.covered_blocks_x() << " tile_xsize: " << tile_xsize + << " bx: " << block_group_rect.x0() << " " << bx << " by: " << block_group_rect.y0() + << " " << by << " dc_stride: " << dc_stride << std::endl; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + std::cout << std::setw(15) << coef_dc[i * dc_stride + j] << " "; + } + } + std::cout << std::endl; + for (int i = 0; i < 16; i++) { + std::cout << std::setw(15) << dc32x32[1][16 * block_cnt32x32 + i] << " "; + } + std::cout << std::endl; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + if (coef_dc[i * dc_stride + j] != dc32x32[1][16 * block_cnt32x32 + i * 4 + j]) + std::cout << "!!!"; + } + } + std::cout << std::endl; + } +#endif + + if (acs.RawStrategy() == 0) { + coef_dc[0] = dc8x8[1][block_cnt8x8]; + } else if (acs.RawStrategy() == 1) { + coef_dc[0] = dcIDT[1][block_cnt8x8]; + } else if (acs.RawStrategy() == 2) { + coef_dc[0] = dc2x2[1][block_cnt8x8]; + } else if (acs.RawStrategy() == 3) { + coef_dc[0] = dc4x4[1][block_cnt8x8]; + } else if (acs.RawStrategy() == 4) { + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + coef_dc[i * dc_stride + j] = dc16x16[1][4 * block_cnt16x16 + i * 2 + j]; + } + } + } else if (acs.RawStrategy() == 5) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + coef_dc[i * dc_stride + j] = dc32x32[1][16 * block_cnt32x32 + i * 4 + j]; + } + } + } else { + std::cout << "unsupported DCFromLowFREQ" << std::endl; + } + + QuantizeRoundtripYBlockAC(enc_state->shared.quantizer, error_diffusion, quant_ac, acs.RawStrategy(), + xblocks, yblocks, kDefaultQuantBias, coeffs_in + size, quantized + size); + + // DCT X and B channels + for (size_t c : {0, 2}) { + // TransformFromPixels(acs.Strategy(), opsin_rows[c] + bx + // * kBlockDim, + // opsin_stride, coeffs_in + c * + // size, scratch_space); + coef_dct = coeffs_in + c * size; + for (int i = 0; i < 32 * 32; i++) { + if (acs.RawStrategy() == 0) { + if (i < 64) coef_dct[i] = dct8x8[c][64 * block_cnt8x8 + i]; + } else if (acs.RawStrategy() == 1) { + if (i < 64) coef_dct[i] = dctIDT[c][64 * block_cnt8x8 + i]; + } else if (acs.RawStrategy() == 2) { + if (i < 64) coef_dct[i] = dct2x2[c][64 * block_cnt8x8 + i]; + } else if (acs.RawStrategy() == 3) { + if (i < 64) coef_dct[i] = dct4x4[c][64 * block_cnt8x8 + i]; + } else if (acs.RawStrategy() == 4) { + if (i < 256) coef_dct[i] = dct16x16[c][16 * 16 * block_cnt16x16 + i]; + } else if (acs.RawStrategy() == 5) { + coef_dct[i] = dct32x32[c][32 * 32 * block_cnt32x32 + i]; + } else { + std::cout << "unsupported DCT" << std::endl; + } + } + } + + // Unapply color correlation + for (size_t k = 0; k < size; k += Lanes(d)) { + const auto in_x = Load(d, coeffs_in + k); + const auto in_y = Load(d, coeffs_in + size + k); + const auto in_b = Load(d, coeffs_in + 2 * size + k); + const auto out_x = in_x - x_factor * in_y; + const auto out_b = in_b - b_factor * in_y; + Store(out_x, d, coeffs_in + k); + Store(out_b, d, coeffs_in + 2 * size + k); + } + + // Quantize X and B channels and set DC. + for (size_t c : {0, 2}) { + QuantizeBlockAC(enc_state->shared.quantizer, error_diffusion, c, quant_ac, + c == 0 ? enc_state->x_qm_multiplier : enc_state->b_qm_multiplier, + acs.RawStrategy(), xblocks, yblocks, coeffs_in + c * size, + quantized + c * size); + /* DCFromLowestFrequencies(acs.Strategy(), coeffs_in + c * + size, dc_rows[c] + bx, dc_stride);*/ + coef_dc = dc_rows[c] + bx; + if (acs.RawStrategy() == 0) { + coef_dc[0] = dc8x8[c][block_cnt8x8]; + } else if (acs.RawStrategy() == 1) { + coef_dc[0] = dcIDT[c][block_cnt8x8]; + } else if (acs.RawStrategy() == 2) { + coef_dc[0] = dc2x2[c][block_cnt8x8]; + } else if (acs.RawStrategy() == 3) { + coef_dc[0] = dc4x4[c][block_cnt8x8]; + } else if (acs.RawStrategy() == 4) { + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + coef_dc[i * dc_stride + j] = dc16x16[c][4 * block_cnt16x16 + i * 2 + j]; + } + } + } else if (acs.RawStrategy() == 5) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + coef_dc[i * dc_stride + j] = dc32x32[c][16 * block_cnt32x32 + i * 4 + j]; + } + } + } else { + std::cout << "unsupported DCFromLowFREQ" << std::endl; + } + } + enc_state->progressive_splitter.SplitACCoefficients(quantized, size, acs, bx, by, offset, coeffs); + offset += size; + } + } + } + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(ComputeCoefficients); +void ComputeCoefficients(size_t group_idx, + PassesEncoderState* enc_state, + const Image3F& opsin, + Image3F* dc, + //==========acc interface======== + size_t xsize, + size_t ysize, + std::vector >& dctIDT, + std::vector >& dct2x2, + std::vector >& dct4x4, + std::vector >& dct8x8, + std::vector >& dct16x16, + std::vector >& dct32x32, + + std::vector >& dcIDT, + std::vector >& dc2x2, + std::vector >& dc4x4, + std::vector >& dc8x8, + std::vector >& dc16x16, + std::vector >& dc32x32 + //================================ + ) { + return HWY_DYNAMIC_DISPATCH(ComputeCoefficients)(group_idx, enc_state, opsin, dc, xsize, ysize, dctIDT, dct2x2, + dct4x4, dct8x8, dct16x16, dct32x32, dcIDT, dc2x2, dc4x4, dc8x8, + dc16x16, dc32x32); +} + +Status EncodeGroupTokenizedCoefficients(size_t group_idx, + size_t pass_idx, + size_t histogram_idx, + const PassesEncoderState& enc_state, + BitWriter* writer, + AuxOut* aux_out) { + // Select which histogram to use among those of the current pass. + const size_t num_histograms = enc_state.shared.num_histograms; + // num_histograms is 0 only for lossless. + JXL_ASSERT(num_histograms == 0 || histogram_idx < num_histograms); + size_t histo_selector_bits = CeilLog2Nonzero(num_histograms); + + if (histo_selector_bits != 0) { + BitWriter::Allotment allotment(writer, histo_selector_bits); + writer->Write(histo_selector_bits, histogram_idx); + ReclaimAndCharge(writer, &allotment, kLayerAC, aux_out); + } + WriteTokens(enc_state.passes[pass_idx].ac_tokens[group_idx], enc_state.passes[pass_idx].codes, + enc_state.passes[pass_idx].context_map, writer, kLayerACTokens, aux_out); + + return true; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/others/src/acc_init_histogram.cpp b/codec/L2/demos/jxlEnc/others/src/acc_init_histogram.cpp new file mode 100644 index 0000000000..cf426cded0 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/acc_init_histogram.cpp @@ -0,0 +1,115 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef ACC_INIT_HISTOGRAM_CPP +#define ACC_INIT_HISTOGRAM_CPP + +#include "acc_init_histogram.hpp" + +namespace jxl { +bool acc_InitHistogram(std::vector& histograms, std::vector >& tokens) { + size_t total_tokens = 0; + HybridUintConfig uint_config; // Default config for clustering. + + for (size_t i = 0; i < tokens.size(); ++i) { + for (size_t j = 0; j < tokens[i].size(); ++j) { + const Token token = tokens[i][j]; + total_tokens++; + uint32_t tok, nbits, bits; + uint_config.Encode(token.value, &tok, &nbits, &bits); + tok += 0; + histograms[token.context].Add(tok); + } + } + bool use_prefix_code = total_tokens < 100; + return false; +} + +void acc_ANSinitHistogram(LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& frame_header, + + std::vector& params, + bool do_once[5], + + std::vector >& tokens0, + std::vector >& tokens1, + std::vector >& tokens2, + std::vector >& tokens3, + + char* do_prefix_out, + std::vector& largest_idx, + std::vector >& nonempty_histograms, + std::vector >& histograms_) { + PassesEncoderState* JXL_RESTRICT enc_state_ = lossy_frame_encoder.State(); + PassesSharedState& shared = enc_state_->shared; + std::vector& group_caches_ = lossy_frame_encoder.get_group_cashes(); + + group_caches_.resize(1); + for (int group_index = 0; group_index < shared.frame_dim.num_groups; group_index++) { + // Tokenize coefficients. + const Rect rect = shared.BlockGroupRect(group_index); + for (size_t idx_pass = 0; idx_pass < enc_state_->passes.size(); idx_pass++) { + JXL_ASSERT(enc_state_->coeffs[idx_pass]->Type() == ACType::k32); + const int32_t* JXL_RESTRICT ac_rows[3] = { + enc_state_->coeffs[idx_pass]->PlaneRow(0, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(1, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(2, group_index, 0).ptr32, + }; + // Ensure group cache is initialized. + group_caches_[0].InitOnce(); + TokenizeCoefficients(&shared.coeff_orders[idx_pass * shared.coeff_order_size], rect, ac_rows, + shared.ac_strategy, frame_header->chroma_subsampling, &group_caches_[0].num_nzeroes, + &enc_state_->passes[idx_pass].ac_tokens[group_index], enc_state_->shared.quant_dc, + enc_state_->shared.raw_quant_field, enc_state_->shared.block_ctx_map); + } + }; + + for (int i = 0; i < 5; i++) { + if (!do_once[i]) continue; + + std::vector >& tokens = tokens0; + if (i == 0) { + tokens = tokens0; + } else if (i == 1) { + tokens = tokens1; + } else if (i == 2) { + tokens = tokens2; + } else if (i == 3) { + tokens = tokens3; + } else if (i == 4) { + tokens = enc_state_->passes[0].ac_tokens; + } + + bool use_prefix_code = acc_InitHistogram(histograms_[i], tokens); + + do_prefix_out[i] = (char)use_prefix_code; + + int count = 0; + for (int j = 0; j < histograms_[i].size(); j++) { + count += histograms_[i][j].data_.size(); + } + + if (histograms_[i].size() > 1) { + size_t max_histograms = std::min(kClustersLimit, params[i].max_histograms); + + largest_idx[i] = 0; + nonempty_histograms[i].reserve(histograms_[i].size()); + for (size_t j = 0; j < histograms_[i].size(); j++) { + if (histograms_[i][j].total_count_ == 0) continue; + + if (histograms_[i][j].total_count_ > histograms_[i][largest_idx[i]].total_count_) { + largest_idx[i] = j; + } + nonempty_histograms[i].push_back(j); + } + + largest_idx[i] = std::find(nonempty_histograms[i].begin(), nonempty_histograms[i].end(), largest_idx[i]) - + nonempty_histograms[i].begin(); + } + } +} +} // namespace jxl + +#endif diff --git a/codec/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_host.cpp b/codec/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_host.cpp new file mode 100644 index 0000000000..c4c5a60e2c --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_host.cpp @@ -0,0 +1,308 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "acc_host.hpp" + +namespace jxl { +void FindBestDequantMatrices(const CompressParams& cparams, + const Image3F& opsin, + ModularFrameEncoder* modular_frame_encoder, + DequantMatrices* dequant_matrices) { + // TODO(veluca): quant matrices for no-gaborish. + // TODO(veluca): heuristics for in-bitstream quant tables. + *dequant_matrices = DequantMatrices(); + if (cparams.max_error_mode) { + // Set numerators of all quantization matrices to constant values. + float weights[3][1] = { + {1.0f / cparams.max_error[0]}, {1.0f / cparams.max_error[1]}, {1.0f / cparams.max_error[2]}}; + DctQuantWeightParams dct_params(weights); + std::vector encodings(DequantMatrices::kNum, QuantEncoding::DCT(dct_params)); + DequantMatricesSetCustom(dequant_matrices, encodings, modular_frame_encoder); + float dc_weights[3] = {1.0f / cparams.max_error[0], 1.0f / cparams.max_error[1], 1.0f / cparams.max_error[2]}; + DequantMatricesSetCustomDC(dequant_matrices, dc_weights); + } +} + +bool DefaultEncoderHeuristics::HandlesColorConversion(const CompressParams& cparams, const ImageBundle& ib) { + return cparams.noise != Override::kOn && cparams.patches != Override::kOn && + cparams.speed_tier >= SpeedTier::kWombat && cparams.resampling == 1 && + cparams.color_transform == ColorTransform::kXYB && !cparams.modular_mode && !ib.HasAlpha(); +} + +Status acc_host(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + const ImageBundle* JXL_RESTRICT ib_or_linear, + ThreadPool* pool, + std::unique_ptr& modular_frame_encoder, + BitWriter* writer, + AuxOut* aux_out, + std::unique_ptr& frame_header, + const FrameInfo& frame_info, + CompressParams cparams, + const std::vector* extra_channels, + PassesEncoderState* passes_enc_state, + FrameDimensions frame_dim, + const size_t num_groups, + const ImageBundle& ib, + std::vector& aux_outs, + const std::function& resize_aux_outs) { + acc_phase1(opsin, lossy_frame_encoder, cparams, frame_header, frame_info, ib_or_linear, ib, aux_out, pool); + + acc_phase2(xclbinPath, opsin, lossy_frame_encoder, modular_frame_encoder, cparams, frame_header, extra_channels, + ib_or_linear, ib, pool, aux_out); + + acc_phase3(xclbinPath, opsin, lossy_frame_encoder, modular_frame_encoder, cparams, frame_header, passes_enc_state, + frame_dim, writer, num_groups, aux_out, pool, aux_outs, ib, resize_aux_outs); + + return true; +} + +Status DefaultEncoderHeuristics::LossyFrameHeuristics(PassesEncoderState* enc_state, + ModularFrameEncoder* modular_frame_encoder, + const ImageBundle* original_pixels, + Image3F* opsin, + ThreadPool* pool, + AuxOut* aux_out) { + PROFILER_ZONE("JxlLossyFrameHeuristics uninstrumented"); + + CompressParams& cparams = enc_state->cparams; + PassesSharedState& shared = enc_state->shared; + + // Compute parameters for noise synthesis. + if (shared.frame_header.flags & FrameHeader::kNoise) { + PROFILER_ZONE("enc GetNoiseParam"); + if (cparams.photon_noise_iso > 0) { + shared.image_features.noise_params = + SimulatePhotonNoise(opsin->xsize(), opsin->ysize(), cparams.photon_noise_iso); + } else { + // Don't start at zero amplitude since adding noise is expensive -- it + // significantly slows down decoding, and this is unlikely to + // completely go away even with advanced optimizations. After the + // kNoiseModelingRampUpDistanceRange we have reached the full level, + // i.e. noise is no longer represented by the compressed image, so we + // can add full noise by the noise modeling itself. + static const float kNoiseModelingRampUpDistanceRange = 0.6; + static const float kNoiseLevelAtStartOfRampUp = 0.25; + static const float kNoiseRampupStart = 1.0; + // TODO(user) test and properly select quality_coef with smooth + // filter + float quality_coef = 1.0f; + const float rampup = (cparams.butteraugli_distance - kNoiseRampupStart) / kNoiseModelingRampUpDistanceRange; + if (rampup < 1.0f) { + quality_coef = kNoiseLevelAtStartOfRampUp + (1.0f - kNoiseLevelAtStartOfRampUp) * rampup; + } + if (rampup < 0.0f) { + quality_coef = kNoiseRampupStart; + } + if (!GetNoiseParameter(*opsin, &shared.image_features.noise_params, quality_coef)) { + shared.frame_header.flags &= ~FrameHeader::kNoise; + } + } + } + if (enc_state->shared.frame_header.upsampling != 1 && !cparams.already_downsampled) { + // In VarDCT mode, LossyFrameHeuristics takes care of running downsampling + // after noise, if necessary. + DownsampleImage(opsin, cparams.resampling); + PadImageToBlockMultipleInPlace(opsin); + } + + const FrameDimensions& frame_dim = enc_state->shared.frame_dim; + size_t target_size = TargetSize(cparams, frame_dim); + size_t opsin_target_size = target_size; + if (cparams.target_size > 0 || cparams.target_bitrate > 0.0) { + cparams.target_size = opsin_target_size; + } else if (cparams.butteraugli_distance < 0) { + return JXL_FAILURE("Expected non-negative distance"); + } + +#ifndef XLNX_DISABLE_BLK_DICT + // Find and subtract splines. + if (cparams.speed_tier <= SpeedTier::kSquirrel) { + shared.image_features.splines = FindSplines(*opsin); + JXL_RETURN_IF_ERROR(shared.image_features.splines.SubtractFrom(opsin, shared.cmap)); + } + + // Find and subtract patches/dots. + if (ApplyOverride(cparams.patches, cparams.speed_tier <= SpeedTier::kSquirrel)) { + FindBestPatchDictionary(*opsin, enc_state, pool, aux_out); + PatchDictionaryEncoder::SubtractFrom(shared.image_features.patches, opsin); + } +#endif + + static const float kAcQuant = 0.79f; + const float quant_dc = InitialQuantDC(cparams.butteraugli_distance); + Quantizer& quantizer = enc_state->shared.quantizer; + // We don't know the quant field yet, but for computing the global scale + // assuming that it will be the same as for Falcon mode is good enough. + quantizer.ComputeGlobalScaleAndQuant(quant_dc, kAcQuant / cparams.butteraugli_distance, 0); + + // TODO(veluca): we can now run all the code from here to FindBestQuantizer + // (excluded) one rect at a time. Do that. + + // Dependency graph: + // + // input: either XYB or input image + // + // input image -> XYB [optional] + // XYB -> initial quant field + // XYB -> Gaborished XYB + // Gaborished XYB -> CfL1 + // initial quant field, Gaborished XYB, CfL1 -> ACS + // initial quant field, ACS, Gaborished XYB -> EPF control field + // initial quant field -> adjusted initial quant field + // adjusted initial quant field, ACS -> raw quant field + // raw quant field, ACS, Gaborished XYB -> CfL2 + // + // output: Gaborished XYB, CfL, ACS, raw quant field, EPF control field. + + ArControlFieldHeuristics ar_heuristics; + AcStrategyHeuristics acs_heuristics; + CfLHeuristics cfl_heuristics; + + if (!opsin->xsize()) { + JXL_ASSERT(HandlesColorConversion(cparams, *original_pixels)); + *opsin = Image3F(RoundUpToBlockDim(original_pixels->xsize()), RoundUpToBlockDim(original_pixels->ysize())); + opsin->ShrinkTo(original_pixels->xsize(), original_pixels->ysize()); + ToXYB(*original_pixels, pool, opsin, /*linear=*/nullptr); + PadImageToBlockMultipleInPlace(opsin); + } + + // Compute an initial estimate of the quantization field. + // Call InitialQuantField only in Hare mode or slower. Otherwise, rely + // on simple heuristics in FindBestAcStrategy, or set a constant for Falcon + // mode. + if (cparams.speed_tier > SpeedTier::kHare || cparams.uniform_quant > 0) { + enc_state->initial_quant_field = ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks); + float q = cparams.uniform_quant > 0 ? cparams.uniform_quant : kAcQuant / cparams.butteraugli_distance; + FillImage(q, &enc_state->initial_quant_field); + } else { + // Call this here, as it relies on pre-gaborish values. + float butteraugli_distance_for_iqf = cparams.butteraugli_distance; + if (!shared.frame_header.loop_filter.gab) { + butteraugli_distance_for_iqf *= 0.73f; + } + enc_state->initial_quant_field = InitialQuantField(butteraugli_distance_for_iqf, *opsin, shared.frame_dim, pool, + 1.0f, &enc_state->initial_quant_masking); + } + + // TODO(veluca): do something about animations. + + // Apply inverse-gaborish. + if (shared.frame_header.loop_filter.gab) { + GaborishInverse(opsin, 0.9908511000000001f, pool); + } + + cfl_heuristics.Init(*opsin); + acs_heuristics.Init(*opsin, enc_state); + ar_heuristics.PrepareForThreads(/*num_threads*/ 1); + cfl_heuristics.PrepareForThreads(/*num_threads*/ 1); + + // auto process_tile = [&](size_t tid, size_t thread) { + for (int tid = 0; tid < DivCeil(enc_state->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks) * + DivCeil(enc_state->shared.frame_dim.ysize_blocks, kEncTileDimInBlocks); + tid++) { + size_t thread = 0; + size_t n_enc_tiles = DivCeil(enc_state->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks); + size_t tx = tid % n_enc_tiles; + size_t ty = tid / n_enc_tiles; + size_t by0 = ty * kEncTileDimInBlocks; + size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, enc_state->shared.frame_dim.ysize_blocks); + size_t bx0 = tx * kEncTileDimInBlocks; + size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, enc_state->shared.frame_dim.xsize_blocks); + Rect r(bx0, by0, bx1 - bx0, by1 - by0); + + // For speeds up to Wombat, we only compute the color correlation map + // once we know the transform type and the quantization map. + if (cparams.speed_tier <= SpeedTier::kSquirrel) { + // cfl_heuristics.ComputeTile(r, *opsin, enc_state->shared.matrices, + // /*ac_strategy=*/nullptr, + // /*quantizer=*/nullptr, /*fast=*/false, thread, + // &enc_state->shared.cmap); + } + +// Choose block sizes. +// acs_heuristics.ProcessRect(r); + +// Choose amount of post-processing smoothing. +// TODO(veluca): should this go *after* AdjustQuantField? +#ifndef XLNX_DISABLE_ARC + ar_heuristics.RunRect(r, *opsin, enc_state, thread); +#else + ImageB* JXL_RESTRICT epf_sharpness = &enc_state->shared.epf_sharpness; + FillPlane(static_cast(4), epf_sharpness, r); +#endif + // Always set the initial quant field, so we can compute the CfL map with + // more accuracy. The initial quant field might change in slower modes, but + // adjusting the quant field with butteraugli when all the other encoding + // parameters are fixed is likely a more reliable choice anyway. + AdjustQuantField(enc_state->shared.ac_strategy, r, &enc_state->initial_quant_field); + quantizer.SetQuantFieldRect(enc_state->initial_quant_field, r, &enc_state->shared.raw_quant_field); + +// Compute a non-default CfL map if we are at Hare speed, or slower. +#ifndef XLNX_DISABLE_2NDCMP + if (cparams.speed_tier <= SpeedTier::kHare) { + cfl_heuristics.ComputeTile( + r, *opsin, enc_state->shared.matrices, &enc_state->shared.ac_strategy, &enc_state->shared.quantizer, + /*fast=*/cparams.speed_tier >= SpeedTier::kWombat, thread, &enc_state->shared.cmap); + } +#endif + }; + /* RunOnPool(pool, 0, DivCeil(enc_state->shared.frame_dim.xsize_blocks, + kEncTileDimInBlocks) * + DivCeil(enc_state->shared.frame_dim.ysize_blocks, + kEncTileDimInBlocks), + [&](const size_t num_threads) { + ar_heuristics.PrepareForThreads(num_threads); + cfl_heuristics.PrepareForThreads(num_threads); + return true; + }, + process_tile, "Enc Heuristics");*/ + + acs_heuristics.Finalize(aux_out); + if (cparams.speed_tier <= SpeedTier::kHare) { + cfl_heuristics.ComputeDC(/*fast=*/cparams.speed_tier >= SpeedTier::kWombat, &enc_state->shared.cmap); + } + + FindBestDequantMatrices(cparams, *opsin, modular_frame_encoder, &enc_state->shared.matrices); + + // Refine quantization levels. + FindBestQuantizer(original_pixels, *opsin, enc_state, pool, aux_out); + + // Choose a context model that depends on the amount of quantization for AC. + if (cparams.speed_tier < SpeedTier::kFalcon) { + FindBestBlockEntropyModel(*enc_state); + } + +#ifdef XLNX_DEBUG_CMAP + std::cout << "=========================================" << std::endl; + std::cout << "ColorMap info: " << std::endl; + ImageSB* JXL_RESTRICT tmp_map = &enc_state->shared.cmap.ytox_map; + int32_t dc = enc_state->shared.cmap.GetYToXDC(); + std::cout << "Y to X dc: " << dc << std::endl; + for (int i = 0; i < tmp_map->ysize(); i++) { + int8_t* JXL_RESTRICT row_out = tmp_map->Row(i); + for (int j = 0; j < tmp_map->xsize(); j++) { + std::cout << (int)row_out[j] << " "; + } + std::cout << std::endl; + } + + tmp_map = &enc_state->shared.cmap.ytox_map; + dc = enc_state->shared.cmap.GetYToBDC(); + std::cout << "Y to B dc: " << dc << std::endl; + for (int i = 0; i < tmp_map->ysize(); i++) { + int8_t* JXL_RESTRICT row_out = tmp_map->Row(i); + for (int j = 0; j < tmp_map->xsize(); j++) { + std::cout << (int)row_out[j] << " "; + } + std::cout << std::endl; + } + std::cout << std::endl; +#endif + + return true; +} +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_phase1.cpp b/codec/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_phase1.cpp new file mode 100644 index 0000000000..a37f251c20 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_phase1.cpp @@ -0,0 +1,276 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef HLS_KERNEL1_CPP +#define HLS_KERNEL1_CPP + +#include "acc_phase1.hpp" + +namespace jxl { +namespace { +// Invisible (alpha = 0) pixels tend to be a mess in optimized PNGs. +// Since they have no visual impact whatsoever, we can replace them with +// something that compresses better and reduces artifacts near the edges. This +// does some kind of smooth stuff that seems to work. +// Replace invisible pixels with a weighted average of the pixel to the left, +// the pixel to the topright, and non-invisible neighbours. +// Produces downward-blurry smears, with in the upwards direction only a 1px +// edge duplication but not more. It would probably be better to smear in all +// directions. That requires an alpha-weighed convolution with a large enough +// kernel though, which might be overkill... +void SimplifyInvisible(Image3F* image, const ImageF& alpha, bool lossless) { + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < image->ysize(); ++y) { + float* JXL_RESTRICT row = image->PlaneRow(c, y); + const float* JXL_RESTRICT prow = (y > 0 ? image->PlaneRow(c, y - 1) : nullptr); + const float* JXL_RESTRICT nrow = (y + 1 < image->ysize() ? image->PlaneRow(c, y + 1) : nullptr); + const float* JXL_RESTRICT a = alpha.Row(y); + const float* JXL_RESTRICT pa = (y > 0 ? alpha.Row(y - 1) : nullptr); + const float* JXL_RESTRICT na = (y + 1 < image->ysize() ? alpha.Row(y + 1) : nullptr); + for (size_t x = 0; x < image->xsize(); ++x) { + if (a[x] == 0) { + if (lossless) { + row[x] = 0; + continue; + } + float d = 0.f; + row[x] = 0; + if (x > 0) { + row[x] += row[x - 1]; + d++; + if (a[x - 1] > 0.f) { + row[x] += row[x - 1]; + d++; + } + } + if (x + 1 < image->xsize()) { + if (y > 0) { + row[x] += prow[x + 1]; + d++; + } + if (a[x + 1] > 0.f) { + row[x] += 2.f * row[x + 1]; + d += 2.f; + } + if (y > 0 && pa[x + 1] > 0.f) { + row[x] += 2.f * prow[x + 1]; + d += 2.f; + } + if (y + 1 < image->ysize() && na[x + 1] > 0.f) { + row[x] += 2.f * nrow[x + 1]; + d += 2.f; + } + } + if (y > 0 && pa[x] > 0.f) { + row[x] += 2.f * prow[x]; + d += 2.f; + } + if (y + 1 < image->ysize() && na[x] > 0.f) { + row[x] += 2.f * nrow[x]; + d += 2.f; + } + if (d > 1.f) row[x] /= d; + } + } + } + } +} +} // namespace + +Status acc_phase1(Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + const FrameInfo& frame_info, + const ImageBundle* JXL_RESTRICT ib_or_linear, + const ImageBundle& ib, + AuxOut* aux_out, + ThreadPool* pool) { + const ColorEncoding& c_linear = ColorEncoding::LinearSRGB(ib.IsGray()); + std::unique_ptr metadata_linear = jxl::make_unique(); + metadata_linear->xyb_encoded = (cparams.color_transform == ColorTransform::kXYB); + metadata_linear->color_encoding = c_linear; + ImageBundle linear_storage(metadata_linear.get()); + + // Allocating a large enough image avoids a copy when padding. + opsin = Image3F(RoundUpToBlockDim(ib.xsize()), RoundUpToBlockDim(ib.ysize())); + opsin.ShrinkTo(ib.xsize(), ib.ysize()); + + const bool want_linear = + frame_header->encoding == FrameEncoding::kVarDCT && cparams.speed_tier <= SpeedTier::kKitten; + ib_or_linear = &ib; + + if (frame_header->color_transform == ColorTransform::kXYB && frame_info.ib_needs_color_transform) { + // linear_storage would only be used by the Butteraugli loop (passing + // linear sRGB avoids a color conversion there). Otherwise, don't + // fill it to reduce memory usage. + ib_or_linear = ToXYB(ib, pool, &opsin, want_linear ? &linear_storage : nullptr); + } else { // RGB or YCbCr: don't do anything (forward YCbCr is not + // implemented, this is only used when the input is already in + // YCbCr) + // If encoding a special DC or reference frame, don't do anything: + // input is already in XYB. + CopyImageTo(ib.color(), &opsin); + } + bool lossless = (frame_header->encoding == FrameEncoding::kModular && cparams.quality_pair.first == 100); + if (ib.HasAlpha() && !ib.AlphaIsPremultiplied() && !ApplyOverride(cparams.keep_invisible, lossless) && + cparams.ec_resampling == cparams.resampling) { + // simplify invisible pixels + SimplifyInvisible(&opsin, ib.alpha(), lossless); + if (want_linear) { + SimplifyInvisible(const_cast(&ib_or_linear->color()), ib.alpha(), lossless); + } + } + if (aux_out != nullptr) { + JXL_RETURN_IF_ERROR(aux_out->InspectImage3F("enc_frame:OpsinDynamicsImage", opsin)); + } + if (frame_header->encoding == FrameEncoding::kVarDCT) { + PadImageToBlockMultipleInPlace(&opsin); + PassesEncoderState* JXL_RESTRICT enc_state_ = lossy_frame_encoder.State(); + // std::vector& group_caches_ = + // lossy_frame_encoder.get_group_cashes(); + + JXL_ASSERT((opsin.xsize() % kBlockDim) == 0 && (opsin.ysize() % kBlockDim) == 0); + PassesSharedState& shared = enc_state_->shared; + + if (!enc_state_->cparams.max_error_mode) { + float x_qm_scale_steps[3] = {0.65f, 1.25f, 9.0f}; + shared.frame_header.x_qm_scale = 1; + for (float x_qm_scale_step : x_qm_scale_steps) { + if (enc_state_->cparams.butteraugli_distance > x_qm_scale_step) { + shared.frame_header.x_qm_scale++; + } + } + } + + Image3F* opsin_ = &opsin; + // CompressParams& cparams = enc_state->cparams; + // PassesSharedState& shared = enc_state->shared; + + // Compute parameters for noise synthesis. + if (shared.frame_header.flags & FrameHeader::kNoise) { + PROFILER_ZONE("enc GetNoiseParam"); + if (cparams.photon_noise_iso > 0) { + shared.image_features.noise_params = + SimulatePhotonNoise(opsin_->xsize(), opsin_->ysize(), cparams.photon_noise_iso); + } else { + // Don't start at zero amplitude since adding noise is expensive -- it + // significantly slows down decoding, and this is unlikely to + // completely go away even with advanced optimizations. After the + // kNoiseModelingRampUpDistanceRange we have reached the full level, + // i.e. noise is no longer represented by the compressed image, so we + // can add full noise by the noise modeling itself. + static const float kNoiseModelingRampUpDistanceRange = 0.6; + static const float kNoiseLevelAtStartOfRampUp = 0.25; + static const float kNoiseRampupStart = 1.0; + // TODO(user) test and properly select quality_coef with smooth + // filter + float quality_coef = 1.0f; + const float rampup = + (cparams.butteraugli_distance - kNoiseRampupStart) / kNoiseModelingRampUpDistanceRange; + if (rampup < 1.0f) { + quality_coef = kNoiseLevelAtStartOfRampUp + (1.0f - kNoiseLevelAtStartOfRampUp) * rampup; + } + if (rampup < 0.0f) { + quality_coef = kNoiseRampupStart; + } + if (!GetNoiseParameter(*opsin_, &shared.image_features.noise_params, quality_coef)) { + shared.frame_header.flags &= ~FrameHeader::kNoise; + } + } + } + if (enc_state_->shared.frame_header.upsampling != 1 && !cparams.already_downsampled) { + // In VarDCT mode, LossyFrameHeuristics takes care of running downsampling + // after noise, if necessary. + DownsampleImage(opsin_, cparams.resampling); + PadImageToBlockMultipleInPlace(opsin_); + } + + const FrameDimensions& frame_dim_ = enc_state_->shared.frame_dim; + size_t target_size = TargetSize(cparams, frame_dim_); + size_t opsin_target_size = target_size; + if (cparams.target_size > 0 || cparams.target_bitrate > 0.0) { + cparams.target_size = opsin_target_size; + } else if (cparams.butteraugli_distance < 0) { + return JXL_FAILURE("Expected non-negative distance"); + } + +#ifndef XLNX_DISABLE_BLK_DICT + // Find and subtract splines. + if (cparams.speed_tier <= SpeedTier::kSquirrel) { + shared.image_features.splines = FindSplines(*opsin_); + JXL_RETURN_IF_ERROR(shared.image_features.splines.SubtractFrom(opsin_, shared.cmap)); + } + + // Find and subtract patches/dots. + if (ApplyOverride(cparams.patches, cparams.speed_tier <= SpeedTier::kSquirrel)) { + FindBestPatchDictionary(*opsin_, enc_state_, pool, aux_out); + PatchDictionaryEncoder::SubtractFrom(shared.image_features.patches, opsin_); + } +#endif + + static const float kAcQuant = 0.79f; + const float quant_dc = InitialQuantDC(cparams.butteraugli_distance); + Quantizer& quantizer = enc_state_->shared.quantizer; + // We don't know the quant field yet, but for computing the global scale + // assuming that it will be the same as for Falcon mode is good enough. + quantizer.ComputeGlobalScaleAndQuant(quant_dc, kAcQuant / cparams.butteraugli_distance, 0); + + // TODO(veluca): we can now run all the code from here to FindBestQuantizer + // (excluded) one rect at a time. Do that. + + // Dependency graph: + // + // input: either XYB or input image + // + // input image -> XYB [optional] + // XYB -> initial quant field + // XYB -> Gaborished XYB + // Gaborished XYB -> CfL1 + // initial quant field, Gaborished XYB, CfL1 -> ACS + // initial quant field, ACS, Gaborished XYB -> EPF control field + // initial quant field -> adjusted initial quant field + // adjusted initial quant field, ACS -> raw quant field + // raw quant field, ACS, Gaborished XYB -> CfL2 + // + // output: Gaborished XYB, CfL, ACS, raw quant field, EPF control field. + + if (!opsin_->xsize()) { + JXL_ASSERT(enc_state_->heuristics->HandlesColorConversion(cparams, *ib_or_linear)); + *opsin_ = Image3F(RoundUpToBlockDim(ib_or_linear->xsize()), RoundUpToBlockDim(ib_or_linear->ysize())); + opsin_->ShrinkTo(ib_or_linear->xsize(), ib_or_linear->ysize()); + ToXYB(*ib_or_linear, pool, opsin_, /*linear=*/nullptr); + PadImageToBlockMultipleInPlace(opsin_); + } + + // Compute an initial estimate of the quantization field. + // Call InitialQuantField only in Hare mode or slower. Otherwise, rely + // on simple heuristics in FindBestAcStrategy, or set a constant for Falcon + // mode. + if (cparams.speed_tier > SpeedTier::kHare || cparams.uniform_quant > 0) { + enc_state_->initial_quant_field = ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks); + float q = cparams.uniform_quant > 0 ? cparams.uniform_quant : kAcQuant / cparams.butteraugli_distance; + FillImage(q, &enc_state_->initial_quant_field); + } else { + // Call this here, as it relies on pre-gaborish values. + float butteraugli_distance_for_iqf = cparams.butteraugli_distance; + if (!shared.frame_header.loop_filter.gab) { + butteraugli_distance_for_iqf *= 0.73f; + } + enc_state_->initial_quant_field = InitialQuantField(butteraugli_distance_for_iqf, *opsin_, shared.frame_dim, + pool, 1.0f, &enc_state_->initial_quant_masking); + } + + // TODO(veluca): do something about animations. + + // Apply inverse-gaborish. + if (shared.frame_header.loop_filter.gab) { + GaborishInverse(opsin_, 0.9908511000000001f, pool); + } + } + return true; +} +} // namespace jxl +#endif \ No newline at end of file diff --git a/codec/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_phase2.cpp b/codec/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_phase2.cpp new file mode 100644 index 0000000000..f47dd76fde --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_phase2.cpp @@ -0,0 +1,415 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef HLS_KERNEL2_CPP +#define HLS_KERNEL2_CPP + +#include "acc_phase2.hpp" + +namespace jxl { + +Status acc_phase2(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& modular_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + const std::vector* extra_channels, + const ImageBundle* JXL_RESTRICT ib_or_linear, + const ImageBundle& ib, + ThreadPool* pool, + AuxOut* aux_out) { + if (frame_header->encoding == FrameEncoding::kVarDCT) { + std::vector& group_caches_ = lossy_frame_encoder.get_group_cashes(); + PassesEncoderState* JXL_RESTRICT enc_state_ = lossy_frame_encoder.State(); + PassesSharedState& shared = enc_state_->shared; + Image3F* opsin_ = &opsin; + Quantizer& quantizer = enc_state_->shared.quantizer; + + size_t tile_xsize = (opsin.xsize() + 63) / 64 * 64; + size_t tile_ysize = (opsin.ysize() + 63) / 64 * 64; +#ifdef XLNX_QC_DEBUG_DCT +/*std::cout << std::endl + << "======================================== full origin pixel " + "==============================================" + << std::endl; +for (int c = 0; c < 3; c++) { + if (c == 0) { + std::cout << std::setw(15) << 0 << " "; + for (int m = 0; m < tile_xsize; m++) { + std::cout << std::setw(15) << m << " "; + } + std::cout << std::endl << std::endl; + + for (int y = 0; y < tile_ysize; y++) { + std::cout << std::setw(15) << y << " "; + const float* JXL_RESTRICT row_y = opsin.ConstPlaneRow(c, y); + for (int x = 0; x < tile_xsize; x++) { + std::cout << std::setw(15) << row_y[x] << " "; + } + std::cout << std::endl; + } + std::cout << std::endl; + } +}*/ +#endif + + std::vector > dctIDT(3, std::vector(tile_xsize * tile_ysize)); + std::vector > dct2x2(3, std::vector(tile_xsize * tile_ysize)); + std::vector > dct4x4(3, std::vector(tile_xsize * tile_ysize)); + std::vector > dct8x8(3, std::vector(tile_xsize * tile_ysize)); + std::vector > dct16x16(3, std::vector(tile_xsize * tile_ysize)); + std::vector > dct32x32(3, std::vector(tile_xsize * tile_ysize)); + + std::vector > dcIDT(3, std::vector((tile_xsize * tile_ysize + 63) / 64 * 64)); + std::vector > dc2x2(3, std::vector((tile_xsize * tile_ysize + 63) / 64 * 64)); + std::vector > dc4x4(3, std::vector((tile_xsize * tile_ysize + 63) / 64 * 64)); + std::vector > dc8x8(3, std::vector((tile_xsize * tile_ysize + 63) / 64 * 64)); + std::vector > dc16x16(3, std::vector((tile_xsize * tile_ysize + 63) / 64 * 64)); + std::vector > dc32x32(3, std::vector((tile_xsize * tile_ysize + 63) / 64 * 64)); + + for (int c = 0; c < 3; c++) { + for (size_t y = 0; y < tile_ysize; y = y + 8) { + const float* JXL_RESTRICT row = opsin.ConstPlaneRow(c, y); + size_t stride = opsin.PixelsPerRow(); + + for (size_t x = 0; x < tile_xsize; x = x + 8) { + float* mem = (float*)calloc(8UL * 8UL, sizeof(float)); + float* dc_mem = + (float*)calloc(AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks, sizeof(float)); + float* scratch_space = (float*)calloc(2048UL, sizeof(float)); + AcStrategy acs = AcStrategy::FromRawStrategy(AcStrategy::Type::IDENTITY); + size_t xs = acs.covered_blocks_x(); + N_SCALAR::TransformFromPixels(acs.Strategy(), row + x, stride, mem, scratch_space); + N_SCALAR::DCFromLowestFrequencies(acs.Strategy(), mem, dc_mem, xs); + for (int m = 0; m < 64; m++) { + dctIDT[c][64 * (y / 8 * (tile_xsize / 8) + x / 8) + m] = mem[m]; + } + dcIDT[c][y / 8 * tile_xsize / 8 + x / 8] = dc_mem[0]; + free(mem); + free(dc_mem); + free(scratch_space); + } + } + } + + for (int c = 0; c < 3; c++) { + for (size_t y = 0; y < tile_ysize; y = y + 8) { + const float* JXL_RESTRICT row = opsin.ConstPlaneRow(c, y); + size_t stride = opsin.PixelsPerRow(); + + for (size_t x = 0; x < tile_xsize; x = x + 8) { + float* mem = (float*)calloc(8UL * 8UL, sizeof(float)); + float* dc_mem = + (float*)calloc(AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks, sizeof(float)); + float* scratch_space = (float*)calloc(2048UL, sizeof(float)); + AcStrategy acs = AcStrategy::FromRawStrategy(AcStrategy::Type::DCT2X2); + size_t xs = acs.covered_blocks_x(); + N_SCALAR::TransformFromPixels(acs.Strategy(), row + x, stride, mem, scratch_space); + N_SCALAR::DCFromLowestFrequencies(acs.Strategy(), mem, dc_mem, xs); + for (int m = 0; m < 64; m++) { + dct2x2[c][64 * (y / 8 * (tile_xsize / 8) + x / 8) + m] = mem[m]; + } + dc2x2[c][y / 8 * tile_xsize / 8 + x / 8] = dc_mem[0]; + free(mem); + free(dc_mem); + free(scratch_space); + } + } + } + + for (int c = 0; c < 3; c++) { + for (size_t y = 0; y < tile_ysize; y = y + 8) { + const float* JXL_RESTRICT row = opsin.ConstPlaneRow(c, y); + size_t stride = opsin.PixelsPerRow(); + + for (size_t x = 0; x < tile_xsize; x = x + 8) { + float* mem = (float*)calloc(8UL * 8UL, sizeof(float)); + float* dc_mem = + (float*)calloc(AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks, sizeof(float)); + float* scratch_space = (float*)calloc(2048UL, sizeof(float)); + AcStrategy acs = AcStrategy::FromRawStrategy(AcStrategy::Type::DCT4X4); + size_t xs = acs.covered_blocks_x(); + N_SCALAR::TransformFromPixels(acs.Strategy(), row + x, stride, mem, scratch_space); + N_SCALAR::DCFromLowestFrequencies(acs.Strategy(), mem, dc_mem, xs); + for (int m = 0; m < 64; m++) { + dct4x4[c][64 * (y / 8 * (tile_xsize / 8) + x / 8) + m] = mem[m]; + } + dc4x4[c][y / 8 * tile_xsize / 8 + x / 8] = dc_mem[0]; + free(mem); + free(dc_mem); + free(scratch_space); + } + } + } + + for (int c = 0; c < 3; c++) { + for (size_t y = 0; y < tile_ysize; y = y + 8) { + const float* JXL_RESTRICT row = opsin.ConstPlaneRow(c, y); + size_t stride = opsin.PixelsPerRow(); + + for (size_t x = 0; x < tile_xsize; x = x + 8) { + float* mem = (float*)calloc(8UL * 8UL, sizeof(float)); + float* dc_mem = + (float*)calloc(AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks, sizeof(float)); + float* scratch_space = (float*)calloc(2048UL, sizeof(float)); + AcStrategy acs = AcStrategy::FromRawStrategy(AcStrategy::Type::DCT); + size_t xs = acs.covered_blocks_x(); + N_SCALAR::TransformFromPixels(acs.Strategy(), row + x, stride, mem, scratch_space); + N_SCALAR::DCFromLowestFrequencies(acs.Strategy(), mem, dc_mem, xs); + for (int m = 0; m < 64; m++) { + dct8x8[c][64 * (y / 8 * (tile_xsize / 8) + x / 8) + m] = mem[m]; + } + dc8x8[c][y / 8 * (tile_xsize / 8) + x / 8] = dc_mem[0]; + free(mem); + free(dc_mem); + free(scratch_space); + } + } + } + + for (int c = 0; c < 3; c++) { + for (size_t y = 0; y < tile_ysize; y = y + 16) { + const float* JXL_RESTRICT row = opsin.ConstPlaneRow(c, y); + size_t stride = opsin.PixelsPerRow(); + + for (size_t x = 0; x < tile_xsize; x = x + 16) { + float* mem = (float*)calloc(16UL * 16UL, sizeof(float)); + float* dc_mem = + (float*)calloc(AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks, sizeof(float)); + float* scratch_space = (float*)calloc(2048UL, sizeof(float)); + AcStrategy acs = AcStrategy::FromRawStrategy(AcStrategy::Type::DCT16X16); + size_t xs = acs.covered_blocks_x(); + N_SCALAR::TransformFromPixels(acs.Strategy(), row + x, stride, mem, scratch_space); + N_SCALAR::DCFromLowestFrequencies(acs.Strategy(), mem, dc_mem, xs); + for (int m = 0; m < 16 * 16; m++) { + dct16x16[c][16 * 16 * (y / 16 * (tile_xsize / 16) + x / 16) + m] = mem[m]; + } + for (int m = 0; m < 4; m++) { + dc16x16[c][4 * (y / 16 * (tile_xsize / 16) + x / 16) + m] = dc_mem[m]; + } + free(mem); + free(dc_mem); + free(scratch_space); + } + } + } + + for (int c = 0; c < 3; c++) { + for (size_t y = 0; y < tile_ysize; y = y + 32) { + const float* JXL_RESTRICT row = opsin.ConstPlaneRow(c, y); + size_t stride = opsin.PixelsPerRow(); + + for (size_t x = 0; x < tile_xsize; x = x + 32) { + float* mem = (float*)calloc(32UL * 32UL, sizeof(float)); + float* dc_mem = + (float*)calloc(AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks, sizeof(float)); + float* scratch_space = (float*)calloc(2048UL, sizeof(float)); + AcStrategy acs = AcStrategy::FromRawStrategy(AcStrategy::Type::DCT32X32); + size_t xs = acs.covered_blocks_x(); + N_SCALAR::TransformFromPixels(acs.Strategy(), row + x, stride, mem, scratch_space); + N_SCALAR::DCFromLowestFrequencies(acs.Strategy(), mem, dc_mem, xs); + for (int m = 0; m < 32 * 32; m++) { + dct32x32[c][32 * 32 * (y / 32 * (tile_xsize / 32) + x / 32) + m] = mem[m]; + } + for (int m = 0; m < 16; m++) { + dc32x32[c][16 * (y / 32 * (tile_xsize / 32) + x / 32) + m] = dc_mem[m]; + } + free(mem); + free(dc_mem); + free(scratch_space); + } + } + } + +#ifdef XLNX_QC_DEBUG_DCT + std::cout << std::endl + << "======================================== full coef " + "==============================================" + << std::endl; + for (int c = 0; c < 3; c++) { + if (c == 1) { + std::cout << std::setw(15) << 0 << " "; + for (int m = 0; m < tile_xsize; m++) { + std::cout << std::setw(15) << m << " "; + } + std::cout << std::endl << std::endl; + for (int y = 0; y < tile_ysize; y++) { + std::cout << std::setw(15) << y << " "; + for (int x = 0; x < tile_xsize; x++) { + std::cout << std::setw(15) << dct8x8[c][y * tile_xsize + x] << " "; + } + std::cout << std::endl; + } + } + } +#endif + +#ifdef XLNX_QC_DEBUG_DC + std::cout << std::endl + << "======================================== full DC " + "==============================================" + << std::endl; + for (int c = 0; c < 3; c++) { + if (c == 1) { + std::cout << std::setw(15) << 0 << " "; + for (int m = 0; m < tile_xsize / 8; m++) { + std::cout << std::setw(15) << m << " "; + } + std::cout << std::endl << std::endl; + for (int y = 0; y < tile_ysize / 8; y++) { + std::cout << std::setw(15) << y << " "; + for (int x = 0; x < tile_xsize / 8; x++) { + std::cout << std::setw(15) << dc32x32[c][y * tile_xsize / 8 + x] << " "; + } + std::cout << std::endl; + } + } + } +#endif + + ArControlFieldHeuristics ar_heuristics; + AcStrategyHeuristics acs_heuristics; + CfLHeuristics cfl_heuristics; + + cfl_heuristics.Init(*opsin_); + acs_heuristics.Init(*opsin_, enc_state_); + ar_heuristics.PrepareForThreads(/*num_threads*/ 1); + cfl_heuristics.PrepareForThreads(/*num_threads*/ 1); + + // auto process_tile = [&](size_t tid, size_t thread) { + for (int tid = 0; tid < DivCeil(enc_state_->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks) * + DivCeil(enc_state_->shared.frame_dim.ysize_blocks, kEncTileDimInBlocks); + tid++) { + size_t thread = 0; + size_t n_enc_tiles = DivCeil(enc_state_->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks); + size_t tx = tid % n_enc_tiles; + size_t ty = tid / n_enc_tiles; + size_t by0 = ty * kEncTileDimInBlocks; + size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, enc_state_->shared.frame_dim.ysize_blocks); + size_t bx0 = tx * kEncTileDimInBlocks; + size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, enc_state_->shared.frame_dim.xsize_blocks); + Rect r(bx0, by0, bx1 - bx0, by1 - by0); + + // For speeds up to Wombat, we only compute the color correlation map + // once we know the transform type and the quantization map. + if (cparams.speed_tier <= SpeedTier::kSquirrel) { + cfl_heuristics.ComputeTile(r, *opsin_, enc_state_->shared.matrices, + /*ac_strategy=*/nullptr, + /*quantizer=*/nullptr, /*fast=*/false, thread, &enc_state_->shared.cmap, + opsin.xsize(), opsin.ysize(), dctIDT, dct2x2, dct4x4, dct8x8, dct16x16, + dct32x32, dcIDT, dc2x2, dc4x4, dc8x8, dc16x16, dc32x32); + } + + // Choose block sizes. + acs_heuristics.ProcessRect(r, opsin.xsize(), opsin.ysize(), dctIDT, dct2x2, dct4x4, dct8x8, dct16x16, + dct32x32, dcIDT, dc2x2, dc4x4, dc8x8, dc16x16, dc32x32); + +// Choose amount of post-processing smoothing. +// TODO(veluca): should this go *after* AdjustQuantField? +#ifndef XLNX_DISABLE_ARC + ar_heuristics.RunRect(r, *opsin_, enc_state_, thread); +#else + ImageB* JXL_RESTRICT epf_sharpness = &enc_state_->shared.epf_sharpness; + FillPlane(static_cast(4), epf_sharpness, r); +#endif + // Always set the initial quant field, so we can compute the CfL map + // with more accuracy. The initial quant field might change in slower + // modes, but adjusting the quant field with butteraugli when all the + // other encoding parameters are fixed is likely a more reliable choice + // anyway. + AdjustQuantField(enc_state_->shared.ac_strategy, r, &enc_state_->initial_quant_field); + quantizer.SetQuantFieldRect(enc_state_->initial_quant_field, r, &enc_state_->shared.raw_quant_field); + +// Compute a non-default CfL map if we are at Hare speed, or slower. +#ifndef XLNX_DISABLE_2NDCMP + if (cparams.speed_tier <= SpeedTier::kHare) { + cfl_heuristics.ComputeTile(r, *opsin_, enc_state_->shared.matrices, &enc_state_->shared.ac_strategy, + &enc_state_->shared.quantizer, + /*fast=*/cparams.speed_tier >= SpeedTier::kWombat, thread, + &enc_state_->shared.cmap, dctIDT, dct2x2, dct4x4, dct8x8, dct16x16, dct32x32, + dcIDT, dc2x2, dc4x4, dc8x8, dc16x16, dc32x32); + } +#endif + }; + /* RunOnPool(pool, 0, DivCeil(enc_state_->shared.frame_dim.xsize_blocks, + kEncTileDimInBlocks) * + DivCeil(enc_state_->shared.frame_dim.ysize_blocks, + kEncTileDimInBlocks), + [&](const size_t num_threads) { + ar_heuristics.PrepareForThreads(num_threads); + cfl_heuristics.PrepareForThreads(num_threads); + return true; + }, + process_tile, "Enc Heuristics");*/ + + acs_heuristics.Finalize(aux_out); + if (cparams.speed_tier <= SpeedTier::kHare) { + cfl_heuristics.ComputeDC( + /*fast=*/cparams.speed_tier >= SpeedTier::kWombat, &enc_state_->shared.cmap); + } + + FindBestDequantMatrices(cparams, *opsin_, modular_frame_encoder.get(), &enc_state_->shared.matrices); + + // Refine quantization levels. + FindBestQuantizer(ib_or_linear, *opsin_, enc_state_, pool, aux_out); + + // Choose a context model that depends on the amount of quantization for + // AC. + if (cparams.speed_tier < SpeedTier::kFalcon) { + FindBestBlockEntropyModel(*enc_state_); + } + +#ifdef XLNX_DEBUG_CMAP + std::cout << "=========================================" << std::endl; + std::cout << "ColorMap info: " << std::endl; + ImageSB* JXL_RESTRICT tmp_map = &enc_state_->shared.cmap.ytox_map; + int32_t dc = enc_state_->shared.cmap.GetYToXDC(); + std::cout << "Y to X dc: " << dc << std::endl; + for (int i = 0; i < tmp_map->ysize(); i++) { + int8_t* JXL_RESTRICT row_out = tmp_map->Row(i); + for (int j = 0; j < tmp_map->xsize(); j++) { + std::cout << (int)row_out[j] << " "; + } + std::cout << std::endl; + } + + tmp_map = &enc_state_->shared.cmap.ytox_map; + dc = enc_state_->shared.cmap.GetYToBDC(); + std::cout << "Y to B dc: " << dc << std::endl; + for (int i = 0; i < tmp_map->ysize(); i++) { + int8_t* JXL_RESTRICT row_out = tmp_map->Row(i); + for (int j = 0; j < tmp_map->xsize(); j++) { + std::cout << (int)row_out[j] << " "; + } + std::cout << std::endl; + } + std::cout << std::endl; +#endif + + InitializePassesEncoder(opsin, pool, enc_state_, modular_frame_encoder.get(), aux_out, opsin.xsize(), + opsin.ysize(), dctIDT, dct2x2, dct4x4, dct8x8, dct16x16, dct32x32, dcIDT, dc2x2, dc4x4, + dc8x8, dc16x16, dc32x32); + + enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses()); + for (PassesEncoderState::PassData& pass : enc_state_->passes) { + pass.ac_tokens.resize(shared.frame_dim.num_groups); + } + + lossy_frame_encoder.ComputeAllCoeffOrders(shared.frame_dim); + shared.num_histograms = 1; + + *frame_header = shared.frame_header; + + // needs to happen *AFTER* VarDCT-ComputeEncodingData. + JXL_RETURN_IF_ERROR(modular_frame_encoder->ComputeEncodingData( + *frame_header, *ib.metadata(), &opsin, *extra_channels, lossy_frame_encoder.State(), pool, aux_out, + /* do_color=*/frame_header->encoding == FrameEncoding::kModular)); + } + return true; +} +} // namespace jxl + +#endif diff --git a/codec/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_phase3.cpp b/codec/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_phase3.cpp new file mode 100644 index 0000000000..225caa2097 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/host_acc_cluster_histogram/acc_phase3.cpp @@ -0,0 +1,1586 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef HLS_KERNEL3_CPP +#define HLS_KERNEL3_CPP + +#include "acc_phase3.hpp" + +#include + +#include "acc_init_histogram.hpp" +#include "acc_store_encode_data.hpp" +#include "lib/jxl/lehmer_code.h" + +#ifndef HLS_TEST +#include "host_cluster_histogram.hpp" +#else +#include "hls_cluster_histogram.hpp" +#endif +// void test(int* in, int* out); + +// inline int tvdiff(struct timeval* tv0, struct timeval* tv1) { +// return (tv1->tv_sec - tv0->tv_sec) * 1000000 + (tv1->tv_usec - tv0->tv_usec); +//} + +namespace jxl { +namespace { +size_t IndexOf(const std::vector& v, uint8_t value) { + size_t i = 0; + for (; i < v.size(); ++i) { + if (v[i] == value) return i; + } + return i; +} + +void MoveToFront(std::vector* v, size_t index) { + uint8_t value = (*v)[index]; + for (size_t i = index; i != 0; --i) { + (*v)[i] = (*v)[i - 1]; + } + (*v)[0] = value; +} + +std::vector MoveToFrontTransform(const std::vector& v) { + if (v.empty()) return v; + uint8_t max_value = *std::max_element(v.begin(), v.end()); + std::vector mtf(max_value + 1); + for (size_t i = 0; i <= max_value; ++i) mtf[i] = i; + std::vector result(v.size()); + for (size_t i = 0; i < v.size(); ++i) { + size_t index = IndexOf(mtf, v[i]); + JXL_ASSERT(index < mtf.size()); + result[i] = static_cast(index); + MoveToFront(&mtf, index); + } + return result; +} +} // namespace + +namespace { + +void acc_TokenizePermutation(const coeff_order_t* JXL_RESTRICT order, + size_t skip, + size_t size, + std::vector* tokens) { + std::vector lehmer(size); + std::vector temp(size + 1); + ComputeLehmerCode(order, temp.data(), size, lehmer.data()); + size_t end = size; + while (end > skip && lehmer[end - 1] == 0) { + --end; + } + tokens->emplace_back(CoeffOrderContext(size), end - skip); + uint32_t last = 0; + for (size_t i = skip; i < end; ++i) { + tokens->emplace_back(CoeffOrderContext(last), lehmer[i]); + last = lehmer[i]; + } +} + +} // namespace + +namespace { +void acc_EncodeCoeffOrder(const coeff_order_t* JXL_RESTRICT order, + AcStrategy acs, + std::vector* tokens, + coeff_order_t* order_zigzag) { + const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y(); + const size_t size = kDCTBlockSize * llf; + const coeff_order_t* natural_coeff_order_lut = acs.NaturalCoeffOrderLut(); + for (size_t i = 0; i < size; ++i) { + order_zigzag[i] = natural_coeff_order_lut[order[i]]; + } + acc_TokenizePermutation(order_zigzag, llf, size, tokens); +} +} // namespace + +Status acc_predictAndtoken(LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& frame_header, + std::vector >& coefOrders_tokens, + ThreadPool* pool) { + std::vector& group_caches_ = lossy_frame_encoder.get_group_cashes(); + PassesEncoderState* JXL_RESTRICT enc_state_ = lossy_frame_encoder.State(); + PassesSharedState& shared = enc_state_->shared; + + const auto tokenize_group_init = [&](const size_t num_threads) { + group_caches_.resize(num_threads); + return true; + }; + const auto tokenize_group = [&](const int group_index, const int thread) { + // Tokenize coefficients. + const Rect rect = shared.BlockGroupRect(group_index); + for (size_t idx_pass = 0; idx_pass < enc_state_->passes.size(); idx_pass++) { + JXL_ASSERT(enc_state_->coeffs[idx_pass]->Type() == ACType::k32); + const int32_t* JXL_RESTRICT ac_rows[3] = { + enc_state_->coeffs[idx_pass]->PlaneRow(0, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(1, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(2, group_index, 0).ptr32, + }; + // Ensure group cache is initialized. + group_caches_[thread].InitOnce(); + TokenizeCoefficients(&shared.coeff_orders[idx_pass * shared.coeff_order_size], rect, ac_rows, + shared.ac_strategy, frame_header->chroma_subsampling, + &group_caches_[thread].num_nzeroes, + &enc_state_->passes[idx_pass].ac_tokens[group_index], enc_state_->shared.quant_dc, + enc_state_->shared.raw_quant_field, enc_state_->shared.block_ctx_map); + } + }; + RunOnPool(pool, 0, shared.frame_dim.num_groups, tokenize_group_init, tokenize_group, "TokenizeGroup"); + + const coeff_order_t* JXL_RESTRICT order = &enc_state_->shared.coeff_orders[0 * enc_state_->shared.coeff_order_size]; + auto mem = hwy::AllocateAligned(AcStrategy::kMaxCoeffArea); + uint16_t computed = 0; + uint16_t used_orders = enc_state_->used_orders[0]; + + if (frame_header->encoding == FrameEncoding::kVarDCT) { + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + uint8_t ord = kStrategyOrder[o]; + if (computed & (1 << ord)) continue; + computed |= 1 << ord; + if ((used_orders & (1 << ord)) == 0) continue; + AcStrategy acs = AcStrategy::FromRawStrategy(o); + for (size_t c = 0; c < 3; c++) { + acc_EncodeCoeffOrder(&order[CoeffOrderOffset(ord, c)], acs, &coefOrders_tokens[0], mem.get()); + } + } + } + return true; +} + +BitWriter* get_output(const size_t index, std::vector& group_codes, bool is_small_image) { + return &group_codes[is_small_image ? 0 : index]; +} + +Status acc_histogram(std::string xclbinPath, + LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& modular_frame_encoder, + PassesEncoderState* passes_enc_state, + FrameDimensions frame_dim, + std::unique_ptr& frame_header, + CompressParams cparams, + std::vector >& coefOrders_tokens, + BitWriter* group_codes_writer, + BitWriter* acInfo_writer, + size_t& ans_cost, + size_t& mtf_cost, + std::vector >& bcm_tokens, + std::vector >& bcm_mtf_tokens, + EntropyEncodingData& bcm_codes, + std::vector& bcm_dummy_context_map, + + EntropyEncodingData& modularFramTree_code, + std::vector& modularFramTree_ctxmap, + + EntropyEncodingData& coefOrders_codes, + std::vector& coefOrders_context_map, + + std::vector& aux_outs, + AuxOut* aux_out) { + std::vector& group_caches_ = lossy_frame_encoder.get_group_cashes(); + PassesEncoderState* JXL_RESTRICT enc_state_ = lossy_frame_encoder.State(); + PassesSharedState& shared = enc_state_->shared; + + const coeff_order_t* JXL_RESTRICT order = &enc_state_->shared.coeff_orders[0 * enc_state_->shared.coeff_order_size]; + auto mem = hwy::AllocateAligned(AcStrategy::kMaxCoeffArea); + uint16_t computed = 0; + uint16_t used_orders = enc_state_->used_orders[0]; + + if (frame_header->encoding == FrameEncoding::kVarDCT) { + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + uint8_t ord = kStrategyOrder[o]; + if (computed & (1 << ord)) continue; + computed |= 1 << ord; + if ((used_orders & (1 << ord)) == 0) continue; + AcStrategy acs = AcStrategy::FromRawStrategy(o); + for (size_t c = 0; c < 3; c++) { + acc_EncodeCoeffOrder(&order[CoeffOrderOffset(ord, c)], acs, &coefOrders_tokens[0], mem.get()); + } + } + } + + HistogramParams params0; + params0.clustering = HistogramParams::ClusteringType::kFast; + params0.uint_method = HistogramParams::HybridUintMethod::kNone; + params0.lz77_method = HistogramParams::LZ77Method::kNone; + HistogramParams params1; + params1.clustering = HistogramParams::ClusteringType::kFast; + params1.uint_method = HistogramParams::HybridUintMethod::kNone; + params1.lz77_method = HistogramParams::LZ77Method::kNone; + HistogramParams params2; + params2.clustering = HistogramParams::ClusteringType::kFast; + params2.uint_method = HistogramParams::HybridUintMethod::kNone; + params2.lz77_method = HistogramParams::LZ77Method::kNone; + HistogramParams params3; + params3.clustering = HistogramParams::ClusteringType::kFast; + params3.uint_method = HistogramParams::HybridUintMethod::kNone; + params3.lz77_method = HistogramParams::LZ77Method::kNone; + HistogramParams params4(enc_state_->cparams.speed_tier, enc_state_->shared.block_ctx_map.NumACContexts()); + if (enc_state_->cparams.decoding_speed_tier >= 1) { + params4.max_histograms = 6; + } + + params4.clustering = HistogramParams::ClusteringType::kFast; + params4.uint_method = HistogramParams::HybridUintMethod::kNone; + params4.lz77_method = HistogramParams::LZ77Method::kNone; + std::vector context_map0; + std::vector context_map1; + std::vector context_map2; + std::vector context_map3; + std::vector context_map4; + std::vector context_map_c0; + std::vector context_map_c1; + std::vector context_map_c2; + std::vector context_map_c3; + std::vector context_map_c4; + std::vector > tokens0(1); + std::vector > tokens1(1); + std::vector > tokens2(1); + std::vector > tokens3(1); + std::vector > tokens4(1); + std::vector > tokens_c0(1); + std::vector > tokens_c1(1); + std::vector > tokens_c2(1); + std::vector > tokens_c3(1); + std::vector > tokens_c4(1); + EntropyEncodingData codes0; + EntropyEncodingData codes1; + EntropyEncodingData codes2; + EntropyEncodingData codes3; + EntropyEncodingData codes4; + EntropyEncodingData codes_c0; + EntropyEncodingData codes_c1; + EntropyEncodingData codes_c2; + EntropyEncodingData codes_c3; + EntropyEncodingData codes_c4; + std::vector clustered_histograms0; + std::vector clustered_histograms1; + std::vector clustered_histograms2; + std::vector clustered_histograms3; + std::vector clustered_histograms4; + std::vector clustered_histograms_c0; + std::vector clustered_histograms_c1; + std::vector clustered_histograms_c2; + std::vector clustered_histograms_c3; + std::vector clustered_histograms_c4; + BitWriter* writer0 = nullptr; + BitWriter* writer1 = nullptr; + BitWriter* writer2 = nullptr; + BitWriter* writer3 = nullptr; + BitWriter* writer4 = nullptr; + size_t layer0 = 0; + size_t layer1 = 0; + size_t layer2 = 0; + size_t layer3 = 0; + size_t layer4 = 0; + size_t num_contexts0 = 1; + size_t num_contexts1 = 1; + size_t num_contexts2 = 1; + size_t num_contexts3 = 1; + size_t num_contexts4 = 1; + bool do_once[5] = {0, 0, 0, 0, 0}; + char* do_inner = (char*)malloc(sizeof(char) * 8); + for (int i = 0; i < 5; i++) do_inner[i] = 0; + char* do_prefix_in = (char*)malloc(sizeof(char) * 8); + for (int i = 0; i < 5; i++) do_prefix_in[i] = 0; + char* do_prefix_out = (char*)malloc(sizeof(char) * 8); + for (int i = 0; i < 5; i++) do_prefix_out[i] = 0; + + const size_t num_passes = passes_enc_state->progressive_splitter.GetNumPasses(); + const bool is_small_image = frame_dim.num_groups == 1 && num_passes == 1; + + if (!is_small_image) { + group_codes_writer->init(200); + group_codes_writer->update_part(0); + } else { + group_codes_writer->init(200); + group_codes_writer->update_part(0); + } + + bool all_default = true; + const float* dc_quant = (lossy_frame_encoder.State()->shared.matrices).DCQuants(); + for (size_t c = 0; c < 3; c++) { + if (dc_quant[c] != kDCQuant[c]) { + all_default = false; + } + } + BitWriter::Allotment allotment(group_codes_writer, 1 + sizeof(float) * kBitsPerByte * 3); + group_codes_writer->Write(1, all_default); + if (!all_default) { + for (size_t c = 0; c < 3; c++) { + JXL_RETURN_IF_ERROR(F16Coder::Write(dc_quant[c] * 128.0f, group_codes_writer)); + } + } + ReclaimAndCharge(group_codes_writer, &allotment, kLayerDequantTables, aux_out); + + auto& dct = enc_state_->shared.block_ctx_map.dc_thresholds; + auto& qft = enc_state_->shared.block_ctx_map.qf_thresholds; + auto& ctx_map = enc_state_->shared.block_ctx_map.ctx_map; + if (frame_header->encoding == FrameEncoding::kVarDCT) { + JXL_RETURN_IF_ERROR(enc_state_->shared.quantizer.Encode(group_codes_writer, kLayerQuant, aux_out)); + //============Encode GlobalDCInfo: Block Context Map========= + if (dct[0].empty() && dct[1].empty() && dct[2].empty() && qft.empty() && ctx_map.size() == 21 && + std::equal(ctx_map.begin(), ctx_map.end(), jxl::kDefaultCtxMap)) { + group_codes_writer->Write(1, 1); // default + } else { + group_codes_writer->Write(1, 0); + for (int j : {0, 1, 2}) { + group_codes_writer->Write(4, dct[j].size()); + for (int i : dct[j]) { + JXL_CHECK(U32Coder::Write(kDCThresholdDist, PackSigned(i), group_codes_writer)); + } + } + group_codes_writer->Write(4, qft.size()); + for (uint32_t i : qft) { + JXL_CHECK(U32Coder::Write(kQFThresholdDist, i - 1, group_codes_writer)); + } + for (size_t i = 0; i < ctx_map.size(); i++) { + bcm_tokens[0].emplace_back(0, ctx_map[i]); + } + + { + std::vector context_map = ctx_map; + BitWriter* writer = group_codes_writer; + writer0 = group_codes_writer; + size_t num_histograms = enc_state_->shared.block_ctx_map.num_ctxs; + if (num_histograms == 1) { + // Simple code + writer->Write(1, 1); + // 0 bits per entry. + writer->Write(2, 0); + } else { + std::vector > tokens(1); + for (size_t i = 0; i < context_map.size(); i++) { + tokens[0].emplace_back(0, context_map[i]); + } + + size_t entry_bits = CeilLog2Nonzero(num_histograms); + size_t simple_cost = entry_bits * context_map.size(); + if (entry_bits < 4) { + writer->Write(1, 1); + writer->Write(2, entry_bits); + for (size_t i = 0; i < context_map.size(); i++) { + writer->Write(entry_bits, context_map[i]); + } + } else { + writer->Write(1, 0); + writer->Write(1, 0); + EntropyEncodingData context_codes0; + std::vector > context_tokens0(1); + do_once[0] = true; + num_contexts0 = 1; + tokens0 = tokens; + codes0 = bcm_codes; + context_map0 = bcm_dummy_context_map; + // codes_c0 = context_codes0; + // writer0 = writer; + layer0 = 0; + + // BuildAndEncodeHistogramsNew0 + // ========================================================= + } + } + } + } + //============================= + //============Encode GlobalDCInfo: Color Correlation Map========= + if (!is_small_image) { + group_codes_writer->update_part(20); + } else { + group_codes_writer->update_part(20); + } + ColorCorrelationMapEncodeDC(&enc_state_->shared.cmap, group_codes_writer, kLayerDC, aux_out); + //============================= + } + + if (!is_small_image) { + group_codes_writer->update_part(30); + } else { + group_codes_writer->update_part(30); + } + + writer1 = group_codes_writer; + writer2 = group_codes_writer; + BitWriter::Allotment allotmentGlobalInfo(group_codes_writer, 1); + // If we are using brotli, or not using modular mode. + if (modular_frame_encoder->tree_tokens.empty() || modular_frame_encoder->tree_tokens[0].empty()) { + group_codes_writer->Write(1, 0); + ReclaimAndCharge(group_codes_writer, &allotmentGlobalInfo, kLayerModularTree, aux_out); + } else { + group_codes_writer->Write(1, 1); + ReclaimAndCharge(group_codes_writer, &allotmentGlobalInfo, kLayerModularTree, aux_out); + // Write tree + if (cparams.speed_tier > SpeedTier::kKitten) { + params1.ans_histogram_strategy = HistogramParams::ANSHistogramStrategy::kApproximate; + params2.ans_histogram_strategy = HistogramParams::ANSHistogramStrategy::kApproximate; + } + + if (cparams.decoding_speed_tier >= 1) { + params1.max_histograms = 12; + params2.max_histograms = 12; + } + + EntropyEncodingData context_codes1; + std::vector > context_tokens1(1); + std::vector dummy_context_map1; + + do_once[1] = true; + num_contexts1 = kNumTreeContexts; + tokens1 = modular_frame_encoder->tree_tokens; + codes1 = modularFramTree_code; + context_map1 = modularFramTree_ctxmap; + ////codes_c0 = context_codes0; + ////writer0 = writer; + layer1 = kLayerModularTree; + + // BuildAndEncodeHistogramsNew1 + + if (!is_small_image) { + group_codes_writer->update_part(50); + } else { + group_codes_writer->update_part(50); + } + params2.image_widths = modular_frame_encoder->image_widths; + // Write histograms. + EntropyEncodingData context_codes2; + std::vector > context_tokens2(1); + std::vector dummy_context_map2; + + do_once[2] = true; + num_contexts2 = (modular_frame_encoder->tree.size() + 1) / 2; + tokens2 = modular_frame_encoder->tokens; + codes2 = modular_frame_encoder->code; + context_map2 = modular_frame_encoder->context_map; + ////codes_c0 = context_codes0; + ////writer0 = writer; + layer2 = kLayerModularGlobal; + + // BuildAndEncodeHistogramsNew2 + } + + //============================= Encode Global ACInfo ============= + if (!is_small_image) { + acInfo_writer->init(200); + acInfo_writer->update_part(0); + } else { + acInfo_writer->update_part(80); + } + writer3 = acInfo_writer; + writer4 = acInfo_writer; + + if (frame_header->encoding == FrameEncoding::kVarDCT) { + bool all_default = true; + const std::vector& encodings = (enc_state_->shared.matrices).encodings(); + + for (size_t i = 0; i < encodings.size(); i++) { + if (encodings[i].mode != QuantEncoding::kQuantModeLibrary || encodings[i].predefined != 0) { + all_default = false; + } + } + // TODO(janwas): better bound + BitWriter::Allotment allotment(acInfo_writer, 512 * 1024); + acInfo_writer->Write(1, all_default); + ReclaimAndCharge(acInfo_writer, &allotment, kLayerDequantTables, aux_out); + + size_t num_histo_bits = CeilLog2Nonzero(enc_state_->shared.frame_dim.num_groups); + if (num_histo_bits != 0) { + BitWriter::Allotment allotment(acInfo_writer, num_histo_bits); + acInfo_writer->Write(num_histo_bits, enc_state_->shared.num_histograms - 1); + ReclaimAndCharge(acInfo_writer, &allotment, kLayerAC, aux_out); + } + + //============= encode coef orders======== + // Encode coefficient orders. + uint16_t used_orders = enc_state_->used_orders[0]; + size_t order_bits = 0; + JXL_RETURN_IF_ERROR(U32Coder::CanEncode(kOrderEnc, enc_state_->used_orders[0], &order_bits)); + BitWriter::Allotment allotmentCoef(acInfo_writer, order_bits); + JXL_CHECK(U32Coder::Write(kOrderEnc, enc_state_->used_orders[0], acInfo_writer)); + ReclaimAndCharge(acInfo_writer, &allotmentCoef, kLayerOrder, aux_out); + + // Do not write anything if no order is used. + EntropyEncodingData context_codes3; + std::vector > context_tokens3(1); + std::vector dummy_context_map3; + do_once[3] = true; + num_contexts3 = kPermutationContexts; + tokens3 = coefOrders_tokens; + codes3 = coefOrders_codes; + context_map3 = coefOrders_context_map; + ////codes_c0 = context_codes0; + ////writer0 = writer; + layer3 = kLayerOrder; + // BuildAndEncodeHistogramsNew3 + + if (!is_small_image) { + acInfo_writer->update_part(20); + } else { + acInfo_writer->update_part(100); + } + } + + std::vector > histograms_(5); + histograms_[0].resize(num_contexts0); + histograms_[1].resize(num_contexts1); + histograms_[2].resize(num_contexts2); + histograms_[3].resize(num_contexts3); + histograms_[4].resize(enc_state_->shared.num_histograms * enc_state_->shared.block_ctx_map.NumACContexts()); + + std::vector params(5); + std::vector num_contexts(5); + std::vector layer(5); + std::vector codes(5); + std::vector*> context_map(5); + std::vector codes_c(5); + std::vector writer(5); + writer[0] = writer0; + writer[1] = writer1; + writer[2] = writer2; + writer[3] = writer3; + writer[4] = writer4; + + std::vector > nonempty_histograms(5); + std::vector largest_idx(5); + + std::vector > clustered_histograms(5); + + std::vector > clustered_histogramsin(5); + std::vector > > tokensin(5, std::vector >(1)); + std::vector codesin(5); + std::vector > context_map_in(5); + + constexpr float kMinDistanceForDistinctFast = 64.0f; + constexpr float kMinDistanceForDistinctBest = 16.0f; + + if (frame_header->encoding == FrameEncoding::kVarDCT) { + do_once[4] = true; + } + + // Build histograms. + for (int i = 0; i < 5; i++) { + if (!do_once[i]) continue; + if (i == 0) { + params[0] = params0; + num_contexts[0] = num_contexts0; + layer[0] = layer0; + codes[0] = &codes0; + context_map[0] = &context_map0; + codes_c[0] = &codes_c0; + } else if (i == 1) { + params[1] = params1; + num_contexts[1] = num_contexts1; + layer[1] = layer1; + codes[1] = &codes1; + context_map[1] = &context_map1; + codes_c[1] = &codes_c1; + } else if (i == 2) { + params[2] = params2; + num_contexts[2] = num_contexts2; + layer[2] = layer2; + codes[2] = &codes2; + context_map[2] = &context_map2; + codes_c[2] = &codes_c2; + } else if (i == 3) { + params[3] = params3; + num_contexts[3] = num_contexts3; + layer[3] = layer3; + codes[3] = &codes3; + context_map[3] = &context_map3; + codes_c[3] = &codes_c3; + } else if (i == 4) { + params[4] = params4; + num_contexts[4] = num_contexts4; + layer[4] = kLayerAC; + codes[4] = &enc_state_->passes[0].codes; + context_map[4] = &enc_state_->passes[0].context_map; + codes_c[4] = &codes_c4; + } + } + + acc_ANSinitHistogram(lossy_frame_encoder, frame_header, params, do_once, tokens0, tokens1, tokens2, tokens3, + do_prefix_out, largest_idx, nonempty_histograms, histograms_); + + uint32_t numHisto[5]; + uint32_t numCtx[5]; + + std::vector histograms_ptr(5); + std::vector histo_totalcnt_ptr(5); + std::vector histo_size_ptr(5); + std::vector nonempty_histo_ptr(5); + + for (int i = 0; i < 5; i++) { + numHisto[i] = histograms_[i].size(); + numCtx[i] = num_contexts[i]; + histograms_ptr[i] = (int32_t*)malloc(4096 * 40 * sizeof(int32_t)); + memset(histograms_ptr[i], 0, 4096 * 40 * sizeof(int32_t)); + histo_totalcnt_ptr[i] = (uint32_t*)malloc(4096 * sizeof(uint32_t)); + memset(histo_totalcnt_ptr[i], 0, 4096 * sizeof(int32_t)); + histo_size_ptr[i] = (uint32_t*)malloc(4096 * sizeof(int32_t)); + memset(histo_size_ptr[i], 0, 4096 * sizeof(uint32_t)); + nonempty_histo_ptr[i] = (uint32_t*)malloc(4096 * sizeof(uint32_t)); + memset(nonempty_histo_ptr[i], 0, 4096 * sizeof(uint32_t)); + for (int j = 0; j < histograms_[i].size(); j++) { + histo_totalcnt_ptr[i][j] = histograms_[i][j].total_count_; + histo_size_ptr[i][j] = histograms_[i][j].data_.size(); + for (int k = 0; k < histograms_[i][j].data_.size(); k++) { + histograms_ptr[i][j * 40 + k] = histograms_[i][j].data_[k]; + } + } + + for (int j = 0; j < nonempty_histograms[i].size(); j++) { + nonempty_histo_ptr[i][j] = nonempty_histograms[i][j]; + } + } + + uint32_t numHisto_clusd[5]; + uint32_t histo_size_clusdin[5] = {0, 0, 0, 0, 0}; + + std::vector ctx_map_ptr(5); + std::vector histograms_clusd_ptr(5); + std::vector histo_size_clusd_ptr(5); + std::vector histograms_clusdin_ptr(5); + for (int i = 0; i < 5; i++) { + ctx_map_ptr[i] = (uint8_t*)malloc(4096 * sizeof(uint8_t)); + memset(ctx_map_ptr[i], 0, 4096 * sizeof(uint8_t)); + histograms_clusd_ptr[i] = (int32_t*)malloc(128 * 40 * sizeof(int32_t)); + memset(histograms_clusd_ptr[i], 0, 128 * 40 * sizeof(int32_t)); + histo_size_clusd_ptr[i] = (uint32_t*)malloc(128 * sizeof(uint32_t)); + memset(histo_size_clusd_ptr[i], 0, 128 * sizeof(uint32_t)); + histograms_clusdin_ptr[i] = (int32_t*)malloc(4096 * sizeof(int32_t)); + memset(histograms_clusdin_ptr[i], 0, 4096 * sizeof(int32_t)); + } + + uint32_t* config = (uint32_t*)malloc(35 * sizeof(uint32_t)); + memset(config, 0, 35 * sizeof(uint32_t)); + + config[0] = histograms_[0].size(); + config[1] = histograms_[1].size(); + config[2] = histograms_[2].size(); + config[3] = histograms_[3].size(); + config[4] = histograms_[4].size(); + config[5] = nonempty_histograms[0].size(); + config[6] = nonempty_histograms[1].size(); + config[7] = nonempty_histograms[2].size(); + config[8] = nonempty_histograms[3].size(); + config[9] = nonempty_histograms[4].size(); + config[10] = largest_idx[0]; + config[11] = largest_idx[1]; + config[12] = largest_idx[2]; + config[13] = largest_idx[3]; + config[14] = largest_idx[4]; + + config[25] = do_once[0]; + config[26] = do_once[1]; + config[27] = do_once[2]; + config[28] = do_once[3]; + config[29] = do_once[4]; + +// clang-format off +#ifndef HLS_TEST + hls_ANSclusterHistogram_wrapper( + xclbinPath, + config, + //======= + histograms_ptr[0], + histo_totalcnt_ptr[0], + histo_size_ptr[0], + nonempty_histo_ptr[0], + ctx_map_ptr[0], + histograms_clusd_ptr[0], + histo_size_clusd_ptr[0], + histograms_clusdin_ptr[0], + //======== + histograms_ptr[1], + histo_totalcnt_ptr[1], + histo_size_ptr[1], + nonempty_histo_ptr[1], + ctx_map_ptr[1], + histograms_clusd_ptr[1], + histo_size_clusd_ptr[1], + histograms_clusdin_ptr[1], + //======= + histograms_ptr[2], + histo_totalcnt_ptr[2], + histo_size_ptr[2], + nonempty_histo_ptr[2], + ctx_map_ptr[2], + histograms_clusd_ptr[2], + histo_size_clusd_ptr[2], + histograms_clusdin_ptr[2], + //======= + histograms_ptr[3], + histo_totalcnt_ptr[3], + histo_size_ptr[3], + nonempty_histo_ptr[3], + ctx_map_ptr[3], + histograms_clusd_ptr[3], + histo_size_clusd_ptr[3], + histograms_clusdin_ptr[3], + //====== + histograms_ptr[4], + histo_totalcnt_ptr[4], + histo_size_ptr[4], + nonempty_histo_ptr[4], + ctx_map_ptr[4], + histograms_clusd_ptr[4], + histo_size_clusd_ptr[4], + histograms_clusdin_ptr[4] +); +#else + acc_ANSclusterHistogram(config, + histograms_ptr[0], + histo_totalcnt_ptr[0], + histo_size_ptr[0], + + nonempty_histo_ptr[0], + + ctx_map_ptr[0], + + histograms_clusd_ptr[0], + histo_size_clusd_ptr[0], + + histograms_clusdin_ptr[0], + //======== + histograms_ptr[1], + histo_totalcnt_ptr[1], + histo_size_ptr[1], + + nonempty_histo_ptr[1], + + ctx_map_ptr[1], + + histograms_clusd_ptr[1], + histo_size_clusd_ptr[1], + + histograms_clusdin_ptr[1], + //======= + histograms_ptr[2], + histo_totalcnt_ptr[2], + histo_size_ptr[2], + + nonempty_histo_ptr[2], + + ctx_map_ptr[2], + + histograms_clusd_ptr[2], + histo_size_clusd_ptr[2], + + histograms_clusdin_ptr[2], + //======= + histograms_ptr[3], + histo_totalcnt_ptr[3], + histo_size_ptr[3], + + nonempty_histo_ptr[3], + + ctx_map_ptr[3], + + histograms_clusd_ptr[3], + histo_size_clusd_ptr[3], + + histograms_clusdin_ptr[3], + //====== + histograms_ptr[4], + histo_totalcnt_ptr[4], + histo_size_ptr[4], + + nonempty_histo_ptr[4], + + ctx_map_ptr[4], + + histograms_clusd_ptr[4], + histo_size_clusd_ptr[4], + + histograms_clusdin_ptr[4] +); +#endif + // clang-format on + + numHisto_clusd[0] = config[15]; + numHisto_clusd[1] = config[16]; + numHisto_clusd[2] = config[17]; + numHisto_clusd[3] = config[18]; + numHisto_clusd[4] = config[19]; + histo_size_clusdin[0] = config[20]; + histo_size_clusdin[1] = config[21]; + histo_size_clusdin[2] = config[22]; + histo_size_clusdin[3] = config[23]; + histo_size_clusdin[4] = config[24]; + + for (int i = 0; i < 5; i++) { + do_inner[i] = 0; + if (histograms_[i].size() > 1) { + if (numHisto_clusd[i] == 1) { + } else { + size_t entry_bits = CeilLog2Nonzero(numHisto_clusd[i]); + if (entry_bits < 4) { + } else { + do_inner[i] = 1; + } + } + } + } + + for (int i = 0; i < 5; i++) { + if (!do_once[i]) continue; + + if (do_inner[i]) { + clustered_histogramsin[i].resize(1); + clustered_histogramsin[i][0].data_.resize(histo_size_clusdin[i]); + for (int j = 0; j < histo_size_clusdin[i]; j++) { + clustered_histogramsin[i][0].data_[j] = histograms_clusdin_ptr[i][j]; + } + context_map_in[i].resize(histo_size_clusdin[i]); + } + } + + for (int i = 0; i < 5; i++) { + if (!do_once[i]) continue; + size_t histograms_size = numHisto[i]; + if (histograms_size > 1) { + if (writer[i] != nullptr) { + size_t num_histograms = numHisto_clusd[i]; + if (num_histograms == 1) { + } else { + for (size_t j = 0; j < numHisto[i]; j++) { + tokensin[i][0].emplace_back(0, ctx_map_ptr[i][j]); + } + } + } + } + } + + for (int i = 0; i < 5; i++) { + if (!do_once[i]) continue; + if (numHisto[i] > 1) { + clustered_histograms[i].resize(numHisto_clusd[i]); + for (int j = 0; j < numHisto_clusd[i]; j++) { + clustered_histograms[i][j].data_.resize(histo_size_clusd_ptr[i][j]); + for (int k = 0; k < histo_size_clusd_ptr[i][j]; k++) { + clustered_histograms[i][j].data_[k] = histograms_clusd_ptr[i][j * 40 + k]; + } + } + } + } + + for (int i = 0; i < 5; i++) { + if (!do_once[i]) continue; + + codes[i]->lz77.nonserialized_distance_context = num_contexts[i]; + codes[i]->lz77.enabled = false; + codes[i]->lz77.min_symbol = 224; + codes[i]->encoding_info.clear(); + if (do_inner[i]) { + codesin[i].lz77.nonserialized_distance_context = 1; + codesin[i].lz77.enabled = false; + codesin[i].lz77.min_symbol = 224; + codesin[i].encoding_info.clear(); + } + } + + for (int i = 0; i < 5; i++) { + if (!do_once[i]) continue; + context_map[i]->resize(numHisto[i]); + if (numHisto[i] > 1) { + for (size_t c = 0; c < numHisto[i]; ++c) { + (*context_map[i])[c] = ctx_map_ptr[i][c]; + } + } + } + + for (int i = 0; i < 5; i++) { + if (!do_once[i]) continue; + if (i == 0) { + tokens_c0 = tokensin[i]; + codes_c0 = codesin[i]; + context_map_c0 = context_map_in[i]; + clustered_histograms0 = clustered_histograms[i]; + clustered_histograms_c0 = clustered_histogramsin[i]; + } else if (i == 1) { + tokens_c1 = tokensin[i]; + codes_c1 = codesin[i]; + context_map_c1 = context_map_in[i]; + clustered_histograms1 = clustered_histograms[i]; + clustered_histograms_c1 = clustered_histogramsin[i]; + } else if (i == 2) { + tokens_c2 = tokensin[i]; + codes_c2 = codesin[i]; + context_map_c2 = context_map_in[i]; + clustered_histograms2 = clustered_histograms[i]; + clustered_histograms_c2 = clustered_histogramsin[i]; + } else if (i == 3) { + tokens_c3 = tokensin[i]; + codes_c3 = codesin[i]; + context_map_c3 = context_map_in[i]; + clustered_histograms3 = clustered_histograms[i]; + clustered_histograms_c3 = clustered_histogramsin[i]; + } else if (i == 4) { + tokens_c4 = tokensin[i]; + codes_c4 = codesin[i]; + context_map_c4 = context_map_in[i]; + clustered_histograms4 = clustered_histograms[i]; + clustered_histograms_c4 = clustered_histogramsin[i]; + } + do_prefix_in[i] = 0; + } + + for (int i = 0; i < 5; i++) { + if (!do_once[i]) continue; + + if (i == 0) { + if (!is_small_image) { + writer[0]->update_part(1); + } else { + writer[0]->update_part(1); + } + + } else if (i == 1) { + if (!is_small_image) { + writer[1]->update_part(31); + } else { + writer[1]->update_part(31); + } + } else if (i == 2) { + if (!is_small_image) { + writer[2]->update_part(51); + } else { + writer[2]->update_part(51); + } + } else if (i == 3) { + if (!is_small_image) { + writer[3]->update_part(1); + } else { + writer[3]->update_part(81); + } + } else if (i == 4) { + if (!is_small_image) { + writer[4]->update_part(21); + } else { + writer[4]->update_part(101); + } + } + + size_t histograms_size = numHisto[i]; + + const size_t max_contexts = std::min((size_t)numCtx[i], kClustersLimit); + BitWriter::Allotment allotment(writer[i], 128 + numCtx[i] * 40 + max_contexts * 96); + if (writer[i]) { + LZ77Params lz77; + lz77.nonserialized_distance_context = numCtx[i]; + lz77.enabled = false; + lz77.min_symbol = 224; + JXL_CHECK(Bundle::Write(lz77 /*codes[i]->lz77*/, writer[i], layer[i], nullptr)); + } + + if (histograms_size > 1) { + size_t num_histograms = numHisto_clusd[i]; + if (writer[i] != nullptr) { + if (num_histograms == 1) { + writer[i]->Write(1, 1); + writer[i]->Write(2, 0); + } else { + size_t entry_bits = CeilLog2Nonzero(num_histograms); + if (entry_bits < 4) { + writer[i]->Write(1, 1); + writer[i]->Write(2, entry_bits); + for (size_t j = 0; j < numHisto[i]; j++) { + writer[i]->Write(entry_bits, ctx_map_ptr[i][j]); + } + } else { + writer[i]->Write(1, 0); + writer[i]->Write(1, 0); + } + } + } + } + // StoreEntropyCodesNew + allotment.FinishedHistogram(writer[i]); + ReclaimAndCharge(writer[i], &allotment, layer[i], nullptr); + + if (do_inner[i]) { + // do inner ontext map = true + BitWriter::Allotment allotment(writer[i], 128 + 1 * 40 + 96); + LZ77Params lz77; + lz77.nonserialized_distance_context = 1; + lz77.enabled = false; + lz77.min_symbol = 224; + JXL_CHECK(Bundle::Write(lz77 /*codesin[i].lz77*/, writer[i], 0, nullptr)); + + // StoreEntropyCodesNew + // WriteToken + allotment.FinishedHistogram(writer[i]); + ReclaimAndCharge(writer[i], &allotment, 0, nullptr); + } + } + + // ============================================== + // Do StoreEntropyCodes for outer histogram + // ============================================== + // printf("do_prefix_out = %d, %d, %d, %d, %d\n", do_prefix_out[0], + // do_prefix_out[1], do_prefix_out[2], do_prefix_out[3], do_prefix_out[4]); + + if (do_once[0]) { + if (!is_small_image) { + writer0->update_part(4); + } else { + writer0->update_part(4); + } + StoreEntropyCodesNew(params0, tokens0, &codes0, do_prefix_out[0], writer0, layer0, nullptr, + clustered_histograms0); + bcm_codes = codes0; + bcm_dummy_context_map = context_map0; + } + if (do_once[1]) { + if (!is_small_image) { + writer1->update_part(34); + } else { + writer1->update_part(34); + } + StoreEntropyCodesNew(params1, tokens1, &codes1, do_prefix_out[1], writer1, layer1, nullptr, + clustered_histograms1); + modularFramTree_code = codes1; + modularFramTree_ctxmap = context_map1; + } + if (do_once[2]) { + if (!is_small_image) { + writer2->update_part(54); + } else { + writer2->update_part(54); + } + StoreEntropyCodesNew(params2, tokens2, &codes2, do_prefix_out[2], writer2, layer2, nullptr, + clustered_histograms2); + modular_frame_encoder->code = codes2; + modular_frame_encoder->context_map = context_map2; + } + if (do_once[3]) { + if (!is_small_image) { + writer3->update_part(4); + } else { + writer3->update_part(84); + } + StoreEntropyCodesNew(params3, tokens3, &codes3, do_prefix_out[3], writer3, layer3, nullptr, + clustered_histograms3); + coefOrders_codes = codes3; + coefOrders_context_map = context_map3; + } + if (do_once[4]) { + if (!is_small_image) { + writer4->update_part(24); + } else { + writer4->update_part(104); + } + StoreEntropyCodesNew(params4, tokens4, &codes4, do_prefix_out[4], writer4, layer4, nullptr, + clustered_histograms4); + enc_state_->passes[0].codes = codes4; + enc_state_->passes[0].context_map = context_map4; + } + + // ============================================== + // Do StoreEntropyCodes for inner histogram + // ============================================== + // printf("do_prefix_in = %d, %d, %d, %d, %d\n", do_prefix_in[0], + // do_prefix_in[1], do_prefix_in[2], do_prefix_in[3], do_prefix_in[4]); + + if (do_inner[0]) { + if (!is_small_image) { + writer0->update_part(2); + } else { + writer0->update_part(2); + } + StoreEntropyCodesNew(params0, tokens_c0, &codes_c0, do_prefix_in[0], writer0, 0, nullptr, + clustered_histograms_c0); + } + if (do_inner[1]) { + if (!is_small_image) { + writer1->update_part(32); + } else { + writer1->update_part(32); + } + StoreEntropyCodesNew(params1, tokens_c1, &codes_c1, do_prefix_in[1], writer1, 0, nullptr, + clustered_histograms_c1); + } + if (do_inner[2]) { + if (!is_small_image) { + writer2->update_part(52); + } else { + writer2->update_part(52); + } + StoreEntropyCodesNew(params2, tokens_c2, &codes_c2, do_prefix_in[2], writer2, 0, nullptr, + clustered_histograms_c2); + } + if (do_inner[3]) { + if (!is_small_image) { + writer3->update_part(2); + } else { + writer3->update_part(82); + } + StoreEntropyCodesNew(params3, tokens_c3, &codes_c3, do_prefix_in[3], writer3, 0, nullptr, + clustered_histograms_c3); + } + if (do_inner[4]) { + if (!is_small_image) { + writer4->update_part(22); + } else { + writer4->update_part(102); + } + StoreEntropyCodesNew(params4, tokens_c4, &codes_c4, do_prefix_in[4], writer4, 0, nullptr, + clustered_histograms_c4); + } + + // ============================================== + // Do WriteTokens for inner histogram + // ============================================== + // printf("do_inner = %d, %d, %d, %d, %d\n", do_inner[0], do_inner[1], + // do_inner[2], do_inner[3], do_inner[4]); + if (do_inner[0]) { + if (!is_small_image) { + writer0->update_part(3); + } else { + writer0->update_part(3); + } + // printf("%s: %s: %d, WriteTokens token size out=%zu, + // codes.encoding_info.size=%zu\n", + // __FILE__, __FUNCTION__, __LINE__, tokens_c0[0].size(), + // codes_c0.encoding_info.size(), context_map_c0.size()); + WriteTokens(tokens_c0[0], codes_c0, context_map_c0, writer0); + } + if (do_inner[1]) { + if (!is_small_image) { + writer1->update_part(33); + } else { + writer1->update_part(33); + } + // printf("%s: %s: %d, WriteTokens token size out=%zu, + // codes.encoding_info.size=%zu, context_map.size=%d\n", + // __FILE__, __FUNCTION__, __LINE__, tokens_c0[0].size(), + // codes_c0.encoding_info.size(), context_map_c0.size()); + WriteTokens(tokens_c1[0], codes_c1, context_map_c1, writer1); + } + if (do_inner[2]) { + if (!is_small_image) { + writer2->update_part(53); + } else { + writer2->update_part(53); + } + // printf("%s: %s: %d, WriteTokens token size out=%zu, + // codes.encoding_info.size=%zu, context_map.size=%d\n", + // __FILE__, __FUNCTION__, __LINE__, tokens_c0[0].size(), + // codes_c0.encoding_info.size(), context_map_c0.size()); + WriteTokens(tokens_c2[0], codes_c2, context_map_c2, writer2); + } + if (do_inner[3]) { + if (!is_small_image) { + writer3->update_part(3); + } else { + writer3->update_part(83); + } + // printf("%s: %s: %d, WriteTokens token size out=%zu, + // codes.encoding_info.size=%zu, context_map.size=%d\n", + // __FILE__, __FUNCTION__, __LINE__, tokens_c0[0].size(), + // codes_c0.encoding_info.size(), context_map_c0.size()); + WriteTokens(tokens_c3[0], codes_c3, context_map_c3, writer3); + } + if (do_inner[4]) { + if (!is_small_image) { + writer4->update_part(23); + } else { + writer4->update_part(103); + } + // printf("%s: %s: %d, WriteTokens token size out=%zu, + // codes.encoding_info.size=%zu, context_map.size=%d\n", + // __FILE__, __FUNCTION__, __LINE__, tokens_c0[0].size(), + // codes_c0.encoding_info.size(), context_map_c0.size()); + WriteTokens(tokens_c4[0], codes_c4, context_map_c4, writer4); + } + return true; +} + +Status acc_ANS_tokens(LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& modular_frame_encoder, + const size_t num_groups, + PassesEncoderState* passes_enc_state, + FrameDimensions frame_dim, + std::unique_ptr& frame_header, + std::vector >& coefOrders_tokens, + std::vector& group_codes, + BitWriter* group_codes_writer, + BitWriter* acInfo_writer, + std::vector& dc_group_writers, + std::vector& acGroupWriters, + size_t& ans_cost, + size_t& mtf_cost, + std::vector >& bcm_tokens, + std::vector >& bcm_mtf_tokens, + EntropyEncodingData& bcm_codes, + std::vector& bcm_dummy_context_map, + + EntropyEncodingData& modularFramTree_code, + std::vector& modularFramTree_ctxmap, + + EntropyEncodingData& coefOrders_codes, + std::vector& coefOrders_context_map, + std::vector& aux_outs, + AuxOut* aux_out) { + PassesEncoderState* JXL_RESTRICT enc_state_ = lossy_frame_encoder.State(); + PassesSharedState& shared = enc_state_->shared; + const size_t global_ac_index = frame_dim.num_dc_groups + 1; + + const size_t num_passes = passes_enc_state->progressive_splitter.GetNumPasses(); + const bool is_small_image = frame_dim.num_groups == 1 && num_passes == 1; + + const bool has_ac_global = true; + + auto& dct = enc_state_->shared.block_ctx_map.dc_thresholds; + auto& qft = enc_state_->shared.block_ctx_map.qf_thresholds; + auto& ctx_map = enc_state_->shared.block_ctx_map.ctx_map; + + //============ANSWriteTokens Encode GlobalDCInfo: Block Context Map========= + if (frame_header->encoding == FrameEncoding::kVarDCT) { + if (dct[0].empty() && dct[1].empty() && dct[2].empty() && qft.empty() && ctx_map.size() == 21 && + std::equal(ctx_map.begin(), ctx_map.end(), jxl::kDefaultCtxMap)) { + } else { + if (enc_state_->shared.block_ctx_map.num_ctxs == 1) { + } else { + size_t entry_bits = CeilLog2Nonzero(enc_state_->shared.block_ctx_map.num_ctxs); + size_t simple_cost = entry_bits * ctx_map.size(); + if (entry_bits < 4 /* && simple_cost < ans_cost && + simple_cost < mtf_cost*/) { + } else { + if (!is_small_image) { + group_codes_writer->update_part(10); + } else { + group_codes_writer->update_part(10); + } + WriteTokens(bcm_tokens[0], bcm_codes, bcm_dummy_context_map, group_codes_writer); + } + } + BitWriter::Allotment allotmentGlobalDCInfoBCM( + group_codes_writer, (dct[0].size() + dct[1].size() + dct[2].size() + qft.size()) * 34 + 1 + 4 + 4 + + ctx_map.size() * 10 + 1024); + ReclaimAndCharge(group_codes_writer, &allotmentGlobalDCInfoBCM, kLayerAC, aux_out); + } + } + + //============ANSWriteTokens Encode GlobalDCInfo: modular frame tree========= + if (modular_frame_encoder->tree_tokens.empty() || modular_frame_encoder->tree_tokens[0].empty()) { + } else { + if (!is_small_image) { + group_codes_writer->update_part(40); + } else { + group_codes_writer->update_part(40); + } + WriteTokens(modular_frame_encoder->tree_tokens[0], modularFramTree_code, modularFramTree_ctxmap, + group_codes_writer, kLayerModularTree, aux_out); + } + + //============ANSWriteTokens Encode GlobalDCInfo: modular frame token========= + if (!is_small_image) { + group_codes_writer->update_part(60); + } else { + group_codes_writer->update_part(60); + } + size_t stream_id = ModularStreamId::Global().ID(frame_dim); + if (modular_frame_encoder->stream_images[stream_id].channel.empty()) { + // Image with no channels, header never gets decoded. + } else { + JXL_RETURN_IF_ERROR(Bundle::Write(modular_frame_encoder->stream_headers[stream_id], group_codes_writer, + kLayerModularGlobal, aux_out)); + WriteTokens(modular_frame_encoder->tokens[stream_id], modular_frame_encoder->code, + modular_frame_encoder->context_map, group_codes_writer, kLayerModularGlobal, aux_out); + } + + //============================= + + //============================= ANSWriteTokens DC group============= + for (int group_index = 0; group_index < frame_dim.num_dc_groups; group_index++) { + BitWriter* tmp = get_output(group_index + 1, group_codes, is_small_image); + dc_group_writers.emplace_back(tmp); + if (!is_small_image) { + tmp->init(200); + tmp->update_part(0); + } else { + tmp->update_part(70); + } + } + + for (int group_index = 0; group_index < frame_dim.num_dc_groups; group_index++) { + AuxOut* my_aux_out = aux_out ? &aux_outs[0] : nullptr; + BitWriter* output = dc_group_writers[group_index]; + if (frame_header->encoding == FrameEncoding::kVarDCT && !(frame_header->flags & FrameHeader::kUseDcFrame)) { + BitWriter::Allotment allotment(output, 2); + output->Write(2, modular_frame_encoder->extra_dc_precision[group_index]); + ReclaimAndCharge(output, &allotment, kLayerDC, my_aux_out); + size_t stream_id = ModularStreamId::VarDCTDC(group_index).ID(frame_dim); + if (modular_frame_encoder->stream_images[stream_id].channel.empty()) { + // Image with no channels, header never gets decoded. + } else { + Bundle::Write(modular_frame_encoder->stream_headers[stream_id], output, kLayerDC, aux_out); + WriteTokens(modular_frame_encoder->tokens[stream_id], modular_frame_encoder->code, + modular_frame_encoder->context_map, output, kLayerDC, my_aux_out); + } + } + + size_t stream_id = ModularStreamId::ModularDC(group_index).ID(frame_dim); + if (modular_frame_encoder->stream_images[stream_id].channel.empty()) { + // Image with no channels, header never gets decoded. + } else { + Bundle::Write(modular_frame_encoder->stream_headers[stream_id], output, kLayerModularDcGroup, aux_out); + WriteTokens(modular_frame_encoder->tokens[stream_id], modular_frame_encoder->code, + modular_frame_encoder->context_map, output, kLayerModularDcGroup, my_aux_out); + } + + if (frame_header->encoding == FrameEncoding::kVarDCT) { + const Rect& rect = lossy_frame_encoder.State()->shared.DCGroupRect(group_index); + size_t nb_bits = CeilLog2Nonzero(rect.xsize() * rect.ysize()); + if (nb_bits != 0) { + BitWriter::Allotment allotment(output, nb_bits); + output->Write(nb_bits, modular_frame_encoder->ac_metadata_size[group_index] - 1); + ReclaimAndCharge(output, &allotment, kLayerControlFields, my_aux_out); + } + size_t stream_id = ModularStreamId::ACMetadata(group_index).ID(frame_dim); + if (modular_frame_encoder->stream_images[stream_id].channel.empty()) { + // Image with no channels, header never gets decoded. + } else { + Bundle::Write(modular_frame_encoder->stream_headers[stream_id], output, kLayerControlFields, aux_out); + WriteTokens(modular_frame_encoder->tokens[stream_id], modular_frame_encoder->code, + modular_frame_encoder->context_map, output, kLayerControlFields, my_aux_out); + } + } + }; + + //============================= ANSWriteTokens AC Info============= + for (size_t i = 0; i < enc_state_->progressive_splitter.GetNumPasses(); i++) { + uint16_t used_orders = enc_state_->used_orders[i]; + if (used_orders != 0) { + if (!is_small_image) { + acInfo_writer->update_part(19); + } else { + acInfo_writer->update_part(90); + } + WriteTokens(coefOrders_tokens[0], coefOrders_codes, coefOrders_context_map, acInfo_writer, kLayerOrder, + aux_out); + } + } + + //========================================== + if (!is_small_image) { + acInfo_writer->update_part(29); + } else { + acInfo_writer->update_part(109); + } + //=============== + + //========================Encode AC Group============= + for (int group_index = 0; group_index < num_groups; group_index++) { + for (size_t i = 0; i < num_passes; i++) { + BitWriter* tmp = + get_output(AcGroupIndex(i, group_index, frame_dim.num_groups, frame_dim.num_dc_groups, has_ac_global), + group_codes, is_small_image); + acGroupWriters.emplace_back(tmp); + } + } + + int sum = 0; + for (int group_index = 0; group_index < num_groups; group_index++) { + AuxOut* my_aux_out = aux_out ? &aux_outs[0] : nullptr; + for (size_t i = 0; i < num_passes; i++) { + BitWriter* acGroupWriter = acGroupWriters[group_index * num_passes + i]; + if (frame_header->encoding == FrameEncoding::kVarDCT) { + // Select which histogram to use among those of the current pass. + const size_t num_histograms = enc_state_->shared.num_histograms; + // num_histograms is 0 only for lossless. + JXL_ASSERT(num_histograms == 0 || enc_state_->histogram_idx[group_index] < num_histograms); + size_t histo_selector_bits = CeilLog2Nonzero(num_histograms); + + if (histo_selector_bits != 0) { + BitWriter::Allotment allotment(acGroupWriter, histo_selector_bits); + acGroupWriter->Write(histo_selector_bits, enc_state_->histogram_idx[group_index]); + ReclaimAndCharge(acGroupWriter, &allotment, kLayerAC, aux_out); + } + sum = sum + enc_state_->passes[i].ac_tokens[group_index].size(); + WriteTokens(enc_state_->passes[i].ac_tokens[group_index], enc_state_->passes[i].codes, + enc_state_->passes[i].context_map, acGroupWriter, kLayerACTokens, aux_out); + } + + size_t stream_id = ModularStreamId::ModularAC(group_index, i).ID(frame_dim); + if (modular_frame_encoder->stream_images[stream_id].channel.empty()) { + // Image with no channels, header never gets decoded. + } else { + Bundle::Write(modular_frame_encoder->stream_headers[stream_id], acGroupWriter, kLayerModularAcGroup, + aux_out); + WriteTokens(modular_frame_encoder->tokens[stream_id], modular_frame_encoder->code, + modular_frame_encoder->context_map, acGroupWriter, kLayerModularAcGroup, aux_out); + } + } + } + //===================== + + return true; +} + +Status acc_writeout(LossyFrameEncoder& lossy_frame_encoder, + const size_t num_groups, + PassesEncoderState* passes_enc_state, + std::unique_ptr& frame_header, + FrameDimensions frame_dim, + std::vector& group_codes, + BitWriter* writer, + BitWriter* group_codes_writer, + BitWriter* acInfo_writer, + std::vector& dc_group_writers, + std::vector& acGroupWriters, + AuxOut* aux_out, + const std::function& resize_aux_outs) { + const size_t num_passes = passes_enc_state->progressive_splitter.GetNumPasses(); + const bool is_small_image = frame_dim.num_groups == 1 && num_passes == 1; + + writer->AppendByteAligned(lossy_frame_encoder.State()->special_frames); + frame_header->UpdateFlag(lossy_frame_encoder.State()->shared.image_features.patches.HasAny(), + FrameHeader::kPatches); + frame_header->UpdateFlag(lossy_frame_encoder.State()->shared.image_features.splines.HasAny(), + FrameHeader::kSplines); + JXL_RETURN_IF_ERROR(WriteFrameHeader(*frame_header, writer, aux_out)); + + // Resizing aux_outs to 0 also Assimilates the array. + std::atomic num_errors{0}; + static_cast(resize_aux_outs(0)); + JXL_RETURN_IF_ERROR(num_errors.load(std::memory_order_relaxed) == 0); + + for (BitWriter& bw : group_codes) { + bw.ZeroPadToByte(); // end of group. + } + + if (is_small_image) { + std::vector group_codes_seq{0, 1, 2, 3, 4, 10, 19, 20, 29, 30, 31, 32, 33, 34, 40, 50, 51, + 52, 53, 54, 60, 70, 80, 81, 82, 83, 84, 90, 100, 101, 102, 103, 104, 109}; + group_codes_writer->Finalize(group_codes_seq); + // group_codes_writer->Finalize(); + } else { + // std::cout << "===============Group Codes writer Final==================" + // << std::endl; + std::vector group_codes_seq{0, 1, 2, 3, 4, 10, 19, 20, 29, 30, 31, 32, 33, 34, 40, 50, 51, 52, 53, 54, 60}; + group_codes_writer->Finalize(group_codes_seq); + // group_codes_writer->Finalize(); + std::vector dc_group_seq{0}; + for (int group_index = 0; group_index < frame_dim.num_dc_groups; group_index++) { + dc_group_writers[group_index]->Finalize(dc_group_seq); + // dc_group_writers[group_index]->Finalize(); + } + // std::cout << "===============AC Info writer Final==================" + // << std::endl; + std::vector acInfo_seq{0, 1, 2, 3, 4, 10, 19, 20, 21, 22, 23, 24, 29}; + acInfo_writer->Finalize(acInfo_seq); + // acInfo_writer->Finalize(); + std::vector acGroup_seq{0}; + for (int group_index = 0; group_index < num_groups; group_index++) { + for (size_t i = 0; i < num_passes; i++) { + acGroupWriters[group_index * num_passes + i]->Finalize(acGroup_seq); + // acGroupWriters[group_index * num_passes + i]->Finalize(); + } + } + } + // std::cout << "===============Others writer Final==================" + // << std::endl; + BitWriter::Allotment allotmentGrpOffset(writer, MaxBits(group_codes.size())); + writer->Write(1, 0); // no permutation + std::vector write_seq{0}; + // writer->Finalize(write_seq); + writer->Finalize(); + // } + writer->ZeroPadToByte(); // before TOC entries + + for (size_t i = 0; i < group_codes.size(); i++) { + JXL_ASSERT(group_codes[i].BitsWritten() % kBitsPerByte == 0); + const size_t group_size = group_codes[i].BitsWritten() / kBitsPerByte; + JXL_RETURN_IF_ERROR(U32Coder::Write(kTocDist, group_size, writer)); + } + // writer->Finalize(write_seq); + writer->Finalize(); + writer->ZeroPadToByte(); // before first group + ReclaimAndCharge(writer, &allotmentGrpOffset, kLayerTOC, aux_out); + + writer->AppendByteAligned(group_codes); + writer->ZeroPadToByte(); // end of frame. + + return true; +} + +Status acc_phase3(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& modular_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + PassesEncoderState* passes_enc_state, + FrameDimensions frame_dim, + BitWriter* writer, + const size_t num_groups, + AuxOut* aux_out, + ThreadPool* pool, + std::vector& aux_outs, + const ImageBundle& ib, + const std::function& resize_aux_outs) { + // std::cout << "===========acc_kernel3 start================" << std::endl; + std::vector > coefOrders_tokens(1); + + const size_t num_passes = passes_enc_state->progressive_splitter.GetNumPasses(); + + // DC global info + DC groups + AC global info + AC groups * + // num_passes. + const bool has_ac_global = true; + std::vector group_codes( + NumTocEntries(frame_dim.num_groups, frame_dim.num_dc_groups, num_passes, has_ac_global)); + const size_t global_ac_index = frame_dim.num_dc_groups + 1; + const bool is_small_image = frame_dim.num_groups == 1 && num_passes == 1; + + BitWriter* group_codes_writer = get_output(0, group_codes, is_small_image); + BitWriter* acInfo_writer = get_output(global_ac_index, group_codes, is_small_image); + + std::vector > bcm_tokens(1), bcm_mtf_tokens(1); + EntropyEncodingData bcm_codes; + std::vector bcm_dummy_context_map; + size_t ans_cost, mtf_cost; + + EntropyEncodingData modularFramTree_code; + std::vector modularFramTree_ctxmap; + + EntropyEncodingData coefOrders_codes; + std::vector coefOrders_context_map; + + std::vector dc_group_writers; + std::vector acGroupWriters; + struct timeval start_time, token_time, hist_time, ans_time; + gettimeofday(&start_time, 0); + // acc_predictAndtoken(lossy_frame_encoder, frame_header, coefOrders_tokens, + // pool); + + gettimeofday(&token_time, 0); + acc_histogram(xclbinPath, lossy_frame_encoder, modular_frame_encoder, passes_enc_state, frame_dim, frame_header, + cparams, coefOrders_tokens, group_codes_writer, acInfo_writer, ans_cost, mtf_cost, bcm_tokens, + bcm_mtf_tokens, bcm_codes, bcm_dummy_context_map, + + modularFramTree_code, modularFramTree_ctxmap, + + coefOrders_codes, coefOrders_context_map, + + aux_outs, aux_out); + gettimeofday(&hist_time, 0); + acc_ANS_tokens(lossy_frame_encoder, modular_frame_encoder, num_groups, passes_enc_state, frame_dim, frame_header, + coefOrders_tokens, group_codes, group_codes_writer, acInfo_writer, dc_group_writers, acGroupWriters, + ans_cost, mtf_cost, bcm_tokens, bcm_mtf_tokens, bcm_codes, bcm_dummy_context_map, + + modularFramTree_code, modularFramTree_ctxmap, + + coefOrders_codes, coefOrders_context_map, aux_outs, aux_out); + + acc_writeout(lossy_frame_encoder, num_groups, passes_enc_state, frame_header, frame_dim, group_codes, writer, + group_codes_writer, acInfo_writer, dc_group_writers, acGroupWriters, aux_out, resize_aux_outs); + gettimeofday(&ans_time, 0); + + return true; +} +} // namespace jxl + +#endif diff --git a/codec/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_host.cpp b/codec/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_host.cpp new file mode 100644 index 0000000000..c4c5a60e2c --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_host.cpp @@ -0,0 +1,308 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "acc_host.hpp" + +namespace jxl { +void FindBestDequantMatrices(const CompressParams& cparams, + const Image3F& opsin, + ModularFrameEncoder* modular_frame_encoder, + DequantMatrices* dequant_matrices) { + // TODO(veluca): quant matrices for no-gaborish. + // TODO(veluca): heuristics for in-bitstream quant tables. + *dequant_matrices = DequantMatrices(); + if (cparams.max_error_mode) { + // Set numerators of all quantization matrices to constant values. + float weights[3][1] = { + {1.0f / cparams.max_error[0]}, {1.0f / cparams.max_error[1]}, {1.0f / cparams.max_error[2]}}; + DctQuantWeightParams dct_params(weights); + std::vector encodings(DequantMatrices::kNum, QuantEncoding::DCT(dct_params)); + DequantMatricesSetCustom(dequant_matrices, encodings, modular_frame_encoder); + float dc_weights[3] = {1.0f / cparams.max_error[0], 1.0f / cparams.max_error[1], 1.0f / cparams.max_error[2]}; + DequantMatricesSetCustomDC(dequant_matrices, dc_weights); + } +} + +bool DefaultEncoderHeuristics::HandlesColorConversion(const CompressParams& cparams, const ImageBundle& ib) { + return cparams.noise != Override::kOn && cparams.patches != Override::kOn && + cparams.speed_tier >= SpeedTier::kWombat && cparams.resampling == 1 && + cparams.color_transform == ColorTransform::kXYB && !cparams.modular_mode && !ib.HasAlpha(); +} + +Status acc_host(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + const ImageBundle* JXL_RESTRICT ib_or_linear, + ThreadPool* pool, + std::unique_ptr& modular_frame_encoder, + BitWriter* writer, + AuxOut* aux_out, + std::unique_ptr& frame_header, + const FrameInfo& frame_info, + CompressParams cparams, + const std::vector* extra_channels, + PassesEncoderState* passes_enc_state, + FrameDimensions frame_dim, + const size_t num_groups, + const ImageBundle& ib, + std::vector& aux_outs, + const std::function& resize_aux_outs) { + acc_phase1(opsin, lossy_frame_encoder, cparams, frame_header, frame_info, ib_or_linear, ib, aux_out, pool); + + acc_phase2(xclbinPath, opsin, lossy_frame_encoder, modular_frame_encoder, cparams, frame_header, extra_channels, + ib_or_linear, ib, pool, aux_out); + + acc_phase3(xclbinPath, opsin, lossy_frame_encoder, modular_frame_encoder, cparams, frame_header, passes_enc_state, + frame_dim, writer, num_groups, aux_out, pool, aux_outs, ib, resize_aux_outs); + + return true; +} + +Status DefaultEncoderHeuristics::LossyFrameHeuristics(PassesEncoderState* enc_state, + ModularFrameEncoder* modular_frame_encoder, + const ImageBundle* original_pixels, + Image3F* opsin, + ThreadPool* pool, + AuxOut* aux_out) { + PROFILER_ZONE("JxlLossyFrameHeuristics uninstrumented"); + + CompressParams& cparams = enc_state->cparams; + PassesSharedState& shared = enc_state->shared; + + // Compute parameters for noise synthesis. + if (shared.frame_header.flags & FrameHeader::kNoise) { + PROFILER_ZONE("enc GetNoiseParam"); + if (cparams.photon_noise_iso > 0) { + shared.image_features.noise_params = + SimulatePhotonNoise(opsin->xsize(), opsin->ysize(), cparams.photon_noise_iso); + } else { + // Don't start at zero amplitude since adding noise is expensive -- it + // significantly slows down decoding, and this is unlikely to + // completely go away even with advanced optimizations. After the + // kNoiseModelingRampUpDistanceRange we have reached the full level, + // i.e. noise is no longer represented by the compressed image, so we + // can add full noise by the noise modeling itself. + static const float kNoiseModelingRampUpDistanceRange = 0.6; + static const float kNoiseLevelAtStartOfRampUp = 0.25; + static const float kNoiseRampupStart = 1.0; + // TODO(user) test and properly select quality_coef with smooth + // filter + float quality_coef = 1.0f; + const float rampup = (cparams.butteraugli_distance - kNoiseRampupStart) / kNoiseModelingRampUpDistanceRange; + if (rampup < 1.0f) { + quality_coef = kNoiseLevelAtStartOfRampUp + (1.0f - kNoiseLevelAtStartOfRampUp) * rampup; + } + if (rampup < 0.0f) { + quality_coef = kNoiseRampupStart; + } + if (!GetNoiseParameter(*opsin, &shared.image_features.noise_params, quality_coef)) { + shared.frame_header.flags &= ~FrameHeader::kNoise; + } + } + } + if (enc_state->shared.frame_header.upsampling != 1 && !cparams.already_downsampled) { + // In VarDCT mode, LossyFrameHeuristics takes care of running downsampling + // after noise, if necessary. + DownsampleImage(opsin, cparams.resampling); + PadImageToBlockMultipleInPlace(opsin); + } + + const FrameDimensions& frame_dim = enc_state->shared.frame_dim; + size_t target_size = TargetSize(cparams, frame_dim); + size_t opsin_target_size = target_size; + if (cparams.target_size > 0 || cparams.target_bitrate > 0.0) { + cparams.target_size = opsin_target_size; + } else if (cparams.butteraugli_distance < 0) { + return JXL_FAILURE("Expected non-negative distance"); + } + +#ifndef XLNX_DISABLE_BLK_DICT + // Find and subtract splines. + if (cparams.speed_tier <= SpeedTier::kSquirrel) { + shared.image_features.splines = FindSplines(*opsin); + JXL_RETURN_IF_ERROR(shared.image_features.splines.SubtractFrom(opsin, shared.cmap)); + } + + // Find and subtract patches/dots. + if (ApplyOverride(cparams.patches, cparams.speed_tier <= SpeedTier::kSquirrel)) { + FindBestPatchDictionary(*opsin, enc_state, pool, aux_out); + PatchDictionaryEncoder::SubtractFrom(shared.image_features.patches, opsin); + } +#endif + + static const float kAcQuant = 0.79f; + const float quant_dc = InitialQuantDC(cparams.butteraugli_distance); + Quantizer& quantizer = enc_state->shared.quantizer; + // We don't know the quant field yet, but for computing the global scale + // assuming that it will be the same as for Falcon mode is good enough. + quantizer.ComputeGlobalScaleAndQuant(quant_dc, kAcQuant / cparams.butteraugli_distance, 0); + + // TODO(veluca): we can now run all the code from here to FindBestQuantizer + // (excluded) one rect at a time. Do that. + + // Dependency graph: + // + // input: either XYB or input image + // + // input image -> XYB [optional] + // XYB -> initial quant field + // XYB -> Gaborished XYB + // Gaborished XYB -> CfL1 + // initial quant field, Gaborished XYB, CfL1 -> ACS + // initial quant field, ACS, Gaborished XYB -> EPF control field + // initial quant field -> adjusted initial quant field + // adjusted initial quant field, ACS -> raw quant field + // raw quant field, ACS, Gaborished XYB -> CfL2 + // + // output: Gaborished XYB, CfL, ACS, raw quant field, EPF control field. + + ArControlFieldHeuristics ar_heuristics; + AcStrategyHeuristics acs_heuristics; + CfLHeuristics cfl_heuristics; + + if (!opsin->xsize()) { + JXL_ASSERT(HandlesColorConversion(cparams, *original_pixels)); + *opsin = Image3F(RoundUpToBlockDim(original_pixels->xsize()), RoundUpToBlockDim(original_pixels->ysize())); + opsin->ShrinkTo(original_pixels->xsize(), original_pixels->ysize()); + ToXYB(*original_pixels, pool, opsin, /*linear=*/nullptr); + PadImageToBlockMultipleInPlace(opsin); + } + + // Compute an initial estimate of the quantization field. + // Call InitialQuantField only in Hare mode or slower. Otherwise, rely + // on simple heuristics in FindBestAcStrategy, or set a constant for Falcon + // mode. + if (cparams.speed_tier > SpeedTier::kHare || cparams.uniform_quant > 0) { + enc_state->initial_quant_field = ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks); + float q = cparams.uniform_quant > 0 ? cparams.uniform_quant : kAcQuant / cparams.butteraugli_distance; + FillImage(q, &enc_state->initial_quant_field); + } else { + // Call this here, as it relies on pre-gaborish values. + float butteraugli_distance_for_iqf = cparams.butteraugli_distance; + if (!shared.frame_header.loop_filter.gab) { + butteraugli_distance_for_iqf *= 0.73f; + } + enc_state->initial_quant_field = InitialQuantField(butteraugli_distance_for_iqf, *opsin, shared.frame_dim, pool, + 1.0f, &enc_state->initial_quant_masking); + } + + // TODO(veluca): do something about animations. + + // Apply inverse-gaborish. + if (shared.frame_header.loop_filter.gab) { + GaborishInverse(opsin, 0.9908511000000001f, pool); + } + + cfl_heuristics.Init(*opsin); + acs_heuristics.Init(*opsin, enc_state); + ar_heuristics.PrepareForThreads(/*num_threads*/ 1); + cfl_heuristics.PrepareForThreads(/*num_threads*/ 1); + + // auto process_tile = [&](size_t tid, size_t thread) { + for (int tid = 0; tid < DivCeil(enc_state->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks) * + DivCeil(enc_state->shared.frame_dim.ysize_blocks, kEncTileDimInBlocks); + tid++) { + size_t thread = 0; + size_t n_enc_tiles = DivCeil(enc_state->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks); + size_t tx = tid % n_enc_tiles; + size_t ty = tid / n_enc_tiles; + size_t by0 = ty * kEncTileDimInBlocks; + size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, enc_state->shared.frame_dim.ysize_blocks); + size_t bx0 = tx * kEncTileDimInBlocks; + size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, enc_state->shared.frame_dim.xsize_blocks); + Rect r(bx0, by0, bx1 - bx0, by1 - by0); + + // For speeds up to Wombat, we only compute the color correlation map + // once we know the transform type and the quantization map. + if (cparams.speed_tier <= SpeedTier::kSquirrel) { + // cfl_heuristics.ComputeTile(r, *opsin, enc_state->shared.matrices, + // /*ac_strategy=*/nullptr, + // /*quantizer=*/nullptr, /*fast=*/false, thread, + // &enc_state->shared.cmap); + } + +// Choose block sizes. +// acs_heuristics.ProcessRect(r); + +// Choose amount of post-processing smoothing. +// TODO(veluca): should this go *after* AdjustQuantField? +#ifndef XLNX_DISABLE_ARC + ar_heuristics.RunRect(r, *opsin, enc_state, thread); +#else + ImageB* JXL_RESTRICT epf_sharpness = &enc_state->shared.epf_sharpness; + FillPlane(static_cast(4), epf_sharpness, r); +#endif + // Always set the initial quant field, so we can compute the CfL map with + // more accuracy. The initial quant field might change in slower modes, but + // adjusting the quant field with butteraugli when all the other encoding + // parameters are fixed is likely a more reliable choice anyway. + AdjustQuantField(enc_state->shared.ac_strategy, r, &enc_state->initial_quant_field); + quantizer.SetQuantFieldRect(enc_state->initial_quant_field, r, &enc_state->shared.raw_quant_field); + +// Compute a non-default CfL map if we are at Hare speed, or slower. +#ifndef XLNX_DISABLE_2NDCMP + if (cparams.speed_tier <= SpeedTier::kHare) { + cfl_heuristics.ComputeTile( + r, *opsin, enc_state->shared.matrices, &enc_state->shared.ac_strategy, &enc_state->shared.quantizer, + /*fast=*/cparams.speed_tier >= SpeedTier::kWombat, thread, &enc_state->shared.cmap); + } +#endif + }; + /* RunOnPool(pool, 0, DivCeil(enc_state->shared.frame_dim.xsize_blocks, + kEncTileDimInBlocks) * + DivCeil(enc_state->shared.frame_dim.ysize_blocks, + kEncTileDimInBlocks), + [&](const size_t num_threads) { + ar_heuristics.PrepareForThreads(num_threads); + cfl_heuristics.PrepareForThreads(num_threads); + return true; + }, + process_tile, "Enc Heuristics");*/ + + acs_heuristics.Finalize(aux_out); + if (cparams.speed_tier <= SpeedTier::kHare) { + cfl_heuristics.ComputeDC(/*fast=*/cparams.speed_tier >= SpeedTier::kWombat, &enc_state->shared.cmap); + } + + FindBestDequantMatrices(cparams, *opsin, modular_frame_encoder, &enc_state->shared.matrices); + + // Refine quantization levels. + FindBestQuantizer(original_pixels, *opsin, enc_state, pool, aux_out); + + // Choose a context model that depends on the amount of quantization for AC. + if (cparams.speed_tier < SpeedTier::kFalcon) { + FindBestBlockEntropyModel(*enc_state); + } + +#ifdef XLNX_DEBUG_CMAP + std::cout << "=========================================" << std::endl; + std::cout << "ColorMap info: " << std::endl; + ImageSB* JXL_RESTRICT tmp_map = &enc_state->shared.cmap.ytox_map; + int32_t dc = enc_state->shared.cmap.GetYToXDC(); + std::cout << "Y to X dc: " << dc << std::endl; + for (int i = 0; i < tmp_map->ysize(); i++) { + int8_t* JXL_RESTRICT row_out = tmp_map->Row(i); + for (int j = 0; j < tmp_map->xsize(); j++) { + std::cout << (int)row_out[j] << " "; + } + std::cout << std::endl; + } + + tmp_map = &enc_state->shared.cmap.ytox_map; + dc = enc_state->shared.cmap.GetYToBDC(); + std::cout << "Y to B dc: " << dc << std::endl; + for (int i = 0; i < tmp_map->ysize(); i++) { + int8_t* JXL_RESTRICT row_out = tmp_map->Row(i); + for (int j = 0; j < tmp_map->xsize(); j++) { + std::cout << (int)row_out[j] << " "; + } + std::cout << std::endl; + } + std::cout << std::endl; +#endif + + return true; +} +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase1.cpp b/codec/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase1.cpp new file mode 100644 index 0000000000..a37f251c20 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase1.cpp @@ -0,0 +1,276 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef HLS_KERNEL1_CPP +#define HLS_KERNEL1_CPP + +#include "acc_phase1.hpp" + +namespace jxl { +namespace { +// Invisible (alpha = 0) pixels tend to be a mess in optimized PNGs. +// Since they have no visual impact whatsoever, we can replace them with +// something that compresses better and reduces artifacts near the edges. This +// does some kind of smooth stuff that seems to work. +// Replace invisible pixels with a weighted average of the pixel to the left, +// the pixel to the topright, and non-invisible neighbours. +// Produces downward-blurry smears, with in the upwards direction only a 1px +// edge duplication but not more. It would probably be better to smear in all +// directions. That requires an alpha-weighed convolution with a large enough +// kernel though, which might be overkill... +void SimplifyInvisible(Image3F* image, const ImageF& alpha, bool lossless) { + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < image->ysize(); ++y) { + float* JXL_RESTRICT row = image->PlaneRow(c, y); + const float* JXL_RESTRICT prow = (y > 0 ? image->PlaneRow(c, y - 1) : nullptr); + const float* JXL_RESTRICT nrow = (y + 1 < image->ysize() ? image->PlaneRow(c, y + 1) : nullptr); + const float* JXL_RESTRICT a = alpha.Row(y); + const float* JXL_RESTRICT pa = (y > 0 ? alpha.Row(y - 1) : nullptr); + const float* JXL_RESTRICT na = (y + 1 < image->ysize() ? alpha.Row(y + 1) : nullptr); + for (size_t x = 0; x < image->xsize(); ++x) { + if (a[x] == 0) { + if (lossless) { + row[x] = 0; + continue; + } + float d = 0.f; + row[x] = 0; + if (x > 0) { + row[x] += row[x - 1]; + d++; + if (a[x - 1] > 0.f) { + row[x] += row[x - 1]; + d++; + } + } + if (x + 1 < image->xsize()) { + if (y > 0) { + row[x] += prow[x + 1]; + d++; + } + if (a[x + 1] > 0.f) { + row[x] += 2.f * row[x + 1]; + d += 2.f; + } + if (y > 0 && pa[x + 1] > 0.f) { + row[x] += 2.f * prow[x + 1]; + d += 2.f; + } + if (y + 1 < image->ysize() && na[x + 1] > 0.f) { + row[x] += 2.f * nrow[x + 1]; + d += 2.f; + } + } + if (y > 0 && pa[x] > 0.f) { + row[x] += 2.f * prow[x]; + d += 2.f; + } + if (y + 1 < image->ysize() && na[x] > 0.f) { + row[x] += 2.f * nrow[x]; + d += 2.f; + } + if (d > 1.f) row[x] /= d; + } + } + } + } +} +} // namespace + +Status acc_phase1(Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + const FrameInfo& frame_info, + const ImageBundle* JXL_RESTRICT ib_or_linear, + const ImageBundle& ib, + AuxOut* aux_out, + ThreadPool* pool) { + const ColorEncoding& c_linear = ColorEncoding::LinearSRGB(ib.IsGray()); + std::unique_ptr metadata_linear = jxl::make_unique(); + metadata_linear->xyb_encoded = (cparams.color_transform == ColorTransform::kXYB); + metadata_linear->color_encoding = c_linear; + ImageBundle linear_storage(metadata_linear.get()); + + // Allocating a large enough image avoids a copy when padding. + opsin = Image3F(RoundUpToBlockDim(ib.xsize()), RoundUpToBlockDim(ib.ysize())); + opsin.ShrinkTo(ib.xsize(), ib.ysize()); + + const bool want_linear = + frame_header->encoding == FrameEncoding::kVarDCT && cparams.speed_tier <= SpeedTier::kKitten; + ib_or_linear = &ib; + + if (frame_header->color_transform == ColorTransform::kXYB && frame_info.ib_needs_color_transform) { + // linear_storage would only be used by the Butteraugli loop (passing + // linear sRGB avoids a color conversion there). Otherwise, don't + // fill it to reduce memory usage. + ib_or_linear = ToXYB(ib, pool, &opsin, want_linear ? &linear_storage : nullptr); + } else { // RGB or YCbCr: don't do anything (forward YCbCr is not + // implemented, this is only used when the input is already in + // YCbCr) + // If encoding a special DC or reference frame, don't do anything: + // input is already in XYB. + CopyImageTo(ib.color(), &opsin); + } + bool lossless = (frame_header->encoding == FrameEncoding::kModular && cparams.quality_pair.first == 100); + if (ib.HasAlpha() && !ib.AlphaIsPremultiplied() && !ApplyOverride(cparams.keep_invisible, lossless) && + cparams.ec_resampling == cparams.resampling) { + // simplify invisible pixels + SimplifyInvisible(&opsin, ib.alpha(), lossless); + if (want_linear) { + SimplifyInvisible(const_cast(&ib_or_linear->color()), ib.alpha(), lossless); + } + } + if (aux_out != nullptr) { + JXL_RETURN_IF_ERROR(aux_out->InspectImage3F("enc_frame:OpsinDynamicsImage", opsin)); + } + if (frame_header->encoding == FrameEncoding::kVarDCT) { + PadImageToBlockMultipleInPlace(&opsin); + PassesEncoderState* JXL_RESTRICT enc_state_ = lossy_frame_encoder.State(); + // std::vector& group_caches_ = + // lossy_frame_encoder.get_group_cashes(); + + JXL_ASSERT((opsin.xsize() % kBlockDim) == 0 && (opsin.ysize() % kBlockDim) == 0); + PassesSharedState& shared = enc_state_->shared; + + if (!enc_state_->cparams.max_error_mode) { + float x_qm_scale_steps[3] = {0.65f, 1.25f, 9.0f}; + shared.frame_header.x_qm_scale = 1; + for (float x_qm_scale_step : x_qm_scale_steps) { + if (enc_state_->cparams.butteraugli_distance > x_qm_scale_step) { + shared.frame_header.x_qm_scale++; + } + } + } + + Image3F* opsin_ = &opsin; + // CompressParams& cparams = enc_state->cparams; + // PassesSharedState& shared = enc_state->shared; + + // Compute parameters for noise synthesis. + if (shared.frame_header.flags & FrameHeader::kNoise) { + PROFILER_ZONE("enc GetNoiseParam"); + if (cparams.photon_noise_iso > 0) { + shared.image_features.noise_params = + SimulatePhotonNoise(opsin_->xsize(), opsin_->ysize(), cparams.photon_noise_iso); + } else { + // Don't start at zero amplitude since adding noise is expensive -- it + // significantly slows down decoding, and this is unlikely to + // completely go away even with advanced optimizations. After the + // kNoiseModelingRampUpDistanceRange we have reached the full level, + // i.e. noise is no longer represented by the compressed image, so we + // can add full noise by the noise modeling itself. + static const float kNoiseModelingRampUpDistanceRange = 0.6; + static const float kNoiseLevelAtStartOfRampUp = 0.25; + static const float kNoiseRampupStart = 1.0; + // TODO(user) test and properly select quality_coef with smooth + // filter + float quality_coef = 1.0f; + const float rampup = + (cparams.butteraugli_distance - kNoiseRampupStart) / kNoiseModelingRampUpDistanceRange; + if (rampup < 1.0f) { + quality_coef = kNoiseLevelAtStartOfRampUp + (1.0f - kNoiseLevelAtStartOfRampUp) * rampup; + } + if (rampup < 0.0f) { + quality_coef = kNoiseRampupStart; + } + if (!GetNoiseParameter(*opsin_, &shared.image_features.noise_params, quality_coef)) { + shared.frame_header.flags &= ~FrameHeader::kNoise; + } + } + } + if (enc_state_->shared.frame_header.upsampling != 1 && !cparams.already_downsampled) { + // In VarDCT mode, LossyFrameHeuristics takes care of running downsampling + // after noise, if necessary. + DownsampleImage(opsin_, cparams.resampling); + PadImageToBlockMultipleInPlace(opsin_); + } + + const FrameDimensions& frame_dim_ = enc_state_->shared.frame_dim; + size_t target_size = TargetSize(cparams, frame_dim_); + size_t opsin_target_size = target_size; + if (cparams.target_size > 0 || cparams.target_bitrate > 0.0) { + cparams.target_size = opsin_target_size; + } else if (cparams.butteraugli_distance < 0) { + return JXL_FAILURE("Expected non-negative distance"); + } + +#ifndef XLNX_DISABLE_BLK_DICT + // Find and subtract splines. + if (cparams.speed_tier <= SpeedTier::kSquirrel) { + shared.image_features.splines = FindSplines(*opsin_); + JXL_RETURN_IF_ERROR(shared.image_features.splines.SubtractFrom(opsin_, shared.cmap)); + } + + // Find and subtract patches/dots. + if (ApplyOverride(cparams.patches, cparams.speed_tier <= SpeedTier::kSquirrel)) { + FindBestPatchDictionary(*opsin_, enc_state_, pool, aux_out); + PatchDictionaryEncoder::SubtractFrom(shared.image_features.patches, opsin_); + } +#endif + + static const float kAcQuant = 0.79f; + const float quant_dc = InitialQuantDC(cparams.butteraugli_distance); + Quantizer& quantizer = enc_state_->shared.quantizer; + // We don't know the quant field yet, but for computing the global scale + // assuming that it will be the same as for Falcon mode is good enough. + quantizer.ComputeGlobalScaleAndQuant(quant_dc, kAcQuant / cparams.butteraugli_distance, 0); + + // TODO(veluca): we can now run all the code from here to FindBestQuantizer + // (excluded) one rect at a time. Do that. + + // Dependency graph: + // + // input: either XYB or input image + // + // input image -> XYB [optional] + // XYB -> initial quant field + // XYB -> Gaborished XYB + // Gaborished XYB -> CfL1 + // initial quant field, Gaborished XYB, CfL1 -> ACS + // initial quant field, ACS, Gaborished XYB -> EPF control field + // initial quant field -> adjusted initial quant field + // adjusted initial quant field, ACS -> raw quant field + // raw quant field, ACS, Gaborished XYB -> CfL2 + // + // output: Gaborished XYB, CfL, ACS, raw quant field, EPF control field. + + if (!opsin_->xsize()) { + JXL_ASSERT(enc_state_->heuristics->HandlesColorConversion(cparams, *ib_or_linear)); + *opsin_ = Image3F(RoundUpToBlockDim(ib_or_linear->xsize()), RoundUpToBlockDim(ib_or_linear->ysize())); + opsin_->ShrinkTo(ib_or_linear->xsize(), ib_or_linear->ysize()); + ToXYB(*ib_or_linear, pool, opsin_, /*linear=*/nullptr); + PadImageToBlockMultipleInPlace(opsin_); + } + + // Compute an initial estimate of the quantization field. + // Call InitialQuantField only in Hare mode or slower. Otherwise, rely + // on simple heuristics in FindBestAcStrategy, or set a constant for Falcon + // mode. + if (cparams.speed_tier > SpeedTier::kHare || cparams.uniform_quant > 0) { + enc_state_->initial_quant_field = ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks); + float q = cparams.uniform_quant > 0 ? cparams.uniform_quant : kAcQuant / cparams.butteraugli_distance; + FillImage(q, &enc_state_->initial_quant_field); + } else { + // Call this here, as it relies on pre-gaborish values. + float butteraugli_distance_for_iqf = cparams.butteraugli_distance; + if (!shared.frame_header.loop_filter.gab) { + butteraugli_distance_for_iqf *= 0.73f; + } + enc_state_->initial_quant_field = InitialQuantField(butteraugli_distance_for_iqf, *opsin_, shared.frame_dim, + pool, 1.0f, &enc_state_->initial_quant_masking); + } + + // TODO(veluca): do something about animations. + + // Apply inverse-gaborish. + if (shared.frame_header.loop_filter.gab) { + GaborishInverse(opsin_, 0.9908511000000001f, pool); + } + } + return true; +} +} // namespace jxl +#endif \ No newline at end of file diff --git a/codec/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase2.cpp b/codec/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase2.cpp new file mode 100644 index 0000000000..322d6e5003 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase2.cpp @@ -0,0 +1,587 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef HLS_KERNEL2_CPP +#define HLS_KERNEL2_CPP + +#include "acc_phase2.hpp" +#include "host_lossy_enc_compute.hpp" + +#include +#include +#include + +namespace jxl { + +void collect_dc(PassesEncoderState* enc_state, + Image3F* dc, + size_t xsize, + size_t ysize, + float* hls_dc8x8, + float* hls_dc16x16, + float* hls_dc32x32) { + for (int i = 0; i < enc_state->shared.frame_dim.num_groups; i++) { + const Rect block_group_rect = enc_state->shared.BlockGroupRect(i); + const size_t xsize_blocks = block_group_rect.xsize(); + const size_t ysize_blocks = block_group_rect.ysize(); + + const size_t dc_stride = static_cast(dc->PixelsPerRow()); + + { + size_t offset = 0; + + for (size_t by = 0; by < ysize_blocks; ++by) { + size_t ty = by / kColorTileDimInBlocks; + float* JXL_RESTRICT dc_rows[3] = { + block_group_rect.PlaneRow(dc, 0, by), block_group_rect.PlaneRow(dc, 1, by), + block_group_rect.PlaneRow(dc, 2, by), + }; + AcStrategyRow ac_strategy_row = enc_state->shared.ac_strategy.ConstRow(block_group_rect, by); + for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks); tx++) { + for (size_t bx = tx * kColorTileDimInBlocks; + bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks; ++bx) { + const AcStrategy acs = ac_strategy_row[bx]; + if (!acs.IsFirstBlock()) continue; + + size_t xblocks = acs.covered_blocks_x(); + size_t yblocks = acs.covered_blocks_y(); + + size_t size = kDCTBlockSize * xblocks * yblocks; + + size_t tile_xsize = (xsize + 63) / 64 * 64; + size_t tile_ysize = (ysize + 63) / 64 * 64; + + size_t block_cnt8x8 = + (block_group_rect.y0() + by) * (tile_xsize / 8) + block_group_rect.x0() + bx; + size_t block_cnt16x16 = + (block_group_rect.y0() + by) / 2 * (tile_xsize / 16) + (block_group_rect.x0() + bx) / 2; + size_t block_cnt32x32 = + (block_group_rect.y0() + by) / 4 * (tile_xsize / 32) + (block_group_rect.x0() + bx) / 4; + + for (size_t c : {0, 1, 2}) { + float* coef_dc = dc_rows[c] + bx; + if (acs.RawStrategy() == 0) { + coef_dc[0] = hls_dc8x8[c * tile_xsize * tile_ysize + block_cnt8x8]; + } else if (acs.RawStrategy() == 4) { + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + coef_dc[i * dc_stride + j] = + hls_dc16x16[c * tile_xsize * tile_ysize + 4 * block_cnt16x16 + i * 2 + j]; + } + } + } else if (acs.RawStrategy() == 5) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + coef_dc[i * dc_stride + j] = + hls_dc32x32[c * tile_ysize * tile_xsize + 16 * block_cnt32x32 + i * 4 + j]; + } + } + } else { + std::cout << "unsupported DCFromLowFREQ" << std::endl; + } + } + offset += size; + } + } + } + } + } +} + +Status acc_phase2(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& modular_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + const std::vector* extra_channels, + const ImageBundle* JXL_RESTRICT ib_or_linear, + const ImageBundle& ib, + ThreadPool* pool, + AuxOut* aux_out) { + if (frame_header->encoding == FrameEncoding::kVarDCT) { + //===================================================================================================// + // kernel-2 CPU part, pre-processing + //===================================================================================================// + + // pointer define + PassesEncoderState* JXL_RESTRICT enc_state_ = lossy_frame_encoder.State(); + PassesSharedState& shared = enc_state_->shared; + + // define sizes + uint32_t tile_xsize = (opsin.xsize() + 63) / 64 * 64; + uint32_t tile_ysize = (opsin.ysize() + 63) / 64 * 64; + uint32_t ysize64 = tile_ysize / 64; + uint32_t xsize64 = tile_xsize / 64; + int xsize_blocks = enc_state_->shared.frame_dim.xsize_blocks; + int ysize_blocks = enc_state_->shared.frame_dim.ysize_blocks; + int xnum_tile = (opsin.xsize() + 63) / 64; + int ynum_tile = (opsin.ysize() + 63) / 64; + unsigned xsize_8alg = (opsin.xsize() + 7) / 8 * 8; + unsigned ysize_8alg = (opsin.ysize() + 7) / 8 * 8; + int num_tile = xnum_tile * ynum_tile; + + Image3F* opsin_ = &opsin; + Quantizer& quantizer = enc_state_->shared.quantizer; + enc_state_->shared.matrices = DequantMatrices(); + enc_state_->histogram_idx.resize(shared.frame_dim.num_groups); + enc_state_->x_qm_multiplier = std::pow(1.25f, shared.frame_header.x_qm_scale - 2.0f); + enc_state_->b_qm_multiplier = std::pow(1.25f, shared.frame_header.b_qm_scale - 2.0f); + + if (enc_state_->coeffs.size() < shared.frame_header.passes.num_passes) { + enc_state_->coeffs.reserve(shared.frame_header.passes.num_passes); + for (size_t i = enc_state_->coeffs.size(); i < shared.frame_header.passes.num_passes; i++) { + // Allocate enough coefficients for each group on every row. + enc_state_->coeffs.emplace_back( + make_unique >(kGroupDim * kGroupDim, shared.frame_dim.num_groups)); + } + } + + while (enc_state_->coeffs.size() > shared.frame_header.passes.num_passes) { + enc_state_->coeffs.pop_back(); + } + + Image3F dc(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks); + + AcStrategyHeuristics acs_heuristics; + CfLHeuristics cfl_heuristics; + cfl_heuristics.Init(*opsin_); + cfl_heuristics.PrepareForThreads(1); + acs_heuristics.Init(*opsin_, enc_state_); + + //========================================================================// + // host interface + //========================================================================// + int config[MAX_NUM_CONFIG]; + float config_fl[MAX_NUM_CONFIG]; + float* hls_opsin_1 = (float*)malloc(ALL_PIXEL * sizeof(float)); + float* hls_opsin_2 = (float*)malloc(ALL_PIXEL * sizeof(float)); + float* hls_opsin_3 = (float*)malloc(ALL_PIXEL * sizeof(float)); + float* hls_quant_field = (float*)malloc(BLOCK8_H * BLOCK8_W * sizeof(float)); + float* hls_masking_field = (float*)malloc(BLOCK8_H * BLOCK8_W * sizeof(float)); + float* aq_map_f = (float*)malloc(BLOCK8_H * BLOCK8_W * sizeof(float)); + int8_t* cmap_axi = (int8_t*)malloc(TILE_W * TILE_H * 2 * sizeof(int8_t)); + int* ac_coef_axiout = (int*)malloc(ALL_PIXEL * sizeof(int)); + uint8_t* strategy_all = (uint8_t*)malloc(sizeof(uint8_t*) * BLOCK8_H * BLOCK8_W); + int* raw_quant_field_i = (int*)malloc(BLOCK8_H * BLOCK8_W * sizeof(int)); + uint32_t hls_order[MAX_ORDER]; + float* hls_dc8x8 = (float*)malloc(ALL_PIXEL * sizeof(float)); + float* hls_dc16x16 = (float*)malloc(ALL_PIXEL * sizeof(float)); + float* hls_dc32x32 = (float*)malloc(ALL_PIXEL * sizeof(float)); + + float* Image_reorder_dct8 = (float*)malloc(ALL_PIXEL * sizeof(float)); + float* Image_reorder_dct16 = (float*)malloc(ALL_PIXEL * sizeof(float)); + float* Image_reorder_dct32 = (float*)malloc(ALL_PIXEL * sizeof(float)); + + config[0] = opsin.ysize(); + config[1] = opsin.xsize(); + config[2] = acs_heuristics.config.masking_field_stride; + config[3] = acs_heuristics.config.quant_field_stride; + config_fl[0] = acs_heuristics.enc_state->cparams.butteraugli_distance; + config_fl[1] = acs_heuristics.config.cost1; + config_fl[2] = quantizer.InvGlobalScale(); + + for (int c = 0; c < 3; c++) { + for (int y = 0; y < tile_ysize; y++) { + const float* JXL_RESTRICT row = opsin.ConstPlaneRow(c, y); + memcpy(&hls_opsin_1[c * tile_xsize * tile_ysize + y * tile_xsize], row, tile_xsize * sizeof(float)); + } + } + + for (int c = 0; c < 3; c++) { + for (int y = 0; y < tile_ysize; y++) { + const float* JXL_RESTRICT row = opsin.ConstPlaneRow(c, y); + memcpy(&hls_opsin_2[c * tile_xsize * tile_ysize + y * tile_xsize], row, tile_xsize * sizeof(float)); + } + } + + for (int c = 0; c < 3; c++) { + for (int y = 0; y < tile_ysize; y++) { + const float* JXL_RESTRICT row = opsin.ConstPlaneRow(c, y); + memcpy(&hls_opsin_3[c * tile_xsize * tile_ysize + y * tile_xsize], row, tile_xsize * sizeof(float)); + } + } + + for (uint32_t y64 = 0; y64 < ysize64; y64++) { + for (uint32_t x64 = 0; x64 < xsize64; x64++) { + for (uint32_t y8 = 0; y8 < 8; y8++) { + for (uint32_t x8 = 0; x8 < 8; x8++) { + for (int c = 0; c < 3; c++) { + for (int m = 0; m < 8; m++) { + for (int n = 0; n < 8; n++) { + uint32_t c_tmp = 0; + if (c == 0) { + c_tmp = 1; + } else if (c == 1) { + c_tmp = 0; + } else { + c_tmp = 2; + } + uint32_t addr = c_tmp * tile_xsize * tile_ysize + y64 * tile_xsize * 64 + x64 * 64 + + y8 * tile_xsize * 8 + x8 * 8 + m * tile_xsize + n; + + float reg = hls_opsin_1[addr]; + Image_reorder_dct8[n + 8 * m + 64 * c + 64 * 3 * x8 + 512 * 3 * y8 + + 4096 * 3 * x64 + 4096 * 3 * xsize64 * y64] = reg; + } + } + } + } + } + } + } + + for (uint32_t y64 = 0; y64 < ysize64; y64++) { + for (uint32_t x64 = 0; x64 < xsize64; x64++) { + for (uint32_t y16 = 0; y16 < 4; y16++) { + for (uint32_t x16 = 0; x16 < 4; x16++) { + for (uint32_t c = 0; c < 3; c++) { + for (uint32_t m = 0; m < 16; m++) { + for (uint32_t n = 0; n < 16; n++) { + uint32_t c_tmp = 0; + if (c == 0) { + c_tmp = 1; + } else if (c == 1) { + c_tmp = 0; + } else { + c_tmp = 2; + } + + uint32_t addr = c_tmp * tile_xsize * tile_ysize + y64 * tile_xsize * 64 + x64 * 64 + + y16 * tile_xsize * 16 + x16 * 16 + m * tile_xsize + n; + float reg = hls_opsin_2[addr]; + Image_reorder_dct16[4096 * 3 * xsize64 * y64 + 4096 * 3 * x64 + 1024 * 3 * y16 + + 256 * 3 * x16 + 256 * c + 16 * m + n] = reg; + } + } + } + } + } + } + } + + for (uint32_t y64 = 0; y64 < ysize64; y64++) { + for (uint32_t x64 = 0; x64 < xsize64; x64++) { + for (uint32_t y32 = 0; y32 < 2; y32++) { + for (uint32_t x32 = 0; x32 < 2; x32++) { + for (uint32_t c = 0; c < 3; c++) { + for (uint32_t m = 0; m < 32; m++) { + for (uint32_t n = 0; n < 32; n++) { + uint32_t c_tmp = 0; + if (c == 0) { + c_tmp = 1; + } else if (c == 1) { + c_tmp = 0; + } else { + c_tmp = 2; + } + + uint32_t addr = c_tmp * tile_xsize * tile_ysize + y64 * tile_xsize * 64 + x64 * 64 + + y32 * tile_xsize * 32 + x32 * 32 + m * tile_xsize + n; + float reg = hls_opsin_3[addr]; + Image_reorder_dct32[4096 * 3 * xsize64 * y64 + 4096 * 3 * x64 + 2048 * 3 * y32 + + 1024 * 3 * x32 + 1024 * c + 32 * m + n] = reg; + } + } + } + } + } + } + } + + // input: rqf + for (int y = 0; y < ysize_blocks; y++) { + float* aq_row = enc_state_->initial_quant_field.Row(y); + for (int x = 0; x < xsize_blocks; x++) { + aq_map_f[y * xsize_blocks + x] = aq_row[x]; + } + } + + // input: masking field + for (int i = 0; i < BLOCK8_H * BLOCK8_W; i++) { + hls_masking_field[i] = acs_heuristics.config.masking_field_row[i]; + } + + // input: quant_field + for (int i = 0; i < BLOCK8_H * BLOCK8_W; i++) { + hls_quant_field[i] = acs_heuristics.config.quant_field_row[i]; + } + + //================================================================// + // kernel-2 FPGA kernel part, pass HLS test + // hls_kernel2_top.cpp + //===============================================================// + hls_lossy_enc_compute_wrapper(xclbinPath, + // input + config, config_fl, Image_reorder_dct8, Image_reorder_dct16, Image_reorder_dct32, + hls_quant_field, hls_masking_field, aq_map_f, + // output + cmap_axi, ac_coef_axiout, strategy_all, raw_quant_field_i, hls_order, hls_dc8x8, + hls_dc16x16, hls_dc32x32); + + //==============================================================// + // kernel-2 CPU part, post-processing + //==============================================================// + // ac_coef host post-process + int* ac_coef = (int*)malloc(ALL_PIXEL * sizeof(int)); + { + bool visit[8][8]; + int i = 0, addr = 0; + for (int ty = 0; ty < ynum_tile; ty++) { + for (int tx = 0; tx < xnum_tile; tx++) { + for (int by = 0; by < 8; by++) { + for (int bx = 0; bx < 8; bx++) { + visit[by][bx] = false; + } + } + for (int by = 0; by < 8; by++) { + for (int bx = 0; bx < 8; bx++) { + if (!visit[by][bx] && (ty * 8 + by) < ysize_8alg / 8 && (tx * 8 + bx) < xsize_8alg / 8) { + int idx_acs = (ty * 8 + by) * xsize_8alg / 8 + tx * 8 + bx; + char strategy = strategy_all[idx_acs]; + int b = 0; + if (strategy == 4) { + b = 2; + } else if (strategy == 5) { + b = 4; + } else { + b = 1; + } + for (int iy = 0; iy < b; iy++) { + for (int ix = 0; ix < b; ix++) { + visit[by + iy][bx + ix] = true; + for (int j = 0; j < 64; j++) { + for (unsigned c = 0; c < 3; ++c) { + if (c == 0 && j == 0) { + addr = ((ty * 8 + by + iy) * 64 * 3 * xsize_8alg / 8 + + (tx * 8 + bx + ix) * 64 * 3); + } + ac_coef[addr + j * 3 + c] = ac_coef_axiout[i]; + i++; + } + } + } + } + } + } + } + } + } + } + + // acs host post-processing + AcStrategyImage* acs_strategy = &acs_heuristics.enc_state->shared.ac_strategy; + for (size_t y = 0; y < ysize_blocks; ++y) { + for (size_t x = 0; x < xsize_blocks; ++x) { + int index = y * xsize_blocks + x; + int value = strategy_all[index]; + if (value == 4 && y % 2 == 0 && x % 2 == 0) { + acs_strategy->Set(x, y, static_cast(value)); + } else if (value == 5 && y % 4 == 0 && x % 4 == 0) { + acs_strategy->Set(x, y, static_cast(value)); + } else if (value < 4) { + acs_strategy->Set(x, y, static_cast(value)); + } + } + } + + // rqf host post-processing + ImageI* raw_quant_field = &enc_state_->shared.raw_quant_field; + for (int y = 0; y < ysize_blocks; y++) { + float* aq_row = enc_state_->initial_quant_field.Row(y); // quant_field.Row(y); + int* row_qi = raw_quant_field->Row(y); + for (int x = 0; x < xsize_blocks; x++) { + row_qi[x] = raw_quant_field_i[y * xsize_blocks + x]; + aq_row[x] = aq_map_f[y * xsize_blocks + x]; + } + } + + // epf init + ImageB* epf_sharpness = &enc_state_->shared.epf_sharpness; + for (int y = 0; y < enc_state_->shared.frame_dim.ysize_blocks; y++) { + uint8_t* row = epf_sharpness->Row(y); + for (int x = 0; x < enc_state_->shared.frame_dim.xsize_blocks; x++) { + row[x] = 4; + } + } + + // dc coeff post-processing + collect_dc(enc_state_, &dc, opsin.xsize(), opsin.ysize(), hls_dc8x8, hls_dc16x16, hls_dc32x32); + + // cmap host post-processing + const FrameDimensions frame_dim = enc_state_->shared.frame_dim; + ImageSB* map_x = &(enc_state_->shared.cmap).ytox_map; + ImageSB* map_b = &(enc_state_->shared.cmap).ytob_map; + + for (int tid = 0; tid < DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks) * + DivCeil(frame_dim.ysize_blocks, kEncTileDimInBlocks); + tid++) { + size_t n_enc_tiles = DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks); + size_t tx = tid % n_enc_tiles; + size_t ty = tid / n_enc_tiles; + size_t by0 = ty * kEncTileDimInBlocks; + size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, frame_dim.ysize_blocks); + size_t bx0 = tx * kEncTileDimInBlocks; + size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, frame_dim.xsize_blocks); + Rect r(bx0, by0, bx1 - bx0, by1 - by0); + static_assert(kEncTileDimInBlocks == kColorTileDimInBlocks, "Invalid color tile dim"); + + size_t num_ac = 0; + + int8_t* JXL_RESTRICT row_out_x = map_x->Row(ty); + int8_t* JXL_RESTRICT row_out_b = map_b->Row(ty); + + row_out_x[tx] = cmap_axi[tid]; + row_out_b[tx] = cmap_axi[num_tile + tid]; + } + + // ac_coeff host post-processing + for (size_t group_index = 0; group_index < frame_dim.num_groups; group_index++) { + const size_t gx = group_index % frame_dim.xsize_groups; + const size_t gy = group_index / frame_dim.xsize_groups; + const Rect rect(gx * kGroupDimInBlocks, gy * kGroupDimInBlocks, kGroupDimInBlocks, kGroupDimInBlocks, + frame_dim.xsize_blocks, frame_dim.ysize_blocks); + ACPtr rows[3]; + // ACType type = (*enc_state_->coeffs[0]).Type(); + for (size_t c = 0; c < 3; c++) { + rows[c] = (*enc_state_->coeffs[0]).PlaneRow(c, group_index, 0); + } + size_t ac_offset = 0; + for (size_t by = 0; by < rect.ysize(); ++by) { + AcStrategyRow acs_row = enc_state_->shared.ac_strategy.ConstRow(rect, by); + for (size_t bx = 0; bx < rect.xsize(); ++bx) { + AcStrategy acs = acs_row[bx]; + if (!acs.IsFirstBlock()) continue; + size_t size = kDCTBlockSize << acs.log2_covered_blocks(); + size_t cxsize = acs.covered_blocks_x(); + size_t cysize = acs.covered_blocks_y(); + + int addr = 0; + for (int cy = 0; cy < cysize; cy++) { + for (int cx = 0; cx < cxsize; cx++) { + for (int i = 0; i < 64; i++) { + for (size_t c = 0; c < 3; ++c) { + int reorder[3] = {1, 0, 2}; + rows[c].ptr32[ac_offset + addr] = + ac_coef[(gy * 32 + by + cy) * 64 * 3 * xsize_8alg / 8 + + (gx * 32 + bx + cx) * 64 * 3 + i * 3 + reorder[c]]; + } + addr++; + } + } + } + ac_offset += size; + } + } + } + + // hls_order host-post processing + enc_state_->used_orders.resize(enc_state_->progressive_splitter.GetNumPasses()); + coeff_order_t* JXL_RESTRICT order = &enc_state_->shared.coeff_orders[0 * enc_state_->shared.coeff_order_size]; + + const int32_t offset8x8 = 0; + const int32_t offset16x16 = 64; + + uint32_t hls_order_reg = hls_order[320 * 3]; + uint32_t mask_0 = 0x00000001; + uint32_t mask_2 = 0x00000004; + uint32_t all_used_orders_set[32]; + for (int i = 0; i < 32; i++) { + if (i == 0) { + all_used_orders_set[i] = hls_order_reg & mask_0; + } else if (i == 2) { + all_used_orders_set[i] = hls_order_reg & mask_2; + } else if (i == 1 || i == 3) { + all_used_orders_set[i] = 0; + } else { + all_used_orders_set[i] = 0; + } + } + + uint32_t computed = 0; + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + uint8_t ord = kStrategyOrder[o]; + if (computed & (1 << ord)) continue; + computed |= 1 << ord; + AcStrategy acs = AcStrategy::FromRawStrategy(o); + size_t sz = kDCTBlockSize * acs.covered_blocks_x() * acs.covered_blocks_y(); + if (all_used_orders_set[ord] == 0) { + for (size_t c = 0; c < 3; c++) { + size_t offset = CoeffOrderOffset(ord, c); + JXL_DASSERT(CoeffOrderOffset(ord, c + 1) - offset == sz); + SetDefaultOrder(AcStrategy::FromRawStrategy(o), &order[offset]); + } + } else { + for (size_t c = 0; c < 3; c++) { + int reorder[3] = {1, 0, 2}; + for (int i = 0; i < sz; i++) { + size_t offset = CoeffOrderOffset(ord, c); + coeff_order_t* JXL_RESTRICT cur_order = &order[offset]; + if (o == 0) { + cur_order[i] = hls_order[reorder[c] * 320 + offset8x8 + i]; + } else if (o == 4) { + cur_order[i] = hls_order[reorder[c] * 320 + offset16x16 + i]; + } + } + } + } + } + enc_state_->used_orders[0] = hls_order_reg; + + // Choose a context model that depends on the amount of quantization for AC. + if (cparams.speed_tier < SpeedTier::kFalcon) { + FindBestBlockEntropyModel(*enc_state_); + } + + // resize ac_tokens vector + enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses()); + for (PassesEncoderState::PassData& pass : enc_state_->passes) { + pass.ac_tokens.resize(shared.frame_dim.num_groups); + } + + shared.num_histograms = 1; + *frame_header = shared.frame_header; + + // Modular VarDCTDC + for (int group_index = 0; group_index < shared.frame_dim.num_dc_groups; group_index++) { + modular_frame_encoder->AddVarDCTDC(dc, group_index, enc_state_->cparams.butteraugli_distance >= 2.0f && + enc_state_->cparams.speed_tier < SpeedTier::kFalcon, + enc_state_); + }; + + // Modular ACMetadata + for (int group_index = 0; group_index < shared.frame_dim.num_dc_groups; group_index++) { + modular_frame_encoder->AddACMetadata(group_index, /*jpeg_transcode=*/false, enc_state_); + }; + + // Modular encode + JXL_RETURN_IF_ERROR(modular_frame_encoder->ComputeEncodingData( + *frame_header, *ib.metadata(), &opsin, *extra_channels, lossy_frame_encoder.State(), pool, aux_out, + /* do_color=*/frame_header->encoding == FrameEncoding::kModular)); + + // free host mem + free(hls_opsin_1); + free(hls_opsin_2); + free(hls_opsin_3); + free(Image_reorder_dct8); + free(Image_reorder_dct16); + free(Image_reorder_dct32); + free(aq_map_f); + free(hls_masking_field); + free(hls_quant_field); + free(cmap_axi); + free(ac_coef_axiout); + free(strategy_all); + free(raw_quant_field_i); + free(hls_dc8x8); + free(hls_dc16x16); + free(hls_dc32x32); + free(ac_coef); + } + return true; +} +} // namespace jxl + +#endif diff --git a/codec/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase3.cpp b/codec/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase3.cpp new file mode 100644 index 0000000000..a938eccdd9 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/host_acc_lossy_enc_compute/acc_phase3.cpp @@ -0,0 +1,243 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef HLS_KERNEL3_CPP +#define HLS_KERNEL3_CPP + +#include "acc_phase3.hpp" + +namespace jxl { + +Status acc_phase3(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& modular_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + PassesEncoderState* passes_enc_state, + FrameDimensions frame_dim, + BitWriter* writer, + const size_t num_groups, + AuxOut* aux_out, + ThreadPool* pool, + std::vector& aux_outs, + const ImageBundle& ib, + const std::function& resize_aux_outs) { + std::vector& group_caches_ = lossy_frame_encoder.get_group_cashes(); + PassesEncoderState* JXL_RESTRICT enc_state_ = lossy_frame_encoder.State(); + PassesSharedState& shared = enc_state_->shared; + + const auto tokenize_group_init = [&](const size_t num_threads) { + group_caches_.resize(num_threads); + return true; + }; + const auto tokenize_group = [&](const int group_index, const int thread) { + // Tokenize coefficients. + const Rect rect = shared.BlockGroupRect(group_index); + for (size_t idx_pass = 0; idx_pass < enc_state_->passes.size(); idx_pass++) { + JXL_ASSERT(enc_state_->coeffs[idx_pass]->Type() == ACType::k32); + const int32_t* JXL_RESTRICT ac_rows[3] = { + enc_state_->coeffs[idx_pass]->PlaneRow(0, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(1, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(2, group_index, 0).ptr32, + }; + // Ensure group cache is initialized. + group_caches_[thread].InitOnce(); + TokenizeCoefficients(&shared.coeff_orders[idx_pass * shared.coeff_order_size], rect, ac_rows, + shared.ac_strategy, frame_header->chroma_subsampling, + &group_caches_[thread].num_nzeroes, + &enc_state_->passes[idx_pass].ac_tokens[group_index], enc_state_->shared.quant_dc, + enc_state_->shared.raw_quant_field, enc_state_->shared.block_ctx_map); + } + }; + RunOnPool(pool, 0, shared.frame_dim.num_groups, tokenize_group_init, tokenize_group, "TokenizeGroup"); + + writer->AppendByteAligned(lossy_frame_encoder.State()->special_frames); + frame_header->UpdateFlag(lossy_frame_encoder.State()->shared.image_features.patches.HasAny(), + FrameHeader::kPatches); + frame_header->UpdateFlag(lossy_frame_encoder.State()->shared.image_features.splines.HasAny(), + FrameHeader::kSplines); + JXL_RETURN_IF_ERROR(WriteFrameHeader(*frame_header, writer, aux_out)); + + const size_t num_passes = passes_enc_state->progressive_splitter.GetNumPasses(); + + // DC global info + DC groups + AC global info + AC groups * + // num_passes. + const bool has_ac_global = true; + std::vector group_codes( + NumTocEntries(frame_dim.num_groups, frame_dim.num_dc_groups, num_passes, has_ac_global)); + const size_t global_ac_index = frame_dim.num_dc_groups + 1; + const bool is_small_image = frame_dim.num_groups == 1 && num_passes == 1; + const auto get_output = [&](const size_t index) { return &group_codes[is_small_image ? 0 : index]; }; + auto ac_group_code = [&](size_t pass, size_t group) { + return get_output(AcGroupIndex(pass, group, frame_dim.num_groups, frame_dim.num_dc_groups, has_ac_global)); + }; + + if (frame_header->flags & FrameHeader::kPatches) { + PatchDictionaryEncoder::Encode(lossy_frame_encoder.State()->shared.image_features.patches, get_output(0), + kLayerDictionary, aux_out); + } + + if (frame_header->flags & FrameHeader::kSplines) { + EncodeSplines(lossy_frame_encoder.State()->shared.image_features.splines, get_output(0), kLayerSplines, + HistogramParams(), aux_out); + } + + if (frame_header->flags & FrameHeader::kNoise) { + EncodeNoise(lossy_frame_encoder.State()->shared.image_features.noise_params, get_output(0), kLayerNoise, + aux_out); + } + + JXL_RETURN_IF_ERROR(DequantMatricesEncodeDC(&lossy_frame_encoder.State()->shared.matrices, get_output(0), + kLayerDequantTables, aux_out)); + if (frame_header->encoding == FrameEncoding::kVarDCT) { + JXL_RETURN_IF_ERROR(lossy_frame_encoder.EncodeGlobalDCInfo(*frame_header, get_output(0))); + } + JXL_RETURN_IF_ERROR(modular_frame_encoder->EncodeGlobalInfo(get_output(0), aux_out)); + JXL_RETURN_IF_ERROR( + modular_frame_encoder->EncodeStream(get_output(0), aux_out, kLayerModularGlobal, ModularStreamId::Global())); + + const auto process_dc_group = [&](const int group_index, const int thread) { + AuxOut* my_aux_out = aux_out ? &aux_outs[thread] : nullptr; + BitWriter* output = get_output(group_index + 1); + if (frame_header->encoding == FrameEncoding::kVarDCT && !(frame_header->flags & FrameHeader::kUseDcFrame)) { + BitWriter::Allotment allotment(output, 2); + output->Write(2, modular_frame_encoder->extra_dc_precision[group_index]); + ReclaimAndCharge(output, &allotment, kLayerDC, my_aux_out); + JXL_CHECK(modular_frame_encoder->EncodeStream(output, my_aux_out, kLayerDC, + ModularStreamId::VarDCTDC(group_index))); + } + JXL_CHECK(modular_frame_encoder->EncodeStream(output, my_aux_out, kLayerModularDcGroup, + ModularStreamId::ModularDC(group_index))); + if (frame_header->encoding == FrameEncoding::kVarDCT) { + const Rect& rect = lossy_frame_encoder.State()->shared.DCGroupRect(group_index); + size_t nb_bits = CeilLog2Nonzero(rect.xsize() * rect.ysize()); + if (nb_bits != 0) { + BitWriter::Allotment allotment(output, nb_bits); + output->Write(nb_bits, modular_frame_encoder->ac_metadata_size[group_index] - 1); + ReclaimAndCharge(output, &allotment, kLayerControlFields, my_aux_out); + } + JXL_CHECK(modular_frame_encoder->EncodeStream(output, my_aux_out, kLayerControlFields, + ModularStreamId::ACMetadata(group_index))); + } + }; + RunOnPool(pool, 0, frame_dim.num_dc_groups, resize_aux_outs, process_dc_group, "EncodeDCGroup"); + + if (frame_header->encoding == FrameEncoding::kVarDCT) { + JXL_RETURN_IF_ERROR( + lossy_frame_encoder.EncodeGlobalACInfo(get_output(global_ac_index), modular_frame_encoder.get())); + } + + std::atomic num_errors{0}; + const auto process_group = [&](const int group_index, const int thread) { + AuxOut* my_aux_out = aux_out ? &aux_outs[thread] : nullptr; + + for (size_t i = 0; i < num_passes; i++) { + if (frame_header->encoding == FrameEncoding::kVarDCT) { + if (!lossy_frame_encoder.EncodeACGroup(i, group_index, ac_group_code(i, group_index), my_aux_out)) { + num_errors.fetch_add(1, std::memory_order_relaxed); + return; + } + } + // Write all modular encoded data (color?, alpha, depth, extra channels) + if (!modular_frame_encoder->EncodeStream(ac_group_code(i, group_index), my_aux_out, kLayerModularAcGroup, + ModularStreamId::ModularAC(group_index, i))) { + num_errors.fetch_add(1, std::memory_order_relaxed); + return; + } + } + }; + RunOnPool(pool, 0, num_groups, resize_aux_outs, process_group, "EncodeGroupCoefficients"); + + // Resizing aux_outs to 0 also Assimilates the array. + static_cast(resize_aux_outs(0)); + JXL_RETURN_IF_ERROR(num_errors.load(std::memory_order_relaxed) == 0); + + for (BitWriter& bw : group_codes) { + bw.ZeroPadToByte(); // end of group. + } + + std::vector* permutation_ptr = nullptr; + std::vector permutation; + if (cparams.centerfirst && !(num_passes == 1 && num_groups == 1)) { + permutation_ptr = &permutation; + // Don't permute global DC/AC or DC. + permutation.resize(global_ac_index + 1); + std::iota(permutation.begin(), permutation.end(), 0); + std::vector ac_group_order(num_groups); + std::iota(ac_group_order.begin(), ac_group_order.end(), 0); + size_t group_dim = frame_dim.group_dim; + + // The center of the image is either given by parameters or chosen + // to be the middle of the image by default if center_x, center_y resp. + // are not provided. + + int64_t imag_cx; + if (cparams.center_x != static_cast(-1)) { + JXL_RETURN_IF_ERROR(cparams.center_x < ib.xsize()); + imag_cx = cparams.center_x; + } else { + imag_cx = ib.xsize() / 2; + } + + int64_t imag_cy; + if (cparams.center_y != static_cast(-1)) { + JXL_RETURN_IF_ERROR(cparams.center_y < ib.ysize()); + imag_cy = cparams.center_y; + } else { + imag_cy = ib.ysize() / 2; + } + + // The center of the group containing the center of the image. + int64_t cx = (imag_cx / group_dim) * group_dim + group_dim / 2; + int64_t cy = (imag_cy / group_dim) * group_dim + group_dim / 2; + // This identifies in what area of the central group the center of the + // image + // lies in. + double direction = -std::atan2(imag_cy - cy, imag_cx - cx); + // This identifies the side of the central group the center of the image + // lies closest to. This can take values 0, 1, 2, 3 corresponding to left, + // bottom, right, top. + int64_t side = std::fmod((direction + 5 * kPi / 4), 2 * kPi) * 2 / kPi; + auto get_distance_from_center = [&](size_t gid) { + Rect r = passes_enc_state->shared.GroupRect(gid); + int64_t gcx = r.x0() + group_dim / 2; + int64_t gcy = r.y0() + group_dim / 2; + int64_t dx = gcx - cx; + int64_t dy = gcy - cy; + // The angle is determined by taking atan2 and adding an appropriate + // starting point depending on the side we want to start on. + double angle = std::remainder(std::atan2(dy, dx) + kPi / 4 + side * (kPi / 2), 2 * kPi); + // Concentric squares in clockwise order. + return std::make_pair(std::max(std::abs(dx), std::abs(dy)), angle); + }; + std::sort(ac_group_order.begin(), ac_group_order.end(), [&](coeff_order_t a, coeff_order_t b) { + return get_distance_from_center(a) < get_distance_from_center(b); + }); + std::vector inv_ac_group_order(ac_group_order.size(), 0); + for (size_t i = 0; i < ac_group_order.size(); i++) { + inv_ac_group_order[ac_group_order[i]] = i; + } + for (size_t i = 0; i < num_passes; i++) { + size_t pass_start = permutation.size(); + for (coeff_order_t v : inv_ac_group_order) { + permutation.push_back(pass_start + v); + } + } + std::vector new_group_codes(group_codes.size()); + for (size_t i = 0; i < permutation.size(); i++) { + new_group_codes[permutation[i]] = std::move(group_codes[i]); + } + group_codes = std::move(new_group_codes); + } + + JXL_RETURN_IF_ERROR(WriteGroupOffsets(group_codes, permutation_ptr, writer, aux_out)); + writer->AppendByteAligned(group_codes); + writer->ZeroPadToByte(); // end of frame. + return true; +} +} // namespace jxl + +#endif diff --git a/codec/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_host.cpp b/codec/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_host.cpp new file mode 100644 index 0000000000..c4c5a60e2c --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_host.cpp @@ -0,0 +1,308 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "acc_host.hpp" + +namespace jxl { +void FindBestDequantMatrices(const CompressParams& cparams, + const Image3F& opsin, + ModularFrameEncoder* modular_frame_encoder, + DequantMatrices* dequant_matrices) { + // TODO(veluca): quant matrices for no-gaborish. + // TODO(veluca): heuristics for in-bitstream quant tables. + *dequant_matrices = DequantMatrices(); + if (cparams.max_error_mode) { + // Set numerators of all quantization matrices to constant values. + float weights[3][1] = { + {1.0f / cparams.max_error[0]}, {1.0f / cparams.max_error[1]}, {1.0f / cparams.max_error[2]}}; + DctQuantWeightParams dct_params(weights); + std::vector encodings(DequantMatrices::kNum, QuantEncoding::DCT(dct_params)); + DequantMatricesSetCustom(dequant_matrices, encodings, modular_frame_encoder); + float dc_weights[3] = {1.0f / cparams.max_error[0], 1.0f / cparams.max_error[1], 1.0f / cparams.max_error[2]}; + DequantMatricesSetCustomDC(dequant_matrices, dc_weights); + } +} + +bool DefaultEncoderHeuristics::HandlesColorConversion(const CompressParams& cparams, const ImageBundle& ib) { + return cparams.noise != Override::kOn && cparams.patches != Override::kOn && + cparams.speed_tier >= SpeedTier::kWombat && cparams.resampling == 1 && + cparams.color_transform == ColorTransform::kXYB && !cparams.modular_mode && !ib.HasAlpha(); +} + +Status acc_host(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + const ImageBundle* JXL_RESTRICT ib_or_linear, + ThreadPool* pool, + std::unique_ptr& modular_frame_encoder, + BitWriter* writer, + AuxOut* aux_out, + std::unique_ptr& frame_header, + const FrameInfo& frame_info, + CompressParams cparams, + const std::vector* extra_channels, + PassesEncoderState* passes_enc_state, + FrameDimensions frame_dim, + const size_t num_groups, + const ImageBundle& ib, + std::vector& aux_outs, + const std::function& resize_aux_outs) { + acc_phase1(opsin, lossy_frame_encoder, cparams, frame_header, frame_info, ib_or_linear, ib, aux_out, pool); + + acc_phase2(xclbinPath, opsin, lossy_frame_encoder, modular_frame_encoder, cparams, frame_header, extra_channels, + ib_or_linear, ib, pool, aux_out); + + acc_phase3(xclbinPath, opsin, lossy_frame_encoder, modular_frame_encoder, cparams, frame_header, passes_enc_state, + frame_dim, writer, num_groups, aux_out, pool, aux_outs, ib, resize_aux_outs); + + return true; +} + +Status DefaultEncoderHeuristics::LossyFrameHeuristics(PassesEncoderState* enc_state, + ModularFrameEncoder* modular_frame_encoder, + const ImageBundle* original_pixels, + Image3F* opsin, + ThreadPool* pool, + AuxOut* aux_out) { + PROFILER_ZONE("JxlLossyFrameHeuristics uninstrumented"); + + CompressParams& cparams = enc_state->cparams; + PassesSharedState& shared = enc_state->shared; + + // Compute parameters for noise synthesis. + if (shared.frame_header.flags & FrameHeader::kNoise) { + PROFILER_ZONE("enc GetNoiseParam"); + if (cparams.photon_noise_iso > 0) { + shared.image_features.noise_params = + SimulatePhotonNoise(opsin->xsize(), opsin->ysize(), cparams.photon_noise_iso); + } else { + // Don't start at zero amplitude since adding noise is expensive -- it + // significantly slows down decoding, and this is unlikely to + // completely go away even with advanced optimizations. After the + // kNoiseModelingRampUpDistanceRange we have reached the full level, + // i.e. noise is no longer represented by the compressed image, so we + // can add full noise by the noise modeling itself. + static const float kNoiseModelingRampUpDistanceRange = 0.6; + static const float kNoiseLevelAtStartOfRampUp = 0.25; + static const float kNoiseRampupStart = 1.0; + // TODO(user) test and properly select quality_coef with smooth + // filter + float quality_coef = 1.0f; + const float rampup = (cparams.butteraugli_distance - kNoiseRampupStart) / kNoiseModelingRampUpDistanceRange; + if (rampup < 1.0f) { + quality_coef = kNoiseLevelAtStartOfRampUp + (1.0f - kNoiseLevelAtStartOfRampUp) * rampup; + } + if (rampup < 0.0f) { + quality_coef = kNoiseRampupStart; + } + if (!GetNoiseParameter(*opsin, &shared.image_features.noise_params, quality_coef)) { + shared.frame_header.flags &= ~FrameHeader::kNoise; + } + } + } + if (enc_state->shared.frame_header.upsampling != 1 && !cparams.already_downsampled) { + // In VarDCT mode, LossyFrameHeuristics takes care of running downsampling + // after noise, if necessary. + DownsampleImage(opsin, cparams.resampling); + PadImageToBlockMultipleInPlace(opsin); + } + + const FrameDimensions& frame_dim = enc_state->shared.frame_dim; + size_t target_size = TargetSize(cparams, frame_dim); + size_t opsin_target_size = target_size; + if (cparams.target_size > 0 || cparams.target_bitrate > 0.0) { + cparams.target_size = opsin_target_size; + } else if (cparams.butteraugli_distance < 0) { + return JXL_FAILURE("Expected non-negative distance"); + } + +#ifndef XLNX_DISABLE_BLK_DICT + // Find and subtract splines. + if (cparams.speed_tier <= SpeedTier::kSquirrel) { + shared.image_features.splines = FindSplines(*opsin); + JXL_RETURN_IF_ERROR(shared.image_features.splines.SubtractFrom(opsin, shared.cmap)); + } + + // Find and subtract patches/dots. + if (ApplyOverride(cparams.patches, cparams.speed_tier <= SpeedTier::kSquirrel)) { + FindBestPatchDictionary(*opsin, enc_state, pool, aux_out); + PatchDictionaryEncoder::SubtractFrom(shared.image_features.patches, opsin); + } +#endif + + static const float kAcQuant = 0.79f; + const float quant_dc = InitialQuantDC(cparams.butteraugli_distance); + Quantizer& quantizer = enc_state->shared.quantizer; + // We don't know the quant field yet, but for computing the global scale + // assuming that it will be the same as for Falcon mode is good enough. + quantizer.ComputeGlobalScaleAndQuant(quant_dc, kAcQuant / cparams.butteraugli_distance, 0); + + // TODO(veluca): we can now run all the code from here to FindBestQuantizer + // (excluded) one rect at a time. Do that. + + // Dependency graph: + // + // input: either XYB or input image + // + // input image -> XYB [optional] + // XYB -> initial quant field + // XYB -> Gaborished XYB + // Gaborished XYB -> CfL1 + // initial quant field, Gaborished XYB, CfL1 -> ACS + // initial quant field, ACS, Gaborished XYB -> EPF control field + // initial quant field -> adjusted initial quant field + // adjusted initial quant field, ACS -> raw quant field + // raw quant field, ACS, Gaborished XYB -> CfL2 + // + // output: Gaborished XYB, CfL, ACS, raw quant field, EPF control field. + + ArControlFieldHeuristics ar_heuristics; + AcStrategyHeuristics acs_heuristics; + CfLHeuristics cfl_heuristics; + + if (!opsin->xsize()) { + JXL_ASSERT(HandlesColorConversion(cparams, *original_pixels)); + *opsin = Image3F(RoundUpToBlockDim(original_pixels->xsize()), RoundUpToBlockDim(original_pixels->ysize())); + opsin->ShrinkTo(original_pixels->xsize(), original_pixels->ysize()); + ToXYB(*original_pixels, pool, opsin, /*linear=*/nullptr); + PadImageToBlockMultipleInPlace(opsin); + } + + // Compute an initial estimate of the quantization field. + // Call InitialQuantField only in Hare mode or slower. Otherwise, rely + // on simple heuristics in FindBestAcStrategy, or set a constant for Falcon + // mode. + if (cparams.speed_tier > SpeedTier::kHare || cparams.uniform_quant > 0) { + enc_state->initial_quant_field = ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks); + float q = cparams.uniform_quant > 0 ? cparams.uniform_quant : kAcQuant / cparams.butteraugli_distance; + FillImage(q, &enc_state->initial_quant_field); + } else { + // Call this here, as it relies on pre-gaborish values. + float butteraugli_distance_for_iqf = cparams.butteraugli_distance; + if (!shared.frame_header.loop_filter.gab) { + butteraugli_distance_for_iqf *= 0.73f; + } + enc_state->initial_quant_field = InitialQuantField(butteraugli_distance_for_iqf, *opsin, shared.frame_dim, pool, + 1.0f, &enc_state->initial_quant_masking); + } + + // TODO(veluca): do something about animations. + + // Apply inverse-gaborish. + if (shared.frame_header.loop_filter.gab) { + GaborishInverse(opsin, 0.9908511000000001f, pool); + } + + cfl_heuristics.Init(*opsin); + acs_heuristics.Init(*opsin, enc_state); + ar_heuristics.PrepareForThreads(/*num_threads*/ 1); + cfl_heuristics.PrepareForThreads(/*num_threads*/ 1); + + // auto process_tile = [&](size_t tid, size_t thread) { + for (int tid = 0; tid < DivCeil(enc_state->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks) * + DivCeil(enc_state->shared.frame_dim.ysize_blocks, kEncTileDimInBlocks); + tid++) { + size_t thread = 0; + size_t n_enc_tiles = DivCeil(enc_state->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks); + size_t tx = tid % n_enc_tiles; + size_t ty = tid / n_enc_tiles; + size_t by0 = ty * kEncTileDimInBlocks; + size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, enc_state->shared.frame_dim.ysize_blocks); + size_t bx0 = tx * kEncTileDimInBlocks; + size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, enc_state->shared.frame_dim.xsize_blocks); + Rect r(bx0, by0, bx1 - bx0, by1 - by0); + + // For speeds up to Wombat, we only compute the color correlation map + // once we know the transform type and the quantization map. + if (cparams.speed_tier <= SpeedTier::kSquirrel) { + // cfl_heuristics.ComputeTile(r, *opsin, enc_state->shared.matrices, + // /*ac_strategy=*/nullptr, + // /*quantizer=*/nullptr, /*fast=*/false, thread, + // &enc_state->shared.cmap); + } + +// Choose block sizes. +// acs_heuristics.ProcessRect(r); + +// Choose amount of post-processing smoothing. +// TODO(veluca): should this go *after* AdjustQuantField? +#ifndef XLNX_DISABLE_ARC + ar_heuristics.RunRect(r, *opsin, enc_state, thread); +#else + ImageB* JXL_RESTRICT epf_sharpness = &enc_state->shared.epf_sharpness; + FillPlane(static_cast(4), epf_sharpness, r); +#endif + // Always set the initial quant field, so we can compute the CfL map with + // more accuracy. The initial quant field might change in slower modes, but + // adjusting the quant field with butteraugli when all the other encoding + // parameters are fixed is likely a more reliable choice anyway. + AdjustQuantField(enc_state->shared.ac_strategy, r, &enc_state->initial_quant_field); + quantizer.SetQuantFieldRect(enc_state->initial_quant_field, r, &enc_state->shared.raw_quant_field); + +// Compute a non-default CfL map if we are at Hare speed, or slower. +#ifndef XLNX_DISABLE_2NDCMP + if (cparams.speed_tier <= SpeedTier::kHare) { + cfl_heuristics.ComputeTile( + r, *opsin, enc_state->shared.matrices, &enc_state->shared.ac_strategy, &enc_state->shared.quantizer, + /*fast=*/cparams.speed_tier >= SpeedTier::kWombat, thread, &enc_state->shared.cmap); + } +#endif + }; + /* RunOnPool(pool, 0, DivCeil(enc_state->shared.frame_dim.xsize_blocks, + kEncTileDimInBlocks) * + DivCeil(enc_state->shared.frame_dim.ysize_blocks, + kEncTileDimInBlocks), + [&](const size_t num_threads) { + ar_heuristics.PrepareForThreads(num_threads); + cfl_heuristics.PrepareForThreads(num_threads); + return true; + }, + process_tile, "Enc Heuristics");*/ + + acs_heuristics.Finalize(aux_out); + if (cparams.speed_tier <= SpeedTier::kHare) { + cfl_heuristics.ComputeDC(/*fast=*/cparams.speed_tier >= SpeedTier::kWombat, &enc_state->shared.cmap); + } + + FindBestDequantMatrices(cparams, *opsin, modular_frame_encoder, &enc_state->shared.matrices); + + // Refine quantization levels. + FindBestQuantizer(original_pixels, *opsin, enc_state, pool, aux_out); + + // Choose a context model that depends on the amount of quantization for AC. + if (cparams.speed_tier < SpeedTier::kFalcon) { + FindBestBlockEntropyModel(*enc_state); + } + +#ifdef XLNX_DEBUG_CMAP + std::cout << "=========================================" << std::endl; + std::cout << "ColorMap info: " << std::endl; + ImageSB* JXL_RESTRICT tmp_map = &enc_state->shared.cmap.ytox_map; + int32_t dc = enc_state->shared.cmap.GetYToXDC(); + std::cout << "Y to X dc: " << dc << std::endl; + for (int i = 0; i < tmp_map->ysize(); i++) { + int8_t* JXL_RESTRICT row_out = tmp_map->Row(i); + for (int j = 0; j < tmp_map->xsize(); j++) { + std::cout << (int)row_out[j] << " "; + } + std::cout << std::endl; + } + + tmp_map = &enc_state->shared.cmap.ytox_map; + dc = enc_state->shared.cmap.GetYToBDC(); + std::cout << "Y to B dc: " << dc << std::endl; + for (int i = 0; i < tmp_map->ysize(); i++) { + int8_t* JXL_RESTRICT row_out = tmp_map->Row(i); + for (int j = 0; j < tmp_map->xsize(); j++) { + std::cout << (int)row_out[j] << " "; + } + std::cout << std::endl; + } + std::cout << std::endl; +#endif + + return true; +} +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_phase1.cpp b/codec/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_phase1.cpp new file mode 100644 index 0000000000..a37f251c20 --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_phase1.cpp @@ -0,0 +1,276 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef HLS_KERNEL1_CPP +#define HLS_KERNEL1_CPP + +#include "acc_phase1.hpp" + +namespace jxl { +namespace { +// Invisible (alpha = 0) pixels tend to be a mess in optimized PNGs. +// Since they have no visual impact whatsoever, we can replace them with +// something that compresses better and reduces artifacts near the edges. This +// does some kind of smooth stuff that seems to work. +// Replace invisible pixels with a weighted average of the pixel to the left, +// the pixel to the topright, and non-invisible neighbours. +// Produces downward-blurry smears, with in the upwards direction only a 1px +// edge duplication but not more. It would probably be better to smear in all +// directions. That requires an alpha-weighed convolution with a large enough +// kernel though, which might be overkill... +void SimplifyInvisible(Image3F* image, const ImageF& alpha, bool lossless) { + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < image->ysize(); ++y) { + float* JXL_RESTRICT row = image->PlaneRow(c, y); + const float* JXL_RESTRICT prow = (y > 0 ? image->PlaneRow(c, y - 1) : nullptr); + const float* JXL_RESTRICT nrow = (y + 1 < image->ysize() ? image->PlaneRow(c, y + 1) : nullptr); + const float* JXL_RESTRICT a = alpha.Row(y); + const float* JXL_RESTRICT pa = (y > 0 ? alpha.Row(y - 1) : nullptr); + const float* JXL_RESTRICT na = (y + 1 < image->ysize() ? alpha.Row(y + 1) : nullptr); + for (size_t x = 0; x < image->xsize(); ++x) { + if (a[x] == 0) { + if (lossless) { + row[x] = 0; + continue; + } + float d = 0.f; + row[x] = 0; + if (x > 0) { + row[x] += row[x - 1]; + d++; + if (a[x - 1] > 0.f) { + row[x] += row[x - 1]; + d++; + } + } + if (x + 1 < image->xsize()) { + if (y > 0) { + row[x] += prow[x + 1]; + d++; + } + if (a[x + 1] > 0.f) { + row[x] += 2.f * row[x + 1]; + d += 2.f; + } + if (y > 0 && pa[x + 1] > 0.f) { + row[x] += 2.f * prow[x + 1]; + d += 2.f; + } + if (y + 1 < image->ysize() && na[x + 1] > 0.f) { + row[x] += 2.f * nrow[x + 1]; + d += 2.f; + } + } + if (y > 0 && pa[x] > 0.f) { + row[x] += 2.f * prow[x]; + d += 2.f; + } + if (y + 1 < image->ysize() && na[x] > 0.f) { + row[x] += 2.f * nrow[x]; + d += 2.f; + } + if (d > 1.f) row[x] /= d; + } + } + } + } +} +} // namespace + +Status acc_phase1(Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + const FrameInfo& frame_info, + const ImageBundle* JXL_RESTRICT ib_or_linear, + const ImageBundle& ib, + AuxOut* aux_out, + ThreadPool* pool) { + const ColorEncoding& c_linear = ColorEncoding::LinearSRGB(ib.IsGray()); + std::unique_ptr metadata_linear = jxl::make_unique(); + metadata_linear->xyb_encoded = (cparams.color_transform == ColorTransform::kXYB); + metadata_linear->color_encoding = c_linear; + ImageBundle linear_storage(metadata_linear.get()); + + // Allocating a large enough image avoids a copy when padding. + opsin = Image3F(RoundUpToBlockDim(ib.xsize()), RoundUpToBlockDim(ib.ysize())); + opsin.ShrinkTo(ib.xsize(), ib.ysize()); + + const bool want_linear = + frame_header->encoding == FrameEncoding::kVarDCT && cparams.speed_tier <= SpeedTier::kKitten; + ib_or_linear = &ib; + + if (frame_header->color_transform == ColorTransform::kXYB && frame_info.ib_needs_color_transform) { + // linear_storage would only be used by the Butteraugli loop (passing + // linear sRGB avoids a color conversion there). Otherwise, don't + // fill it to reduce memory usage. + ib_or_linear = ToXYB(ib, pool, &opsin, want_linear ? &linear_storage : nullptr); + } else { // RGB or YCbCr: don't do anything (forward YCbCr is not + // implemented, this is only used when the input is already in + // YCbCr) + // If encoding a special DC or reference frame, don't do anything: + // input is already in XYB. + CopyImageTo(ib.color(), &opsin); + } + bool lossless = (frame_header->encoding == FrameEncoding::kModular && cparams.quality_pair.first == 100); + if (ib.HasAlpha() && !ib.AlphaIsPremultiplied() && !ApplyOverride(cparams.keep_invisible, lossless) && + cparams.ec_resampling == cparams.resampling) { + // simplify invisible pixels + SimplifyInvisible(&opsin, ib.alpha(), lossless); + if (want_linear) { + SimplifyInvisible(const_cast(&ib_or_linear->color()), ib.alpha(), lossless); + } + } + if (aux_out != nullptr) { + JXL_RETURN_IF_ERROR(aux_out->InspectImage3F("enc_frame:OpsinDynamicsImage", opsin)); + } + if (frame_header->encoding == FrameEncoding::kVarDCT) { + PadImageToBlockMultipleInPlace(&opsin); + PassesEncoderState* JXL_RESTRICT enc_state_ = lossy_frame_encoder.State(); + // std::vector& group_caches_ = + // lossy_frame_encoder.get_group_cashes(); + + JXL_ASSERT((opsin.xsize() % kBlockDim) == 0 && (opsin.ysize() % kBlockDim) == 0); + PassesSharedState& shared = enc_state_->shared; + + if (!enc_state_->cparams.max_error_mode) { + float x_qm_scale_steps[3] = {0.65f, 1.25f, 9.0f}; + shared.frame_header.x_qm_scale = 1; + for (float x_qm_scale_step : x_qm_scale_steps) { + if (enc_state_->cparams.butteraugli_distance > x_qm_scale_step) { + shared.frame_header.x_qm_scale++; + } + } + } + + Image3F* opsin_ = &opsin; + // CompressParams& cparams = enc_state->cparams; + // PassesSharedState& shared = enc_state->shared; + + // Compute parameters for noise synthesis. + if (shared.frame_header.flags & FrameHeader::kNoise) { + PROFILER_ZONE("enc GetNoiseParam"); + if (cparams.photon_noise_iso > 0) { + shared.image_features.noise_params = + SimulatePhotonNoise(opsin_->xsize(), opsin_->ysize(), cparams.photon_noise_iso); + } else { + // Don't start at zero amplitude since adding noise is expensive -- it + // significantly slows down decoding, and this is unlikely to + // completely go away even with advanced optimizations. After the + // kNoiseModelingRampUpDistanceRange we have reached the full level, + // i.e. noise is no longer represented by the compressed image, so we + // can add full noise by the noise modeling itself. + static const float kNoiseModelingRampUpDistanceRange = 0.6; + static const float kNoiseLevelAtStartOfRampUp = 0.25; + static const float kNoiseRampupStart = 1.0; + // TODO(user) test and properly select quality_coef with smooth + // filter + float quality_coef = 1.0f; + const float rampup = + (cparams.butteraugli_distance - kNoiseRampupStart) / kNoiseModelingRampUpDistanceRange; + if (rampup < 1.0f) { + quality_coef = kNoiseLevelAtStartOfRampUp + (1.0f - kNoiseLevelAtStartOfRampUp) * rampup; + } + if (rampup < 0.0f) { + quality_coef = kNoiseRampupStart; + } + if (!GetNoiseParameter(*opsin_, &shared.image_features.noise_params, quality_coef)) { + shared.frame_header.flags &= ~FrameHeader::kNoise; + } + } + } + if (enc_state_->shared.frame_header.upsampling != 1 && !cparams.already_downsampled) { + // In VarDCT mode, LossyFrameHeuristics takes care of running downsampling + // after noise, if necessary. + DownsampleImage(opsin_, cparams.resampling); + PadImageToBlockMultipleInPlace(opsin_); + } + + const FrameDimensions& frame_dim_ = enc_state_->shared.frame_dim; + size_t target_size = TargetSize(cparams, frame_dim_); + size_t opsin_target_size = target_size; + if (cparams.target_size > 0 || cparams.target_bitrate > 0.0) { + cparams.target_size = opsin_target_size; + } else if (cparams.butteraugli_distance < 0) { + return JXL_FAILURE("Expected non-negative distance"); + } + +#ifndef XLNX_DISABLE_BLK_DICT + // Find and subtract splines. + if (cparams.speed_tier <= SpeedTier::kSquirrel) { + shared.image_features.splines = FindSplines(*opsin_); + JXL_RETURN_IF_ERROR(shared.image_features.splines.SubtractFrom(opsin_, shared.cmap)); + } + + // Find and subtract patches/dots. + if (ApplyOverride(cparams.patches, cparams.speed_tier <= SpeedTier::kSquirrel)) { + FindBestPatchDictionary(*opsin_, enc_state_, pool, aux_out); + PatchDictionaryEncoder::SubtractFrom(shared.image_features.patches, opsin_); + } +#endif + + static const float kAcQuant = 0.79f; + const float quant_dc = InitialQuantDC(cparams.butteraugli_distance); + Quantizer& quantizer = enc_state_->shared.quantizer; + // We don't know the quant field yet, but for computing the global scale + // assuming that it will be the same as for Falcon mode is good enough. + quantizer.ComputeGlobalScaleAndQuant(quant_dc, kAcQuant / cparams.butteraugli_distance, 0); + + // TODO(veluca): we can now run all the code from here to FindBestQuantizer + // (excluded) one rect at a time. Do that. + + // Dependency graph: + // + // input: either XYB or input image + // + // input image -> XYB [optional] + // XYB -> initial quant field + // XYB -> Gaborished XYB + // Gaborished XYB -> CfL1 + // initial quant field, Gaborished XYB, CfL1 -> ACS + // initial quant field, ACS, Gaborished XYB -> EPF control field + // initial quant field -> adjusted initial quant field + // adjusted initial quant field, ACS -> raw quant field + // raw quant field, ACS, Gaborished XYB -> CfL2 + // + // output: Gaborished XYB, CfL, ACS, raw quant field, EPF control field. + + if (!opsin_->xsize()) { + JXL_ASSERT(enc_state_->heuristics->HandlesColorConversion(cparams, *ib_or_linear)); + *opsin_ = Image3F(RoundUpToBlockDim(ib_or_linear->xsize()), RoundUpToBlockDim(ib_or_linear->ysize())); + opsin_->ShrinkTo(ib_or_linear->xsize(), ib_or_linear->ysize()); + ToXYB(*ib_or_linear, pool, opsin_, /*linear=*/nullptr); + PadImageToBlockMultipleInPlace(opsin_); + } + + // Compute an initial estimate of the quantization field. + // Call InitialQuantField only in Hare mode or slower. Otherwise, rely + // on simple heuristics in FindBestAcStrategy, or set a constant for Falcon + // mode. + if (cparams.speed_tier > SpeedTier::kHare || cparams.uniform_quant > 0) { + enc_state_->initial_quant_field = ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks); + float q = cparams.uniform_quant > 0 ? cparams.uniform_quant : kAcQuant / cparams.butteraugli_distance; + FillImage(q, &enc_state_->initial_quant_field); + } else { + // Call this here, as it relies on pre-gaborish values. + float butteraugli_distance_for_iqf = cparams.butteraugli_distance; + if (!shared.frame_header.loop_filter.gab) { + butteraugli_distance_for_iqf *= 0.73f; + } + enc_state_->initial_quant_field = InitialQuantField(butteraugli_distance_for_iqf, *opsin_, shared.frame_dim, + pool, 1.0f, &enc_state_->initial_quant_masking); + } + + // TODO(veluca): do something about animations. + + // Apply inverse-gaborish. + if (shared.frame_header.loop_filter.gab) { + GaborishInverse(opsin_, 0.9908511000000001f, pool); + } + } + return true; +} +} // namespace jxl +#endif \ No newline at end of file diff --git a/codec/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_phase2.cpp b/codec/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_phase2.cpp new file mode 100644 index 0000000000..545403e15e --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_phase2.cpp @@ -0,0 +1,415 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef ACC_PHASE2_CPP +#define ACC_PHASE2_CPP + +#include "acc_phase2.hpp" + +namespace jxl { + +Status acc_phase2(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& modular_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + const std::vector* extra_channels, + const ImageBundle* JXL_RESTRICT ib_or_linear, + const ImageBundle& ib, + ThreadPool* pool, + AuxOut* aux_out) { + if (frame_header->encoding == FrameEncoding::kVarDCT) { + std::vector& group_caches_ = lossy_frame_encoder.get_group_cashes(); + PassesEncoderState* JXL_RESTRICT enc_state_ = lossy_frame_encoder.State(); + PassesSharedState& shared = enc_state_->shared; + Image3F* opsin_ = &opsin; + Quantizer& quantizer = enc_state_->shared.quantizer; + + size_t tile_xsize = (opsin.xsize() + 63) / 64 * 64; + size_t tile_ysize = (opsin.ysize() + 63) / 64 * 64; +#ifdef XLNX_QC_DEBUG_DCT +/*std::cout << std::endl + << "======================================== full origin pixel " + "==============================================" + << std::endl; +for (int c = 0; c < 3; c++) { + if (c == 0) { + std::cout << std::setw(15) << 0 << " "; + for (int m = 0; m < tile_xsize; m++) { + std::cout << std::setw(15) << m << " "; + } + std::cout << std::endl << std::endl; + + for (int y = 0; y < tile_ysize; y++) { + std::cout << std::setw(15) << y << " "; + const float* JXL_RESTRICT row_y = opsin.ConstPlaneRow(c, y); + for (int x = 0; x < tile_xsize; x++) { + std::cout << std::setw(15) << row_y[x] << " "; + } + std::cout << std::endl; + } + std::cout << std::endl; + } +}*/ +#endif + + std::vector > dctIDT(3, std::vector(tile_xsize * tile_ysize)); + std::vector > dct2x2(3, std::vector(tile_xsize * tile_ysize)); + std::vector > dct4x4(3, std::vector(tile_xsize * tile_ysize)); + std::vector > dct8x8(3, std::vector(tile_xsize * tile_ysize)); + std::vector > dct16x16(3, std::vector(tile_xsize * tile_ysize)); + std::vector > dct32x32(3, std::vector(tile_xsize * tile_ysize)); + + std::vector > dcIDT(3, std::vector((tile_xsize * tile_ysize + 63) / 64 * 64)); + std::vector > dc2x2(3, std::vector((tile_xsize * tile_ysize + 63) / 64 * 64)); + std::vector > dc4x4(3, std::vector((tile_xsize * tile_ysize + 63) / 64 * 64)); + std::vector > dc8x8(3, std::vector((tile_xsize * tile_ysize + 63) / 64 * 64)); + std::vector > dc16x16(3, std::vector((tile_xsize * tile_ysize + 63) / 64 * 64)); + std::vector > dc32x32(3, std::vector((tile_xsize * tile_ysize + 63) / 64 * 64)); + + for (int c = 0; c < 3; c++) { + for (size_t y = 0; y < tile_ysize; y = y + 8) { + const float* JXL_RESTRICT row = opsin.ConstPlaneRow(c, y); + size_t stride = opsin.PixelsPerRow(); + + for (size_t x = 0; x < tile_xsize; x = x + 8) { + float* mem = (float*)calloc(8UL * 8UL, sizeof(float)); + float* dc_mem = + (float*)calloc(AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks, sizeof(float)); + float* scratch_space = (float*)calloc(2048UL, sizeof(float)); + AcStrategy acs = AcStrategy::FromRawStrategy(AcStrategy::Type::IDENTITY); + size_t xs = acs.covered_blocks_x(); + N_SCALAR::TransformFromPixels(acs.Strategy(), row + x, stride, mem, scratch_space); + N_SCALAR::DCFromLowestFrequencies(acs.Strategy(), mem, dc_mem, xs); + for (int m = 0; m < 64; m++) { + dctIDT[c][64 * (y / 8 * (tile_xsize / 8) + x / 8) + m] = mem[m]; + } + dcIDT[c][y / 8 * tile_xsize / 8 + x / 8] = dc_mem[0]; + free(mem); + free(dc_mem); + free(scratch_space); + } + } + } + + for (int c = 0; c < 3; c++) { + for (size_t y = 0; y < tile_ysize; y = y + 8) { + const float* JXL_RESTRICT row = opsin.ConstPlaneRow(c, y); + size_t stride = opsin.PixelsPerRow(); + + for (size_t x = 0; x < tile_xsize; x = x + 8) { + float* mem = (float*)calloc(8UL * 8UL, sizeof(float)); + float* dc_mem = + (float*)calloc(AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks, sizeof(float)); + float* scratch_space = (float*)calloc(2048UL, sizeof(float)); + AcStrategy acs = AcStrategy::FromRawStrategy(AcStrategy::Type::DCT2X2); + size_t xs = acs.covered_blocks_x(); + N_SCALAR::TransformFromPixels(acs.Strategy(), row + x, stride, mem, scratch_space); + N_SCALAR::DCFromLowestFrequencies(acs.Strategy(), mem, dc_mem, xs); + for (int m = 0; m < 64; m++) { + dct2x2[c][64 * (y / 8 * (tile_xsize / 8) + x / 8) + m] = mem[m]; + } + dc2x2[c][y / 8 * tile_xsize / 8 + x / 8] = dc_mem[0]; + free(mem); + free(dc_mem); + free(scratch_space); + } + } + } + + for (int c = 0; c < 3; c++) { + for (size_t y = 0; y < tile_ysize; y = y + 8) { + const float* JXL_RESTRICT row = opsin.ConstPlaneRow(c, y); + size_t stride = opsin.PixelsPerRow(); + + for (size_t x = 0; x < tile_xsize; x = x + 8) { + float* mem = (float*)calloc(8UL * 8UL, sizeof(float)); + float* dc_mem = + (float*)calloc(AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks, sizeof(float)); + float* scratch_space = (float*)calloc(2048UL, sizeof(float)); + AcStrategy acs = AcStrategy::FromRawStrategy(AcStrategy::Type::DCT4X4); + size_t xs = acs.covered_blocks_x(); + N_SCALAR::TransformFromPixels(acs.Strategy(), row + x, stride, mem, scratch_space); + N_SCALAR::DCFromLowestFrequencies(acs.Strategy(), mem, dc_mem, xs); + for (int m = 0; m < 64; m++) { + dct4x4[c][64 * (y / 8 * (tile_xsize / 8) + x / 8) + m] = mem[m]; + } + dc4x4[c][y / 8 * tile_xsize / 8 + x / 8] = dc_mem[0]; + free(mem); + free(dc_mem); + free(scratch_space); + } + } + } + + for (int c = 0; c < 3; c++) { + for (size_t y = 0; y < tile_ysize; y = y + 8) { + const float* JXL_RESTRICT row = opsin.ConstPlaneRow(c, y); + size_t stride = opsin.PixelsPerRow(); + + for (size_t x = 0; x < tile_xsize; x = x + 8) { + float* mem = (float*)calloc(8UL * 8UL, sizeof(float)); + float* dc_mem = + (float*)calloc(AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks, sizeof(float)); + float* scratch_space = (float*)calloc(2048UL, sizeof(float)); + AcStrategy acs = AcStrategy::FromRawStrategy(AcStrategy::Type::DCT); + size_t xs = acs.covered_blocks_x(); + N_SCALAR::TransformFromPixels(acs.Strategy(), row + x, stride, mem, scratch_space); + N_SCALAR::DCFromLowestFrequencies(acs.Strategy(), mem, dc_mem, xs); + for (int m = 0; m < 64; m++) { + dct8x8[c][64 * (y / 8 * (tile_xsize / 8) + x / 8) + m] = mem[m]; + } + dc8x8[c][y / 8 * (tile_xsize / 8) + x / 8] = dc_mem[0]; + free(mem); + free(dc_mem); + free(scratch_space); + } + } + } + + for (int c = 0; c < 3; c++) { + for (size_t y = 0; y < tile_ysize; y = y + 16) { + const float* JXL_RESTRICT row = opsin.ConstPlaneRow(c, y); + size_t stride = opsin.PixelsPerRow(); + + for (size_t x = 0; x < tile_xsize; x = x + 16) { + float* mem = (float*)calloc(16UL * 16UL, sizeof(float)); + float* dc_mem = + (float*)calloc(AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks, sizeof(float)); + float* scratch_space = (float*)calloc(2048UL, sizeof(float)); + AcStrategy acs = AcStrategy::FromRawStrategy(AcStrategy::Type::DCT16X16); + size_t xs = acs.covered_blocks_x(); + N_SCALAR::TransformFromPixels(acs.Strategy(), row + x, stride, mem, scratch_space); + N_SCALAR::DCFromLowestFrequencies(acs.Strategy(), mem, dc_mem, xs); + for (int m = 0; m < 16 * 16; m++) { + dct16x16[c][16 * 16 * (y / 16 * (tile_xsize / 16) + x / 16) + m] = mem[m]; + } + for (int m = 0; m < 4; m++) { + dc16x16[c][4 * (y / 16 * (tile_xsize / 16) + x / 16) + m] = dc_mem[m]; + } + free(mem); + free(dc_mem); + free(scratch_space); + } + } + } + + for (int c = 0; c < 3; c++) { + for (size_t y = 0; y < tile_ysize; y = y + 32) { + const float* JXL_RESTRICT row = opsin.ConstPlaneRow(c, y); + size_t stride = opsin.PixelsPerRow(); + + for (size_t x = 0; x < tile_xsize; x = x + 32) { + float* mem = (float*)calloc(32UL * 32UL, sizeof(float)); + float* dc_mem = + (float*)calloc(AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks, sizeof(float)); + float* scratch_space = (float*)calloc(2048UL, sizeof(float)); + AcStrategy acs = AcStrategy::FromRawStrategy(AcStrategy::Type::DCT32X32); + size_t xs = acs.covered_blocks_x(); + N_SCALAR::TransformFromPixels(acs.Strategy(), row + x, stride, mem, scratch_space); + N_SCALAR::DCFromLowestFrequencies(acs.Strategy(), mem, dc_mem, xs); + for (int m = 0; m < 32 * 32; m++) { + dct32x32[c][32 * 32 * (y / 32 * (tile_xsize / 32) + x / 32) + m] = mem[m]; + } + for (int m = 0; m < 16; m++) { + dc32x32[c][16 * (y / 32 * (tile_xsize / 32) + x / 32) + m] = dc_mem[m]; + } + free(mem); + free(dc_mem); + free(scratch_space); + } + } + } + +#ifdef XLNX_QC_DEBUG_DCT + std::cout << std::endl + << "======================================== full coef " + "==============================================" + << std::endl; + for (int c = 0; c < 3; c++) { + if (c == 1) { + std::cout << std::setw(15) << 0 << " "; + for (int m = 0; m < tile_xsize; m++) { + std::cout << std::setw(15) << m << " "; + } + std::cout << std::endl << std::endl; + for (int y = 0; y < tile_ysize; y++) { + std::cout << std::setw(15) << y << " "; + for (int x = 0; x < tile_xsize; x++) { + std::cout << std::setw(15) << dct8x8[c][y * tile_xsize + x] << " "; + } + std::cout << std::endl; + } + } + } +#endif + +#ifdef XLNX_QC_DEBUG_DC + std::cout << std::endl + << "======================================== full DC " + "==============================================" + << std::endl; + for (int c = 0; c < 3; c++) { + if (c == 1) { + std::cout << std::setw(15) << 0 << " "; + for (int m = 0; m < tile_xsize / 8; m++) { + std::cout << std::setw(15) << m << " "; + } + std::cout << std::endl << std::endl; + for (int y = 0; y < tile_ysize / 8; y++) { + std::cout << std::setw(15) << y << " "; + for (int x = 0; x < tile_xsize / 8; x++) { + std::cout << std::setw(15) << dc32x32[c][y * tile_xsize / 8 + x] << " "; + } + std::cout << std::endl; + } + } + } +#endif + + ArControlFieldHeuristics ar_heuristics; + AcStrategyHeuristics acs_heuristics; + CfLHeuristics cfl_heuristics; + + cfl_heuristics.Init(*opsin_); + acs_heuristics.Init(*opsin_, enc_state_); + ar_heuristics.PrepareForThreads(/*num_threads*/ 1); + cfl_heuristics.PrepareForThreads(/*num_threads*/ 1); + + // auto process_tile = [&](size_t tid, size_t thread) { + for (int tid = 0; tid < DivCeil(enc_state_->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks) * + DivCeil(enc_state_->shared.frame_dim.ysize_blocks, kEncTileDimInBlocks); + tid++) { + size_t thread = 0; + size_t n_enc_tiles = DivCeil(enc_state_->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks); + size_t tx = tid % n_enc_tiles; + size_t ty = tid / n_enc_tiles; + size_t by0 = ty * kEncTileDimInBlocks; + size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, enc_state_->shared.frame_dim.ysize_blocks); + size_t bx0 = tx * kEncTileDimInBlocks; + size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, enc_state_->shared.frame_dim.xsize_blocks); + Rect r(bx0, by0, bx1 - bx0, by1 - by0); + + // For speeds up to Wombat, we only compute the color correlation map + // once we know the transform type and the quantization map. + if (cparams.speed_tier <= SpeedTier::kSquirrel) { + cfl_heuristics.ComputeTile(r, *opsin_, enc_state_->shared.matrices, + /*ac_strategy=*/nullptr, + /*quantizer=*/nullptr, /*fast=*/false, thread, &enc_state_->shared.cmap, + opsin.xsize(), opsin.ysize(), dctIDT, dct2x2, dct4x4, dct8x8, dct16x16, + dct32x32, dcIDT, dc2x2, dc4x4, dc8x8, dc16x16, dc32x32); + } + + // Choose block sizes. + acs_heuristics.ProcessRect(r, opsin.xsize(), opsin.ysize(), dctIDT, dct2x2, dct4x4, dct8x8, dct16x16, + dct32x32, dcIDT, dc2x2, dc4x4, dc8x8, dc16x16, dc32x32); + +// Choose amount of post-processing smoothing. +// TODO(veluca): should this go *after* AdjustQuantField? +#ifndef XLNX_DISABLE_ARC + ar_heuristics.RunRect(r, *opsin_, enc_state_, thread); +#else + ImageB* JXL_RESTRICT epf_sharpness = &enc_state_->shared.epf_sharpness; + FillPlane(static_cast(4), epf_sharpness, r); +#endif + // Always set the initial quant field, so we can compute the CfL map + // with more accuracy. The initial quant field might change in slower + // modes, but adjusting the quant field with butteraugli when all the + // other encoding parameters are fixed is likely a more reliable choice + // anyway. + AdjustQuantField(enc_state_->shared.ac_strategy, r, &enc_state_->initial_quant_field); + quantizer.SetQuantFieldRect(enc_state_->initial_quant_field, r, &enc_state_->shared.raw_quant_field); + +// Compute a non-default CfL map if we are at Hare speed, or slower. +#ifndef XLNX_DISABLE_2NDCMP + if (cparams.speed_tier <= SpeedTier::kHare) { + cfl_heuristics.ComputeTile(r, *opsin_, enc_state_->shared.matrices, &enc_state_->shared.ac_strategy, + &enc_state_->shared.quantizer, + /*fast=*/cparams.speed_tier >= SpeedTier::kWombat, thread, + &enc_state_->shared.cmap, dctIDT, dct2x2, dct4x4, dct8x8, dct16x16, dct32x32, + dcIDT, dc2x2, dc4x4, dc8x8, dc16x16, dc32x32); + } +#endif + }; + /* RunOnPool(pool, 0, DivCeil(enc_state_->shared.frame_dim.xsize_blocks, + kEncTileDimInBlocks) * + DivCeil(enc_state_->shared.frame_dim.ysize_blocks, + kEncTileDimInBlocks), + [&](const size_t num_threads) { + ar_heuristics.PrepareForThreads(num_threads); + cfl_heuristics.PrepareForThreads(num_threads); + return true; + }, + process_tile, "Enc Heuristics");*/ + + acs_heuristics.Finalize(aux_out); + if (cparams.speed_tier <= SpeedTier::kHare) { + cfl_heuristics.ComputeDC( + /*fast=*/cparams.speed_tier >= SpeedTier::kWombat, &enc_state_->shared.cmap); + } + + FindBestDequantMatrices(cparams, *opsin_, modular_frame_encoder.get(), &enc_state_->shared.matrices); + + // Refine quantization levels. + FindBestQuantizer(ib_or_linear, *opsin_, enc_state_, pool, aux_out); + + // Choose a context model that depends on the amount of quantization for + // AC. + if (cparams.speed_tier < SpeedTier::kFalcon) { + FindBestBlockEntropyModel(*enc_state_); + } + +#ifdef XLNX_DEBUG_CMAP + std::cout << "=========================================" << std::endl; + std::cout << "ColorMap info: " << std::endl; + ImageSB* JXL_RESTRICT tmp_map = &enc_state_->shared.cmap.ytox_map; + int32_t dc = enc_state_->shared.cmap.GetYToXDC(); + std::cout << "Y to X dc: " << dc << std::endl; + for (int i = 0; i < tmp_map->ysize(); i++) { + int8_t* JXL_RESTRICT row_out = tmp_map->Row(i); + for (int j = 0; j < tmp_map->xsize(); j++) { + std::cout << (int)row_out[j] << " "; + } + std::cout << std::endl; + } + + tmp_map = &enc_state_->shared.cmap.ytox_map; + dc = enc_state_->shared.cmap.GetYToBDC(); + std::cout << "Y to B dc: " << dc << std::endl; + for (int i = 0; i < tmp_map->ysize(); i++) { + int8_t* JXL_RESTRICT row_out = tmp_map->Row(i); + for (int j = 0; j < tmp_map->xsize(); j++) { + std::cout << (int)row_out[j] << " "; + } + std::cout << std::endl; + } + std::cout << std::endl; +#endif + + InitializePassesEncoder(opsin, pool, enc_state_, modular_frame_encoder.get(), aux_out, opsin.xsize(), + opsin.ysize(), dctIDT, dct2x2, dct4x4, dct8x8, dct16x16, dct32x32, dcIDT, dc2x2, dc4x4, + dc8x8, dc16x16, dc32x32); + + enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses()); + for (PassesEncoderState::PassData& pass : enc_state_->passes) { + pass.ac_tokens.resize(shared.frame_dim.num_groups); + } + + lossy_frame_encoder.ComputeAllCoeffOrders(shared.frame_dim); + shared.num_histograms = 1; + + *frame_header = shared.frame_header; + + // needs to happen *AFTER* VarDCT-ComputeEncodingData. + JXL_RETURN_IF_ERROR(modular_frame_encoder->ComputeEncodingData( + *frame_header, *ib.metadata(), &opsin, *extra_channels, lossy_frame_encoder.State(), pool, aux_out, + /* do_color=*/frame_header->encoding == FrameEncoding::kModular)); + } + return true; +} +} // namespace jxl + +#endif diff --git a/codec/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_phase3.cpp b/codec/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_phase3.cpp new file mode 100644 index 0000000000..ee15e2154a --- /dev/null +++ b/codec/L2/demos/jxlEnc/others/src/host_acc_tokInit_histogram/acc_phase3.cpp @@ -0,0 +1,1572 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef ACC_PHASE3_CPP +#define ACC_PHASE3_CPP + +#include "acc_phase3.hpp" + +#include + +#include "acc_cluster_histogram.hpp" +#include "acc_store_encode_data.hpp" +#include "lib/jxl/lehmer_code.h" +#include "host_tokinit_histogram.hpp" + +void test(int* in, int* out); + +inline int tvdiff(struct timeval* tv0, struct timeval* tv1) { + return (tv1->tv_sec - tv0->tv_sec) * 1000000 + (tv1->tv_usec - tv0->tv_usec); +} + +namespace jxl { +namespace { +size_t IndexOf(const std::vector& v, uint8_t value) { + size_t i = 0; + for (; i < v.size(); ++i) { + if (v[i] == value) return i; + } + return i; +} + +void MoveToFront(std::vector* v, size_t index) { + uint8_t value = (*v)[index]; + for (size_t i = index; i != 0; --i) { + (*v)[i] = (*v)[i - 1]; + } + (*v)[0] = value; +} + +std::vector MoveToFrontTransform(const std::vector& v) { + if (v.empty()) return v; + uint8_t max_value = *std::max_element(v.begin(), v.end()); + std::vector mtf(max_value + 1); + for (size_t i = 0; i <= max_value; ++i) mtf[i] = i; + std::vector result(v.size()); + for (size_t i = 0; i < v.size(); ++i) { + size_t index = IndexOf(mtf, v[i]); + JXL_ASSERT(index < mtf.size()); + result[i] = static_cast(index); + MoveToFront(&mtf, index); + } + return result; +} +} // namespace + +namespace { + +void acc_TokenizePermutation(const coeff_order_t* JXL_RESTRICT order, + size_t skip, + size_t size, + std::vector* tokens) { + std::vector lehmer(size); + std::vector temp(size + 1); + ComputeLehmerCode(order, temp.data(), size, lehmer.data()); + size_t end = size; + while (end > skip && lehmer[end - 1] == 0) { + --end; + } + tokens->emplace_back(CoeffOrderContext(size), end - skip); + uint32_t last = 0; + for (size_t i = skip; i < end; ++i) { + tokens->emplace_back(CoeffOrderContext(last), lehmer[i]); + last = lehmer[i]; + } +} + +} // namespace + +namespace { +void acc_EncodeCoeffOrder(const coeff_order_t* JXL_RESTRICT order, + AcStrategy acs, + std::vector* tokens, + coeff_order_t* order_zigzag) { + const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y(); + const size_t size = kDCTBlockSize * llf; + const coeff_order_t* natural_coeff_order_lut = acs.NaturalCoeffOrderLut(); + for (size_t i = 0; i < size; ++i) { + order_zigzag[i] = natural_coeff_order_lut[order[i]]; + } + acc_TokenizePermutation(order_zigzag, llf, size, tokens); +} +} // namespace + +Status acc_predictAndtoken(LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& frame_header, + std::vector >& coefOrders_tokens, + ThreadPool* pool) { + std::vector& group_caches_ = lossy_frame_encoder.get_group_cashes(); + PassesEncoderState* JXL_RESTRICT enc_state_ = lossy_frame_encoder.State(); + PassesSharedState& shared = enc_state_->shared; + + //------------------------------------------------------------------------// + // Tokenize coefficients. + //------------------------------------------------------------------------// + group_caches_.resize(1); // multi-thread settings, would be further removed + int thread = 0; + for (int group_index = 0; group_index < shared.frame_dim.num_groups; group_index++) { + const Rect rect = shared.BlockGroupRect(group_index); + for (size_t idx_pass = 0; idx_pass < enc_state_->passes.size(); idx_pass++) { + const int32_t* JXL_RESTRICT ac_rows[3] = { + enc_state_->coeffs[idx_pass]->PlaneRow(0, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(1, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(2, group_index, 0).ptr32, + }; + + // Ensure group cache is initialized. + group_caches_[thread].InitOnce(); + const coeff_order_t* acc_coeff_orders = &shared.coeff_orders[idx_pass * shared.coeff_order_size]; + TokenizeCoefficients(&shared.coeff_orders[idx_pass * shared.coeff_order_size], rect, ac_rows, + shared.ac_strategy, frame_header->chroma_subsampling, + &group_caches_[thread].num_nzeroes, + &enc_state_->passes[idx_pass].ac_tokens[group_index], enc_state_->shared.quant_dc, + enc_state_->shared.raw_quant_field, enc_state_->shared.block_ctx_map); + } + }; + + const coeff_order_t* JXL_RESTRICT order = &enc_state_->shared.coeff_orders[0 * enc_state_->shared.coeff_order_size]; + auto mem = hwy::AllocateAligned(AcStrategy::kMaxCoeffArea); + uint16_t computed = 0; + uint16_t used_orders = enc_state_->used_orders[0]; + + if (frame_header->encoding == FrameEncoding::kVarDCT) { + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + uint8_t ord = kStrategyOrder[o]; + if (computed & (1 << ord)) continue; + computed |= 1 << ord; + if ((used_orders & (1 << ord)) == 0) continue; + AcStrategy acs = AcStrategy::FromRawStrategy(o); + for (size_t c = 0; c < 3; c++) { + acc_EncodeCoeffOrder(&order[CoeffOrderOffset(ord, c)], acs, &coefOrders_tokens[0], mem.get()); + } + } + } + return true; +} + +BitWriter* get_output(const size_t index, std::vector& group_codes, bool is_small_image) { + return &group_codes[is_small_image ? 0 : index]; +} + +Status acc_histogram(LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& modular_frame_encoder, + PassesEncoderState* passes_enc_state, + FrameDimensions frame_dim, + std::unique_ptr& frame_header, + CompressParams cparams, + std::vector >& coefOrders_tokens, + BitWriter* group_codes_writer, + BitWriter* acInfo_writer, + size_t& ans_cost, + size_t& mtf_cost, + std::vector >& bcm_tokens, + std::vector >& bcm_mtf_tokens, + EntropyEncodingData& bcm_codes, + std::vector& bcm_dummy_context_map, + + EntropyEncodingData& modularFramTree_code, + std::vector& modularFramTree_ctxmap, + + EntropyEncodingData& coefOrders_codes, + std::vector& coefOrders_context_map, + + std::vector& aux_outs, + AuxOut* aux_out, + std::string xclbinPath) { + std::vector& group_caches_ = lossy_frame_encoder.get_group_cashes(); + PassesEncoderState* JXL_RESTRICT enc_state_ = lossy_frame_encoder.State(); + PassesSharedState& shared = enc_state_->shared; + + const coeff_order_t* JXL_RESTRICT order = &enc_state_->shared.coeff_orders[0 * enc_state_->shared.coeff_order_size]; + auto mem = hwy::AllocateAligned(AcStrategy::kMaxCoeffArea); + uint16_t computed = 0; + uint16_t used_orders = enc_state_->used_orders[0]; + + if (frame_header->encoding == FrameEncoding::kVarDCT) { + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + uint8_t ord = kStrategyOrder[o]; + if (computed & (1 << ord)) continue; + computed |= 1 << ord; + if ((used_orders & (1 << ord)) == 0) continue; + AcStrategy acs = AcStrategy::FromRawStrategy(o); + for (size_t c = 0; c < 3; c++) { + acc_EncodeCoeffOrder(&order[CoeffOrderOffset(ord, c)], acs, &coefOrders_tokens[0], mem.get()); + } + } + } + + HistogramParams params0; + params0.clustering = HistogramParams::ClusteringType::kFast; + params0.uint_method = HistogramParams::HybridUintMethod::kNone; + params0.lz77_method = HistogramParams::LZ77Method::kNone; + HistogramParams params1; + params1.clustering = HistogramParams::ClusteringType::kFast; + params1.uint_method = HistogramParams::HybridUintMethod::kNone; + params1.lz77_method = HistogramParams::LZ77Method::kNone; + HistogramParams params2; + params2.clustering = HistogramParams::ClusteringType::kFast; + params2.uint_method = HistogramParams::HybridUintMethod::kNone; + params2.lz77_method = HistogramParams::LZ77Method::kNone; + HistogramParams params3; + params3.clustering = HistogramParams::ClusteringType::kFast; + params3.uint_method = HistogramParams::HybridUintMethod::kNone; + params3.lz77_method = HistogramParams::LZ77Method::kNone; + HistogramParams params4(enc_state_->cparams.speed_tier, enc_state_->shared.block_ctx_map.NumACContexts()); + if (enc_state_->cparams.decoding_speed_tier >= 1) { + params4.max_histograms = 6; + } + + params4.clustering = HistogramParams::ClusteringType::kFast; + params4.uint_method = HistogramParams::HybridUintMethod::kNone; + params4.lz77_method = HistogramParams::LZ77Method::kNone; + std::vector context_map0; + std::vector context_map1; + std::vector context_map2; + std::vector context_map3; + std::vector context_map4; + std::vector context_map_c0; + std::vector context_map_c1; + std::vector context_map_c2; + std::vector context_map_c3; + std::vector context_map_c4; + std::vector > tokens0(1); + std::vector > tokens1(1); + std::vector > tokens2(1); + std::vector > tokens3(1); + std::vector > tokens4(1); + std::vector > tokens_c0(1); + std::vector > tokens_c1(1); + std::vector > tokens_c2(1); + std::vector > tokens_c3(1); + std::vector > tokens_c4(1); + EntropyEncodingData codes0; + EntropyEncodingData codes1; + EntropyEncodingData codes2; + EntropyEncodingData codes3; + EntropyEncodingData codes4; + EntropyEncodingData codes_c0; + EntropyEncodingData codes_c1; + EntropyEncodingData codes_c2; + EntropyEncodingData codes_c3; + EntropyEncodingData codes_c4; + std::vector clustered_histograms0; + std::vector clustered_histograms1; + std::vector clustered_histograms2; + std::vector clustered_histograms3; + std::vector clustered_histograms4; + std::vector clustered_histograms_c0; + std::vector clustered_histograms_c1; + std::vector clustered_histograms_c2; + std::vector clustered_histograms_c3; + std::vector clustered_histograms_c4; + BitWriter* writer0 = nullptr; + BitWriter* writer1 = nullptr; + BitWriter* writer2 = nullptr; + BitWriter* writer3 = nullptr; + BitWriter* writer4 = nullptr; + size_t layer0 = 0; + size_t layer1 = 0; + size_t layer2 = 0; + size_t layer3 = 0; + size_t layer4 = 0; + size_t num_contexts0 = 1; + size_t num_contexts1 = 1; + size_t num_contexts2 = 1; + size_t num_contexts3 = 1; + size_t num_contexts4 = 1; + bool do_once[5] = {0, 0, 0, 0, 0}; + char* do_inner = (char*)malloc(sizeof(char) * 8); + for (int i = 0; i < 5; i++) do_inner[i] = 0; + char* do_prefix_in = (char*)malloc(sizeof(char) * 8); + for (int i = 0; i < 5; i++) do_prefix_in[i] = 0; + char* do_prefix_out = (char*)malloc(sizeof(char) * 8); + for (int i = 0; i < 5; i++) do_prefix_out[i] = 0; + + const size_t num_passes = passes_enc_state->progressive_splitter.GetNumPasses(); + const bool is_small_image = frame_dim.num_groups == 1 && num_passes == 1; + + if (!is_small_image) { + group_codes_writer->init(200); + group_codes_writer->update_part(0); + } else { + group_codes_writer->init(200); + group_codes_writer->update_part(0); + } + + bool all_default = true; + const float* dc_quant = (lossy_frame_encoder.State()->shared.matrices).DCQuants(); + for (size_t c = 0; c < 3; c++) { + if (dc_quant[c] != kDCQuant[c]) { + all_default = false; + } + } + BitWriter::Allotment allotment(group_codes_writer, 1 + sizeof(float) * kBitsPerByte * 3); + group_codes_writer->Write(1, all_default); + if (!all_default) { + for (size_t c = 0; c < 3; c++) { + JXL_RETURN_IF_ERROR(F16Coder::Write(dc_quant[c] * 128.0f, group_codes_writer)); + } + } + ReclaimAndCharge(group_codes_writer, &allotment, kLayerDequantTables, aux_out); + + auto& dct = enc_state_->shared.block_ctx_map.dc_thresholds; + auto& qft = enc_state_->shared.block_ctx_map.qf_thresholds; + auto& ctx_map = enc_state_->shared.block_ctx_map.ctx_map; + if (frame_header->encoding == FrameEncoding::kVarDCT) { + JXL_RETURN_IF_ERROR(enc_state_->shared.quantizer.Encode(group_codes_writer, kLayerQuant, aux_out)); + //============Encode GlobalDCInfo: Block Context Map========= + if (dct[0].empty() && dct[1].empty() && dct[2].empty() && qft.empty() && ctx_map.size() == 21 && + std::equal(ctx_map.begin(), ctx_map.end(), jxl::kDefaultCtxMap)) { + group_codes_writer->Write(1, 1); // default + } else { + group_codes_writer->Write(1, 0); + for (int j : {0, 1, 2}) { + group_codes_writer->Write(4, dct[j].size()); + for (int i : dct[j]) { + JXL_CHECK(U32Coder::Write(kDCThresholdDist, PackSigned(i), group_codes_writer)); + } + } + group_codes_writer->Write(4, qft.size()); + for (uint32_t i : qft) { + JXL_CHECK(U32Coder::Write(kQFThresholdDist, i - 1, group_codes_writer)); + } + for (size_t i = 0; i < ctx_map.size(); i++) { + bcm_tokens[0].emplace_back(0, ctx_map[i]); + } + + { + std::vector context_map = ctx_map; + BitWriter* writer = group_codes_writer; + writer0 = group_codes_writer; + size_t num_histograms = enc_state_->shared.block_ctx_map.num_ctxs; + if (num_histograms == 1) { + // Simple code + writer->Write(1, 1); + // 0 bits per entry. + writer->Write(2, 0); + } else { + std::vector > tokens(1); + for (size_t i = 0; i < context_map.size(); i++) { + tokens[0].emplace_back(0, context_map[i]); + } + + size_t entry_bits = CeilLog2Nonzero(num_histograms); + size_t simple_cost = entry_bits * context_map.size(); + if (entry_bits < 4) { + writer->Write(1, 1); + writer->Write(2, entry_bits); + for (size_t i = 0; i < context_map.size(); i++) { + writer->Write(entry_bits, context_map[i]); + } + } else { + writer->Write(1, 0); + writer->Write(1, 0); + EntropyEncodingData context_codes0; + std::vector > context_tokens0(1); + do_once[0] = true; + num_contexts0 = 1; + tokens0 = tokens; + codes0 = bcm_codes; + context_map0 = bcm_dummy_context_map; + // codes_c0 = context_codes0; + // writer0 = writer; + layer0 = 0; + + // BuildAndEncodeHistogramsNew0 + // ========================================================= + } + } + } + } + //============================= + //============Encode GlobalDCInfo: Color Correlation Map========= + if (!is_small_image) { + group_codes_writer->update_part(20); + } else { + group_codes_writer->update_part(20); + } + ColorCorrelationMapEncodeDC(&enc_state_->shared.cmap, group_codes_writer, kLayerDC, aux_out); + //============================= + } + + if (!is_small_image) { + group_codes_writer->update_part(30); + } else { + group_codes_writer->update_part(30); + } + + writer1 = group_codes_writer; + writer2 = group_codes_writer; + BitWriter::Allotment allotmentGlobalInfo(group_codes_writer, 1); + // If we are using brotli, or not using modular mode. + if (modular_frame_encoder->tree_tokens.empty() || modular_frame_encoder->tree_tokens[0].empty()) { + group_codes_writer->Write(1, 0); + ReclaimAndCharge(group_codes_writer, &allotmentGlobalInfo, kLayerModularTree, aux_out); + } else { + group_codes_writer->Write(1, 1); + ReclaimAndCharge(group_codes_writer, &allotmentGlobalInfo, kLayerModularTree, aux_out); + // Write tree + if (cparams.speed_tier > SpeedTier::kKitten) { + params1.ans_histogram_strategy = HistogramParams::ANSHistogramStrategy::kApproximate; + params2.ans_histogram_strategy = HistogramParams::ANSHistogramStrategy::kApproximate; + } + + if (cparams.decoding_speed_tier >= 1) { + params1.max_histograms = 12; + params2.max_histograms = 12; + } + + EntropyEncodingData context_codes1; + std::vector > context_tokens1(1); + std::vector dummy_context_map1; + + do_once[1] = true; + num_contexts1 = kNumTreeContexts; + tokens1 = modular_frame_encoder->tree_tokens; + codes1 = modularFramTree_code; + context_map1 = modularFramTree_ctxmap; + ////codes_c0 = context_codes0; + ////writer0 = writer; + layer1 = kLayerModularTree; + + // BuildAndEncodeHistogramsNew1 + + if (!is_small_image) { + group_codes_writer->update_part(50); + } else { + group_codes_writer->update_part(50); + } + params2.image_widths = modular_frame_encoder->image_widths; + // Write histograms. + EntropyEncodingData context_codes2; + std::vector > context_tokens2(1); + std::vector dummy_context_map2; + + do_once[2] = true; + num_contexts2 = (modular_frame_encoder->tree.size() + 1) / 2; + tokens2 = modular_frame_encoder->tokens; + codes2 = modular_frame_encoder->code; + context_map2 = modular_frame_encoder->context_map; + ////codes_c0 = context_codes0; + ////writer0 = writer; + layer2 = kLayerModularGlobal; + + // BuildAndEncodeHistogramsNew2 + } + + //============================= Encode Global ACInfo ============= + if (!is_small_image) { + acInfo_writer->init(200); + acInfo_writer->update_part(0); + } else { + acInfo_writer->update_part(80); + } + writer3 = acInfo_writer; + writer4 = acInfo_writer; + + if (frame_header->encoding == FrameEncoding::kVarDCT) { + bool all_default = true; + const std::vector& encodings = (enc_state_->shared.matrices).encodings(); + + for (size_t i = 0; i < encodings.size(); i++) { + if (encodings[i].mode != QuantEncoding::kQuantModeLibrary || encodings[i].predefined != 0) { + all_default = false; + } + } + // TODO(janwas): better bound + BitWriter::Allotment allotment(acInfo_writer, 512 * 1024); + acInfo_writer->Write(1, all_default); + ReclaimAndCharge(acInfo_writer, &allotment, kLayerDequantTables, aux_out); + + size_t num_histo_bits = CeilLog2Nonzero(enc_state_->shared.frame_dim.num_groups); + if (num_histo_bits != 0) { + BitWriter::Allotment allotment(acInfo_writer, num_histo_bits); + acInfo_writer->Write(num_histo_bits, enc_state_->shared.num_histograms - 1); + ReclaimAndCharge(acInfo_writer, &allotment, kLayerAC, aux_out); + } + + //============= encode coef orders======== + // Encode coefficient orders. + uint16_t used_orders = enc_state_->used_orders[0]; + size_t order_bits = 0; + JXL_RETURN_IF_ERROR(U32Coder::CanEncode(kOrderEnc, enc_state_->used_orders[0], &order_bits)); + BitWriter::Allotment allotmentCoef(acInfo_writer, order_bits); + JXL_CHECK(U32Coder::Write(kOrderEnc, enc_state_->used_orders[0], acInfo_writer)); + ReclaimAndCharge(acInfo_writer, &allotmentCoef, kLayerOrder, aux_out); + + // Do not write anything if no order is used. + EntropyEncodingData context_codes3; + std::vector > context_tokens3(1); + std::vector dummy_context_map3; + do_once[3] = true; + num_contexts3 = kPermutationContexts; + tokens3 = coefOrders_tokens; + codes3 = coefOrders_codes; + context_map3 = coefOrders_context_map; + ////codes_c0 = context_codes0; + ////writer0 = writer; + layer3 = kLayerOrder; + // BuildAndEncodeHistogramsNew3 + + if (!is_small_image) { + acInfo_writer->update_part(20); + } else { + acInfo_writer->update_part(100); + } + } + + std::vector > histograms_(5); + histograms_[0].resize(num_contexts0); + histograms_[1].resize(num_contexts1); + histograms_[2].resize(num_contexts2); + histograms_[3].resize(num_contexts3); + histograms_[4].resize(enc_state_->shared.num_histograms * enc_state_->shared.block_ctx_map.NumACContexts()); + + std::vector params(5); + std::vector num_contexts(5); + std::vector layer(5); + std::vector codes(5); + std::vector*> context_map(5); + std::vector codes_c(5); + std::vector writer(5); + writer[0] = writer0; + writer[1] = writer1; + writer[2] = writer2; + writer[3] = writer3; + writer[4] = writer4; + + std::vector > nonempty_histograms(5); + std::vector largest_idx(5); + + std::vector > clustered_histograms(5); + std::vector > histogram_symbols(5); + + std::vector > clustered_histogramsin(5); + std::vector > > tokensin(5, std::vector >(1)); + std::vector codesin(5); + std::vector > context_map_in(5); + + constexpr float kMinDistanceForDistinctFast = 64.0f; + constexpr float kMinDistanceForDistinctBest = 16.0f; + + if (frame_header->encoding == FrameEncoding::kVarDCT) { + do_once[4] = true; + } + + // Build histograms. + for (int i = 0; i < 5; i++) { + if (!do_once[i]) continue; + if (i == 0) { + params[0] = params0; + num_contexts[0] = num_contexts0; + layer[0] = layer0; + codes[0] = &codes0; + context_map[0] = &context_map0; + codes_c[0] = &codes_c0; + } else if (i == 1) { + params[1] = params1; + num_contexts[1] = num_contexts1; + layer[1] = layer1; + codes[1] = &codes1; + context_map[1] = &context_map1; + codes_c[1] = &codes_c1; + } else if (i == 2) { + params[2] = params2; + num_contexts[2] = num_contexts2; + layer[2] = layer2; + codes[2] = &codes2; + context_map[2] = &context_map2; + codes_c[2] = &codes_c2; + } else if (i == 3) { + params[3] = params3; + num_contexts[3] = num_contexts3; + layer[3] = layer3; + codes[3] = &codes3; + context_map[3] = &context_map3; + codes_c[3] = &codes_c3; + } else if (i == 4) { + params[4] = params4; + num_contexts[4] = num_contexts4; + layer[4] = kLayerAC; + codes[4] = &enc_state_->passes[0].codes; + context_map[4] = &enc_state_->passes[0].context_map; + codes_c[4] = &codes_c4; + } + } + + // lossy_frame ---- > mem + + int config[32]; + + // config init + int pixel_xsize = shared.frame_dim.xsize; + int pixel_ysize = shared.frame_dim.ysize; + int group_dim = shared.frame_dim.group_dim; + int xsize_blocks = shared.frame_dim.xsize_blocks; + int ysize_blocks = shared.frame_dim.ysize_blocks; + + config[0] = enc_state_->shared.quant_dc.bytes_per_row(); + config[1] = enc_state_->shared.raw_quant_field.bytes_per_row(); + config[4] = group_dim; + config[5] = pixel_xsize; + config[6] = pixel_ysize; + config[7] = enc_state_->shared.block_ctx_map.num_ctxs; + config[8] = enc_state_->shared.block_ctx_map.num_dc_ctxs; + config[9] = enc_state_->shared.block_ctx_map.qf_thresholds.size(); + config[10] = 32; // nzero_stride + config[11] = enc_state_->shared.block_ctx_map.ctx_map.size(); + config[12] = do_once[0]; + config[13] = do_once[1]; + config[14] = do_once[2]; + config[15] = do_once[3]; + config[16] = do_once[4]; + /* + uint32_t nempty_cnt_ptr[5]; + uint32_t largest_idx_ptr[5]; + */ + + // orders + uint32_t* coeff_orders_ddr = new uint32_t[MAX_ORDERS_SIZE]; + const coeff_order_t* JXL_RESTRICT orders = &shared.coeff_orders[0]; + for (int i = 0; i < MAX_ORDERS_SIZE; i++) { + coeff_orders_ddr[i] = orders[i]; + } + + // ac_coef + int32_t* ac_coeff_ordered_ddr = new int32_t[ALL_PIXEL]; + uint64_t group_offset_1 = 0; + for (int group_index = 0; group_index < shared.frame_dim.num_groups; group_index++) { + const Rect rect = shared.BlockGroupRect(group_index); + const int32_t* ac_rows[3] = { + enc_state_->coeffs[0]->PlaneRow(0, group_index, 0).ptr32, + enc_state_->coeffs[0]->PlaneRow(1, group_index, 0).ptr32, + enc_state_->coeffs[0]->PlaneRow(2, group_index, 0).ptr32, + }; + int rect_xsize_block = rect.xsize(); + int rect_ysize_block = rect.ysize(); + + uint32_t offset[3] = {0, 0, 0}; + int acc_block_offset = 0; + for (int by = 0; by < rect_ysize_block; by++) { + int by_offset = by * rect_xsize_block * kDCTBlockSize * 3; + for (int bx = 0; bx < rect_xsize_block; bx++) { + int bx_offset = bx * kDCTBlockSize * 3; + for (int c : {1, 0, 2}) { + // strategy + AcStrategyRow acs_row = shared.ac_strategy.ConstRow(rect, by); + AcStrategy acs = acs_row[bx]; + + // cx & cy + uint32_t cx = acs.covered_blocks_x(); + uint32_t cy = acs.covered_blocks_y(); + + // covered blocks + const uint32_t covered_blocks = cx * cy; // = #LLF coefficients + + // size + const uint32_t size = covered_blocks * kDCTBlockSize; + + // ordered_coef + int ord = kStrategyOrder[acs.RawStrategy()]; + const coeff_order_t* JXL_RESTRICT order = &orders[CoeffOrderOffset(ord, c)]; + + // block address + if (acs.IsFirstBlock()) { + const int32_t* block = ac_rows[c] + offset[c]; + for (int k = 0; k < covered_blocks * kDCTBlockSize; k++) { + ac_coeff_ordered_ddr[k + acc_block_offset + group_offset_1] = block[order[k]]; + } + acc_block_offset += size; + offset[c] += size; + } + } + } + } + group_offset_1 += rect_ysize_block * rect_xsize_block * kDCTBlockSize * 3; + } + + // ac_strategy + int32_t* strategy_ddr = new int32_t[MAX_NUM_BLK88]; + int group_offset_0 = 0; + for (int group_index = 0; group_index < shared.frame_dim.num_groups; group_index++) { + const Rect rect = shared.BlockGroupRect(group_index); + int rect_xsize_block = rect.xsize(); + int rect_ysize_block = rect.ysize(); + for (int by = 0; by < rect_ysize_block; by++) { + for (int bx = 0; bx < rect_xsize_block; bx++) { + AcStrategyRow acs_row = shared.ac_strategy.ConstRow(rect, by); + AcStrategy acs = acs_row[bx]; + strategy_ddr[group_offset_0 + by * rect_xsize_block + bx] = acs.Strategy(); + } + } + + group_offset_0 += rect_ysize_block * rect_xsize_block; + } + + // quant field & quant dc + const int32_t* qf_rows = enc_state_->shared.raw_quant_field.ConstRow(0); + const uint8_t* qdc_rows = enc_state_->shared.quant_dc.ConstRow(0); + + int32_t* qf_ddr = new int32_t[MAX_NUM_BLK88]; + uint8_t* qdc_ddr = new uint8_t[MAX_NUM_BLK88]; + + int qdc_bytes_per_row = enc_state_->shared.quant_dc.bytes_per_row(); + int qf_bytes_per_row = enc_state_->shared.raw_quant_field.bytes_per_row(); + + int group_offset_qdc_qf = 0; + for (int group_index = 0; group_index < shared.frame_dim.num_groups; group_index++) { + const Rect rect = shared.BlockGroupRect(group_index); + int rect_xsize_block = rect.xsize(); + int rect_ysize_block = rect.ysize(); + for (int by = 0; by < rect_ysize_block; by++) { + for (int bx = 0; bx < rect_xsize_block; bx++) { + // qdc & qf + int qdc_y_offset = (rect.y0() + by) * qdc_bytes_per_row + rect.x0(); + uint8_t dc_idx = qdc_rows[bx + qdc_y_offset]; + qdc_ddr[bx + by * rect_xsize_block + group_offset_qdc_qf] = dc_idx; + + int qf_y_offset = (rect.y0() + by) * (qf_bytes_per_row >> 2) + rect.x0(); + int32_t acc_qf = qf_rows[bx + qf_y_offset]; + qf_ddr[bx + by * rect_xsize_block + group_offset_qdc_qf] = acc_qf; + } + } + + group_offset_qdc_qf += rect_ysize_block * rect_xsize_block; + } + + // block_ctx_map & qf_thresholds + uint8_t* acc_ctx_map = new uint8_t[MAX_CTX_MAP_SIZE]; + uint32_t* qf_thresholds = new uint32_t[MAX_QF_THRESH_SIZE]; + + int ctx_map_size = enc_state_->shared.block_ctx_map.ctx_map.size(); + int qf_thresholds_size = enc_state_->shared.block_ctx_map.qf_thresholds.size(); + + for (int i = 0; i < ctx_map_size; i++) { + acc_ctx_map[i] = enc_state_->shared.block_ctx_map.ctx_map[i]; + } + + for (int i = 0; i < qf_thresholds_size; i++) { + qf_thresholds[i] = enc_state_->shared.block_ctx_map.qf_thresholds[i]; + } + + // ac_token + uint64_t* ac_tokens_ddr = new uint64_t[MAX_AC_TOKEN_SIZE]; + + // tokenize + uint32_t token0_size = 0; + for (int i = 0; i < tokens0.size(); i++) { + token0_size += tokens0[i].size(); + } + + // ap_uint<64>* tokens0_ptr = (ap_uint<64>*)malloc(MAX_AC_TOKEN_SIZE * sizeof(ap_uint<64>)); + uint64_t* tokens0_ptr = (uint64_t*)malloc(MAX_AC_TOKEN_SIZE * sizeof(uint64_t)); + tokens0_ptr[0] = token0_size; + uint32_t cnt = 1; + for (int i = 0; i < tokens0.size(); i++) { + for (int j = 0; j < tokens0[i].size(); j++) { + ap_uint<64> reg; + reg.range(31, 0) = tokens0[i][j].value; + reg.range(62, 32) = tokens0[i][j].context; + reg[63] = tokens0[i][j].is_lz77_length; + tokens0_ptr[cnt] = (uint64_t)reg; + cnt++; + } + } + + uint32_t token1_size = 0; + for (int i = 0; i < tokens1.size(); i++) { + token1_size += tokens1[i].size(); + } + + // for acc cosim + // token1_size = MAX_AC_TOKEN_SIZE; + + // ap_uint<64>* tokens1_ptr = (ap_uint<64>*)malloc(MAX_AC_TOKEN_SIZE * sizeof(ap_uint<64>)); + uint64_t* tokens1_ptr = (uint64_t*)malloc(MAX_AC_TOKEN_SIZE * sizeof(uint64_t)); + + tokens1_ptr[0] = token1_size; + cnt = 1; + for (int i = 0; i < tokens1.size(); i++) { + for (int j = 0; j < tokens1[i].size(); j++) { + ap_uint<64> reg; + reg.range(31, 0) = tokens1[i][j].value; + reg.range(62, 32) = tokens1[i][j].context; + reg[63] = tokens1[i][j].is_lz77_length; + tokens1_ptr[cnt] = (uint64_t)reg; + cnt++; + } + } + + uint32_t token2_size = 0; + for (int i = 0; i < tokens2.size(); i++) { + token2_size += tokens2[i].size(); + } + // ap_uint<64>* tokens2_ptr = (ap_uint<64>*)malloc(MAX_AC_TOKEN_SIZE * sizeof(ap_uint<64>)); + uint64_t* tokens2_ptr = (uint64_t*)malloc(MAX_AC_TOKEN_SIZE * sizeof(uint64_t)); + tokens2_ptr[0] = token2_size; + cnt = 1; + for (int i = 0; i < tokens2.size(); i++) { + for (int j = 0; j < tokens2[i].size(); j++) { + ap_uint<64> reg; + reg.range(31, 0) = tokens2[i][j].value; + reg.range(62, 32) = tokens2[i][j].context; + reg[63] = tokens2[i][j].is_lz77_length; + tokens2_ptr[cnt] = (uint64_t)reg; + cnt++; + } + } + + uint32_t token3_size = 0; + for (int i = 0; i < tokens3.size(); i++) { + token3_size += tokens3[i].size(); + } + // ap_uint<64>* tokens3_ptr = (ap_uint<64>*)malloc(MAX_AC_TOKEN_SIZE * sizeof(ap_uint<64>)); + uint64_t* tokens3_ptr = (uint64_t*)malloc(MAX_AC_TOKEN_SIZE * sizeof(uint64_t)); + tokens3_ptr[0] = token3_size; + cnt = 1; + for (int i = 0; i < tokens3.size(); i++) { + for (int j = 0; j < tokens3[i].size(); j++) { + ap_uint<64> reg; + reg.range(31, 0) = tokens3[i][j].value; + reg.range(62, 32) = tokens3[i][j].context; + reg[63] = tokens3[i][j].is_lz77_length; + tokens3_ptr[cnt] = (uint64_t)reg; + cnt++; + } + } + + printf("==== token_size:%d, %d, %d, %d =====\n ", token0_size, token1_size, token2_size, token3_size); + + std::vector histograms_ptr(5); + std::vector histograms_size_ptr(5); + std::vector total_count_ptr(5); + std::vector nonempty_ptr(5); + uint32_t nempty_cnt_ptr[5]; + uint32_t largest_idx_ptr[5]; + + for (int i = 0; i < 5; i++) { + histograms_ptr[i] = (int32_t*)malloc(4096 * 40 * sizeof(int32_t)); + histograms_size_ptr[i] = (uint32_t*)malloc(4096 * sizeof(uint32_t)); + total_count_ptr[i] = (uint32_t*)malloc(4096 * sizeof(uint32_t)); + nonempty_ptr[i] = (uint32_t*)malloc(4096 * sizeof(uint32_t)); + } + + //=====================================s + // kernel_code + //===================================== + // std::string xclbinPath = + // "/wrk/xsjhdnobkup3/tianminr/jxl_debug/xf_codec/L2/demos/jxlEnc/tokInit_histogram/" + // "build_dir.sw_emu.xilinx_u50_gen3x16_xdma_201920_3/jxlEnc.xclbin"; + // std::string xclbinPath = + // "/wrk/xsjhdnobkup3/tianminr/jxl_debug/xf_codec/L2/demos/jxlEnc/tokInit_histogram/" + // "build_dir.hw_emu.xilinx_u50_gen3x16_xdma_201920_3/jxlEnc.xclbin"; + // std::string xclbinPath = + // "/wrk/xsjhdnobkup3/tianminr/jxl_hw/xf_codec/L2/demos/jxlEnc/tokInit_histogram/build_dir.hw.xilinx_u50_gen3x16_xdma_201920_3/jxlEnc.xclbin"; + + hls_ANSinitHistogram_wrapper(xclbinPath, config, + //==================== + ac_coeff_ordered_ddr, strategy_ddr, qf_ddr, qdc_ddr, acc_ctx_map, qf_thresholds, + ac_tokens_ddr, + //==================== + tokens0_ptr, tokens1_ptr, tokens2_ptr, tokens3_ptr, + //==================== + histograms_ptr[0], histograms_size_ptr[0], total_count_ptr[0], nonempty_ptr[0], + //==================== + histograms_ptr[1], histograms_size_ptr[1], total_count_ptr[1], nonempty_ptr[1], + //==================== + histograms_ptr[2], histograms_size_ptr[2], total_count_ptr[2], nonempty_ptr[2], + //==================== + histograms_ptr[3], histograms_size_ptr[3], total_count_ptr[3], nonempty_ptr[3], + //===================== + histograms_ptr[4], histograms_size_ptr[4], total_count_ptr[4], nonempty_ptr[4]); + + // post-largeset & nempty + largest_idx_ptr[0] = config[17]; + largest_idx_ptr[1] = config[18]; + largest_idx_ptr[2] = config[19]; + largest_idx_ptr[3] = config[20]; + largest_idx_ptr[4] = config[21]; + + nempty_cnt_ptr[0] = config[22]; + nempty_cnt_ptr[1] = config[23]; + nempty_cnt_ptr[2] = config[24]; + nempty_cnt_ptr[3] = config[25]; + nempty_cnt_ptr[4] = config[26]; + + int all_end = 0; + int group_end = 0; + int idx = 0, vec_idx = 0; + int group_index = 0; + std::vector >& acc_ac_tokens = enc_state_->passes[0].ac_tokens; + while (!all_end) { + uint64_t token_tmp = ac_tokens_ddr[idx]; + all_end = token_tmp >> 63; + group_end = token_tmp >> 62; + if (group_end == 1) { + group_index++; + vec_idx = 0; + } else if (all_end) { + continue; + } else { + uint32_t value = token_tmp % 0xffffffff00000000; + uint32_t context = (token_tmp >> 32) % 0xfffffffff000000; + acc_ac_tokens[group_index].emplace_back(context, value); + } + idx++; + } + + largest_idx[0] = largest_idx_ptr[0]; + largest_idx[1] = largest_idx_ptr[1]; + largest_idx[2] = largest_idx_ptr[2]; + largest_idx[3] = largest_idx_ptr[3]; + largest_idx[4] = largest_idx_ptr[4]; + + delete[] strategy_ddr; + delete[] ac_coeff_ordered_ddr; + delete[] coeff_orders_ddr; + delete[] ac_tokens_ddr; + delete[] acc_ctx_map; + delete[] qf_thresholds; + + for (int i = 0; i < 5; i++) { + if (do_once[i]) { + for (int j = 0; j < histograms_[i].size(); j++) { + histograms_[i][j].data_.resize(histograms_size_ptr[i][j]); + histograms_[i][j].total_count_ = total_count_ptr[i][j]; + for (int k = 0; k < histograms_size_ptr[i][j]; k++) { + histograms_[i][j].data_[k] = histograms_ptr[i][j * 40 + k]; + } + } + + nonempty_histograms[i].reserve(histograms_[i].size()); + for (int j = 0; j < nempty_cnt_ptr[i]; j++) { + nonempty_histograms[i].push_back(nonempty_ptr[i][j]); + } + + if (histograms_[i].size() > 1) { + largest_idx[i] = + std::find(nonempty_histograms[i].begin(), nonempty_histograms[i].end(), largest_idx[i]) - + nonempty_histograms[i].begin(); + } + } + } + do_prefix_out[0] = 0; + do_prefix_out[1] = 0; + do_prefix_out[2] = 0; + do_prefix_out[3] = 0; + do_prefix_out[4] = 0; + + // kernel: acs_clusterHistogram + acc_ANSclusterHistogram(is_small_image, do_once, do_inner, do_prefix_in, params, histograms_, num_contexts, + context_map, nonempty_histograms, largest_idx, codes, clustered_histograms, + histogram_symbols, writer, layer, clustered_histogramsin, tokensin, codesin, + context_map_in); + + for (int i = 0; i < 5; i++) { + if (!do_once[i]) continue; + if (i == 0) { + tokens_c0 = tokensin[i]; + codes_c0 = codesin[i]; + context_map_c0 = context_map_in[i]; + clustered_histograms0 = clustered_histograms[i]; + clustered_histograms_c0 = clustered_histogramsin[i]; + } else if (i == 1) { + tokens_c1 = tokensin[i]; + codes_c1 = codesin[i]; + context_map_c1 = context_map_in[i]; + clustered_histograms1 = clustered_histograms[i]; + clustered_histograms_c1 = clustered_histogramsin[i]; + } else if (i == 2) { + tokens_c2 = tokensin[i]; + codes_c2 = codesin[i]; + context_map_c2 = context_map_in[i]; + clustered_histograms2 = clustered_histograms[i]; + clustered_histograms_c2 = clustered_histogramsin[i]; + } else if (i == 3) { + tokens_c3 = tokensin[i]; + codes_c3 = codesin[i]; + context_map_c3 = context_map_in[i]; + clustered_histograms3 = clustered_histograms[i]; + clustered_histograms_c3 = clustered_histogramsin[i]; + } else if (i == 4) { + tokens_c4 = tokensin[i]; + codes_c4 = codesin[i]; + context_map_c4 = context_map_in[i]; + clustered_histograms4 = clustered_histograms[i]; + clustered_histograms_c4 = clustered_histogramsin[i]; + } + } + + // ============================================== + // Do StoreEntropyCodes for outer histogram + // ============================================== + // printf("do_prefix_out = %d, %d, %d, %d, %d\n", do_prefix_out[0], + // do_prefix_out[1], do_prefix_out[2], do_prefix_out[3], do_prefix_out[4]); + + if (do_once[0]) { + if (!is_small_image) { + writer0->update_part(4); + } else { + writer0->update_part(4); + } + StoreEntropyCodesNew(params0, tokens0, &codes0, do_prefix_out[0], writer0, layer0, nullptr, + clustered_histograms0); + bcm_codes = codes0; + bcm_dummy_context_map = context_map0; + } + if (do_once[1]) { + if (!is_small_image) { + writer1->update_part(34); + } else { + writer1->update_part(34); + } + StoreEntropyCodesNew(params1, tokens1, &codes1, do_prefix_out[1], writer1, layer1, nullptr, + clustered_histograms1); + modularFramTree_code = codes1; + modularFramTree_ctxmap = context_map1; + } + if (do_once[2]) { + if (!is_small_image) { + writer2->update_part(54); + } else { + writer2->update_part(54); + } + StoreEntropyCodesNew(params2, tokens2, &codes2, do_prefix_out[2], writer2, layer2, nullptr, + clustered_histograms2); + modular_frame_encoder->code = codes2; + modular_frame_encoder->context_map = context_map2; + } + if (do_once[3]) { + if (!is_small_image) { + writer3->update_part(4); + } else { + writer3->update_part(84); + } + StoreEntropyCodesNew(params3, tokens3, &codes3, do_prefix_out[3], writer3, layer3, nullptr, + clustered_histograms3); + coefOrders_codes = codes3; + coefOrders_context_map = context_map3; + } + if (do_once[4]) { + if (!is_small_image) { + writer4->update_part(24); + } else { + writer4->update_part(104); + } + StoreEntropyCodesNew(params4, tokens4, &codes4, do_prefix_out[4], writer4, layer4, nullptr, + clustered_histograms4); + enc_state_->passes[0].codes = codes4; + enc_state_->passes[0].context_map = context_map4; + } + + // ============================================== + // Do StoreEntropyCodes for inner histogram + // ============================================== + // printf("do_prefix_in = %d, %d, %d, %d, %d\n", do_prefix_in[0], + // do_prefix_in[1], do_prefix_in[2], do_prefix_in[3], do_prefix_in[4]); + + if (do_inner[0]) { + if (!is_small_image) { + writer0->update_part(2); + } else { + writer0->update_part(2); + } + StoreEntropyCodesNew(params0, tokens_c0, &codes_c0, do_prefix_in[0], writer0, 0, nullptr, + clustered_histograms_c0); + } + if (do_inner[1]) { + if (!is_small_image) { + writer1->update_part(32); + } else { + writer1->update_part(32); + } + StoreEntropyCodesNew(params1, tokens_c1, &codes_c1, do_prefix_in[1], writer1, 0, nullptr, + clustered_histograms_c1); + } + if (do_inner[2]) { + if (!is_small_image) { + writer2->update_part(52); + } else { + writer2->update_part(52); + } + StoreEntropyCodesNew(params2, tokens_c2, &codes_c2, do_prefix_in[2], writer2, 0, nullptr, + clustered_histograms_c2); + } + if (do_inner[3]) { + if (!is_small_image) { + writer3->update_part(2); + } else { + writer3->update_part(82); + } + StoreEntropyCodesNew(params3, tokens_c3, &codes_c3, do_prefix_in[3], writer3, 0, nullptr, + clustered_histograms_c3); + } + if (do_inner[4]) { + if (!is_small_image) { + writer4->update_part(22); + } else { + writer4->update_part(102); + } + StoreEntropyCodesNew(params4, tokens_c4, &codes_c4, do_prefix_in[4], writer4, 0, nullptr, + clustered_histograms_c4); + } + + // ============================================== + // Do WriteTokens for inner histogram + // ============================================== + // printf("do_inner = %d, %d, %d, %d, %d\n", do_inner[0], do_inner[1], + // do_inner[2], do_inner[3], do_inner[4]); + if (do_inner[0]) { + if (!is_small_image) { + writer0->update_part(3); + } else { + writer0->update_part(3); + } + // printf("%s: %s: %d, WriteTokens token size out=%zu, + // codes.encoding_info.size=%zu\n", + // __FILE__, __FUNCTION__, __LINE__, tokens_c0[0].size(), + // codes_c0.encoding_info.size(), context_map_c0.size()); + WriteTokens(tokens_c0[0], codes_c0, context_map_c0, writer0); + } + if (do_inner[1]) { + if (!is_small_image) { + writer1->update_part(33); + } else { + writer1->update_part(33); + } + // printf("%s: %s: %d, WriteTokens token size out=%zu, + // codes.encoding_info.size=%zu, context_map.size=%d\n", + // __FILE__, __FUNCTION__, __LINE__, tokens_c0[0].size(), + // codes_c0.encoding_info.size(), context_map_c0.size()); + WriteTokens(tokens_c1[0], codes_c1, context_map_c1, writer1); + } + if (do_inner[2]) { + if (!is_small_image) { + writer2->update_part(53); + } else { + writer2->update_part(53); + } + // printf("%s: %s: %d, WriteTokens token size out=%zu, + // codes.encoding_info.size=%zu, context_map.size=%d\n", + // __FILE__, __FUNCTION__, __LINE__, tokens_c0[0].size(), + // codes_c0.encoding_info.size(), context_map_c0.size()); + WriteTokens(tokens_c2[0], codes_c2, context_map_c2, writer2); + } + if (do_inner[3]) { + if (!is_small_image) { + writer3->update_part(3); + } else { + writer3->update_part(83); + } + // printf("%s: %s: %d, WriteTokens token size out=%zu, + // codes.encoding_info.size=%zu, context_map.size=%d\n", + // __FILE__, __FUNCTION__, __LINE__, tokens_c0[0].size(), + // codes_c0.encoding_info.size(), context_map_c0.size()); + WriteTokens(tokens_c3[0], codes_c3, context_map_c3, writer3); + } + if (do_inner[4]) { + if (!is_small_image) { + writer4->update_part(23); + } else { + writer4->update_part(103); + } + // printf("%s: %s: %d, WriteTokens token size out=%zu, + // codes.encoding_info.size=%zu, context_map.size=%d\n", + // __FILE__, __FUNCTION__, __LINE__, tokens_c0[0].size(), + // codes_c0.encoding_info.size(), context_map_c0.size()); + WriteTokens(tokens_c4[0], codes_c4, context_map_c4, writer4); + } + return true; +} + +Status acc_ANS_tokens(LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& modular_frame_encoder, + const size_t num_groups, + PassesEncoderState* passes_enc_state, + FrameDimensions frame_dim, + std::unique_ptr& frame_header, + std::vector >& coefOrders_tokens, + std::vector& group_codes, + BitWriter* group_codes_writer, + BitWriter* acInfo_writer, + std::vector& dc_group_writers, + std::vector& acGroupWriters, + size_t& ans_cost, + size_t& mtf_cost, + std::vector >& bcm_tokens, + std::vector >& bcm_mtf_tokens, + EntropyEncodingData& bcm_codes, + std::vector& bcm_dummy_context_map, + + EntropyEncodingData& modularFramTree_code, + std::vector& modularFramTree_ctxmap, + + EntropyEncodingData& coefOrders_codes, + std::vector& coefOrders_context_map, + std::vector& aux_outs, + AuxOut* aux_out) { + PassesEncoderState* JXL_RESTRICT enc_state_ = lossy_frame_encoder.State(); + PassesSharedState& shared = enc_state_->shared; + const size_t global_ac_index = frame_dim.num_dc_groups + 1; + + const size_t num_passes = passes_enc_state->progressive_splitter.GetNumPasses(); + const bool is_small_image = frame_dim.num_groups == 1 && num_passes == 1; + + const bool has_ac_global = true; + + auto& dct = enc_state_->shared.block_ctx_map.dc_thresholds; + auto& qft = enc_state_->shared.block_ctx_map.qf_thresholds; + auto& ctx_map = enc_state_->shared.block_ctx_map.ctx_map; + + //============ANSWriteTokens Encode GlobalDCInfo: Block Context Map========= + if (frame_header->encoding == FrameEncoding::kVarDCT) { + if (dct[0].empty() && dct[1].empty() && dct[2].empty() && qft.empty() && ctx_map.size() == 21 && + std::equal(ctx_map.begin(), ctx_map.end(), jxl::kDefaultCtxMap)) { + } else { + if (enc_state_->shared.block_ctx_map.num_ctxs == 1) { + } else { + size_t entry_bits = CeilLog2Nonzero(enc_state_->shared.block_ctx_map.num_ctxs); + size_t simple_cost = entry_bits * ctx_map.size(); + if (entry_bits < 4 /* && simple_cost < ans_cost && + simple_cost < mtf_cost*/) { + } else { + if (!is_small_image) { + group_codes_writer->update_part(10); + } else { + group_codes_writer->update_part(10); + } + WriteTokens(bcm_tokens[0], bcm_codes, bcm_dummy_context_map, group_codes_writer); + } + } + BitWriter::Allotment allotmentGlobalDCInfoBCM( + group_codes_writer, (dct[0].size() + dct[1].size() + dct[2].size() + qft.size()) * 34 + 1 + 4 + 4 + + ctx_map.size() * 10 + 1024); + ReclaimAndCharge(group_codes_writer, &allotmentGlobalDCInfoBCM, kLayerAC, aux_out); + } + } + + //============ANSWriteTokens Encode GlobalDCInfo: modular frame + // tree========= + if (modular_frame_encoder->tree_tokens.empty() || modular_frame_encoder->tree_tokens[0].empty()) { + } else { + if (!is_small_image) { + group_codes_writer->update_part(40); + } else { + group_codes_writer->update_part(40); + } + WriteTokens(modular_frame_encoder->tree_tokens[0], modularFramTree_code, modularFramTree_ctxmap, + group_codes_writer, kLayerModularTree, aux_out); + } + + //============ANSWriteTokens Encode GlobalDCInfo: modular frame + // token========= + if (!is_small_image) { + group_codes_writer->update_part(60); + } else { + group_codes_writer->update_part(60); + } + size_t stream_id = ModularStreamId::Global().ID(frame_dim); + if (modular_frame_encoder->stream_images[stream_id].channel.empty()) { + // Image with no channels, header never gets decoded. + } else { + JXL_RETURN_IF_ERROR(Bundle::Write(modular_frame_encoder->stream_headers[stream_id], group_codes_writer, + kLayerModularGlobal, aux_out)); + WriteTokens(modular_frame_encoder->tokens[stream_id], modular_frame_encoder->code, + modular_frame_encoder->context_map, group_codes_writer, kLayerModularGlobal, aux_out); + } + + //============================= + + //============================= ANSWriteTokens DC group============= + for (int group_index = 0; group_index < frame_dim.num_dc_groups; group_index++) { + BitWriter* tmp = get_output(group_index + 1, group_codes, is_small_image); + dc_group_writers.emplace_back(tmp); + if (!is_small_image) { + tmp->init(200); + tmp->update_part(0); + } else { + tmp->update_part(70); + } + } + + for (int group_index = 0; group_index < frame_dim.num_dc_groups; group_index++) { + AuxOut* my_aux_out = aux_out ? &aux_outs[0] : nullptr; + BitWriter* output = dc_group_writers[group_index]; + if (frame_header->encoding == FrameEncoding::kVarDCT && !(frame_header->flags & FrameHeader::kUseDcFrame)) { + BitWriter::Allotment allotment(output, 2); + output->Write(2, modular_frame_encoder->extra_dc_precision[group_index]); + ReclaimAndCharge(output, &allotment, kLayerDC, my_aux_out); + size_t stream_id = ModularStreamId::VarDCTDC(group_index).ID(frame_dim); + if (modular_frame_encoder->stream_images[stream_id].channel.empty()) { + // Image with no channels, header never gets decoded. + } else { + Bundle::Write(modular_frame_encoder->stream_headers[stream_id], output, kLayerDC, aux_out); + WriteTokens(modular_frame_encoder->tokens[stream_id], modular_frame_encoder->code, + modular_frame_encoder->context_map, output, kLayerDC, my_aux_out); + } + } + + size_t stream_id = ModularStreamId::ModularDC(group_index).ID(frame_dim); + if (modular_frame_encoder->stream_images[stream_id].channel.empty()) { + // Image with no channels, header never gets decoded. + } else { + Bundle::Write(modular_frame_encoder->stream_headers[stream_id], output, kLayerModularDcGroup, aux_out); + WriteTokens(modular_frame_encoder->tokens[stream_id], modular_frame_encoder->code, + modular_frame_encoder->context_map, output, kLayerModularDcGroup, my_aux_out); + } + + if (frame_header->encoding == FrameEncoding::kVarDCT) { + const Rect& rect = lossy_frame_encoder.State()->shared.DCGroupRect(group_index); + size_t nb_bits = CeilLog2Nonzero(rect.xsize() * rect.ysize()); + if (nb_bits != 0) { + BitWriter::Allotment allotment(output, nb_bits); + output->Write(nb_bits, modular_frame_encoder->ac_metadata_size[group_index] - 1); + ReclaimAndCharge(output, &allotment, kLayerControlFields, my_aux_out); + } + size_t stream_id = ModularStreamId::ACMetadata(group_index).ID(frame_dim); + if (modular_frame_encoder->stream_images[stream_id].channel.empty()) { + // Image with no channels, header never gets decoded. + } else { + Bundle::Write(modular_frame_encoder->stream_headers[stream_id], output, kLayerControlFields, aux_out); + WriteTokens(modular_frame_encoder->tokens[stream_id], modular_frame_encoder->code, + modular_frame_encoder->context_map, output, kLayerControlFields, my_aux_out); + } + } + }; + + //============================= ANSWriteTokens AC Info============= + for (size_t i = 0; i < enc_state_->progressive_splitter.GetNumPasses(); i++) { + uint16_t used_orders = enc_state_->used_orders[i]; + if (used_orders != 0) { + if (!is_small_image) { + acInfo_writer->update_part(19); + } else { + acInfo_writer->update_part(90); + } + WriteTokens(coefOrders_tokens[0], coefOrders_codes, coefOrders_context_map, acInfo_writer, kLayerOrder, + aux_out); + } + } + + //========================================== + if (!is_small_image) { + acInfo_writer->update_part(29); + } else { + acInfo_writer->update_part(109); + } + //=============== + + //========================Encode AC Group============= + for (int group_index = 0; group_index < num_groups; group_index++) { + for (size_t i = 0; i < num_passes; i++) { + BitWriter* tmp = + get_output(AcGroupIndex(i, group_index, frame_dim.num_groups, frame_dim.num_dc_groups, has_ac_global), + group_codes, is_small_image); + acGroupWriters.emplace_back(tmp); + } + } + + int sum = 0; + for (int group_index = 0; group_index < num_groups; group_index++) { + AuxOut* my_aux_out = aux_out ? &aux_outs[0] : nullptr; + for (size_t i = 0; i < num_passes; i++) { + BitWriter* acGroupWriter = acGroupWriters[group_index * num_passes + i]; + if (frame_header->encoding == FrameEncoding::kVarDCT) { + // Select which histogram to use among those of the current pass. + const size_t num_histograms = enc_state_->shared.num_histograms; + // num_histograms is 0 only for lossless. + JXL_ASSERT(num_histograms == 0 || enc_state_->histogram_idx[group_index] < num_histograms); + size_t histo_selector_bits = CeilLog2Nonzero(num_histograms); + + if (histo_selector_bits != 0) { + BitWriter::Allotment allotment(acGroupWriter, histo_selector_bits); + acGroupWriter->Write(histo_selector_bits, enc_state_->histogram_idx[group_index]); + ReclaimAndCharge(acGroupWriter, &allotment, kLayerAC, aux_out); + } + sum = sum + enc_state_->passes[i].ac_tokens[group_index].size(); + WriteTokens(enc_state_->passes[i].ac_tokens[group_index], enc_state_->passes[i].codes, + enc_state_->passes[i].context_map, acGroupWriter, kLayerACTokens, aux_out); + } + + size_t stream_id = ModularStreamId::ModularAC(group_index, i).ID(frame_dim); + if (modular_frame_encoder->stream_images[stream_id].channel.empty()) { + // Image with no channels, header never gets decoded. + } else { + Bundle::Write(modular_frame_encoder->stream_headers[stream_id], acGroupWriter, kLayerModularAcGroup, + aux_out); + WriteTokens(modular_frame_encoder->tokens[stream_id], modular_frame_encoder->code, + modular_frame_encoder->context_map, acGroupWriter, kLayerModularAcGroup, aux_out); + } + } + } + //===================== + + return true; +} + +Status acc_writeout(LossyFrameEncoder& lossy_frame_encoder, + const size_t num_groups, + PassesEncoderState* passes_enc_state, + std::unique_ptr& frame_header, + FrameDimensions frame_dim, + std::vector& group_codes, + BitWriter* writer, + BitWriter* group_codes_writer, + BitWriter* acInfo_writer, + std::vector& dc_group_writers, + std::vector& acGroupWriters, + AuxOut* aux_out, + const std::function& resize_aux_outs) { + const size_t num_passes = passes_enc_state->progressive_splitter.GetNumPasses(); + const bool is_small_image = frame_dim.num_groups == 1 && num_passes == 1; + + writer->AppendByteAligned(lossy_frame_encoder.State()->special_frames); + frame_header->UpdateFlag(lossy_frame_encoder.State()->shared.image_features.patches.HasAny(), + FrameHeader::kPatches); + frame_header->UpdateFlag(lossy_frame_encoder.State()->shared.image_features.splines.HasAny(), + FrameHeader::kSplines); + JXL_RETURN_IF_ERROR(WriteFrameHeader(*frame_header, writer, aux_out)); + + // Resizing aux_outs to 0 also Assimilates the array. + std::atomic num_errors{0}; + static_cast(resize_aux_outs(0)); + JXL_RETURN_IF_ERROR(num_errors.load(std::memory_order_relaxed) == 0); + + for (BitWriter& bw : group_codes) { + bw.ZeroPadToByte(); // end of group. + } + + if (is_small_image) { + std::vector group_codes_seq{0, 1, 2, 3, 4, 10, 19, 20, 29, 30, 31, 32, 33, 34, 40, 50, 51, + 52, 53, 54, 60, 70, 80, 81, 82, 83, 84, 90, 100, 101, 102, 103, 104, 109}; + group_codes_writer->Finalize(group_codes_seq); + // group_codes_writer->Finalize(); + } else { + std::cout << "===============Group Codes writer Final==================" << std::endl; + std::vector group_codes_seq{0, 1, 2, 3, 4, 10, 19, 20, 29, 30, 31, 32, 33, 34, 40, 50, 51, 52, 53, 54, 60}; + group_codes_writer->Finalize(group_codes_seq); + // group_codes_writer->Finalize(); + std::cout << "===============DC Group writer Final==================" << std::endl; + std::vector dc_group_seq{0}; + for (int group_index = 0; group_index < frame_dim.num_dc_groups; group_index++) { + dc_group_writers[group_index]->Finalize(dc_group_seq); + // dc_group_writers[group_index]->Finalize(); + } + std::cout << "===============AC Info writer Final==================" << std::endl; + std::vector acInfo_seq{0, 1, 2, 3, 4, 10, 19, 20, 21, 22, 23, 24, 29}; + acInfo_writer->Finalize(acInfo_seq); + // acInfo_writer->Finalize(); + std::cout << "===============AC Group writer Final==================" << std::endl; + std::vector acGroup_seq{0}; + for (int group_index = 0; group_index < num_groups; group_index++) { + for (size_t i = 0; i < num_passes; i++) { + acGroupWriters[group_index * num_passes + i]->Finalize(acGroup_seq); + // acGroupWriters[group_index * num_passes + i]->Finalize(); + } + } + } + std::cout << "===============Others writer Final==================" << std::endl; + BitWriter::Allotment allotmentGrpOffset(writer, MaxBits(group_codes.size())); + writer->Write(1, 0); // no permutation + std::vector write_seq{0}; + // writer->Finalize(write_seq); + writer->Finalize(); + // } + writer->ZeroPadToByte(); // before TOC entries + + for (size_t i = 0; i < group_codes.size(); i++) { + JXL_ASSERT(group_codes[i].BitsWritten() % kBitsPerByte == 0); + const size_t group_size = group_codes[i].BitsWritten() / kBitsPerByte; + JXL_RETURN_IF_ERROR(U32Coder::Write(kTocDist, group_size, writer)); + } + // writer->Finalize(write_seq); + writer->Finalize(); + writer->ZeroPadToByte(); // before first group + ReclaimAndCharge(writer, &allotmentGrpOffset, kLayerTOC, aux_out); + + writer->AppendByteAligned(group_codes); + writer->ZeroPadToByte(); // end of frame. + + return true; +} + +Status acc_phase3(std::string xclbinPath, + Image3F& opsin, + LossyFrameEncoder& lossy_frame_encoder, + std::unique_ptr& modular_frame_encoder, + CompressParams cparams, + std::unique_ptr& frame_header, + PassesEncoderState* passes_enc_state, + FrameDimensions frame_dim, + BitWriter* writer, + const size_t num_groups, + AuxOut* aux_out, + ThreadPool* pool, + std::vector& aux_outs, + const ImageBundle& ib, + const std::function& resize_aux_outs) { + std::cout << "===========acc_kernel3 start================" << std::endl; + std::vector > coefOrders_tokens(1); + + const size_t num_passes = passes_enc_state->progressive_splitter.GetNumPasses(); + + // DC global info + DC groups + AC global info + AC groups * + // num_passes. + const bool has_ac_global = true; + std::vector group_codes( + NumTocEntries(frame_dim.num_groups, frame_dim.num_dc_groups, num_passes, has_ac_global)); + const size_t global_ac_index = frame_dim.num_dc_groups + 1; + const bool is_small_image = frame_dim.num_groups == 1 && num_passes == 1; + + BitWriter* group_codes_writer = get_output(0, group_codes, is_small_image); + BitWriter* acInfo_writer = get_output(global_ac_index, group_codes, is_small_image); + + std::vector > bcm_tokens(1), bcm_mtf_tokens(1); + EntropyEncodingData bcm_codes; + std::vector bcm_dummy_context_map; + size_t ans_cost, mtf_cost; + + EntropyEncodingData modularFramTree_code; + std::vector modularFramTree_ctxmap; + + EntropyEncodingData coefOrders_codes; + std::vector coefOrders_context_map; + + std::vector dc_group_writers; + std::vector acGroupWriters; + struct timeval start_time, token_time, hist_time, ans_time; + gettimeofday(&start_time, 0); + // acc_predictAndtoken(lossy_frame_encoder, frame_header, + // coefOrders_tokens, + // pool); + + gettimeofday(&token_time, 0); + acc_histogram(lossy_frame_encoder, modular_frame_encoder, passes_enc_state, frame_dim, frame_header, cparams, + coefOrders_tokens, group_codes_writer, acInfo_writer, ans_cost, mtf_cost, bcm_tokens, bcm_mtf_tokens, + bcm_codes, bcm_dummy_context_map, + + modularFramTree_code, modularFramTree_ctxmap, + + coefOrders_codes, coefOrders_context_map, + + aux_outs, aux_out, xclbinPath); + gettimeofday(&hist_time, 0); + acc_ANS_tokens(lossy_frame_encoder, modular_frame_encoder, num_groups, passes_enc_state, frame_dim, frame_header, + coefOrders_tokens, group_codes, group_codes_writer, acInfo_writer, dc_group_writers, acGroupWriters, + ans_cost, mtf_cost, bcm_tokens, bcm_mtf_tokens, bcm_codes, bcm_dummy_context_map, + + modularFramTree_code, modularFramTree_ctxmap, + + coefOrders_codes, coefOrders_context_map, aux_outs, aux_out); + + acc_writeout(lossy_frame_encoder, num_groups, passes_enc_state, frame_header, frame_dim, group_codes, writer, + group_codes_writer, acInfo_writer, dc_group_writers, acGroupWriters, aux_out, resize_aux_outs); + gettimeofday(&ans_time, 0); + + return true; +} +} // namespace jxl + +#endif diff --git a/codec/L2/demos/jxlEnc/third_partys/.clang-format b/codec/L2/demos/jxlEnc/third_partys/.clang-format new file mode 100644 index 0000000000..ff5c354782 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/.clang-format @@ -0,0 +1,89 @@ +--- +Language: Cpp +AccessModifierOffset: -1 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlinesLeft: true +AlignOperands: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: true +AllowShortLoopsOnASingleLine: true +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: true +BinPackArguments: true +BinPackParameters: false +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +ColumnLimit: 120 +CommentPragmas: '^ IWYU pragma:' +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: true +ExperimentalAutoDetectBinPacking: false +ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] +IncludeCategories: + - Regex: '^<.*\.h>' + Priority: 1 + - Regex: '^<.*' + Priority: 2 + - Regex: '.*' + Priority: 3 +IndentCaseLabels: true +IndentWidth: 4 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBlockIndentWidth: 4 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: false +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +ReflowComments: true +SortIncludes: false +SpaceAfterCStyleCast: false +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp03 +TabWidth: 8 +UseTab: Never +... + diff --git a/codec/L2/demos/jxlEnc/third_partys/build/lib/include/jxl/jxl_export.h b/codec/L2/demos/jxlEnc/third_partys/build/lib/include/jxl/jxl_export.h new file mode 100644 index 0000000000..1c73f277d3 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/build/lib/include/jxl/jxl_export.h @@ -0,0 +1,42 @@ + +#ifndef JXL_EXPORT_H +#define JXL_EXPORT_H + +#ifdef JXL_STATIC_DEFINE +# define JXL_EXPORT +# define JXL_NO_EXPORT +#else +# ifndef JXL_EXPORT +# ifdef JXL_INTERNAL_LIBRARY_BUILD + /* We are building this library */ +# define JXL_EXPORT __attribute__((visibility("default"))) +# else + /* We are using this library */ +# define JXL_EXPORT __attribute__((visibility("default"))) +# endif +# endif + +# ifndef JXL_NO_EXPORT +# define JXL_NO_EXPORT __attribute__((visibility("hidden"))) +# endif +#endif + +#ifndef JXL_DEPRECATED +# define JXL_DEPRECATED __attribute__ ((__deprecated__)) +#endif + +#ifndef JXL_DEPRECATED_EXPORT +# define JXL_DEPRECATED_EXPORT JXL_EXPORT JXL_DEPRECATED +#endif + +#ifndef JXL_DEPRECATED_NO_EXPORT +# define JXL_DEPRECATED_NO_EXPORT JXL_NO_EXPORT JXL_DEPRECATED +#endif + +#if 0 /* DEFINE_NO_DEPRECATED */ +# ifndef JXL_NO_DEPRECATED +# define JXL_NO_DEPRECATED +# endif +#endif + +#endif /* JXL_EXPORT_H */ diff --git a/codec/L2/demos/jxlEnc/third_partys/build/lib/include/jxl/jxl_threads_export.h b/codec/L2/demos/jxlEnc/third_partys/build/lib/include/jxl/jxl_threads_export.h new file mode 100644 index 0000000000..d385f7b624 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/build/lib/include/jxl/jxl_threads_export.h @@ -0,0 +1,42 @@ + +#ifndef JXL_THREADS_EXPORT_H +#define JXL_THREADS_EXPORT_H + +#ifdef JXL_THREADS_STATIC_DEFINE +# define JXL_THREADS_EXPORT +# define JXL_THREADS_NO_EXPORT +#else +# ifndef JXL_THREADS_EXPORT +# ifdef JXL_THREADS_INTERNAL_LIBRARY_BUILD + /* We are building this library */ +# define JXL_THREADS_EXPORT __attribute__((visibility("default"))) +# else + /* We are using this library */ +# define JXL_THREADS_EXPORT __attribute__((visibility("default"))) +# endif +# endif + +# ifndef JXL_THREADS_NO_EXPORT +# define JXL_THREADS_NO_EXPORT __attribute__((visibility("hidden"))) +# endif +#endif + +#ifndef JXL_THREADS_DEPRECATED +# define JXL_THREADS_DEPRECATED __attribute__ ((__deprecated__)) +#endif + +#ifndef JXL_THREADS_DEPRECATED_EXPORT +# define JXL_THREADS_DEPRECATED_EXPORT JXL_THREADS_EXPORT JXL_THREADS_DEPRECATED +#endif + +#ifndef JXL_THREADS_DEPRECATED_NO_EXPORT +# define JXL_THREADS_DEPRECATED_NO_EXPORT JXL_THREADS_NO_EXPORT JXL_THREADS_DEPRECATED +#endif + +#if 0 /* DEFINE_NO_DEPRECATED */ +# ifndef JXL_THREADS_NO_DEPRECATED +# define JXL_THREADS_NO_DEPRECATED +# endif +#endif + +#endif /* JXL_THREADS_EXPORT_H */ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec.cc b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec.cc new file mode 100644 index 0000000000..43749fdab2 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec.cc @@ -0,0 +1,226 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/extras/codec.h" + +#include "lib/jxl/base/file_io.h" +#if JPEGXL_ENABLE_APNG +#include "lib/extras/codec_apng.h" +#endif +#if JPEGXL_ENABLE_EXR +#include "lib/extras/codec_exr.h" +#endif +#if JPEGXL_ENABLE_GIF +#include "lib/extras/codec_gif.h" +#endif +#include "lib/extras/codec_jpg.h" +#include "lib/extras/codec_pgx.h" +#include "lib/extras/codec_png.h" +#include "lib/extras/codec_pnm.h" +#include "lib/extras/codec_psd.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { +namespace { + +// Any valid encoding is larger (ensures codecs can read the first few bytes) +constexpr size_t kMinBytes = 9; + +} // namespace + +std::string ExtensionFromCodec(Codec codec, const bool is_gray, + const size_t bits_per_sample) { + switch (codec) { + case Codec::kJPG: + return ".jpg"; + case Codec::kPGX: + return ".pgx"; + case Codec::kPNG: + return ".png"; + case Codec::kPNM: + if (is_gray) return ".pgm"; + return (bits_per_sample == 32) ? ".pfm" : ".ppm"; + case Codec::kGIF: + return ".gif"; + case Codec::kEXR: + return ".exr"; + case Codec::kPSD: + return ".psd"; + case Codec::kUnknown: + return std::string(); + } + JXL_UNREACHABLE; + return std::string(); +} + +Codec CodecFromExtension(const std::string& extension, + size_t* JXL_RESTRICT bits_per_sample) { + if (extension == ".png") return Codec::kPNG; + + if (extension == ".jpg") return Codec::kJPG; + if (extension == ".jpeg") return Codec::kJPG; + + if (extension == ".pgx") return Codec::kPGX; + + if (extension == ".pbm") { + *bits_per_sample = 1; + return Codec::kPNM; + } + if (extension == ".pgm") return Codec::kPNM; + if (extension == ".ppm") return Codec::kPNM; + if (extension == ".pfm") { + *bits_per_sample = 32; + return Codec::kPNM; + } + + if (extension == ".gif") return Codec::kGIF; + + if (extension == ".exr") return Codec::kEXR; + + if (extension == ".psd") return Codec::kPSD; + + return Codec::kUnknown; +} + +Status SetFromBytes(const Span bytes, CodecInOut* io, + ThreadPool* pool, Codec* orig_codec) { + if (bytes.size() < kMinBytes) return JXL_FAILURE("Too few bytes"); + + io->metadata.m.bit_depth.bits_per_sample = 0; // (For is-set check below) + + Codec codec; + if (DecodeImagePNG(bytes, pool, io)) { + codec = Codec::kPNG; + } +#if JPEGXL_ENABLE_APNG + else if (DecodeImageAPNG(bytes, pool, io)) { + codec = Codec::kPNG; + } +#endif + else if (DecodeImagePGX(bytes, pool, io)) { + codec = Codec::kPGX; + } else if (DecodeImagePNM(bytes, pool, io)) { + codec = Codec::kPNM; + } +#if JPEGXL_ENABLE_GIF + else if (DecodeImageGIF(bytes, pool, io)) { + codec = Codec::kGIF; + } +#endif + else if (DecodeImageJPG(bytes, pool, io)) { + codec = Codec::kJPG; + } + else if (DecodeImagePSD(bytes, pool, io)) { + codec = Codec::kPSD; + } +#if JPEGXL_ENABLE_EXR + else if (DecodeImageEXR(bytes, pool, io)) { + codec = Codec::kEXR; + } +#endif + else { + return JXL_FAILURE("Codecs failed to decode"); + } + if (orig_codec) *orig_codec = codec; + + io->CheckMetadata(); + return true; +} + +Status SetFromFile(const std::string& pathname, CodecInOut* io, + ThreadPool* pool, Codec* orig_codec) { + PaddedBytes encoded; + JXL_RETURN_IF_ERROR(ReadFile(pathname, &encoded)); + JXL_RETURN_IF_ERROR( + SetFromBytes(Span(encoded), io, pool, orig_codec)); + return true; +} + +Status Encode(const CodecInOut& io, const Codec codec, + const ColorEncoding& c_desired, size_t bits_per_sample, + PaddedBytes* bytes, ThreadPool* pool) { + JXL_CHECK(!io.Main().c_current().ICC().empty()); + JXL_CHECK(!c_desired.ICC().empty()); + io.CheckMetadata(); + if (io.Main().IsJPEG() && codec != Codec::kJPG) { + return JXL_FAILURE( + "Output format has to be JPEG for losslessly recompressed JPEG " + "reconstruction"); + } + + switch (codec) { + case Codec::kPNG: + return EncodeImagePNG(&io, c_desired, bits_per_sample, pool, bytes); + case Codec::kJPG: +#if JPEGXL_ENABLE_JPEG + return EncodeImageJPG( + &io, io.use_sjpeg ? JpegEncoder::kSJpeg : JpegEncoder::kLibJpeg, + io.jpeg_quality, YCbCrChromaSubsampling(), pool, bytes, + io.Main().IsJPEG() ? DecodeTarget::kQuantizedCoeffs + : DecodeTarget::kPixels); +#else + return JXL_FAILURE("JPEG XL was built without JPEG support"); +#endif + case Codec::kPNM: + return EncodeImagePNM(&io, c_desired, bits_per_sample, pool, bytes); + case Codec::kPGX: + return EncodeImagePGX(&io, c_desired, bits_per_sample, pool, bytes); + case Codec::kGIF: + return JXL_FAILURE("Encoding to GIF is not implemented"); + case Codec::kPSD: + return EncodeImagePSD(&io, c_desired, bits_per_sample, pool, bytes); + case Codec::kEXR: +#if JPEGXL_ENABLE_EXR + return EncodeImageEXR(&io, c_desired, pool, bytes); +#else + return JXL_FAILURE("JPEG XL was built without OpenEXR support"); +#endif + case Codec::kUnknown: + return JXL_FAILURE("Cannot encode using Codec::kUnknown"); + } + + return JXL_FAILURE("Invalid codec"); +} + +Status EncodeToFile(const CodecInOut& io, const ColorEncoding& c_desired, + size_t bits_per_sample, const std::string& pathname, + ThreadPool* pool) { + const std::string extension = Extension(pathname); + const Codec codec = CodecFromExtension(extension, &bits_per_sample); + + // Warn about incorrect usage of PBM/PGM/PGX/PPM - only the latter supports + // color, but CodecFromExtension lumps them all together. + if (codec == Codec::kPNM && extension != ".pfm") { + if (!io.Main().IsGray() && extension != ".ppm") { + JXL_WARNING("For color images, the filename should end with .ppm.\n"); + } else if (io.Main().IsGray() && extension == ".ppm") { + JXL_WARNING( + "For grayscale images, the filename should not end with .ppm.\n"); + } + if (bits_per_sample > 16) { + JXL_WARNING("PPM only supports up to 16 bits per sample"); + bits_per_sample = 16; + } + } else if (codec == Codec::kPGX && !io.Main().IsGray()) { + JXL_WARNING("Storing color image to PGX - use .ppm extension instead.\n"); + } + if (bits_per_sample > 16 && codec == Codec::kPNG) { + JXL_WARNING("PNG only supports up to 16 bits per sample"); + bits_per_sample = 16; + } + + PaddedBytes encoded; + return Encode(io, codec, c_desired, bits_per_sample, &encoded, pool) && + WriteFile(encoded, pathname); +} + +Status EncodeToFile(const CodecInOut& io, const std::string& pathname, + ThreadPool* pool) { + // TODO(lode): need to take the floating_point_sample field into account + return EncodeToFile(io, io.metadata.m.color_encoding, + io.metadata.m.bit_depth.bits_per_sample, pathname, pool); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec.h b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec.h new file mode 100644 index 0000000000..17209fdfe6 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec.h @@ -0,0 +1,85 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_EXTRAS_CODEC_H_ +#define LIB_EXTRAS_CODEC_H_ + +// Facade for image encoders/decoders (PNG, PNM, ...). + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/field_encodings.h" // MakeBit + +namespace jxl { + +// Codecs supported by CodecInOut::Encode. +enum class Codec : uint32_t { + kUnknown, // for CodecFromExtension + kPNG, + kPNM, + kPGX, + kJPG, + kGIF, + kEXR, + kPSD +}; + +static inline constexpr uint64_t EnumBits(Codec /*unused*/) { + // Return only fully-supported codecs (kGIF is decode-only). + return MakeBit(Codec::kPNM) | MakeBit(Codec::kPNG) +#if JPEGXL_ENABLE_JPEG + | MakeBit(Codec::kJPG) +#endif +#if JPEGXL_ENABLE_EXR + | MakeBit(Codec::kEXR) +#endif + | MakeBit(Codec::kPSD); +} + +// Lower case ASCII including dot, e.g. ".png". +std::string ExtensionFromCodec(Codec codec, bool is_gray, + size_t bits_per_sample); + +// If and only if extension is ".pfm", *bits_per_sample is updated to 32 so +// that Encode() would encode to PFM instead of PPM. +Codec CodecFromExtension(const std::string& extension, + size_t* JXL_RESTRICT bits_per_sample); + +// Decodes "bytes" and sets io->metadata.m. +// dec_hints may specify the "color_space" (otherwise, defaults to sRGB). +Status SetFromBytes(const Span bytes, CodecInOut* io, + ThreadPool* pool = nullptr, Codec* orig_codec = nullptr); + +// Reads from file and calls SetFromBytes. +Status SetFromFile(const std::string& pathname, CodecInOut* io, + ThreadPool* pool = nullptr, Codec* orig_codec = nullptr); + +// Replaces "bytes" with an encoding of pixels transformed from c_current +// color space to c_desired. +Status Encode(const CodecInOut& io, Codec codec, const ColorEncoding& c_desired, + size_t bits_per_sample, PaddedBytes* bytes, + ThreadPool* pool = nullptr); + +// Deduces codec, calls Encode and writes to file. +Status EncodeToFile(const CodecInOut& io, const ColorEncoding& c_desired, + size_t bits_per_sample, const std::string& pathname, + ThreadPool* pool = nullptr); +// Same, but defaults to metadata.original color_encoding and bits_per_sample. +Status EncodeToFile(const CodecInOut& io, const std::string& pathname, + ThreadPool* pool = nullptr); + +} // namespace jxl + +#endif // LIB_EXTRAS_CODEC_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_apng.cc b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_apng.cc new file mode 100644 index 0000000000..bef59f6369 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_apng.cc @@ -0,0 +1,410 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/extras/codec_apng.h" + +// Parts of this code are taken from apngdis, which has the following license: +/* APNG Disassembler 2.8 + * + * Deconstructs APNG files into individual frames. + * + * http://apngdis.sourceforge.net + * + * Copyright (c) 2010-2015 Max Stepin + * maxst at users.sourceforge.net + * + * zlib license + * ------------ + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + */ + +#include +#include + +#include +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/luminance.h" +#include "png.h" /* original (unpatched) libpng is ok */ + +namespace jxl { + +namespace { + +constexpr bool isAbc(char c) { + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); +} +#define notabc(c) ((c) < 65 || (c) > 122 || ((c) > 90 && (c) < 97)) + +constexpr uint32_t kId_IHDR = 0x52444849; +constexpr uint32_t kId_acTL = 0x4C546361; +constexpr uint32_t kId_fcTL = 0x4C546366; +constexpr uint32_t kId_IDAT = 0x54414449; +constexpr uint32_t kId_fdAT = 0x54416466; +constexpr uint32_t kId_IEND = 0x444E4549; + +struct CHUNK { + unsigned char* p; + unsigned int size; +}; + +struct APNGFrame { + unsigned char *p, **rows; + unsigned int w, h, delay_num, delay_den; +}; + +struct Reader { + const uint8_t* next; + const uint8_t* last; + bool Read(void* data, size_t len) { + size_t cap = last - next; + size_t to_copy = std::min(cap, len); + memcpy(data, next, to_copy); + next += to_copy; + return (len == to_copy); + } + bool Eof() { return next == last; } +}; + +const unsigned long cMaxPNGSize = 1000000UL; +const size_t kMaxPNGChunkSize = 100000000; // 100 MB + +void info_fn(png_structp png_ptr, png_infop info_ptr) { + png_set_expand(png_ptr); + png_set_strip_16(png_ptr); + png_set_gray_to_rgb(png_ptr); + png_set_palette_to_rgb(png_ptr); + png_set_add_alpha(png_ptr, 0xff, PNG_FILLER_AFTER); + (void)png_set_interlace_handling(png_ptr); + png_read_update_info(png_ptr, info_ptr); +} + +void row_fn(png_structp png_ptr, png_bytep new_row, png_uint_32 row_num, + int pass) { + APNGFrame* frame = (APNGFrame*)png_get_progressive_ptr(png_ptr); + png_progressive_combine_row(png_ptr, frame->rows[row_num], new_row); +} + +inline unsigned int read_chunk(Reader* r, CHUNK* pChunk) { + unsigned char len[4]; + pChunk->size = 0; + pChunk->p = 0; + if (r->Read(&len, 4)) { + const auto size = png_get_uint_32(len); + // Check first, to avoid overflow. + if (size > kMaxPNGChunkSize) { + JXL_WARNING("APNG chunk size is too big"); + return 0; + } + pChunk->size = size + 12; + pChunk->p = new unsigned char[pChunk->size]; + memcpy(pChunk->p, len, 4); + if (r->Read(pChunk->p + 4, pChunk->size - 4)) { + return *(unsigned int*)(pChunk->p + 4); + } + } + return 0; +} + +int processing_start(png_structp& png_ptr, png_infop& info_ptr, void* frame_ptr, + bool hasInfo, CHUNK& chunkIHDR, + std::vector& chunksInfo) { + unsigned char header[8] = {137, 80, 78, 71, 13, 10, 26, 10}; + + png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); + info_ptr = png_create_info_struct(png_ptr); + if (!png_ptr || !info_ptr) return 1; + + if (setjmp(png_jmpbuf(png_ptr))) { + png_destroy_read_struct(&png_ptr, &info_ptr, 0); + return 1; + } + + png_set_crc_action(png_ptr, PNG_CRC_QUIET_USE, PNG_CRC_QUIET_USE); + png_set_progressive_read_fn(png_ptr, frame_ptr, info_fn, row_fn, NULL); + + png_process_data(png_ptr, info_ptr, header, 8); + png_process_data(png_ptr, info_ptr, chunkIHDR.p, chunkIHDR.size); + + if (hasInfo) { + for (unsigned int i = 0; i < chunksInfo.size(); i++) { + png_process_data(png_ptr, info_ptr, chunksInfo[i].p, chunksInfo[i].size); + } + } + return 0; +} + +int processing_data(png_structp png_ptr, png_infop info_ptr, unsigned char* p, + unsigned int size) { + if (!png_ptr || !info_ptr) return 1; + + if (setjmp(png_jmpbuf(png_ptr))) { + png_destroy_read_struct(&png_ptr, &info_ptr, 0); + return 1; + } + + png_process_data(png_ptr, info_ptr, p, size); + return 0; +} + +int processing_finish(png_structp png_ptr, png_infop info_ptr) { + unsigned char footer[12] = {0, 0, 0, 0, 73, 69, 78, 68, 174, 66, 96, 130}; + + if (!png_ptr || !info_ptr) return 1; + + if (setjmp(png_jmpbuf(png_ptr))) { + png_destroy_read_struct(&png_ptr, &info_ptr, 0); + return 1; + } + + png_process_data(png_ptr, info_ptr, footer, 12); + png_destroy_read_struct(&png_ptr, &info_ptr, 0); + + return 0; +} + +} // namespace + +Status DecodeImageAPNG(Span bytes, ThreadPool* pool, + CodecInOut* io) { + Reader r; + unsigned int id, i, j, w, h, w0, h0, x0, y0; + unsigned int delay_num, delay_den, dop, bop, rowbytes, imagesize; + unsigned char sig[8]; + png_structp png_ptr; + png_infop info_ptr; + CHUNK chunk; + CHUNK chunkIHDR; + std::vector chunksInfo; + bool isAnimated = false; + bool skipFirst = false; + bool hasInfo = false; + bool all_dispose_bg = true; + APNGFrame frameRaw = {}; + + r = {bytes.data(), bytes.data() + bytes.size()}; + // Not an aPNG => not an error + unsigned char png_signature[8] = {137, 80, 78, 71, 13, 10, 26, 10}; + if (r.Read(sig, 8) || memcmp(sig, png_signature, 8) != 0) { + return false; + } + id = read_chunk(&r, &chunkIHDR); + + io->frames.clear(); + io->dec_pixels = 0; + io->metadata.m.SetUintSamples(8); + io->metadata.m.SetAlphaBits(8); + io->metadata.m.color_encoding = + ColorEncoding::SRGB(); // todo: get data from png metadata + (void)io->dec_hints.Foreach( + [](const std::string& key, const std::string& /*value*/) { + JXL_WARNING("APNG decoder ignoring %s hint", key.c_str()); + return true; + }); + + bool errorstate = true; + if (id == kId_IHDR && chunkIHDR.size == 25) { + w0 = w = png_get_uint_32(chunkIHDR.p + 8); + h0 = h = png_get_uint_32(chunkIHDR.p + 12); + + if (w > cMaxPNGSize || h > cMaxPNGSize) { + return false; + } + + x0 = 0; + y0 = 0; + delay_num = 1; + delay_den = 10; + dop = 0; + bop = 0; + rowbytes = w * 4; + imagesize = h * rowbytes; + + frameRaw.p = new unsigned char[imagesize]; + frameRaw.rows = new png_bytep[h * sizeof(png_bytep)]; + for (j = 0; j < h; j++) frameRaw.rows[j] = frameRaw.p + j * rowbytes; + + if (!processing_start(png_ptr, info_ptr, (void*)&frameRaw, hasInfo, + chunkIHDR, chunksInfo)) { + bool last_base_was_none = true; + while (!r.Eof()) { + id = read_chunk(&r, &chunk); + if (!id) break; + JXL_ASSERT(chunk.p != nullptr); + + if (id == kId_acTL && !hasInfo && !isAnimated) { + isAnimated = true; + skipFirst = true; + io->metadata.m.have_animation = true; + io->metadata.m.animation.tps_numerator = 1000; + } else if (id == kId_IEND || + (id == kId_fcTL && (!hasInfo || isAnimated))) { + if (hasInfo) { + if (!processing_finish(png_ptr, info_ptr)) { + ImageBundle bundle(&io->metadata.m); + bundle.duration = delay_num * 1000 / delay_den; + bundle.origin.x0 = x0; + bundle.origin.y0 = y0; + // TODO(veluca): this could in principle be implemented. + if (last_base_was_none && !all_dispose_bg && + (x0 != 0 || y0 != 0 || w0 != w || h0 != h || bop != 0)) { + return JXL_FAILURE( + "APNG with dispose-to-0 is not supported for non-full or " + "blended frames"); + } + switch (dop) { + case 0: + bundle.use_for_next_frame = true; + last_base_was_none = false; + all_dispose_bg = false; + break; + case 2: + bundle.use_for_next_frame = false; + all_dispose_bg = false; + break; + default: + bundle.use_for_next_frame = false; + last_base_was_none = true; + } + bundle.blend = bop != 0; + io->dec_pixels += w0 * h0; + + Image3F sub_frame(w0, h0); + ImageF sub_frame_alpha(w0, h0); + for (size_t y = 0; y < h0; ++y) { + float* const JXL_RESTRICT row_r = sub_frame.PlaneRow(0, y); + float* const JXL_RESTRICT row_g = sub_frame.PlaneRow(1, y); + float* const JXL_RESTRICT row_b = sub_frame.PlaneRow(2, y); + float* const JXL_RESTRICT row_alpha = sub_frame_alpha.Row(y); + uint8_t* const f = frameRaw.rows[y]; + for (size_t x = 0; x < w0; ++x) { + if (f[4 * x + 3] == 0) { + row_alpha[x] = 0; + row_r[x] = 0; + row_g[x] = 0; + row_b[x] = 0; + continue; + } + row_r[x] = f[4 * x + 0] * (1.f / 255); + row_g[x] = f[4 * x + 1] * (1.f / 255); + row_b[x] = f[4 * x + 2] * (1.f / 255); + row_alpha[x] = f[4 * x + 3] * (1.f / 255); + } + } + bundle.SetFromImage(std::move(sub_frame), ColorEncoding::SRGB()); + bundle.SetAlpha(std::move(sub_frame_alpha), + /*alpha_is_premultiplied=*/false); + io->frames.push_back(std::move(bundle)); + } else { + delete[] chunk.p; + break; + } + } + + if (id == kId_IEND) { + errorstate = false; + break; + } + // At this point the old frame is done. Let's start a new one. + w0 = png_get_uint_32(chunk.p + 12); + h0 = png_get_uint_32(chunk.p + 16); + x0 = png_get_uint_32(chunk.p + 20); + y0 = png_get_uint_32(chunk.p + 24); + delay_num = png_get_uint_16(chunk.p + 28); + delay_den = png_get_uint_16(chunk.p + 30); + dop = chunk.p[32]; + bop = chunk.p[33]; + + if (!delay_den) delay_den = 100; + + if (w0 > cMaxPNGSize || h0 > cMaxPNGSize || x0 > cMaxPNGSize || + y0 > cMaxPNGSize || x0 + w0 > w || y0 + h0 > h || dop > 2 || + bop > 1) { + delete[] chunk.p; + break; + } + + if (hasInfo) { + memcpy(chunkIHDR.p + 8, chunk.p + 12, 8); + if (processing_start(png_ptr, info_ptr, (void*)&frameRaw, hasInfo, + chunkIHDR, chunksInfo)) { + delete[] chunk.p; + break; + } + } else + skipFirst = false; + + if (io->frames.size() == (skipFirst ? 1 : 0)) { + bop = 0; + if (dop == 2) dop = 1; + } + } else if (id == kId_IDAT) { + hasInfo = true; + if (processing_data(png_ptr, info_ptr, chunk.p, chunk.size)) { + delete[] chunk.p; + break; + } + } else if (id == kId_fdAT && isAnimated) { + png_save_uint_32(chunk.p + 4, chunk.size - 16); + memcpy(chunk.p + 8, "IDAT", 4); + if (processing_data(png_ptr, info_ptr, chunk.p + 4, chunk.size - 4)) { + delete[] chunk.p; + break; + } + } else if (!isAbc(chunk.p[4]) || !isAbc(chunk.p[5]) || + !isAbc(chunk.p[6]) || !isAbc(chunk.p[7])) { + delete[] chunk.p; + break; + } else if (!hasInfo) { + if (processing_data(png_ptr, info_ptr, chunk.p, chunk.size)) { + delete[] chunk.p; + break; + } + chunksInfo.push_back(chunk); + continue; + } + delete[] chunk.p; + } + } + delete[] frameRaw.rows; + delete[] frameRaw.p; + } + + for (i = 0; i < chunksInfo.size(); i++) delete[] chunksInfo[i].p; + + chunksInfo.clear(); + delete[] chunkIHDR.p; + + if (errorstate) return false; + SetIntensityTarget(io); + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_apng.h b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_apng.h new file mode 100644 index 0000000000..53d3bfa2ac --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_apng.h @@ -0,0 +1,27 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_EXTRAS_CODEC_APNG_H_ +#define LIB_EXTRAS_CODEC_APNG_H_ + +// Decodes APNG images in memory. + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/codec_in_out.h" + +namespace jxl { + +// Decodes `bytes` into `io`. io->dec_hints are ignored. +Status DecodeImageAPNG(const Span bytes, ThreadPool* pool, + CodecInOut* io); + +} // namespace jxl + +#endif // LIB_EXTRAS_CODEC_APNG_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_exr.cc b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_exr.cc new file mode 100644 index 0000000000..efd0c1a12c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_exr.cc @@ -0,0 +1,350 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/extras/codec_exr.h" + +#include +#include +#include +#include + +#include + +#include "lib/jxl/alpha.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" + +namespace jxl { + +namespace { + +namespace OpenEXR = OPENEXR_IMF_NAMESPACE; +namespace Imath = IMATH_NAMESPACE; + +// OpenEXR::Int64 is deprecated in favor of using uint64_t directly, but using +// uint64_t as recommended causes build failures with previous OpenEXR versions +// on macOS, where the definition for OpenEXR::Int64 was actually not equivalent +// to uint64_t. This alternative should work in all cases. +using ExrInt64 = decltype(std::declval().tellg()); + +constexpr int kExrBitsPerSample = 16; +constexpr int kExrAlphaBits = 16; + +float GetIntensityTarget(const CodecInOut& io, + const OpenEXR::Header& exr_header) { + if (OpenEXR::hasWhiteLuminance(exr_header)) { + const float exr_luminance = OpenEXR::whiteLuminance(exr_header); + if (io.target_nits != 0) { + JXL_WARNING( + "overriding OpenEXR whiteLuminance of %g with user-specified value " + "of %g", + exr_luminance, io.target_nits); + return io.target_nits; + } + return exr_luminance; + } + if (io.target_nits != 0) { + return io.target_nits; + } + JXL_WARNING( + "no OpenEXR whiteLuminance tag found and no intensity_target specified, " + "defaulting to %g", + kDefaultIntensityTarget); + return kDefaultIntensityTarget; +} + +size_t GetNumThreads(ThreadPool* pool) { + size_t exr_num_threads = 1; + RunOnPool( + pool, 0, 1, + [&](size_t num_threads) { + exr_num_threads = num_threads; + return true; + }, + [&](const int /* task */, const int /*thread*/) {}, + "DecodeImageEXRThreads"); + return exr_num_threads; +} + +class InMemoryIStream : public OpenEXR::IStream { + public: + // The data pointed to by `bytes` must outlive the InMemoryIStream. + explicit InMemoryIStream(const Span bytes) + : IStream(/*fileName=*/""), bytes_(bytes) {} + + bool isMemoryMapped() const override { return true; } + char* readMemoryMapped(const int n) override { + JXL_ASSERT(pos_ + n <= bytes_.size()); + char* const result = + const_cast(reinterpret_cast(bytes_.data() + pos_)); + pos_ += n; + return result; + } + bool read(char c[], const int n) override { + std::copy_n(readMemoryMapped(n), n, c); + return pos_ < bytes_.size(); + } + + ExrInt64 tellg() override { return pos_; } + void seekg(const ExrInt64 pos) override { + JXL_ASSERT(pos + 1 <= bytes_.size()); + pos_ = pos; + } + + private: + const Span bytes_; + size_t pos_ = 0; +}; + +class InMemoryOStream : public OpenEXR::OStream { + public: + // `bytes` must outlive the InMemoryOStream. + explicit InMemoryOStream(PaddedBytes* const bytes) + : OStream(/*fileName=*/""), bytes_(*bytes) {} + + void write(const char c[], const int n) override { + if (bytes_.size() < pos_ + n) { + bytes_.resize(pos_ + n); + } + std::copy_n(c, n, bytes_.begin() + pos_); + pos_ += n; + } + + ExrInt64 tellp() override { return pos_; } + void seekp(const ExrInt64 pos) override { + if (bytes_.size() + 1 < pos) { + bytes_.resize(pos - 1); + } + pos_ = pos; + } + + private: + PaddedBytes& bytes_; + size_t pos_ = 0; +}; + +} // namespace + +Status DecodeImageEXR(Span bytes, ThreadPool* pool, + CodecInOut* io) { + // Get the number of threads we should be using for OpenEXR. + // OpenEXR creates its own set of threads, independent from ours. `pool` is + // only used for converting from a buffer of OpenEXR::Rgba to Image3F. + // TODO(sboukortt): look into changing that with OpenEXR 2.3 which allows + // custom thread pools according to its changelog. + OpenEXR::setGlobalThreadCount(GetNumThreads(pool)); + + InMemoryIStream is(bytes); + +#ifdef __EXCEPTIONS + std::unique_ptr input_ptr; + try { + input_ptr.reset(new OpenEXR::RgbaInputFile(is)); + } catch (...) { + return JXL_FAILURE("OpenEXR failed to parse input"); + } + OpenEXR::RgbaInputFile& input = *input_ptr; +#else + OpenEXR::RgbaInputFile input(is); +#endif + + if ((input.channels() & OpenEXR::RgbaChannels::WRITE_RGB) != + OpenEXR::RgbaChannels::WRITE_RGB) { + return JXL_FAILURE("only RGB OpenEXR files are supported"); + } + const bool has_alpha = (input.channels() & OpenEXR::RgbaChannels::WRITE_A) == + OpenEXR::RgbaChannels::WRITE_A; + + const float intensity_target = GetIntensityTarget(*io, input.header()); + + auto image_size = input.displayWindow().size(); + // Size is computed as max - min, but both bounds are inclusive. + ++image_size.x; + ++image_size.y; + Image3F image(image_size.x, image_size.y); + ZeroFillImage(&image); + ImageF alpha; + if (has_alpha) { + alpha = ImageF(image_size.x, image_size.y); + FillImage(1.f, &alpha); + } + + const int row_size = input.dataWindow().size().x + 1; + // Number of rows to read at a time. + // https://www.openexr.com/documentation/ReadingAndWritingImageFiles.pdf + // recommends reading the whole file at once. + const int y_chunk_size = input.displayWindow().size().y + 1; + std::vector input_rows(row_size * y_chunk_size); + for (int start_y = + std::max(input.dataWindow().min.y, input.displayWindow().min.y); + start_y <= + std::min(input.dataWindow().max.y, input.displayWindow().max.y); + start_y += y_chunk_size) { + // Inclusive. + const int end_y = std::min( + start_y + y_chunk_size - 1, + std::min(input.dataWindow().max.y, input.displayWindow().max.y)); + input.setFrameBuffer( + input_rows.data() - input.dataWindow().min.x - start_y * row_size, + /*xStride=*/1, /*yStride=*/row_size); + input.readPixels(start_y, end_y); + RunOnPool( + pool, start_y, end_y + 1, ThreadPool::SkipInit(), + [&](const int exr_y, const int /*thread*/) { + const int image_y = exr_y - input.displayWindow().min.y; + const OpenEXR::Rgba* const JXL_RESTRICT input_row = + &input_rows[(exr_y - start_y) * row_size]; + float* const JXL_RESTRICT rows[] = { + image.PlaneRow(0, image_y), + image.PlaneRow(1, image_y), + image.PlaneRow(2, image_y), + }; + float* const JXL_RESTRICT alpha_row = + has_alpha ? alpha.Row(image_y) : nullptr; + for (int exr_x = std::max(input.dataWindow().min.x, + input.displayWindow().min.x); + exr_x <= + std::min(input.dataWindow().max.x, input.displayWindow().max.x); + ++exr_x) { + const int image_x = exr_x - input.displayWindow().min.x; + const OpenEXR::Rgba& pixel = + input_row[exr_x - input.dataWindow().min.x]; + rows[0][image_x] = pixel.r; + rows[1][image_x] = pixel.g; + rows[2][image_x] = pixel.b; + if (has_alpha) { + alpha_row[image_x] = pixel.a; + } + } + }, + "DecodeImageEXR"); + } + + ColorEncoding color_encoding; + color_encoding.tf.SetTransferFunction(TransferFunction::kLinear); + color_encoding.SetColorSpace(ColorSpace::kRGB); + PrimariesCIExy primaries = ColorEncoding::SRGB().GetPrimaries(); + CIExy white_point = ColorEncoding::SRGB().GetWhitePoint(); + if (OpenEXR::hasChromaticities(input.header())) { + const auto& chromaticities = OpenEXR::chromaticities(input.header()); + primaries.r.x = chromaticities.red.x; + primaries.r.y = chromaticities.red.y; + primaries.g.x = chromaticities.green.x; + primaries.g.y = chromaticities.green.y; + primaries.b.x = chromaticities.blue.x; + primaries.b.y = chromaticities.blue.y; + white_point.x = chromaticities.white.x; + white_point.y = chromaticities.white.y; + } + JXL_RETURN_IF_ERROR(color_encoding.SetPrimaries(primaries)); + JXL_RETURN_IF_ERROR(color_encoding.SetWhitePoint(white_point)); + JXL_RETURN_IF_ERROR(color_encoding.CreateICC()); + + io->metadata.m.bit_depth.bits_per_sample = kExrBitsPerSample; + // EXR uses binary16 or binary32 floating point format. + io->metadata.m.bit_depth.exponent_bits_per_sample = + kExrBitsPerSample == 16 ? 5 : 8; + io->metadata.m.bit_depth.floating_point_sample = true; + io->SetFromImage(std::move(image), color_encoding); + io->metadata.m.color_encoding = color_encoding; + io->metadata.m.SetIntensityTarget(intensity_target); + if (has_alpha) { + io->metadata.m.SetAlphaBits(kExrAlphaBits, /*alpha_is_premultiplied=*/true); + io->Main().SetAlpha(std::move(alpha), /*alpha_is_premultiplied=*/true); + } + return true; +} + +Status EncodeImageEXR(const CodecInOut* io, const ColorEncoding& c_desired, + ThreadPool* pool, PaddedBytes* bytes) { + // As in `DecodeImageEXR`, `pool` is only used for pixel conversion, not for + // actual OpenEXR I/O. + OpenEXR::setGlobalThreadCount(GetNumThreads(pool)); + + ColorEncoding c_linear = c_desired; + c_linear.tf.SetTransferFunction(TransferFunction::kLinear); + JXL_RETURN_IF_ERROR(c_linear.CreateICC()); + ImageMetadata metadata = io->metadata.m; + ImageBundle store(&metadata); + const ImageBundle* linear; + JXL_RETURN_IF_ERROR( + TransformIfNeeded(io->Main(), c_linear, pool, &store, &linear)); + + const bool has_alpha = io->Main().HasAlpha(); + const bool alpha_is_premultiplied = io->Main().AlphaIsPremultiplied(); + + OpenEXR::Header header(io->xsize(), io->ysize()); + const PrimariesCIExy& primaries = c_linear.HasPrimaries() + ? c_linear.GetPrimaries() + : ColorEncoding::SRGB().GetPrimaries(); + OpenEXR::Chromaticities chromaticities; + chromaticities.red = Imath::V2f(primaries.r.x, primaries.r.y); + chromaticities.green = Imath::V2f(primaries.g.x, primaries.g.y); + chromaticities.blue = Imath::V2f(primaries.b.x, primaries.b.y); + chromaticities.white = + Imath::V2f(c_linear.GetWhitePoint().x, c_linear.GetWhitePoint().y); + OpenEXR::addChromaticities(header, chromaticities); + OpenEXR::addWhiteLuminance(header, io->metadata.m.IntensityTarget()); + + // Ensure that the destructor of RgbaOutputFile has run before we look at the + // size of `bytes`. + { + InMemoryOStream os(bytes); + OpenEXR::RgbaOutputFile output( + os, header, has_alpha ? OpenEXR::WRITE_RGBA : OpenEXR::WRITE_RGB); + // How many rows to write at once. Again, the OpenEXR documentation + // recommends writing the whole image in one call. + const int y_chunk_size = io->ysize(); + std::vector output_rows(io->xsize() * y_chunk_size); + + for (size_t start_y = 0; start_y < io->ysize(); start_y += y_chunk_size) { + // Inclusive. + const size_t end_y = + std::min(start_y + y_chunk_size - 1, io->ysize() - 1); + output.setFrameBuffer(output_rows.data() - start_y * io->xsize(), + /*xStride=*/1, /*yStride=*/io->xsize()); + RunOnPool( + pool, start_y, end_y + 1, ThreadPool::SkipInit(), + [&](const int y, const int /*thread*/) { + const float* const JXL_RESTRICT input_rows[] = { + linear->color().ConstPlaneRow(0, y), + linear->color().ConstPlaneRow(1, y), + linear->color().ConstPlaneRow(2, y), + }; + OpenEXR::Rgba* const JXL_RESTRICT row_data = + &output_rows[(y - start_y) * io->xsize()]; + if (has_alpha) { + const float* const JXL_RESTRICT alpha_row = + io->Main().alpha().ConstRow(y); + if (alpha_is_premultiplied) { + for (size_t x = 0; x < io->xsize(); ++x) { + row_data[x] = + OpenEXR::Rgba(input_rows[0][x], input_rows[1][x], + input_rows[2][x], alpha_row[x]); + } + } else { + for (size_t x = 0; x < io->xsize(); ++x) { + row_data[x] = OpenEXR::Rgba(alpha_row[x] * input_rows[0][x], + alpha_row[x] * input_rows[1][x], + alpha_row[x] * input_rows[2][x], + alpha_row[x]); + } + } + } else { + for (size_t x = 0; x < io->xsize(); ++x) { + row_data[x] = OpenEXR::Rgba(input_rows[0][x], input_rows[1][x], + input_rows[2][x], 1.f); + } + } + }, + "EncodeImageEXR"); + output.writePixels(/*numScanLines=*/end_y - start_y + 1); + } + } + + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_exr.h b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_exr.h new file mode 100644 index 0000000000..b0da5c5b8e --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_exr.h @@ -0,0 +1,31 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_EXTRAS_CODEC_EXR_H_ +#define LIB_EXTRAS_CODEC_EXR_H_ + +// Encodes OpenEXR images in memory. + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" + +namespace jxl { + +// Decodes `bytes` into `io`. io->dec_hints are ignored. +Status DecodeImageEXR(Span bytes, ThreadPool* pool, + CodecInOut* io); + +// Transforms from io->c_current to `c_desired` (with the transfer function set +// to linear as that is the OpenEXR convention) and encodes into `bytes`. +Status EncodeImageEXR(const CodecInOut* io, const ColorEncoding& c_desired, + ThreadPool* pool, PaddedBytes* bytes); + +} // namespace jxl + +#endif // LIB_EXTRAS_CODEC_EXR_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_gif.cc b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_gif.cc new file mode 100644 index 0000000000..1fb2a11ac7 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_gif.cc @@ -0,0 +1,343 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/extras/codec_gif.h" + +#include +#include + +#include +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/luminance.h" +#include "lib/jxl/sanitizers.h" + +namespace jxl { + +namespace { + +struct ReadState { + Span bytes; +}; + +struct DGifCloser { + void operator()(GifFileType* const ptr) const { DGifCloseFile(ptr, nullptr); } +}; +using GifUniquePtr = std::unique_ptr; + +// Gif does not support partial transparency, so this considers anything non-0 +// as opaque. +bool AllOpaque(const ImageF& alpha) { + for (size_t y = 0; y < alpha.ysize(); ++y) { + const float* const JXL_RESTRICT row = alpha.ConstRow(y); + for (size_t x = 0; x < alpha.xsize(); ++x) { + if (row[x] == 0.f) { + return false; + } + } + } + return true; +} + +} // namespace + +Status DecodeImageGIF(Span bytes, ThreadPool* pool, + CodecInOut* io) { + int error = GIF_OK; + ReadState state = {bytes}; + const auto ReadFromSpan = [](GifFileType* const gif, GifByteType* const bytes, + int n) { + ReadState* const state = reinterpret_cast(gif->UserData); + // giflib API requires the input size `n` to be signed int. + if (static_cast(n) > state->bytes.size()) { + n = state->bytes.size(); + } + memcpy(bytes, state->bytes.data(), n); + state->bytes.remove_prefix(n); + return n; + }; + GifUniquePtr gif(DGifOpen(&state, ReadFromSpan, &error)); + if (gif == nullptr) { + if (error == D_GIF_ERR_NOT_GIF_FILE) { + // Not an error. + return false; + } else { + return JXL_FAILURE("Failed to read GIF: %s", GifErrorString(error)); + } + } + error = DGifSlurp(gif.get()); + if (error != GIF_OK) { + return JXL_FAILURE("Failed to read GIF: %s", GifErrorString(gif->Error)); + } + + msan::UnpoisonMemory(gif.get(), sizeof(*gif)); + if (gif->SColorMap) { + msan::UnpoisonMemory(gif->SColorMap, sizeof(*gif->SColorMap)); + msan::UnpoisonMemory( + gif->SColorMap->Colors, + sizeof(*gif->SColorMap->Colors) * gif->SColorMap->ColorCount); + } + msan::UnpoisonMemory(gif->SavedImages, + sizeof(*gif->SavedImages) * gif->ImageCount); + + const SizeConstraints* constraints = &io->constraints; + + JXL_RETURN_IF_ERROR( + VerifyDimensions(constraints, gif->SWidth, gif->SHeight)); + uint64_t total_pixel_count = + static_cast(gif->SWidth) * gif->SHeight; + for (int i = 0; i < gif->ImageCount; ++i) { + const SavedImage& image = gif->SavedImages[i]; + uint32_t w = image.ImageDesc.Width; + uint32_t h = image.ImageDesc.Height; + JXL_RETURN_IF_ERROR(VerifyDimensions(constraints, w, h)); + uint64_t pixel_count = static_cast(w) * h; + if (total_pixel_count + pixel_count < total_pixel_count) { + return JXL_FAILURE("Image too big"); + } + total_pixel_count += pixel_count; + if (total_pixel_count > constraints->dec_max_pixels) { + return JXL_FAILURE("Image too big"); + } + } + + if (!gif->SColorMap) { + for (int i = 0; i < gif->ImageCount; ++i) { + if (!gif->SavedImages[i].ImageDesc.ColorMap) { + return JXL_FAILURE("Missing GIF color map"); + } + } + } + + if (gif->ImageCount > 1) { + io->metadata.m.have_animation = true; + // Delays in GIF are specified in 100ths of a second. + io->metadata.m.animation.tps_numerator = 100; + } + + io->frames.clear(); + io->frames.reserve(gif->ImageCount); + io->dec_pixels = 0; + + io->metadata.m.SetUintSamples(8); + io->metadata.m.color_encoding = ColorEncoding::SRGB(); + io->metadata.m.SetAlphaBits(0); + (void)io->dec_hints.Foreach( + [](const std::string& key, const std::string& /*value*/) { + JXL_WARNING("GIF decoder ignoring %s hint", key.c_str()); + return true; + }); + + Image3F canvas(gif->SWidth, gif->SHeight); + io->SetSize(gif->SWidth, gif->SHeight); + ImageF alpha(gif->SWidth, gif->SHeight); + GifColorType background_color; + if (gif->SColorMap == nullptr) { + background_color = {0, 0, 0}; + } else { + if (gif->SBackGroundColor >= gif->SColorMap->ColorCount) { + return JXL_FAILURE("GIF specifies out-of-bounds background color"); + } + background_color = gif->SColorMap->Colors[gif->SBackGroundColor]; + } + FillPlane(background_color.Red, &canvas.Plane(0)); + FillPlane(background_color.Green, &canvas.Plane(1)); + FillPlane(background_color.Blue, &canvas.Plane(2)); + ZeroFillImage(&alpha); + + Rect previous_rect_if_restore_to_background; + + bool has_alpha = false; + bool replace = true; + bool last_base_was_none = true; + for (int i = 0; i < gif->ImageCount; ++i) { + const SavedImage& image = gif->SavedImages[i]; + msan::UnpoisonMemory(image.RasterBits, sizeof(*image.RasterBits) * + image.ImageDesc.Width * + image.ImageDesc.Height); + const Rect image_rect(image.ImageDesc.Left, image.ImageDesc.Top, + image.ImageDesc.Width, image.ImageDesc.Height); + io->dec_pixels += image_rect.xsize() * image_rect.ysize(); + Rect total_rect; + if (previous_rect_if_restore_to_background.xsize() != 0 || + previous_rect_if_restore_to_background.ysize() != 0) { + const size_t xbegin = std::min( + image_rect.x0(), previous_rect_if_restore_to_background.x0()); + const size_t ybegin = std::min( + image_rect.y0(), previous_rect_if_restore_to_background.y0()); + const size_t xend = + std::max(image_rect.x0() + image_rect.xsize(), + previous_rect_if_restore_to_background.x0() + + previous_rect_if_restore_to_background.xsize()); + const size_t yend = + std::max(image_rect.y0() + image_rect.ysize(), + previous_rect_if_restore_to_background.y0() + + previous_rect_if_restore_to_background.ysize()); + total_rect = Rect(xbegin, ybegin, xend - xbegin, yend - ybegin); + previous_rect_if_restore_to_background = Rect(); + replace = true; + } else { + total_rect = image_rect; + replace = false; + } + if (!image_rect.IsInside(canvas)) { + return JXL_FAILURE("GIF frame extends outside of the canvas"); + } + const ColorMapObject* const color_map = + image.ImageDesc.ColorMap ? image.ImageDesc.ColorMap : gif->SColorMap; + JXL_CHECK(color_map); + msan::UnpoisonMemory(color_map, sizeof(*color_map)); + msan::UnpoisonMemory(color_map->Colors, + sizeof(*color_map->Colors) * color_map->ColorCount); + GraphicsControlBlock gcb; + DGifSavedExtensionToGCB(gif.get(), i, &gcb); + msan::UnpoisonMemory(&gcb, sizeof(gcb)); + + ImageBundle bundle(&io->metadata.m); + if (io->metadata.m.have_animation) { + bundle.duration = gcb.DelayTime; + bundle.origin.x0 = total_rect.x0(); + bundle.origin.y0 = total_rect.y0(); + if (last_base_was_none) { + replace = true; + } + bundle.blend = !replace; + // TODO(veluca): this could in principle be implemented. + if (last_base_was_none && + (total_rect.x0() != 0 || total_rect.y0() != 0 || + total_rect.xsize() != canvas.xsize() || + total_rect.ysize() != canvas.ysize() || !replace)) { + return JXL_FAILURE( + "GIF with dispose-to-0 is not supported for non-full or " + "blended frames"); + } + switch (gcb.DisposalMode) { + case DISPOSE_DO_NOT: + case DISPOSE_BACKGROUND: + bundle.use_for_next_frame = true; + last_base_was_none = false; + break; + case DISPOSE_PREVIOUS: + bundle.use_for_next_frame = false; + break; + default: + bundle.use_for_next_frame = false; + last_base_was_none = true; + } + } + Image3F frame = CopyImage(canvas); + ImageF frame_alpha = CopyImage(alpha); + for (size_t y = 0, byte_index = 0; y < image_rect.ysize(); ++y) { + float* const JXL_RESTRICT row_r = image_rect.Row(&frame.Plane(0), y); + float* const JXL_RESTRICT row_g = image_rect.Row(&frame.Plane(1), y); + float* const JXL_RESTRICT row_b = image_rect.Row(&frame.Plane(2), y); + float* const JXL_RESTRICT row_alpha = image_rect.Row(&frame_alpha, y); + for (size_t x = 0; x < image_rect.xsize(); ++x, ++byte_index) { + const GifByteType byte = image.RasterBits[byte_index]; + if (byte >= color_map->ColorCount) { + return JXL_FAILURE("GIF color is out of bounds"); + } + if (byte == gcb.TransparentColor) continue; + GifColorType color = color_map->Colors[byte]; + row_alpha[x] = 1.f; + row_r[x] = (1.f / 255) * color.Red; + row_g[x] = (1.f / 255) * color.Green; + row_b[x] = (1.f / 255) * color.Blue; + } + } + Image3F sub_frame(total_rect.xsize(), total_rect.ysize()); + ImageF sub_frame_alpha(total_rect.xsize(), total_rect.ysize()); + bool blend_alpha = false; + if (replace) { + CopyImageTo(total_rect, frame, &sub_frame); + CopyImageTo(total_rect, frame_alpha, &sub_frame_alpha); + } else { + for (size_t y = 0, byte_index = 0; y < image_rect.ysize(); ++y) { + float* const JXL_RESTRICT row_r = sub_frame.PlaneRow(0, y); + float* const JXL_RESTRICT row_g = sub_frame.PlaneRow(1, y); + float* const JXL_RESTRICT row_b = sub_frame.PlaneRow(2, y); + float* const JXL_RESTRICT row_alpha = sub_frame_alpha.Row(y); + for (size_t x = 0; x < image_rect.xsize(); ++x, ++byte_index) { + const GifByteType byte = image.RasterBits[byte_index]; + if (byte > color_map->ColorCount) { + return JXL_FAILURE("GIF color is out of bounds"); + } + if (byte == gcb.TransparentColor) { + row_alpha[x] = 0; + row_r[x] = 0; + row_g[x] = 0; + row_b[x] = 0; + blend_alpha = + true; // need to use alpha channel if BlendMode blend is used + continue; + } + GifColorType color = color_map->Colors[byte]; + row_alpha[x] = 1.f; + row_r[x] = (1.f / 255) * color.Red; + row_g[x] = (1.f / 255) * color.Green; + row_b[x] = (1.f / 255) * color.Blue; + } + } + } + bundle.SetFromImage(std::move(sub_frame), ColorEncoding::SRGB()); + if (has_alpha || !AllOpaque(frame_alpha) || blend_alpha) { + if (!has_alpha) { + has_alpha = true; + io->metadata.m.SetAlphaBits(8); + for (ImageBundle& previous_frame : io->frames) { + ImageF previous_alpha(previous_frame.xsize(), previous_frame.ysize()); + FillImage(1.f, &previous_alpha); + previous_frame.SetAlpha(std::move(previous_alpha), + /*alpha_is_premultiplied=*/false); + } + } + bundle.SetAlpha(std::move(sub_frame_alpha), + /*alpha_is_premultiplied=*/false); + } + io->frames.push_back(std::move(bundle)); + switch (gcb.DisposalMode) { + case DISPOSE_DO_NOT: + canvas = std::move(frame); + alpha = std::move(frame_alpha); + break; + + case DISPOSE_BACKGROUND: + FillPlane((1.f / 255) * background_color.Red, &canvas.Plane(0), + image_rect); + FillPlane((1.f / 255) * background_color.Green, &canvas.Plane(1), + image_rect); + FillPlane((1.f / 255) * background_color.Blue, &canvas.Plane(2), + image_rect); + FillPlane(0.f, &alpha, image_rect); + previous_rect_if_restore_to_background = image_rect; + break; + + case DISPOSE_PREVIOUS: + break; + + case DISPOSAL_UNSPECIFIED: + default: + FillPlane((1.f / 255) * background_color.Red, &canvas.Plane(0)); + FillPlane((1.f / 255) * background_color.Green, + &canvas.Plane(1)); + FillPlane((1.f / 255) * background_color.Blue, &canvas.Plane(2)); + ZeroFillImage(&alpha); + } + } + + SetIntensityTarget(io); + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_gif.h b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_gif.h new file mode 100644 index 0000000000..03e0e55253 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_gif.h @@ -0,0 +1,27 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_EXTRAS_CODEC_GIF_H_ +#define LIB_EXTRAS_CODEC_GIF_H_ + +// Decodes GIF images in memory. + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/codec_in_out.h" + +namespace jxl { + +// Decodes `bytes` into `io`. io->dec_hints are ignored. +Status DecodeImageGIF(const Span bytes, ThreadPool* pool, + CodecInOut* io); + +} // namespace jxl + +#endif // LIB_EXTRAS_CODEC_GIF_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_jpg.cc b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_jpg.cc new file mode 100644 index 0000000000..842d52a8b0 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_jpg.cc @@ -0,0 +1,519 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/extras/codec_jpg.h" + +#include +#include + +#if JPEGXL_ENABLE_JPEG +// After stddef/stdio +#include +#include +#include +#endif // JPEGXL_ENABLE_JPEG + +#include +#include +#include +#include +#include + +#include "lib/extras/time.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/jpeg/dec_jpeg_data_writer.h" +#include "lib/jxl/jpeg/enc_jpeg_data.h" +#include "lib/jxl/jpeg/enc_jpeg_data_reader.h" +#include "lib/jxl/luminance.h" +#include "lib/jxl/sanitizers.h" +#if JPEGXL_ENABLE_SJPEG +#include "sjpeg.h" +#endif + +namespace jxl { + +#if JPEGXL_ENABLE_JPEG +namespace { + +constexpr float kJPEGSampleMultiplier = MAXJSAMPLE; +constexpr unsigned char kICCSignature[12] = { + 0x49, 0x43, 0x43, 0x5F, 0x50, 0x52, 0x4F, 0x46, 0x49, 0x4C, 0x45, 0x00}; +constexpr int kICCMarker = JPEG_APP0 + 2; +constexpr size_t kMaxBytesInMarker = 65533; + +constexpr unsigned char kExifSignature[6] = {0x45, 0x78, 0x69, + 0x66, 0x00, 0x00}; +constexpr int kExifMarker = JPEG_APP0 + 1; + +constexpr float kJPEGSampleMin = 0; +constexpr float kJPEGSampleMax = MAXJSAMPLE; + +bool MarkerIsICC(const jpeg_saved_marker_ptr marker) { + return marker->marker == kICCMarker && + marker->data_length >= sizeof kICCSignature + 2 && + std::equal(std::begin(kICCSignature), std::end(kICCSignature), + marker->data); +} +bool MarkerIsExif(const jpeg_saved_marker_ptr marker) { + return marker->marker == kExifMarker && + marker->data_length >= sizeof kExifSignature + 2 && + std::equal(std::begin(kExifSignature), std::end(kExifSignature), + marker->data); +} + +Status ReadICCProfile(jpeg_decompress_struct* const cinfo, + PaddedBytes* const icc) { + constexpr size_t kICCSignatureSize = sizeof kICCSignature; + // ICC signature + uint8_t index + uint8_t max_index. + constexpr size_t kICCHeadSize = kICCSignatureSize + 2; + // Markers are 1-indexed, and we keep them that way in this vector to get a + // convenient 0 at the front for when we compute the offsets later. + std::vector marker_lengths; + int num_markers = 0; + int seen_markers_count = 0; + bool has_num_markers = false; + for (jpeg_saved_marker_ptr marker = cinfo->marker_list; marker != nullptr; + marker = marker->next) { + // marker is initialized by libjpeg, which we are not instrumenting with + // msan. + msan::UnpoisonMemory(marker, sizeof(*marker)); + msan::UnpoisonMemory(marker->data, marker->data_length); + if (!MarkerIsICC(marker)) continue; + + const int current_marker = marker->data[kICCSignatureSize]; + if (current_marker == 0) { + return JXL_FAILURE("inconsistent JPEG ICC marker numbering"); + } + const int current_num_markers = marker->data[kICCSignatureSize + 1]; + if (current_marker > current_num_markers) { + return JXL_FAILURE("inconsistent JPEG ICC marker numbering"); + } + if (has_num_markers) { + if (current_num_markers != num_markers) { + return JXL_FAILURE("inconsistent numbers of JPEG ICC markers"); + } + } else { + num_markers = current_num_markers; + has_num_markers = true; + marker_lengths.resize(num_markers + 1); + } + + size_t marker_length = marker->data_length - kICCHeadSize; + + if (marker_length == 0) { + // NB: if we allow empty chunks, then the next check is incorrect. + return JXL_FAILURE("Empty ICC chunk"); + } + + if (marker_lengths[current_marker] != 0) { + return JXL_FAILURE("duplicate JPEG ICC marker number"); + } + marker_lengths[current_marker] = marker_length; + seen_markers_count++; + } + + if (marker_lengths.empty()) { + // Not an error. + return false; + } + + if (seen_markers_count != num_markers) { + JXL_DASSERT(has_num_markers); + return JXL_FAILURE("Incomplete set of ICC chunks"); + } + + std::vector offsets = std::move(marker_lengths); + std::partial_sum(offsets.begin(), offsets.end(), offsets.begin()); + icc->resize(offsets.back()); + + for (jpeg_saved_marker_ptr marker = cinfo->marker_list; marker != nullptr; + marker = marker->next) { + if (!MarkerIsICC(marker)) continue; + const uint8_t* first = marker->data + kICCHeadSize; + uint8_t current_marker = marker->data[kICCSignatureSize]; + size_t offset = offsets[current_marker - 1]; + size_t marker_length = offsets[current_marker] - offset; + std::copy_n(first, marker_length, icc->data() + offset); + } + + return true; +} + +void ReadExif(jpeg_decompress_struct* const cinfo, PaddedBytes* const exif) { + constexpr size_t kExifSignatureSize = sizeof kExifSignature; + for (jpeg_saved_marker_ptr marker = cinfo->marker_list; marker != nullptr; + marker = marker->next) { + // marker is initialized by libjpeg, which we are not instrumenting with + // msan. + msan::UnpoisonMemory(marker, sizeof(*marker)); + msan::UnpoisonMemory(marker->data, marker->data_length); + if (!MarkerIsExif(marker)) continue; + size_t marker_length = marker->data_length - kExifSignatureSize; + exif->resize(marker_length); + std::copy_n(marker->data + kExifSignatureSize, marker_length, exif->data()); + return; + } +} + +// TODO (jon): take orientation into account when writing jpeg output +// TODO (jon): write Exif blob also in sjpeg encoding +// TODO (jon): overwrite orientation in Exif blob to avoid double orientation + +void WriteICCProfile(jpeg_compress_struct* const cinfo, + const PaddedBytes& icc) { + constexpr size_t kMaxIccBytesInMarker = + kMaxBytesInMarker - sizeof kICCSignature - 2; + const int num_markers = + static_cast(DivCeil(icc.size(), kMaxIccBytesInMarker)); + size_t begin = 0; + for (int current_marker = 0; current_marker < num_markers; ++current_marker) { + const size_t length = std::min(kMaxIccBytesInMarker, icc.size() - begin); + jpeg_write_m_header( + cinfo, kICCMarker, + static_cast(length + sizeof kICCSignature + 2)); + for (const unsigned char c : kICCSignature) { + jpeg_write_m_byte(cinfo, c); + } + jpeg_write_m_byte(cinfo, current_marker + 1); + jpeg_write_m_byte(cinfo, num_markers); + for (size_t i = 0; i < length; ++i) { + jpeg_write_m_byte(cinfo, icc[begin]); + ++begin; + } + } +} +void WriteExif(jpeg_compress_struct* const cinfo, const PaddedBytes& exif) { + if (exif.size() < 4) return; + jpeg_write_m_header( + cinfo, kExifMarker, + static_cast(exif.size() - 4 + sizeof kExifSignature)); + for (const unsigned char c : kExifSignature) { + jpeg_write_m_byte(cinfo, c); + } + for (size_t i = 4; i < exif.size(); ++i) { + jpeg_write_m_byte(cinfo, exif[i]); + } +} + +Status SetChromaSubsampling(const YCbCrChromaSubsampling& chroma_subsampling, + jpeg_compress_struct* const cinfo) { + for (size_t i = 0; i < 3; i++) { + cinfo->comp_info[i].h_samp_factor = + 1 << (chroma_subsampling.MaxHShift() - + chroma_subsampling.HShift(i < 2 ? i ^ 1 : i)); + cinfo->comp_info[i].v_samp_factor = + 1 << (chroma_subsampling.MaxVShift() - + chroma_subsampling.VShift(i < 2 ? i ^ 1 : i)); + } + return true; +} + +void MyErrorExit(j_common_ptr cinfo) { + jmp_buf* env = static_cast(cinfo->client_data); + (*cinfo->err->output_message)(cinfo); + jpeg_destroy_decompress(reinterpret_cast(cinfo)); + longjmp(*env, 1); +} + +void MyOutputMessage(j_common_ptr cinfo) { +#if JXL_DEBUG_WARNING == 1 + char buf[JMSG_LENGTH_MAX]; + (*cinfo->err->format_message)(cinfo, buf); + JXL_WARNING("%s", buf); +#endif +} + +} // namespace +#endif // JPEGXL_ENABLE_JPEG + +Status DecodeImageJPG(const Span bytes, ThreadPool* pool, + CodecInOut* io, double* const elapsed_deinterleave) { + if (elapsed_deinterleave != nullptr) *elapsed_deinterleave = 0; + // Don't do anything for non-JPEG files (no need to report an error) + if (!IsJPG(bytes)) return false; + const DecodeTarget target = io->dec_target; + + // Use brunsli JPEG decoder to read quantized coefficients. + if (target == DecodeTarget::kQuantizedCoeffs) { + return jxl::jpeg::DecodeImageJPG(bytes, io); + } + +#if JPEGXL_ENABLE_JPEG + // TODO(veluca): use JPEGData also for pixels? + + // We need to declare all the non-trivial destructor local variables before + // the call to setjmp(). + ColorEncoding color_encoding; + PaddedBytes icc; + Image3F image; + std::unique_ptr row; + ImageBundle bundle(&io->metadata.m); + + const auto try_catch_block = [&]() -> bool { + jpeg_decompress_struct cinfo; +#ifdef MEMORY_SANITIZER + // cinfo is initialized by libjpeg, which we are not instrumenting with + // msan, therefore we need to initialize cinfo here. + memset(&cinfo, 0, sizeof(cinfo)); +#endif + // Setup error handling in jpeg library so we can deal with broken jpegs in + // the fuzzer. + jpeg_error_mgr jerr; + jmp_buf env; + cinfo.err = jpeg_std_error(&jerr); + jerr.error_exit = &MyErrorExit; + jerr.output_message = &MyOutputMessage; + if (setjmp(env)) { + return false; + } + cinfo.client_data = static_cast(&env); + + jpeg_create_decompress(&cinfo); + unsigned char* tmp_jpg_buff = (unsigned char*)malloc(bytes.size()); + memcpy(tmp_jpg_buff, bytes.data(), bytes.size()); + jpeg_mem_src(&cinfo, tmp_jpg_buff, bytes.size()); + jpeg_save_markers(&cinfo, kICCMarker, 0xFFFF); + jpeg_save_markers(&cinfo, kExifMarker, 0xFFFF); + jpeg_read_header(&cinfo, TRUE); + const auto failure = [&cinfo](const char* str) -> Status { + jpeg_abort_decompress(&cinfo); + jpeg_destroy_decompress(&cinfo); + return JXL_FAILURE("%s", str); + }; + if (!VerifyDimensions(&io->constraints, cinfo.image_width, + cinfo.image_height)) { + return failure("image too big"); + } + // Might cause CPU-zip bomb. + if (cinfo.arith_code) { + return failure("arithmetic code JPEGs are not supported"); + } + if (ReadICCProfile(&cinfo, &icc)) { + if (!color_encoding.SetICC(std::move(icc))) { + return failure("read an invalid ICC profile"); + } + } else { + color_encoding = ColorEncoding::SRGB(cinfo.output_components == 1); + } + ReadExif(&cinfo, &io->blobs.exif); + io->metadata.m.SetUintSamples(BITS_IN_JSAMPLE); + io->metadata.m.color_encoding = color_encoding; + int nbcomp = cinfo.num_components; + if (nbcomp != 1 && nbcomp != 3) { + return failure("unsupported number of components in JPEG"); + } + (void)io->dec_hints.Foreach( + [](const std::string& key, const std::string& /*value*/) { + JXL_WARNING("JPEG decoder ignoring %s hint", key.c_str()); + return true; + }); + + jpeg_start_decompress(&cinfo); + JXL_ASSERT(cinfo.output_components == nbcomp); + image = Image3F(cinfo.image_width, cinfo.image_height); + row.reset(new JSAMPLE[cinfo.output_components * cinfo.image_width]); + for (size_t y = 0; y < image.ysize(); ++y) { + JSAMPROW rows[] = {row.get()}; + jpeg_read_scanlines(&cinfo, rows, 1); + msan::UnpoisonMemory( + row.get(), + sizeof(JSAMPLE) * cinfo.output_components * cinfo.image_width); + auto start = Now(); + float* const JXL_RESTRICT output_row[] = { + image.PlaneRow(0, y), image.PlaneRow(1, y), image.PlaneRow(2, y)}; + if (cinfo.output_components == 1) { + for (size_t x = 0; x < image.xsize(); ++x) { + output_row[0][x] = output_row[1][x] = output_row[2][x] = + row[x] * (1.f / kJPEGSampleMultiplier); + } + } else { // 3 components + for (size_t x = 0; x < image.xsize(); ++x) { + for (size_t c = 0; c < 3; ++c) { + output_row[c][x] = row[3 * x + c] * (1.f / kJPEGSampleMultiplier); + } + } + } + auto end = Now(); + if (elapsed_deinterleave != nullptr) { + *elapsed_deinterleave += end - start; + } + } + io->SetFromImage(std::move(image), color_encoding); + + jpeg_finish_decompress(&cinfo); + jpeg_destroy_decompress(&cinfo); + io->dec_pixels = io->xsize() * io->ysize(); + return true; + }; + + return try_catch_block(); +#else // JPEGXL_ENABLE_JPEG + return JXL_FAILURE("JPEG decoding not enabled at build time."); +#endif // JPEGXL_ENABLE_JPEG +} + +#if JPEGXL_ENABLE_JPEG +Status EncodeWithLibJpeg(const ImageBundle* ib, const CodecInOut* io, + size_t quality, + const YCbCrChromaSubsampling& chroma_subsampling, + PaddedBytes* bytes) { + jpeg_compress_struct cinfo; + // cinfo is initialized by libjpeg, which we are not instrumenting with + // msan. + msan::UnpoisonMemory(&cinfo, sizeof(cinfo)); + jpeg_error_mgr jerr; + cinfo.err = jpeg_std_error(&jerr); + jpeg_create_compress(&cinfo); + unsigned char* buffer = nullptr; + unsigned long size = 0; + jpeg_mem_dest(&cinfo, &buffer, &size); + cinfo.image_width = ib->xsize(); + cinfo.image_height = ib->ysize(); + if (ib->IsGray()) { + cinfo.input_components = 1; + cinfo.in_color_space = JCS_GRAYSCALE; + } else { + cinfo.input_components = 3; + cinfo.in_color_space = JCS_RGB; + } + jpeg_set_defaults(&cinfo); + cinfo.optimize_coding = TRUE; + if (cinfo.input_components == 3) { + JXL_RETURN_IF_ERROR(SetChromaSubsampling(chroma_subsampling, &cinfo)); + } + jpeg_set_quality(&cinfo, quality, TRUE); + jpeg_start_compress(&cinfo, TRUE); + if (!ib->IsSRGB()) { + WriteICCProfile(&cinfo, ib->c_current().ICC()); + } + WriteExif(&cinfo, io->blobs.exif); + if (cinfo.input_components > 3 || cinfo.input_components < 0) + return JXL_FAILURE("invalid numbers of components"); + + std::unique_ptr row( + new JSAMPLE[cinfo.input_components * cinfo.image_width]); + for (size_t y = 0; y < ib->ysize(); ++y) { + const float* const JXL_RESTRICT input_row[3] = { + ib->color().ConstPlaneRow(0, y), ib->color().ConstPlaneRow(1, y), + ib->color().ConstPlaneRow(2, y)}; + for (size_t x = 0; x < ib->xsize(); ++x) { + for (size_t c = 0; c < static_cast(cinfo.input_components); ++c) { + JXL_RETURN_IF_ERROR(c < 3); + row[cinfo.input_components * x + c] = static_cast( + std::max(std::min(kJPEGSampleMultiplier * input_row[c][x] + .5f, + kJPEGSampleMax), + kJPEGSampleMin)); + } + } + JSAMPROW rows[] = {row.get()}; + jpeg_write_scanlines(&cinfo, rows, 1); + } + jpeg_finish_compress(&cinfo); + jpeg_destroy_compress(&cinfo); + bytes->resize(size); + // Compressed image data is initialized by libjpeg, which we are not + // instrumenting with msan. + msan::UnpoisonMemory(buffer, size); + std::copy_n(buffer, size, bytes->data()); + std::free(buffer); + return true; +} + +Status EncodeWithSJpeg(const ImageBundle* ib, size_t quality, + const YCbCrChromaSubsampling& chroma_subsampling, + PaddedBytes* bytes) { +#if !JPEGXL_ENABLE_SJPEG + return JXL_FAILURE("JPEG XL was built without sjpeg support"); +#else + sjpeg::EncoderParam param(quality); + if (!ib->IsSRGB()) { + param.iccp.assign(ib->metadata()->color_encoding.ICC().begin(), + ib->metadata()->color_encoding.ICC().end()); + } + if (chroma_subsampling.Is444()) { + param.yuv_mode = SJPEG_YUV_444; + } else if (chroma_subsampling.Is420()) { + param.yuv_mode = SJPEG_YUV_SHARP; + } else { + return JXL_FAILURE("sjpeg does not support this chroma subsampling mode"); + } + std::vector rgb; + rgb.reserve(ib->xsize() * ib->ysize() * 3); + for (size_t y = 0; y < ib->ysize(); ++y) { + const float* const rows[] = { + ib->color().ConstPlaneRow(0, y), ib->color().ConstPlaneRow(1, y), + ib->color().ConstPlaneRow(2, y), + }; + for (size_t x = 0; x < ib->xsize(); ++x) { + for (const float* const row : rows) { + rgb.push_back(static_cast( + std::max(0.f, std::min(255.f, roundf(255.f * row[x]))))); + } + } + } + std::string output; + JXL_RETURN_IF_ERROR(sjpeg::Encode(rgb.data(), ib->xsize(), ib->ysize(), + ib->xsize() * 3, param, &output)); + bytes->assign( + reinterpret_cast(output.data()), + reinterpret_cast(output.data() + output.size())); + return true; +#endif +} +#endif // JPEGXL_ENABLE_JPEG + +Status EncodeImageJPG(const CodecInOut* io, JpegEncoder encoder, size_t quality, + YCbCrChromaSubsampling chroma_subsampling, + ThreadPool* pool, PaddedBytes* bytes, + const DecodeTarget target) { + if (io->Main().HasAlpha()) { + return JXL_FAILURE("alpha is not supported"); + } + if (quality > 100) { + return JXL_FAILURE("please specify a 0-100 JPEG quality"); + } + + if (target == DecodeTarget::kQuantizedCoeffs) { + auto write = [&bytes](const uint8_t* buf, size_t len) { + bytes->append(buf, buf + len); + return len; + }; + return jpeg::WriteJpeg(*io->Main().jpeg_data, write); + } + +#if JPEGXL_ENABLE_JPEG + const ImageBundle* ib; + ImageMetadata metadata = io->metadata.m; + ImageBundle ib_store(&metadata); + JXL_RETURN_IF_ERROR(TransformIfNeeded( + io->Main(), io->metadata.m.color_encoding, pool, &ib_store, &ib)); + + switch (encoder) { + case JpegEncoder::kLibJpeg: + JXL_RETURN_IF_ERROR( + EncodeWithLibJpeg(ib, io, quality, chroma_subsampling, bytes)); + break; + case JpegEncoder::kSJpeg: + JXL_RETURN_IF_ERROR( + EncodeWithSJpeg(ib, quality, chroma_subsampling, bytes)); + break; + default: + return JXL_FAILURE("tried to use an unknown JPEG encoder"); + } + + return true; +#else // JPEGXL_ENABLE_JPEG + return JXL_FAILURE("JPEG pixel encoding not enabled at build time"); +#endif // JPEGXL_ENABLE_JPEG +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_jpg.h b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_jpg.h new file mode 100644 index 0000000000..84041ac86f --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_jpg.h @@ -0,0 +1,47 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_EXTRAS_CODEC_JPG_H_ +#define LIB_EXTRAS_CODEC_JPG_H_ + +// Encodes JPG pixels and metadata in memory. + +#include + +#include "lib/extras/codec.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/codec_in_out.h" + +namespace jxl { + +enum class JpegEncoder { + kLibJpeg, + kSJpeg, +}; + +static inline bool IsJPG(const Span bytes) { + if (bytes.size() < 2) return false; + if (bytes[0] != 0xFF || bytes[1] != 0xD8) return false; + return true; +} + +// Decodes `bytes` into `io`. io->dec_hints are ignored. +// `elapsed_deinterleave`, if non-null, will be set to the time (in seconds) +// that it took to deinterleave the raw JSAMPLEs to planar floats. +Status DecodeImageJPG(Span bytes, ThreadPool* pool, + CodecInOut* io, double* elapsed_deinterleave = nullptr); + +// Encodes into `bytes`. +Status EncodeImageJPG(const CodecInOut* io, JpegEncoder encoder, size_t quality, + YCbCrChromaSubsampling chroma_subsampling, + ThreadPool* pool, PaddedBytes* bytes, + DecodeTarget target = DecodeTarget::kPixels); + +} // namespace jxl + +#endif // LIB_EXTRAS_CODEC_JPG_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_pgx.cc b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_pgx.cc new file mode 100644 index 0000000000..7904cfb4d0 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_pgx.cc @@ -0,0 +1,358 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/extras/codec_pgx.h" + +#include +#include +#include + +#include +#include +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/file_io.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/dec_external_image.h" +#include "lib/jxl/enc_external_image.h" +#include "lib/jxl/fields.h" // AllDefault +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/luminance.h" + +namespace jxl { +namespace { + +struct HeaderPGX { + // NOTE: PGX is always grayscale + size_t xsize; + size_t ysize; + size_t bits_per_sample; + bool big_endian; + bool is_signed; +}; + +class Parser { + public: + explicit Parser(const Span bytes) + : pos_(bytes.data()), end_(pos_ + bytes.size()) {} + + // Sets "pos" to the first non-header byte/pixel on success. + Status ParseHeader(HeaderPGX* header, const uint8_t** pos) { + // codec.cc ensures we have at least two bytes => no range check here. + if (pos_[0] != 'P' || pos_[1] != 'G') return false; + pos_ += 2; + return ParseHeaderPGX(header, pos); + } + + // Exposed for testing + Status ParseUnsigned(size_t* number) { + if (pos_ == end_) return JXL_FAILURE("PGX: reached end before number"); + if (!IsDigit(*pos_)) return JXL_FAILURE("PGX: expected unsigned number"); + + *number = 0; + while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') { + *number *= 10; + *number += *pos_ - '0'; + ++pos_; + } + + return true; + } + + private: + static bool IsDigit(const uint8_t c) { return '0' <= c && c <= '9'; } + static bool IsLineBreak(const uint8_t c) { return c == '\r' || c == '\n'; } + static bool IsWhitespace(const uint8_t c) { + return IsLineBreak(c) || c == '\t' || c == ' '; + } + + Status SkipSpace() { + if (pos_ == end_) return JXL_FAILURE("PGX: reached end before space"); + const uint8_t c = *pos_; + if (c != ' ') return JXL_FAILURE("PGX: expected space"); + ++pos_; + return true; + } + + Status SkipLineBreak() { + if (pos_ == end_) return JXL_FAILURE("PGX: reached end before line break"); + // Line break can be either "\n" (0a) or "\r\n" (0d 0a). + if (*pos_ == '\n') { + pos_++; + return true; + } else if (*pos_ == '\r' && pos_ + 1 != end_ && *(pos_ + 1) == '\n') { + pos_ += 2; + return true; + } + return JXL_FAILURE("PGX: expected line break"); + } + + Status SkipSingleWhitespace() { + if (pos_ == end_) return JXL_FAILURE("PGX: reached end before whitespace"); + if (!IsWhitespace(*pos_)) return JXL_FAILURE("PGX: expected whitespace"); + ++pos_; + return true; + } + + Status ParseHeaderPGX(HeaderPGX* header, const uint8_t** pos) { + JXL_RETURN_IF_ERROR(SkipSpace()); + if (pos_ + 2 > end_) return JXL_FAILURE("PGX: header too small"); + if (*pos_ == 'M' && *(pos_ + 1) == 'L') { + header->big_endian = true; + } else if (*pos_ == 'L' && *(pos_ + 1) == 'M') { + header->big_endian = false; + } else { + return JXL_FAILURE("PGX: invalid endianness"); + } + pos_ += 2; + JXL_RETURN_IF_ERROR(SkipSpace()); + if (pos_ == end_) return JXL_FAILURE("PGX: header too small"); + if (*pos_ == '+') { + header->is_signed = false; + } else if (*pos_ == '-') { + header->is_signed = true; + } else { + return JXL_FAILURE("PGX: invalid signedness"); + } + pos_++; + // Skip optional space + if (pos_ < end_ && *pos_ == ' ') pos_++; + JXL_RETURN_IF_ERROR(ParseUnsigned(&header->bits_per_sample)); + JXL_RETURN_IF_ERROR(SkipSingleWhitespace()); + JXL_RETURN_IF_ERROR(ParseUnsigned(&header->xsize)); + JXL_RETURN_IF_ERROR(SkipSingleWhitespace()); + JXL_RETURN_IF_ERROR(ParseUnsigned(&header->ysize)); + // 0xa, or 0xd 0xa. + JXL_RETURN_IF_ERROR(SkipLineBreak()); + + if (header->bits_per_sample > 16) { + return JXL_FAILURE("PGX: >16 bits not yet supported"); + } + // TODO(lode): support signed integers. This may require changing the way + // external_image works. + if (header->is_signed) { + return JXL_FAILURE("PGX: signed not yet supported"); + } + + size_t numpixels = header->xsize * header->ysize; + size_t bytes_per_pixel = header->bits_per_sample <= 8 + ? 1 + : header->bits_per_sample <= 16 ? 2 : 4; + if (pos_ + numpixels * bytes_per_pixel > end_) { + return JXL_FAILURE("PGX: data too small"); + } + + *pos = pos_; + return true; + } + + const uint8_t* pos_; + const uint8_t* const end_; +}; + +constexpr size_t kMaxHeaderSize = 200; + +Status EncodeHeader(const ImageBundle& ib, const size_t bits_per_sample, + char* header, int* JXL_RESTRICT chars_written) { + if (ib.HasAlpha()) return JXL_FAILURE("PGX: can't store alpha"); + if (!ib.IsGray()) return JXL_FAILURE("PGX: must be grayscale"); + // TODO(lode): verify other bit depths: for other bit depths such as 1 or 4 + // bits, have a test case to verify it works correctly. For bits > 16, we may + // need to change the way external_image works. + if (bits_per_sample != 8 && bits_per_sample != 16) { + return JXL_FAILURE("PGX: bits other than 8 or 16 not yet supported"); + } + + // Use ML (Big Endian), LM may not be well supported by all decoders. + snprintf(header, kMaxHeaderSize, "PG ML + %zu %zu %zu\n%n", bits_per_sample, + ib.xsize(), ib.ysize(), chars_written); + return true; +} + +Status ApplyHints(CodecInOut* io) { + bool got_color_space = false; + + JXL_RETURN_IF_ERROR(io->dec_hints.Foreach( + [io, &got_color_space](const std::string& key, + const std::string& value) -> Status { + ColorEncoding* c_original = &io->metadata.m.color_encoding; + if (key == "color_space") { + if (!ParseDescription(value, c_original) || + !c_original->CreateICC()) { + return JXL_FAILURE("PGX: Failed to apply color_space"); + } + + if (!io->metadata.m.color_encoding.IsGray()) { + return JXL_FAILURE("PGX: color_space hint must be grayscale"); + } + + got_color_space = true; + } else if (key == "icc_pathname") { + PaddedBytes icc; + JXL_RETURN_IF_ERROR(ReadFile(value, &icc)); + JXL_RETURN_IF_ERROR(c_original->SetICC(std::move(icc))); + got_color_space = true; + } else { + JXL_WARNING("PGX decoder ignoring %s hint", key.c_str()); + } + return true; + })); + + if (!got_color_space) { + JXL_WARNING("PGX: no color_space/icc_pathname given, assuming sRGB"); + JXL_RETURN_IF_ERROR( + io->metadata.m.color_encoding.SetSRGB(ColorSpace::kGray)); + } + + return true; +} + +template +void ExpectNear(T a, T b, T precision) { + JXL_CHECK(std::abs(a - b) <= precision); +} + +Span MakeSpan(const char* str) { + return Span(reinterpret_cast(str), + strlen(str)); +} + +} // namespace + +Status DecodeImagePGX(const Span bytes, ThreadPool* pool, + CodecInOut* io) { + Parser parser(bytes); + HeaderPGX header = {}; + const uint8_t* pos; + if (!parser.ParseHeader(&header, &pos)) return false; + JXL_RETURN_IF_ERROR( + VerifyDimensions(&io->constraints, header.xsize, header.ysize)); + if (header.bits_per_sample == 0 || header.bits_per_sample > 32) { + return JXL_FAILURE("PGX: bits_per_sample invalid"); + } + + JXL_RETURN_IF_ERROR(ApplyHints(io)); + io->metadata.m.SetUintSamples(header.bits_per_sample); + io->metadata.m.SetAlphaBits(0); + io->dec_pixels = header.xsize * header.ysize; + io->SetSize(header.xsize, header.ysize); + io->frames.clear(); + io->frames.reserve(1); + ImageBundle ib(&io->metadata.m); + + const bool has_alpha = false; + const bool flipped_y = false; + const Span span(pos, bytes.data() + bytes.size() - pos); + JXL_RETURN_IF_ERROR(ConvertFromExternal( + span, header.xsize, header.ysize, io->metadata.m.color_encoding, + has_alpha, + /*alpha_is_premultiplied=*/false, + io->metadata.m.bit_depth.bits_per_sample, + header.big_endian ? JXL_BIG_ENDIAN : JXL_LITTLE_ENDIAN, flipped_y, pool, + &ib)); + io->frames.push_back(std::move(ib)); + SetIntensityTarget(io); + return true; +} + +Status EncodeImagePGX(const CodecInOut* io, const ColorEncoding& c_desired, + size_t bits_per_sample, ThreadPool* pool, + PaddedBytes* bytes) { + if (!Bundle::AllDefault(io->metadata.m)) { + JXL_WARNING("PGX encoder ignoring metadata - use a different codec"); + } + if (!c_desired.IsSRGB()) { + JXL_WARNING( + "PGX encoder cannot store custom ICC profile; decoder\n" + "will need hint key=color_space to get the same values"); + } + + ImageBundle ib = io->Main().Copy(); + + ImageMetadata metadata = io->metadata.m; + ImageBundle store(&metadata); + const ImageBundle* transformed; + JXL_RETURN_IF_ERROR( + TransformIfNeeded(ib, c_desired, pool, &store, &transformed)); + PaddedBytes pixels(ib.xsize() * ib.ysize() * + (bits_per_sample / kBitsPerByte)); + size_t stride = ib.xsize() * (bits_per_sample / kBitsPerByte); + JXL_RETURN_IF_ERROR( + ConvertToExternal(*transformed, bits_per_sample, + /*float_out=*/false, + /*num_channels=*/1, JXL_BIG_ENDIAN, stride, pool, + pixels.data(), pixels.size(), /*out_callback=*/nullptr, + /*out_opaque=*/nullptr, metadata.GetOrientation())); + + char header[kMaxHeaderSize]; + int header_size = 0; + JXL_RETURN_IF_ERROR(EncodeHeader(ib, bits_per_sample, header, &header_size)); + + bytes->resize(static_cast(header_size) + pixels.size()); + memcpy(bytes->data(), header, static_cast(header_size)); + memcpy(bytes->data() + header_size, pixels.data(), pixels.size()); + + return true; +} + +void TestCodecPGX() { + { + std::string pgx = "PG ML + 8 2 3\npixels"; + + CodecInOut io; + ThreadPool* pool = nullptr; + + Status ok = DecodeImagePGX(MakeSpan(pgx.c_str()), pool, &io); + JXL_CHECK(ok == true); + + ScaleImage(255.f, io.Main().color()); + + JXL_CHECK(!io.metadata.m.bit_depth.floating_point_sample); + JXL_CHECK(io.metadata.m.bit_depth.bits_per_sample == 8); + JXL_CHECK(io.metadata.m.color_encoding.IsGray()); + JXL_CHECK(io.xsize() == 2); + JXL_CHECK(io.ysize() == 3); + float eps = 1e-5; + ExpectNear('p', io.Main().color()->Plane(0).Row(0)[0], eps); + ExpectNear('i', io.Main().color()->Plane(0).Row(0)[1], eps); + ExpectNear('x', io.Main().color()->Plane(0).Row(1)[0], eps); + ExpectNear('e', io.Main().color()->Plane(0).Row(1)[1], eps); + ExpectNear('l', io.Main().color()->Plane(0).Row(2)[0], eps); + ExpectNear('s', io.Main().color()->Plane(0).Row(2)[1], eps); + } + + { + std::string pgx = "PG ML + 16 2 3\np_i_x_e_l_s_"; + + CodecInOut io; + ThreadPool* pool = nullptr; + + Status ok = DecodeImagePGX(MakeSpan(pgx.c_str()), pool, &io); + JXL_CHECK(ok == true); + + ScaleImage(255.f, io.Main().color()); + + JXL_CHECK(!io.metadata.m.bit_depth.floating_point_sample); + JXL_CHECK(io.metadata.m.bit_depth.bits_per_sample == 16); + JXL_CHECK(io.metadata.m.color_encoding.IsGray()); + JXL_CHECK(io.xsize() == 2); + JXL_CHECK(io.ysize() == 3); + float eps = 1e-7; + const auto& plane = io.Main().color()->Plane(0); + ExpectNear(256.0f * 'p' + '_', plane.Row(0)[0] * 257, eps); + ExpectNear(256.0f * 'i' + '_', plane.Row(0)[1] * 257, eps); + ExpectNear(256.0f * 'x' + '_', plane.Row(1)[0] * 257, eps); + ExpectNear(256.0f * 'e' + '_', plane.Row(1)[1] * 257, eps); + ExpectNear(256.0f * 'l' + '_', plane.Row(2)[0] * 257, eps); + ExpectNear(256.0f * 's' + '_', plane.Row(2)[1] * 257, eps); + } +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_pgx.h b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_pgx.h new file mode 100644 index 0000000000..deb76da0e4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_pgx.h @@ -0,0 +1,36 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_EXTRAS_CODEC_PGX_H_ +#define LIB_EXTRAS_CODEC_PGX_H_ + +// Encodes/decodes PGX pixels in memory. + +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" + +namespace jxl { + +// Decodes `bytes` into `io`. io->dec_hints may specify "color_space", which +// defaults to sRGB. +Status DecodeImagePGX(const Span bytes, ThreadPool* pool, + CodecInOut* io); + +// Transforms from io->c_current to `c_desired` and encodes into `bytes`. +Status EncodeImagePGX(const CodecInOut* io, const ColorEncoding& c_desired, + size_t bits_per_sample, ThreadPool* pool, + PaddedBytes* bytes); + +void TestCodecPGX(); +} // namespace jxl + +#endif // LIB_EXTRAS_CODEC_PGX_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_png.cc b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_png.cc new file mode 100644 index 0000000000..f6fabd865b --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_png.cc @@ -0,0 +1,872 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/extras/codec_png.h" + +#include +#include +#include +#include + +// Lodepng library: +#include + +#include +#include +#include +#include +#include + +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/file_io.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_external_image.h" +#include "lib/jxl/enc_external_image.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/luminance.h" + +namespace jxl { +namespace { + +#define JXL_PNG_VERBOSE 0 + +// Retrieves XMP and EXIF/IPTC from itext and text. +class BlobsReaderPNG { + public: + static Status Decode(const LodePNGInfo& info, Blobs* blobs) { + for (unsigned idx_itext = 0; idx_itext < info.itext_num; ++idx_itext) { + // We trust these are properly null-terminated by LodePNG. + const char* key = info.itext_keys[idx_itext]; + const char* value = info.itext_strings[idx_itext]; + if (strstr(key, "XML:com.adobe.xmp")) { + blobs->xmp.resize(strlen(value)); // safe, see above + memcpy(blobs->xmp.data(), value, blobs->xmp.size()); + } + } + + for (unsigned idx_text = 0; idx_text < info.text_num; ++idx_text) { + // We trust these are properly null-terminated by LodePNG. + const char* key = info.text_keys[idx_text]; + const char* value = info.text_strings[idx_text]; + std::string type; + PaddedBytes bytes; + + // Handle text chunks annotated with key "Raw profile type ####", with + // #### a type, which may contain metadata. + const char* kKey = "Raw profile type "; + if (strncmp(key, kKey, strlen(kKey)) != 0) continue; + + if (!MaybeDecodeBase16(key, value, &type, &bytes)) { + JXL_WARNING("Couldn't parse 'Raw format type' text chunk"); + continue; + } + if (type == "exif") { + if (!blobs->exif.empty()) { + JXL_WARNING("overwriting EXIF (%zu bytes) with base16 (%zu bytes)", + blobs->exif.size(), bytes.size()); + } + blobs->exif = std::move(bytes); + } else if (type == "iptc") { + // TODO (jon): Deal with IPTC in some way + } else if (type == "8bim") { + // TODO (jon): Deal with 8bim in some way + } else if (type == "xmp") { + if (!blobs->xmp.empty()) { + JXL_WARNING("overwriting XMP (%zu bytes) with base16 (%zu bytes)", + blobs->xmp.size(), bytes.size()); + } + blobs->xmp = std::move(bytes); + } else { + JXL_WARNING( + "Unknown type in 'Raw format type' text chunk: %s: %zu bytes", + type.c_str(), bytes.size()); + } + } + + return true; + } + + private: + // Returns false if invalid. + static JXL_INLINE Status DecodeNibble(const char c, + uint32_t* JXL_RESTRICT nibble) { + if ('a' <= c && c <= 'f') { + *nibble = 10 + c - 'a'; + } else if ('0' <= c && c <= '9') { + *nibble = c - '0'; + } else { + *nibble = 0; + return JXL_FAILURE("Invalid metadata nibble"); + } + JXL_ASSERT(*nibble < 16); + return true; + } + + // Parses a PNG text chunk with key of the form "Raw profile type ####", with + // #### a type. + // Returns whether it could successfully parse the content. + // We trust key and encoded are null-terminated because they come from + // LodePNG. + static Status MaybeDecodeBase16(const char* key, const char* encoded, + std::string* type, PaddedBytes* bytes) { + const char* encoded_end = encoded + strlen(encoded); + + const char* kKey = "Raw profile type "; + if (strncmp(key, kKey, strlen(kKey)) != 0) return false; + *type = key + strlen(kKey); + const size_t kMaxTypeLen = 20; + if (type->length() > kMaxTypeLen) return false; // Type too long + + // Header: freeform string and number of bytes + unsigned long bytes_to_decode; + int header_len; + std::vector description((encoded_end - encoded) + 1); + const int fields = sscanf(encoded, "\n%[^\n]\n%8lu%n", description.data(), + &bytes_to_decode, &header_len); + if (fields != 2) return false; // Failed to decode metadata header + JXL_ASSERT(bytes->empty()); + bytes->reserve(bytes_to_decode); + + // Encoding: base16 with newline after 72 chars. + const char* pos = encoded + header_len; + for (size_t i = 0; i < bytes_to_decode; ++i) { + if (i % 36 == 0) { + if (pos + 1 >= encoded_end) return false; // Truncated base16 1 + if (*pos != '\n') return false; // Expected newline + ++pos; + } + + if (pos + 2 >= encoded_end) return false; // Truncated base16 2; + uint32_t nibble0, nibble1; + JXL_RETURN_IF_ERROR(DecodeNibble(pos[0], &nibble0)); + JXL_RETURN_IF_ERROR(DecodeNibble(pos[1], &nibble1)); + bytes->push_back(static_cast((nibble0 << 4) + nibble1)); + pos += 2; + } + if (pos + 1 != encoded_end) return false; // Too many encoded bytes + if (pos[0] != '\n') return false; // Incorrect metadata terminator + return true; + } +}; + +// Stores XMP and EXIF/IPTC into itext and text. +class BlobsWriterPNG { + public: + static Status Encode(const Blobs& blobs, LodePNGInfo* JXL_RESTRICT info) { + if (!blobs.exif.empty()) { + JXL_RETURN_IF_ERROR(EncodeBase16("exif", blobs.exif, info)); + } + if (!blobs.iptc.empty()) { + JXL_RETURN_IF_ERROR(EncodeBase16("iptc", blobs.iptc, info)); + } + + if (!blobs.xmp.empty()) { + JXL_RETURN_IF_ERROR(EncodeBase16("xmp", blobs.xmp, info)); + + // Below is the official way, but it does not seem to work in ImageMagick. + // Exiv2 and exiftool are OK with either way of encoding XMP. + if (/* DISABLES CODE */ (0)) { + const char* key = "XML:com.adobe.xmp"; + const std::string text(reinterpret_cast(blobs.xmp.data()), + blobs.xmp.size()); + if (lodepng_add_itext(info, key, "", "", text.c_str()) != 0) { + return JXL_FAILURE("Failed to add itext"); + } + } + } + + return true; + } + + private: + static JXL_INLINE char EncodeNibble(const uint8_t nibble) { + JXL_ASSERT(nibble < 16); + return (nibble < 10) ? '0' + nibble : 'a' + nibble - 10; + } + + static Status EncodeBase16(const std::string& type, const PaddedBytes& bytes, + LodePNGInfo* JXL_RESTRICT info) { + // Encoding: base16 with newline after 72 chars. + const size_t base16_size = + 2 * bytes.size() + DivCeil(bytes.size(), size_t(36)) + 1; + std::string base16; + base16.reserve(base16_size); + for (size_t i = 0; i < bytes.size(); ++i) { + if (i % 36 == 0) base16.push_back('\n'); + base16.push_back(EncodeNibble(bytes[i] >> 4)); + base16.push_back(EncodeNibble(bytes[i] & 0x0F)); + } + base16.push_back('\n'); + JXL_ASSERT(base16.length() == base16_size); + + char key[30]; + snprintf(key, sizeof(key), "Raw profile type %s", type.c_str()); + + char header[30]; + snprintf(header, sizeof(header), "\n%s\n%8zu", type.c_str(), bytes.size()); + + const std::string& encoded = std::string(header) + base16; + if (lodepng_add_text(info, key, encoded.c_str()) != 0) { + return JXL_FAILURE("Failed to add text"); + } + + return true; + } +}; + +// Retrieves ColorEncoding from PNG chunks. +class ColorEncodingReaderPNG { + public: + // Fills original->color_encoding or returns false. + Status operator()(const Span bytes, const bool is_gray, + CodecInOut* io) { + ColorEncoding* c_original = &io->metadata.m.color_encoding; + JXL_RETURN_IF_ERROR(Decode(bytes, &io->blobs)); + + const ColorSpace color_space = + is_gray ? ColorSpace::kGray : ColorSpace::kRGB; + + if (have_pq_) { + c_original->SetColorSpace(color_space); + c_original->white_point = WhitePoint::kD65; + c_original->primaries = Primaries::k2100; + c_original->tf.SetTransferFunction(TransferFunction::kPQ); + c_original->rendering_intent = RenderingIntent::kRelative; + if (c_original->CreateICC()) return true; + JXL_WARNING("Failed to synthesize BT.2100 PQ"); + // Else: try the actual ICC profile. + } + + // ICC overrides anything else if present. + if (c_original->SetICC(std::move(icc_))) { + if (have_srgb_) { + JXL_WARNING("Invalid PNG with both sRGB and ICC; ignoring sRGB"); + } + if (is_gray != c_original->IsGray()) { + return JXL_FAILURE("Mismatch between ICC and PNG header grayscale"); + } + return true; // it's fine to ignore gAMA/cHRM. + } + + // PNG requires that sRGB override gAMA/cHRM. + if (have_srgb_) { + return c_original->SetSRGB(color_space, rendering_intent_); + } + + // Try to create a custom profile: + + c_original->SetColorSpace(color_space); + + // Attempt to set whitepoint and primaries if there is a cHRM chunk, or else + // use default sRGB (the PNG then is device-dependent). + // In case of grayscale, do not attempt to set the primaries and ignore the + // ones the PNG image has (but still set the white point). + if (!have_chrm_ || !c_original->SetWhitePoint(white_point_) || + (!is_gray && !c_original->SetPrimaries(primaries_))) { +#if JXL_PNG_VERBOSE >= 1 + JXL_WARNING("No (valid) cHRM, assuming sRGB"); +#endif + c_original->white_point = WhitePoint::kD65; + c_original->primaries = Primaries::kSRGB; + } + + if (!have_gama_ || !c_original->tf.SetGamma(gamma_)) { +#if JXL_PNG_VERBOSE >= 1 + JXL_WARNING("No (valid) gAMA nor sRGB, assuming sRGB"); +#endif + c_original->tf.SetTransferFunction(TransferFunction::kSRGB); + } + + c_original->rendering_intent = RenderingIntent::kRelative; + if (c_original->CreateICC()) return true; + + JXL_WARNING( + "DATA LOSS: unable to create an ICC profile for PNG gAMA/cHRM.\n" + "Image pixels will be interpreted as sRGB. Please add an ICC \n" + "profile to the input image"); + return c_original->SetSRGB(color_space); + } + + // Whether the image has any color profile information (ICC chunk, sRGB + // chunk, cHRM chunk, and so on), or has no color information chunks at all. + bool HaveColorProfile() const { + return have_pq_ || have_srgb_ || have_gama_ || have_chrm_ || have_icc_; + } + + private: + Status DecodeICC(const unsigned char* const payload, + const size_t payload_size) { + if (payload_size == 0) return JXL_FAILURE("Empty ICC payload"); + const unsigned char* pos = payload; + const unsigned char* end = payload + payload_size; + + // Profile name + if (*pos == '\0') return JXL_FAILURE("Expected ICC name"); + for (size_t i = 0;; ++i) { + if (i == 80) return JXL_FAILURE("ICC profile name too long"); + if (pos == end) return JXL_FAILURE("Not enough bytes for ICC name"); + if (*pos++ == '\0') break; + } + + // Special case for BT.2100 PQ (https://w3c.github.io/png-hdr-pq/) - try to + // synthesize the profile because table-based curves are less accurate. + // strcmp is safe because we already verified the string is 0-terminated. + if (!strcmp(reinterpret_cast(payload), "ITUR_2100_PQ_FULL")) { + have_pq_ = true; + } + + // Skip over compression method (only one is allowed) + if (pos == end) return JXL_FAILURE("Not enough bytes for ICC method"); + if (*pos++ != 0) return JXL_FAILURE("Unsupported ICC method"); + + // Decompress + unsigned char* icc_buf = nullptr; + size_t icc_size = 0; + LodePNGDecompressSettings settings; + lodepng_decompress_settings_init(&settings); + const unsigned err = lodepng_zlib_decompress( + &icc_buf, &icc_size, pos, payload_size - (pos - payload), &settings); + if (err == 0) { + icc_.resize(icc_size); + memcpy(icc_.data(), icc_buf, icc_size); + } + free(icc_buf); + have_icc_ = true; + return true; + } + + // Returns floating-point value from the PNG encoding (times 10^5). + static double F64FromU32(const uint32_t x) { + return static_cast(x) * 1E-5; + } + + Status DecodeSRGB(const unsigned char* payload, const size_t payload_size) { + if (payload_size != 1) return JXL_FAILURE("Wrong sRGB size"); + // (PNG uses the same values as ICC.) + if (payload[0] >= 4) return JXL_FAILURE("Invalid Rendering Intent"); + rendering_intent_ = static_cast(payload[0]); + have_srgb_ = true; + return true; + } + + Status DecodeGAMA(const unsigned char* payload, const size_t payload_size) { + if (payload_size != 4) return JXL_FAILURE("Wrong gAMA size"); + gamma_ = F64FromU32(LoadBE32(payload)); + have_gama_ = true; + return true; + } + + Status DecodeCHRM(const unsigned char* payload, const size_t payload_size) { + if (payload_size != 32) return JXL_FAILURE("Wrong cHRM size"); + white_point_.x = F64FromU32(LoadBE32(payload + 0)); + white_point_.y = F64FromU32(LoadBE32(payload + 4)); + primaries_.r.x = F64FromU32(LoadBE32(payload + 8)); + primaries_.r.y = F64FromU32(LoadBE32(payload + 12)); + primaries_.g.x = F64FromU32(LoadBE32(payload + 16)); + primaries_.g.y = F64FromU32(LoadBE32(payload + 20)); + primaries_.b.x = F64FromU32(LoadBE32(payload + 24)); + primaries_.b.y = F64FromU32(LoadBE32(payload + 28)); + have_chrm_ = true; + return true; + } + + Status DecodeEXIF(const unsigned char* payload, const size_t payload_size, + Blobs* blobs) { + // If we already have EXIF, keep the larger one. + if (blobs->exif.size() > payload_size) return true; + blobs->exif.resize(payload_size); + memcpy(blobs->exif.data(), payload, payload_size); + return true; + } + + Status Decode(const Span bytes, Blobs* blobs) { + // Look for colorimetry and text chunks in the PNG image. The PNG chunks + // begin after the PNG magic header of 8 bytes. + const unsigned char* chunk = bytes.data() + 8; + const unsigned char* end = bytes.data() + bytes.size(); + for (;;) { + // chunk points to the first field of a PNG chunk. The chunk has + // respectively 4 bytes of length, 4 bytes type, length bytes of data, + // 4 bytes CRC. + if (chunk + 4 >= end) { + break; // Regular end reached. + } + + char type_char[5]; + if (chunk + 8 >= end) { + JXL_NOTIFY_ERROR("PNG: malformed chunk"); + break; + } + lodepng_chunk_type(type_char, chunk); + std::string type = type_char; + + if (type == "acTL" || type == "fcTL" || type == "fdAT") { + // this is an APNG file, without proper handling we would just return + // the first frame, so for now codec_apng handles animation until the + // animation chunk handling is added here + return false; + } + if (type == "eXIf" || type == "iCCP" || type == "sRGB" || + type == "gAMA" || type == "cHRM") { + const unsigned char* payload = lodepng_chunk_data_const(chunk); + const size_t payload_size = lodepng_chunk_length(chunk); + // The entire chunk needs also 4 bytes of CRC after the payload. + if (payload + payload_size + 4 >= end) { + JXL_NOTIFY_ERROR("PNG: truncated chunk"); + break; + } + if (lodepng_chunk_check_crc(chunk) != 0) { + JXL_NOTIFY_ERROR("CRC mismatch in unknown PNG chunk"); + chunk = lodepng_chunk_next_const(chunk, end); + continue; + } + + if (type == "eXIf") { + JXL_RETURN_IF_ERROR(DecodeEXIF(payload, payload_size, blobs)); + } else if (type == "iCCP") { + JXL_RETURN_IF_ERROR(DecodeICC(payload, payload_size)); + } else if (type == "sRGB") { + JXL_RETURN_IF_ERROR(DecodeSRGB(payload, payload_size)); + } else if (type == "gAMA") { + JXL_RETURN_IF_ERROR(DecodeGAMA(payload, payload_size)); + } else if (type == "cHRM") { + JXL_RETURN_IF_ERROR(DecodeCHRM(payload, payload_size)); + } + } + + chunk = lodepng_chunk_next_const(chunk, end); + } + return true; + } + + PaddedBytes icc_; + + bool have_pq_ = false; + bool have_srgb_ = false; + bool have_gama_ = false; + bool have_chrm_ = false; + bool have_icc_ = false; + + // Only valid if have_srgb_: + RenderingIntent rendering_intent_; + + // Only valid if have_gama_: + double gamma_; + + // Only valid if have_chrm_: + CIExy white_point_; + PrimariesCIExy primaries_; +}; + +Status ApplyHints(const bool is_gray, CodecInOut* io) { + bool got_color_space = false; + + JXL_RETURN_IF_ERROR(io->dec_hints.Foreach( + [is_gray, io, &got_color_space](const std::string& key, + const std::string& value) -> Status { + ColorEncoding* c_original = &io->metadata.m.color_encoding; + if (key == "color_space") { + if (!ParseDescription(value, c_original) || + !c_original->CreateICC()) { + return JXL_FAILURE("PNG: Failed to apply color_space"); + } + + if (is_gray != io->metadata.m.color_encoding.IsGray()) { + return JXL_FAILURE( + "PNG: mismatch between file and color_space hint"); + } + + got_color_space = true; + } else if (key == "icc_pathname") { + PaddedBytes icc; + JXL_RETURN_IF_ERROR(ReadFile(value, &icc)); + JXL_RETURN_IF_ERROR(c_original->SetICC(std::move(icc))); + got_color_space = true; + } else { + JXL_WARNING("PNG decoder ignoring %s hint", key.c_str()); + } + return true; + })); + + if (!got_color_space) { + JXL_WARNING("PNG: no color_space/icc_pathname given, assuming sRGB"); + JXL_RETURN_IF_ERROR(io->metadata.m.color_encoding.SetSRGB( + is_gray ? ColorSpace::kGray : ColorSpace::kRGB)); + } + + return true; +} + +// Stores ColorEncoding into PNG chunks. +class ColorEncodingWriterPNG { + public: + static Status Encode(const ColorEncoding& c, LodePNGInfo* JXL_RESTRICT info) { + // Prefer to only write sRGB - smaller. + if (c.IsSRGB()) { + JXL_RETURN_IF_ERROR(AddSRGB(c, info)); + // PNG recommends not including both sRGB and iCCP, so skip the latter. + } else if (!c.HaveFields() || !c.tf.IsGamma()) { + // Having a gamma value means that the source was a PNG with gAMA and + // without iCCP. + JXL_ASSERT(!c.ICC().empty()); + JXL_RETURN_IF_ERROR(AddICC(c.ICC(), info)); + } + + // gAMA and cHRM are always allowed but will be overridden by sRGB/iCCP. + JXL_RETURN_IF_ERROR(MaybeAddGAMA(c, info)); + JXL_RETURN_IF_ERROR(MaybeAddCHRM(c, info)); + return true; + } + + private: + static Status AddChunk(const char* type, const PaddedBytes& payload, + LodePNGInfo* JXL_RESTRICT info) { + // Ignore original location/order of chunks; place them in the first group. + if (lodepng_chunk_create(&info->unknown_chunks_data[0], + &info->unknown_chunks_size[0], payload.size(), + type, payload.data()) != 0) { + return JXL_FAILURE("Failed to add chunk"); + } + return true; + } + + static Status AddICC(const PaddedBytes& icc, LodePNGInfo* JXL_RESTRICT info) { + LodePNGCompressSettings settings; + lodepng_compress_settings_init(&settings); + unsigned char* out = nullptr; + size_t out_size = 0; + if (lodepng_zlib_compress(&out, &out_size, icc.data(), icc.size(), + &settings) != 0) { + return JXL_FAILURE("Failed to compress ICC"); + } + + PaddedBytes payload; + payload.resize(3 + out_size); + // TODO(janwas): use special name if PQ + payload[0] = '1'; // profile name + payload[1] = '\0'; + payload[2] = 0; // compression method (zlib) + memcpy(&payload[3], out, out_size); + free(out); + + return AddChunk("iCCP", payload, info); + } + + static Status AddSRGB(const ColorEncoding& c, + LodePNGInfo* JXL_RESTRICT info) { + PaddedBytes payload; + payload.push_back(static_cast(c.rendering_intent)); + return AddChunk("sRGB", payload, info); + } + + // Returns PNG encoding of floating-point value (times 10^5). + static uint32_t U32FromF64(const double x) { + return static_cast(roundf(x * 1E5)); + } + + static Status MaybeAddGAMA(const ColorEncoding& c, + LodePNGInfo* JXL_RESTRICT info) { + double gamma; + if (c.tf.IsGamma()) { + gamma = c.tf.GetGamma(); + } else if (c.tf.IsLinear()) { + gamma = 1; + } else if (c.tf.IsSRGB()) { + gamma = 0.45455; + } else { + return true; + } + + PaddedBytes payload(4); + StoreBE32(U32FromF64(gamma), payload.data()); + return AddChunk("gAMA", payload, info); + } + + static Status MaybeAddCHRM(const ColorEncoding& c, + LodePNGInfo* JXL_RESTRICT info) { + CIExy white_point = c.GetWhitePoint(); + // A PNG image stores both whitepoint and primaries in the cHRM chunk, but + // for grayscale images we don't have primaries. It does not matter what + // values are stored in the PNG though (all colors are a multiple of the + // whitepoint), so choose default ones. See + // http://www.libpng.org/pub/png/spec/1.2/PNG-Chunks.html section 4.2.2.1. + PrimariesCIExy primaries = + c.IsGray() ? ColorEncoding().GetPrimaries() : c.GetPrimaries(); + + if (c.primaries == Primaries::kSRGB && c.white_point == WhitePoint::kD65) { + // For sRGB, the cHRM chunk is supposed to have very specific values which + // don't quite match the pre-quantized ones we have (red is off by + // 0.00010). Technically, this is only required for full sRGB, but for + // consistency, we might as well use them whenever the primaries and white + // point are sRGB's. + white_point.x = 0.31270; + white_point.y = 0.32900; + primaries.r.x = 0.64000; + primaries.r.y = 0.33000; + primaries.g.x = 0.30000; + primaries.g.y = 0.60000; + primaries.b.x = 0.15000; + primaries.b.y = 0.06000; + } + + PaddedBytes payload(32); + StoreBE32(U32FromF64(white_point.x), &payload[0]); + StoreBE32(U32FromF64(white_point.y), &payload[4]); + StoreBE32(U32FromF64(primaries.r.x), &payload[8]); + StoreBE32(U32FromF64(primaries.r.y), &payload[12]); + StoreBE32(U32FromF64(primaries.g.x), &payload[16]); + StoreBE32(U32FromF64(primaries.g.y), &payload[20]); + StoreBE32(U32FromF64(primaries.b.x), &payload[24]); + StoreBE32(U32FromF64(primaries.b.y), &payload[28]); + return AddChunk("cHRM", payload, info); + } +}; + +// RAII - ensures state is freed even if returning early. +struct PNGState { + PNGState() { lodepng_state_init(&s); } + ~PNGState() { lodepng_state_cleanup(&s); } + + LodePNGState s; +}; + +Status CheckGray(const LodePNGColorMode& mode, bool has_icc, bool* is_gray) { + switch (mode.colortype) { + case LCT_GREY: + case LCT_GREY_ALPHA: + *is_gray = true; + return true; + + case LCT_RGB: + case LCT_RGBA: + *is_gray = false; + return true; + + case LCT_PALETTE: { + if (has_icc) { + // If an ICC profile is present, the PNG specification requires + // palette to be interpreted as RGB colored, not grayscale, so we must + // output color in that case and unfortunately can't optimize it to + // gray if the palette only has gray entries. + *is_gray = false; + return true; + } else { + *is_gray = true; + for (size_t i = 0; i < mode.palettesize; i++) { + if (mode.palette[i * 4] != mode.palette[i * 4 + 1] || + mode.palette[i * 4] != mode.palette[i * 4 + 2]) { + *is_gray = false; + break; + } + } + return true; + } + } + + default: + *is_gray = false; + return JXL_FAILURE("Unexpected PNG color type"); + } +} + +Status CheckAlpha(const LodePNGColorMode& mode, bool* has_alpha) { + if (mode.key_defined) { + // Color key marks a single color as transparent. + *has_alpha = true; + return true; + } + + switch (mode.colortype) { + case LCT_GREY: + case LCT_RGB: + *has_alpha = false; + return true; + + case LCT_GREY_ALPHA: + case LCT_RGBA: + *has_alpha = true; + return true; + + case LCT_PALETTE: { + *has_alpha = false; + for (size_t i = 0; i < mode.palettesize; i++) { + // PNG palettes are always 8-bit. + if (mode.palette[i * 4 + 3] != 255) { + *has_alpha = true; + break; + } + } + return true; + } + + default: + *has_alpha = false; + return JXL_FAILURE("Unexpected PNG color type"); + } +} + +LodePNGColorType MakeType(const bool is_gray, const bool has_alpha) { + if (is_gray) { + return has_alpha ? LCT_GREY_ALPHA : LCT_GREY; + } + return has_alpha ? LCT_RGBA : LCT_RGB; +} + +// Inspects first chunk of the given type and updates state with the information +// when the chunk is relevant and present in the file. +Status InspectChunkType(const Span bytes, + const std::string& type, LodePNGState* state) { + const unsigned char* chunk = lodepng_chunk_find_const( + bytes.data(), bytes.data() + bytes.size(), type.c_str()); + if (chunk && lodepng_inspect_chunk(state, chunk - bytes.data(), bytes.data(), + bytes.size()) != 0) { + return JXL_FAILURE("Invalid chunk \"%s\" in PNG image", type.c_str()); + } + return true; +} + +} // namespace + +Status DecodeImagePNG(const Span bytes, ThreadPool* pool, + CodecInOut* io) { + unsigned w, h; + PNGState state; + if (lodepng_inspect(&w, &h, &state.s, bytes.data(), bytes.size()) != 0) { + return false; // not an error - just wrong format + } + JXL_RETURN_IF_ERROR(VerifyDimensions(&io->constraints, w, h)); + io->SetSize(w, h); + // Palette RGB values + if (!InspectChunkType(bytes, "PLTE", &state.s)) { + return false; + } + // Transparent color key, or palette transparency + if (!InspectChunkType(bytes, "tRNS", &state.s)) { + return false; + } + // ICC profile + if (!InspectChunkType(bytes, "iCCP", &state.s)) { + return false; + } + const LodePNGColorMode& color_mode = state.s.info_png.color; + bool has_icc = state.s.info_png.iccp_defined; + + bool is_gray, has_alpha; + JXL_RETURN_IF_ERROR(CheckGray(color_mode, has_icc, &is_gray)); + JXL_RETURN_IF_ERROR(CheckAlpha(color_mode, &has_alpha)); + // We want LodePNG to promote 1/2/4 bit pixels to 8. + size_t bits_per_sample = std::max(color_mode.bitdepth, 8u); + if (bits_per_sample != 8 && bits_per_sample != 16) { + return JXL_FAILURE("Unexpected PNG bit depth"); + } + io->metadata.m.SetUintSamples(static_cast(bits_per_sample)); + io->metadata.m.SetAlphaBits( + has_alpha ? io->metadata.m.bit_depth.bits_per_sample : 0); + + // Always decode to 8/16-bit RGB/RGBA, not LCT_PALETTE. + state.s.info_raw.bitdepth = static_cast(bits_per_sample); + state.s.info_raw.colortype = MakeType(is_gray, has_alpha); + unsigned char* out = nullptr; + const unsigned err = + lodepng_decode(&out, &w, &h, &state.s, bytes.data(), bytes.size()); + // Automatically call free(out) on return. + std::unique_ptr out_ptr{out, free}; + if (err != 0) { + return JXL_FAILURE("PNG decode failed: %s", lodepng_error_text(err)); + } + + if (!BlobsReaderPNG::Decode(state.s.info_png, &io->blobs)) { + JXL_WARNING("PNG metadata may be incomplete"); + } + ColorEncodingReaderPNG reader; + JXL_RETURN_IF_ERROR(reader(bytes, is_gray, io)); +#if JXL_PNG_VERBOSE >= 1 + printf("PNG read %s\n", Description(io->metadata.m.color_encoding).c_str()); +#endif + + const size_t num_channels = (is_gray ? 1 : 3) + has_alpha; + const size_t out_size = w * h * num_channels * bits_per_sample / kBitsPerByte; + + const JxlEndianness endianness = JXL_BIG_ENDIAN; // PNG requirement + const Span span(out, out_size); + const bool ok = + ConvertFromExternal(span, w, h, io->metadata.m.color_encoding, has_alpha, + /*alpha_is_premultiplied=*/false, + io->metadata.m.bit_depth.bits_per_sample, endianness, + /*flipped_y=*/false, pool, &io->Main()); + JXL_RETURN_IF_ERROR(ok); + io->dec_pixels = w * h; + io->metadata.m.bit_depth.bits_per_sample = io->Main().DetectRealBitdepth(); + io->metadata.m.xyb_encoded = false; + SetIntensityTarget(io); + if (!reader.HaveColorProfile()) { + JXL_RETURN_IF_ERROR(ApplyHints(is_gray, io)); + } else { + (void)io->dec_hints.Foreach( + [](const std::string& key, const std::string& /*value*/) { + JXL_WARNING("PNG decoder ignoring %s hint", key.c_str()); + return true; + }); + } + return true; +} + +Status EncodeImagePNG(const CodecInOut* io, const ColorEncoding& c_desired, + size_t bits_per_sample, ThreadPool* pool, + PaddedBytes* bytes) { + if (bits_per_sample > 8) { + bits_per_sample = 16; + } else if (bits_per_sample < 8) { + // PNG can also do 4, 2, and 1 bits per sample, but it isn't implemented + bits_per_sample = 8; + } + ImageBundle ib = io->Main().Copy(); + const size_t alpha_bits = ib.HasAlpha() ? bits_per_sample : 0; + ImageMetadata metadata = io->metadata.m; + ImageBundle store(&metadata); + const ImageBundle* transformed; + JXL_RETURN_IF_ERROR( + TransformIfNeeded(ib, c_desired, pool, &store, &transformed)); + size_t stride = ib.oriented_xsize() * + DivCeil(c_desired.Channels() * bits_per_sample + alpha_bits, + kBitsPerByte); + PaddedBytes raw_bytes(stride * ib.oriented_ysize()); + JXL_RETURN_IF_ERROR(ConvertToExternal( + *transformed, bits_per_sample, /*float_out=*/false, + c_desired.Channels() + (ib.HasAlpha() ? 1 : 0), JXL_BIG_ENDIAN, stride, + pool, raw_bytes.data(), raw_bytes.size(), /*out_callback=*/nullptr, + /*out_opaque=*/nullptr, metadata.GetOrientation())); + + PNGState state; + // For maximum compatibility, still store 8-bit even if pixels are all zero. + state.s.encoder.auto_convert = 0; + + LodePNGInfo* info = &state.s.info_png; + info->color.bitdepth = bits_per_sample; + info->color.colortype = MakeType(ib.IsGray(), ib.HasAlpha()); + state.s.info_raw = info->color; + + JXL_RETURN_IF_ERROR(ColorEncodingWriterPNG::Encode(c_desired, info)); + JXL_RETURN_IF_ERROR(BlobsWriterPNG::Encode(io->blobs, info)); + + unsigned char* out = nullptr; + size_t out_size = 0; + const unsigned err = + lodepng_encode(&out, &out_size, raw_bytes.data(), ib.oriented_xsize(), + ib.oriented_ysize(), &state.s); + // Automatically call free(out) on return. + std::unique_ptr out_ptr{out, free}; + if (err != 0) { + return JXL_FAILURE("Failed to encode PNG: %s", lodepng_error_text(err)); + } + bytes->resize(out_size); + memcpy(bytes->data(), out, out_size); + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_png.h b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_png.h new file mode 100644 index 0000000000..ca5c76fadf --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_png.h @@ -0,0 +1,37 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_EXTRAS_CODEC_PNG_H_ +#define LIB_EXTRAS_CODEC_PNG_H_ + +// Encodes/decodes PNG pixels and metadata in memory. + +#include +#include + +// TODO(janwas): workaround for incorrect Win64 codegen (cause unknown) +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" + +namespace jxl { + +// Decodes `bytes` into `io`. io->dec_hints are ignored. +Status DecodeImagePNG(const Span bytes, ThreadPool* pool, + CodecInOut* io); + +// Transforms from io->c_current to `c_desired` and encodes into `bytes`. +Status EncodeImagePNG(const CodecInOut* io, const ColorEncoding& c_desired, + size_t bits_per_sample, ThreadPool* pool, + PaddedBytes* bytes); + +} // namespace jxl + +#endif // LIB_EXTRAS_CODEC_PNG_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_pnm.cc b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_pnm.cc new file mode 100644 index 0000000000..3914924a25 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_pnm.cc @@ -0,0 +1,603 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/extras/codec_pnm.h" + +#include +#include +#include + +#include +#include +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/file_io.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/dec_external_image.h" +#include "lib/jxl/enc_external_image.h" +#include "lib/jxl/fields.h" // AllDefault +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/luminance.h" + +namespace jxl { +namespace { + +struct HeaderPNM { + size_t xsize; + size_t ysize; + bool is_bit; // PBM + bool is_gray; // PGM + int is_yuv; // Y4M: where 1 = 444, 2 = 422, 3 = 420 + size_t bits_per_sample; + bool floating_point; + bool big_endian; +}; + +class Parser { + public: + explicit Parser(const Span bytes) + : pos_(bytes.data()), end_(pos_ + bytes.size()) {} + + // Sets "pos" to the first non-header byte/pixel on success. + Status ParseHeader(HeaderPNM* header, const uint8_t** pos) { + // codec.cc ensures we have at least two bytes => no range check here. + if (pos_[0] == 'Y' && pos_[1] == 'U') return ParseHeaderY4M(header, pos); + if (pos_[0] != 'P') return false; + const uint8_t type = pos_[1]; + pos_ += 2; + + header->is_bit = false; + header->is_yuv = 0; + + switch (type) { + case '4': + header->is_bit = true; + header->is_gray = true; + header->bits_per_sample = 1; + return ParseHeaderPNM(header, pos); + + case '5': + header->is_gray = true; + return ParseHeaderPNM(header, pos); + + case '6': + header->is_gray = false; + return ParseHeaderPNM(header, pos); + + // TODO(jon): P7 (PAM) + + case 'F': + header->is_gray = false; + return ParseHeaderPFM(header, pos); + + case 'f': + header->is_gray = true; + return ParseHeaderPFM(header, pos); + } + return false; + } + + // Exposed for testing + Status ParseUnsigned(size_t* number) { + if (pos_ == end_) return JXL_FAILURE("PNM: reached end before number"); + if (!IsDigit(*pos_)) return JXL_FAILURE("PNM: expected unsigned number"); + + *number = 0; + while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') { + *number *= 10; + *number += *pos_ - '0'; + ++pos_; + } + + return true; + } + + Status ParseSigned(double* number) { + if (pos_ == end_) return JXL_FAILURE("PNM: reached end before signed"); + + if (*pos_ != '-' && *pos_ != '+' && !IsDigit(*pos_)) { + return JXL_FAILURE("PNM: expected signed number"); + } + + // Skip sign + const bool is_neg = *pos_ == '-'; + if (is_neg || *pos_ == '+') { + ++pos_; + if (pos_ == end_) return JXL_FAILURE("PNM: reached end before digits"); + } + + // Leading digits + *number = 0.0; + while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') { + *number *= 10; + *number += *pos_ - '0'; + ++pos_; + } + + // Decimal places? + if (pos_ < end_ && *pos_ == '.') { + ++pos_; + double place = 0.1; + while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') { + *number += (*pos_ - '0') * place; + place *= 0.1; + ++pos_; + } + } + + if (is_neg) *number = -*number; + return true; + } + + private: + static bool IsDigit(const uint8_t c) { return '0' <= c && c <= '9'; } + static bool IsLineBreak(const uint8_t c) { return c == '\r' || c == '\n'; } + static bool IsWhitespace(const uint8_t c) { + return IsLineBreak(c) || c == '\t' || c == ' '; + } + + Status SkipBlank() { + if (pos_ == end_) return JXL_FAILURE("PNM: reached end before blank"); + const uint8_t c = *pos_; + if (c != ' ' && c != '\n') return JXL_FAILURE("PNM: expected blank"); + ++pos_; + return true; + } + + Status SkipSingleWhitespace() { + if (pos_ == end_) return JXL_FAILURE("PNM: reached end before whitespace"); + if (!IsWhitespace(*pos_)) return JXL_FAILURE("PNM: expected whitespace"); + ++pos_; + return true; + } + + Status SkipWhitespace() { + if (pos_ == end_) return JXL_FAILURE("PNM: reached end before whitespace"); + if (!IsWhitespace(*pos_) && *pos_ != '#') { + return JXL_FAILURE("PNM: expected whitespace/comment"); + } + + while (pos_ < end_ && IsWhitespace(*pos_)) { + ++pos_; + } + + // Comment(s) + while (pos_ != end_ && *pos_ == '#') { + while (pos_ != end_ && !IsLineBreak(*pos_)) { + ++pos_; + } + // Newline(s) + while (pos_ != end_ && IsLineBreak(*pos_)) pos_++; + } + + while (pos_ < end_ && IsWhitespace(*pos_)) { + ++pos_; + } + return true; + } + + Status ExpectString(const char* str, size_t len) { + // Unlikely to happen. + if (pos_ + len < pos_) return JXL_FAILURE("Y4M: overflow"); + + if (pos_ + len > end_ || strncmp(str, (const char*)pos_, len) != 0) { + return JXL_FAILURE("Y4M: expected %s", str); + } + pos_ += len; + return true; + } + + Status ReadChar(char* out) { + // Unlikely to happen. + if (pos_ + 1 < pos_) return JXL_FAILURE("Y4M: overflow"); + + if (pos_ >= end_) { + return JXL_FAILURE("Y4M: unexpected end of input"); + } + *out = *pos_; + pos_++; + return true; + } + + // TODO(jon): support multi-frame y4m + Status ParseHeaderY4M(HeaderPNM* header, const uint8_t** pos) { + JXL_RETURN_IF_ERROR(ExpectString("YUV4MPEG2", 9)); + header->is_gray = false; + header->is_yuv = 3; + // TODO(jon): check if 4:2:0 is indeed the default + header->bits_per_sample = 8; + // TODO(jon): check if there's a y4m convention for higher bit depths + while (pos_ < end_) { + char next = 0; + JXL_RETURN_IF_ERROR(ReadChar(&next)); + if (next == 0x0A) break; + if (next != ' ') continue; + char field = 0; + JXL_RETURN_IF_ERROR(ReadChar(&field)); + switch (field) { + case 'W': + JXL_RETURN_IF_ERROR(ParseUnsigned(&header->xsize)); + break; + case 'H': + JXL_RETURN_IF_ERROR(ParseUnsigned(&header->ysize)); + break; + case 'I': + JXL_RETURN_IF_ERROR(ReadChar(&next)); + if (next != 'p') { + return JXL_FAILURE( + "Y4M: only progressive (no frame interlacing) allowed"); + } + break; + case 'C': { + char c1 = 0; + JXL_RETURN_IF_ERROR(ReadChar(&c1)); + char c2 = 0; + JXL_RETURN_IF_ERROR(ReadChar(&c2)); + char c3 = 0; + JXL_RETURN_IF_ERROR(ReadChar(&c3)); + if (c1 != '4') return JXL_FAILURE("Y4M: invalid C param"); + if (c2 == '4') { + if (c3 != '4') return JXL_FAILURE("Y4M: invalid C param"); + header->is_yuv = 1; // 444 + } else if (c2 == '2') { + if (c3 == '2') { + header->is_yuv = 2; // 422 + } else if (c3 == '0') { + header->is_yuv = 3; // 420 + } else { + return JXL_FAILURE("Y4M: invalid C param"); + } + } else { + return JXL_FAILURE("Y4M: invalid C param"); + } + } + [[fallthrough]]; + // no break: fallthrough because this field can have values like + // "C420jpeg" (we are ignoring the chroma sample location and treat + // everything like C420jpeg) + case 'F': // Framerate in fps as numerator:denominator + // TODO(jon): actually read this and set corresponding jxl + // metadata + case 'A': // Pixel aspect ratio (ignoring it, could perhaps adjust + // intrinsic dimensions based on this?) + case 'X': // Comment, ignore + // ignore the field value and go to next one + while (pos_ < end_) { + if (pos_[0] == ' ' || pos_[0] == 0x0A) break; + pos_++; + } + break; + default: + return JXL_FAILURE("Y4M: parse error"); + } + } + JXL_RETURN_IF_ERROR(ExpectString("FRAME", 5)); + while (true) { + char next = 0; + JXL_RETURN_IF_ERROR(ReadChar(&next)); + if (next == 0x0A) { + *pos = pos_; + return true; + } + } + } + + Status ParseHeaderPNM(HeaderPNM* header, const uint8_t** pos) { + JXL_RETURN_IF_ERROR(SkipWhitespace()); + JXL_RETURN_IF_ERROR(ParseUnsigned(&header->xsize)); + + JXL_RETURN_IF_ERROR(SkipWhitespace()); + JXL_RETURN_IF_ERROR(ParseUnsigned(&header->ysize)); + + if (!header->is_bit) { + JXL_RETURN_IF_ERROR(SkipWhitespace()); + size_t max_val; + JXL_RETURN_IF_ERROR(ParseUnsigned(&max_val)); + if (max_val == 0 || max_val >= 65536) { + return JXL_FAILURE("PNM: bad MaxVal"); + } + header->bits_per_sample = CeilLog2Nonzero(max_val); + } + header->floating_point = false; + header->big_endian = true; + + JXL_RETURN_IF_ERROR(SkipSingleWhitespace()); + + *pos = pos_; + return true; + } + + Status ParseHeaderPFM(HeaderPNM* header, const uint8_t** pos) { + JXL_RETURN_IF_ERROR(SkipSingleWhitespace()); + JXL_RETURN_IF_ERROR(ParseUnsigned(&header->xsize)); + + JXL_RETURN_IF_ERROR(SkipBlank()); + JXL_RETURN_IF_ERROR(ParseUnsigned(&header->ysize)); + + JXL_RETURN_IF_ERROR(SkipSingleWhitespace()); + // The scale has no meaning as multiplier, only its sign is used to + // indicate endianness. All software expects nominal range 0..1. + double scale; + JXL_RETURN_IF_ERROR(ParseSigned(&scale)); + header->big_endian = scale >= 0.0; + header->bits_per_sample = 32; + header->floating_point = true; + + JXL_RETURN_IF_ERROR(SkipSingleWhitespace()); + + *pos = pos_; + return true; + } + + const uint8_t* pos_; + const uint8_t* const end_; +}; + +constexpr size_t kMaxHeaderSize = 200; + +Status EncodeHeader(const ImageBundle& ib, const size_t bits_per_sample, + const bool little_endian, char* header, + int* JXL_RESTRICT chars_written) { + if (ib.HasAlpha()) return JXL_FAILURE("PNM: can't store alpha"); + + if (bits_per_sample == 32) { // PFM + const char type = ib.IsGray() ? 'f' : 'F'; + const double scale = little_endian ? -1.0 : 1.0; + snprintf(header, kMaxHeaderSize, "P%c\n%zu %zu\n%.1f\n%n", type, + ib.oriented_xsize(), ib.oriented_ysize(), scale, chars_written); + } else if (bits_per_sample == 1) { // PBM + if (!ib.IsGray()) { + return JXL_FAILURE("Cannot encode color as PBM"); + } + snprintf(header, kMaxHeaderSize, "P4\n%zu %zu\n%n", ib.oriented_xsize(), + ib.oriented_ysize(), chars_written); + } else { // PGM/PPM + const uint32_t max_val = (1U << bits_per_sample) - 1; + if (max_val >= 65536) return JXL_FAILURE("PNM cannot have > 16 bits"); + const char type = ib.IsGray() ? '5' : '6'; + snprintf(header, kMaxHeaderSize, "P%c\n%zu %zu\n%u\n%n", type, + ib.oriented_xsize(), ib.oriented_ysize(), max_val, chars_written); + } + return true; +} + +Status ApplyHints(const bool is_gray, CodecInOut* io) { + bool got_color_space = false; + + JXL_RETURN_IF_ERROR(io->dec_hints.Foreach( + [is_gray, io, &got_color_space](const std::string& key, + const std::string& value) -> Status { + ColorEncoding* c_original = &io->metadata.m.color_encoding; + if (key == "color_space") { + if (!ParseDescription(value, c_original) || + !c_original->CreateICC()) { + return JXL_FAILURE("PNM: Failed to apply color_space"); + } + + if (is_gray != io->metadata.m.color_encoding.IsGray()) { + return JXL_FAILURE( + "PNM: mismatch between file and color_space hint"); + } + + got_color_space = true; + } else if (key == "icc_pathname") { + PaddedBytes icc; + JXL_RETURN_IF_ERROR(ReadFile(value, &icc)); + JXL_RETURN_IF_ERROR(c_original->SetICC(std::move(icc))); + got_color_space = true; + } else { + JXL_WARNING("PNM decoder ignoring %s hint", key.c_str()); + } + return true; + })); + + if (!got_color_space) { + JXL_WARNING("PNM: no color_space/icc_pathname given, assuming sRGB"); + JXL_RETURN_IF_ERROR(io->metadata.m.color_encoding.SetSRGB( + is_gray ? ColorSpace::kGray : ColorSpace::kRGB)); + } + + return true; +} + +Span MakeSpan(const char* str) { + return Span(reinterpret_cast(str), + strlen(str)); +} + +// Flip the image vertically for loading/saving PFM files which have the +// scanlines inverted. +void VerticallyFlipImage(Image3F* const image) { + for (int c = 0; c < 3; c++) { + for (size_t y = 0; y < image->ysize() / 2; y++) { + float* first_row = image->PlaneRow(c, y); + float* other_row = image->PlaneRow(c, image->ysize() - y - 1); + for (size_t x = 0; x < image->xsize(); ++x) { + float tmp = first_row[x]; + first_row[x] = other_row[x]; + other_row[x] = tmp; + } + } + } +} + +} // namespace + +Status DecodeImagePNM(const Span bytes, ThreadPool* pool, + CodecInOut* io) { + Parser parser(bytes); + HeaderPNM header = {}; + const uint8_t* pos = nullptr; + if (!parser.ParseHeader(&header, &pos)) return false; + JXL_RETURN_IF_ERROR( + VerifyDimensions(&io->constraints, header.xsize, header.ysize)); + + if (header.bits_per_sample == 0 || header.bits_per_sample > 32) { + return JXL_FAILURE("PNM: bits_per_sample invalid"); + } + + JXL_RETURN_IF_ERROR(ApplyHints(header.is_gray, io)); + if (header.floating_point) { + io->metadata.m.SetFloat32Samples(); + } else { + io->metadata.m.SetUintSamples(header.bits_per_sample); + } + io->metadata.m.SetAlphaBits(0); + io->dec_pixels = header.xsize * header.ysize; + + if (header.is_yuv > 0) { + Image3F yuvdata(header.xsize, header.ysize); + ImageBundle bundle(&io->metadata.m); + const int hshift[3][3] = {{0, 0, 0}, {0, 1, 1}, {0, 1, 1}}; + const int vshift[3][3] = {{0, 0, 0}, {0, 0, 0}, {0, 1, 1}}; + + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < header.ysize >> vshift[header.is_yuv - 1][c]; + ++y) { + float* const JXL_RESTRICT row = + yuvdata.PlaneRow((c == 2 ? 2 : 1 - c), y); + if (pos + (header.xsize >> hshift[header.is_yuv - 1][c]) > + bytes.data() + bytes.size()) + return JXL_FAILURE("Not enough image data"); + for (size_t x = 0; x < header.xsize >> hshift[header.is_yuv - 1][c]; + ++x) { + row[x] = (1.f / 255.f) * ((*pos++) - 128.f); + } + } + } + bundle.SetFromImage(std::move(yuvdata), io->metadata.m.color_encoding); + bundle.color_transform = ColorTransform::kYCbCr; + + YCbCrChromaSubsampling subsampling; + uint8_t cssh[3] = { + 2, static_cast(hshift[header.is_yuv - 1][1] ? 1 : 2), + static_cast(hshift[header.is_yuv - 1][2] ? 1 : 2)}; + uint8_t cssv[3] = { + 2, static_cast(vshift[header.is_yuv - 1][1] ? 1 : 2), + static_cast(vshift[header.is_yuv - 1][2] ? 1 : 2)}; + + JXL_RETURN_IF_ERROR(subsampling.Set(cssh, cssv)); + + bundle.chroma_subsampling = subsampling; + + io->Main() = std::move(bundle); + } else { + const bool flipped_y = header.bits_per_sample == 32; // PFMs are flipped + const Span span(pos, bytes.data() + bytes.size() - pos); + JXL_RETURN_IF_ERROR(ConvertFromExternal( + span, header.xsize, header.ysize, io->metadata.m.color_encoding, + /*has_alpha=*/false, /*alpha_is_premultiplied=*/false, + io->metadata.m.bit_depth.bits_per_sample, + header.big_endian ? JXL_BIG_ENDIAN : JXL_LITTLE_ENDIAN, flipped_y, pool, + &io->Main())); + } + if (!header.floating_point) { + io->metadata.m.bit_depth.bits_per_sample = io->Main().DetectRealBitdepth(); + } + io->SetSize(header.xsize, header.ysize); + SetIntensityTarget(io); + return true; +} + +Status EncodeImagePNM(const CodecInOut* io, const ColorEncoding& c_desired, + size_t bits_per_sample, ThreadPool* pool, + PaddedBytes* bytes) { + const bool floating_point = bits_per_sample > 16; + // Choose native for PFM; PGM/PPM require big-endian (N/A for PBM) + const JxlEndianness endianness = + floating_point ? JXL_NATIVE_ENDIAN : JXL_BIG_ENDIAN; + + ImageMetadata metadata_copy = io->metadata.m; + // AllDefault sets all_default, which can cause a race condition. + if (!Bundle::AllDefault(metadata_copy)) { + JXL_WARNING("PNM encoder ignoring metadata - use a different codec"); + } + if (!c_desired.IsSRGB()) { + JXL_WARNING( + "PNM encoder cannot store custom ICC profile; decoder\n" + "will need hint key=color_space to get the same values"); + } + + ImageBundle ib = io->Main().Copy(); + // In case of PFM the image must be flipped upside down since that format + // is designed that way. + const ImageBundle* to_color_transform = &ib; + ImageBundle flipped; + if (floating_point) { + flipped = ib.Copy(); + VerticallyFlipImage(flipped.color()); + to_color_transform = &flipped; + } + ImageMetadata metadata = io->metadata.m; + ImageBundle store(&metadata); + const ImageBundle* transformed; + JXL_RETURN_IF_ERROR(TransformIfNeeded(*to_color_transform, c_desired, pool, + &store, &transformed)); + size_t stride = ib.oriented_xsize() * + (c_desired.Channels() * bits_per_sample) / kBitsPerByte; + PaddedBytes pixels(stride * ib.oriented_ysize()); + JXL_RETURN_IF_ERROR(ConvertToExternal( + *transformed, bits_per_sample, floating_point, c_desired.Channels(), + endianness, stride, pool, pixels.data(), pixels.size(), + /*out_callback=*/nullptr, /*out_opaque=*/nullptr, + metadata.GetOrientation())); + + char header[kMaxHeaderSize]; + int header_size = 0; + bool is_little_endian = endianness == JXL_LITTLE_ENDIAN || + (endianness == JXL_NATIVE_ENDIAN && IsLittleEndian()); + JXL_RETURN_IF_ERROR(EncodeHeader(*transformed, bits_per_sample, + is_little_endian, header, &header_size)); + + bytes->resize(static_cast(header_size) + pixels.size()); + memcpy(bytes->data(), header, static_cast(header_size)); + memcpy(bytes->data() + header_size, pixels.data(), pixels.size()); + + return true; +} + +void TestCodecPNM() { + size_t u = 77777; // Initialized to wrong value. + double d = 77.77; +// Failing to parse invalid strings results in a crash if `JXL_CRASH_ON_ERROR` +// is defined and hence the tests fail. Therefore we only run these tests if +// `JXL_CRASH_ON_ERROR` is not defined. +#ifndef JXL_CRASH_ON_ERROR + JXL_CHECK(false == Parser(MakeSpan("")).ParseUnsigned(&u)); + JXL_CHECK(false == Parser(MakeSpan("+")).ParseUnsigned(&u)); + JXL_CHECK(false == Parser(MakeSpan("-")).ParseUnsigned(&u)); + JXL_CHECK(false == Parser(MakeSpan("A")).ParseUnsigned(&u)); + + JXL_CHECK(false == Parser(MakeSpan("")).ParseSigned(&d)); + JXL_CHECK(false == Parser(MakeSpan("+")).ParseSigned(&d)); + JXL_CHECK(false == Parser(MakeSpan("-")).ParseSigned(&d)); + JXL_CHECK(false == Parser(MakeSpan("A")).ParseSigned(&d)); +#endif + JXL_CHECK(true == Parser(MakeSpan("1")).ParseUnsigned(&u)); + JXL_CHECK(u == 1); + + JXL_CHECK(true == Parser(MakeSpan("32")).ParseUnsigned(&u)); + JXL_CHECK(u == 32); + + JXL_CHECK(true == Parser(MakeSpan("1")).ParseSigned(&d)); + JXL_CHECK(d == 1.0); + JXL_CHECK(true == Parser(MakeSpan("+2")).ParseSigned(&d)); + JXL_CHECK(d == 2.0); + JXL_CHECK(true == Parser(MakeSpan("-3")).ParseSigned(&d)); + JXL_CHECK(std::abs(d - -3.0) < 1E-15); + JXL_CHECK(true == Parser(MakeSpan("3.141592")).ParseSigned(&d)); + JXL_CHECK(std::abs(d - 3.141592) < 1E-15); + JXL_CHECK(true == Parser(MakeSpan("-3.141592")).ParseSigned(&d)); + JXL_CHECK(std::abs(d - -3.141592) < 1E-15); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_pnm.h b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_pnm.h new file mode 100644 index 0000000000..9547ecc929 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_pnm.h @@ -0,0 +1,40 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_EXTRAS_CODEC_PNM_H_ +#define LIB_EXTRAS_CODEC_PNM_H_ + +// Encodes/decodes PBM/PGM/PPM/PFM pixels in memory. + +#include +#include + +// TODO(janwas): workaround for incorrect Win64 codegen (cause unknown) +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" + +namespace jxl { + +// Decodes `bytes` into `io`. io->dec_hints may specify "color_space", which +// defaults to sRGB. +Status DecodeImagePNM(const Span bytes, ThreadPool* pool, + CodecInOut* io); + +// Transforms from io->c_current to `c_desired` and encodes into `bytes`. +Status EncodeImagePNM(const CodecInOut* io, const ColorEncoding& c_desired, + size_t bits_per_sample, ThreadPool* pool, + PaddedBytes* bytes); + +void TestCodecPNM(); + +} // namespace jxl + +#endif // LIB_EXTRAS_CODEC_PNM_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_psd.cc b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_psd.cc new file mode 100644 index 0000000000..37d3177e35 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_psd.cc @@ -0,0 +1,609 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/extras/codec_psd.h" + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/file_io.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/fields.h" // AllDefault +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/luminance.h" + +namespace jxl { +namespace { + +uint64_t get_be_int(int bytes, const uint8_t*& pos, const uint8_t* maxpos) { + uint64_t r = 0; + if (pos + bytes <= maxpos) { + if (bytes == 1) { + r = *pos; + } else if (bytes == 2) { + r = LoadBE16(pos); + } else if (bytes == 4) { + r = LoadBE32(pos); + } else if (bytes == 8) { + r = LoadBE64(pos); + } + } + pos += bytes; + return r; +} + +// Copies up to n bytes, without reading from maxpos (the STL-style end). +void safe_copy(const uint8_t* JXL_RESTRICT pos, + const uint8_t* JXL_RESTRICT maxpos, char* JXL_RESTRICT out, + size_t n) { + for (size_t i = 0; i < n; ++i) { + if (pos + i >= maxpos) return; + out[i] = pos[i]; + } +} + +// maxpos is the STL-style end! The valid range is up to [pos, maxpos). +int safe_strncmp(const uint8_t* pos, const uint8_t* maxpos, const char* s2, + size_t n) { + if (pos + n > maxpos) return 1; + return strncmp((const char*)pos, s2, n); +} +constexpr int PSD_VERBOSITY = 1; + +Status decode_layer(const uint8_t*& pos, const uint8_t* maxpos, + ImageBundle& layer, std::vector chans, + std::vector invert, int w, int h, int version, + int colormodel, bool is_layer, int depth) { + int compression_method = 2; + int nb_channels = chans.size(); + JXL_DEBUG_V(PSD_VERBOSITY, + "Trying to decode layer with dimensions %ix%i and %i channels", w, + h, nb_channels); + if (w <= 0 || h <= 0) return JXL_FAILURE("PSD: empty layer"); + for (int c = 0; c < nb_channels; c++) { + // skip nop byte padding + while (pos < maxpos && *pos == 128) pos++; + JXL_DEBUG_V(PSD_VERBOSITY, "Channel %i (pos %zu)", c, (size_t)pos); + // Merged image stores all channels together (same compression method) + // Layers store channel per channel + if (is_layer || c == 0) { + compression_method = get_be_int(2, pos, maxpos); + JXL_DEBUG_V(PSD_VERBOSITY, "compression method: %i", compression_method); + if (compression_method > 1 || compression_method < 0) { + return JXL_FAILURE("PSD: can't handle compression method %i", + compression_method); + } + } + + if (!is_layer && c < colormodel) { + // skip to the extra channels + if (compression_method == 0) { + pos += w * h * (depth >> 3) * colormodel; + c = colormodel - 1; + continue; + } + size_t skip_amount = 0; + for (int i = 0; i < nb_channels; i++) { + if (i < colormodel) { + for (int y = 0; y < h; y++) { + skip_amount += get_be_int(2 * version, pos, maxpos); + } + } else { + pos += h * 2 * version; + } + } + pos += skip_amount; + c = colormodel - 1; + continue; + } + if (is_layer || c == 0) { + // skip the line-counts, we don't need them + if (compression_method == 1) { + pos += h * (is_layer ? 1 : nb_channels) * 2 * + version; // PSB uses 4 bytes per rowsize instead of 2 + } + } + int c_id = chans[c]; + if (c_id < 0) continue; // skip + if (static_cast(c_id) >= 3 + layer.extra_channels().size()) + return JXL_FAILURE("PSD: can't handle channel id %i", c_id); + ImageF& ch = (c_id < 3 ? layer.color()->Plane(c_id) + : layer.extra_channels()[c_id - 3]); + + for (int y = 0; y < h; y++) { + if (pos > maxpos) return JXL_FAILURE("PSD: premature end of input"); + float* const JXL_RESTRICT row = ch.Row(y); + if (compression_method == 0) { + // uncompressed is easy + if (depth == 8) { + for (int x = 0; x < w; x++) { + row[x] = get_be_int(1, pos, maxpos) * (1.f / 255.f); + } + } else if (depth == 16) { + for (int x = 0; x < w; x++) { + row[x] = get_be_int(2, pos, maxpos) * (1.f / 65535.f); + } + } else if (depth == 32) { + for (int x = 0; x < w; x++) { + uint32_t f = get_be_int(4, pos, maxpos); + memcpy(&row[x], &f, 4); + } + } + } else { + // RLE is not that hard + if (depth != 8) + return JXL_FAILURE("PSD: did not expect RLE with depth>1"); + for (int x = 0; x < w;) { + if (pos >= maxpos) return JXL_FAILURE("PSD: out of bounds"); + int8_t rle = *pos++; + if (rle <= 0) { + if (rle == -128) continue; // nop + int count = 1 - rle; + float v = get_be_int(1, pos, maxpos) * (1.f / 255.f); + while (count && x < w) { + row[x] = v; + count--; + x++; + } + if (count) return JXL_FAILURE("PSD: row overflow"); + } else { + int count = 1 + rle; + while (count && x < w) { + row[x] = get_be_int(1, pos, maxpos) * (1.f / 255.f); + count--; + x++; + } + if (count) return JXL_FAILURE("PSD: row overflow"); + } + } + } + if (invert[c]) { + // sometimes 0 means full ink + for (int x = 0; x < w; x++) { + row[x] = 1.f - row[x]; + } + } + } + JXL_DEBUG_V(PSD_VERBOSITY, "Channel %i read.", c); + } + + return true; +} + +} // namespace + +Status DecodeImagePSD(const Span bytes, ThreadPool* pool, + CodecInOut* io) { + const uint8_t* pos = bytes.data(); + const uint8_t* maxpos = bytes.data() + bytes.size(); + if (safe_strncmp(pos, maxpos, "8BPS", 4)) return false; // not a PSD file + JXL_DEBUG_V(PSD_VERBOSITY, "trying psd decode"); + pos += 4; + int version = get_be_int(2, pos, maxpos); + JXL_DEBUG_V(PSD_VERBOSITY, "Version=%i", version); + if (version < 1 || version > 2) + return JXL_FAILURE("PSD: unknown format version"); + // PSD = version 1, PSB = version 2 + pos += 6; + int nb_channels = get_be_int(2, pos, maxpos); + size_t ysize = get_be_int(4, pos, maxpos); + size_t xsize = get_be_int(4, pos, maxpos); + const SizeConstraints* constraints = &io->constraints; + JXL_RETURN_IF_ERROR(VerifyDimensions(constraints, xsize, ysize)); + uint64_t total_pixel_count = static_cast(xsize) * ysize; + int bitdepth = get_be_int(2, pos, maxpos); + if (bitdepth != 8 && bitdepth != 16 && bitdepth != 32) { + return JXL_FAILURE("PSD: bit depth %i invalid or not supported", bitdepth); + } + if (bitdepth == 32) { + io->metadata.m.SetFloat32Samples(); + } else { + io->metadata.m.SetUintSamples(bitdepth); + } + int colormodel = get_be_int(2, pos, maxpos); + // 1 = Grayscale, 3 = RGB, 4 = CMYK + if (colormodel != 1 && colormodel != 3 && colormodel != 4) + return JXL_FAILURE("PSD: unsupported color model"); + + int real_nb_channels = colormodel; + std::vector> spotcolor; + + if (get_be_int(4, pos, maxpos)) + return JXL_FAILURE("PSD: Unsupported color mode section"); + + bool hasmergeddata = true; + bool have_alpha = false; + bool merged_has_alpha = false; + size_t metalength = get_be_int(4, pos, maxpos); + const uint8_t* metaoffset = pos; + while (pos < metaoffset + metalength) { + char header[5] = "????"; + safe_copy(pos, maxpos, header, 4); + if (memcmp(header, "8BIM", 4) != 0) { + return JXL_FAILURE("PSD: Unexpected image resource header: %s", header); + } + pos += 4; + int id = get_be_int(2, pos, maxpos); + int namelength = get_be_int(1, pos, maxpos); + pos += namelength; + if (!(namelength & 1)) pos++; // padding to even length + size_t blocklength = get_be_int(4, pos, maxpos); + // JXL_DEBUG_V(PSD_VERBOSITY, "block id: %i | block length: %zu",id, + // blocklength); + if (pos > maxpos) return JXL_FAILURE("PSD: Unexpected end of file"); + if (id == 1039) { // ICC profile + size_t delta = maxpos - pos; + if (delta < blocklength) { + return JXL_FAILURE("PSD: Invalid block length"); + } + PaddedBytes icc; + icc.resize(blocklength); + memcpy(icc.data(), pos, blocklength); + if (!io->metadata.m.color_encoding.SetICC(std::move(icc))) { + return JXL_FAILURE("PSD: Invalid color profile"); + } + } else if (id == 1057) { // compatibility mode or not? + if (get_be_int(4, pos, maxpos) != 1) { + return JXL_FAILURE("PSD: expected version=1 in id=1057 resource block"); + } + hasmergeddata = get_be_int(1, pos, maxpos); + pos++; + blocklength -= 6; // already skipped these bytes + } else if (id == 1077) { // spot colors + int version = get_be_int(4, pos, maxpos); + if (version != 1) { + return JXL_FAILURE( + "PSD: expected DisplayInfo version 1, got version %i", version); + } + int spotcolorcount = nb_channels - colormodel; + JXL_DEBUG_V(PSD_VERBOSITY, "Reading %i spot colors. %zu", spotcolorcount, + blocklength); + for (int k = 0; k < spotcolorcount; k++) { + int colorspace = get_be_int(2, pos, maxpos); + if ((colormodel == 3 && colorspace != 0) || + (colormodel == 4 && colorspace != 2)) { + return JXL_FAILURE( + "PSD: cannot handle spot colors in different color spaces than " + "image itself"); + } + if (colorspace == 2) JXL_WARNING("PSD: K ignored in CMYK spot color"); + std::vector color; + color.push_back(get_be_int(2, pos, maxpos) / 65535.f); // R or C + color.push_back(get_be_int(2, pos, maxpos) / 65535.f); // G or M + color.push_back(get_be_int(2, pos, maxpos) / 65535.f); // B or Y + color.push_back(get_be_int(2, pos, maxpos) / 65535.f); // ignored or K + color.push_back(get_be_int(2, pos, maxpos) / + 100.f); // solidity (alpha, basically) + int kind = get_be_int(1, pos, maxpos); + JXL_DEBUG_V(PSD_VERBOSITY, "Kind=%i", kind); + color.push_back(kind); + spotcolor.push_back(color); + if (kind == 2) { + JXL_DEBUG_V(PSD_VERBOSITY, "Actual spot color"); + } else if (kind == 1) { + JXL_DEBUG_V(PSD_VERBOSITY, "Mask (alpha) channel"); + } else if (kind == 0) { + JXL_DEBUG_V(PSD_VERBOSITY, "Selection (alpha) channel"); + } else { + return JXL_FAILURE("PSD: Unknown extra channel type"); + } + } + if (blocklength & 1) pos++; + blocklength = 0; + } + pos += blocklength; + if (blocklength & 1) pos++; // padding again + } + + size_t layerlength = get_be_int(4 * version, pos, maxpos); + const uint8_t* after_layers_pos = pos + layerlength; + if (after_layers_pos < pos) return JXL_FAILURE("PSD: invalid layer length"); + if (layerlength) { + pos += 4 * version; // don't care about layerinfolength + JXL_DEBUG_V(PSD_VERBOSITY, "Layer section length: %zu", layerlength); + int layercount = static_cast(get_be_int(2, pos, maxpos)); + JXL_DEBUG_V(PSD_VERBOSITY, "Layer count: %i", layercount); + io->frames.clear(); + + if (layercount == 0) { + if (get_be_int(2, pos, maxpos) != 0) { + return JXL_FAILURE( + "PSD: Expected zero padding before additional layer info"); + } + while (pos < after_layers_pos) { + if (safe_strncmp(pos, maxpos, "8BIM", 4) && + safe_strncmp(pos, maxpos, "8B64", 4)) + return JXL_FAILURE("PSD: Unexpected layer info signature"); + pos += 4; + const uint8_t* tpos = pos; + pos += 4; + size_t blocklength = get_be_int(4 * version, pos, maxpos); + JXL_DEBUG_V(PSD_VERBOSITY, "Length=%zu", blocklength); + if (blocklength > 0) { + if (pos >= maxpos) return JXL_FAILURE("PSD: Unexpected end of file"); + size_t delta = maxpos - pos; + if (delta < blocklength) { + return JXL_FAILURE("PSD: Invalid block length"); + } + } + if (!safe_strncmp(tpos, maxpos, "Layr", 4) || + !safe_strncmp(tpos, maxpos, "Lr16", 4) || + !safe_strncmp(tpos, maxpos, "Lr32", 4)) { + layercount = static_cast(get_be_int(2, pos, maxpos)); + if (layercount < 0) { + return JXL_FAILURE("PSD: Invalid layer count"); + } + JXL_DEBUG_V(PSD_VERBOSITY, "Real layer count: %i", layercount); + if (layercount > 1) have_alpha = true; + break; + } + if (!safe_strncmp(tpos, maxpos, "Mtrn", 4) || + !safe_strncmp(tpos, maxpos, "Mt16", 4) || + !safe_strncmp(tpos, maxpos, "Mt32", 4)) { + JXL_DEBUG_V(PSD_VERBOSITY, "Merged layer has transparency channel"); + if (nb_channels > real_nb_channels) { + have_alpha = true; + merged_has_alpha = true; + } + } + pos += blocklength; + } + } else if (layercount < 0) { + // negative layer count indicates merged has alpha and it is to be shown + if (nb_channels > real_nb_channels) { + have_alpha = true; + merged_has_alpha = true; + } + layercount = -layercount; + } else { + // multiple layers implies there is alpha + have_alpha = true; + } + + ExtraChannelInfo info; + info.bit_depth.bits_per_sample = bitdepth; + info.dim_shift = 0; + + if (colormodel == 4) { // cmyk + info.type = ExtraChannel::kBlack; + io->metadata.m.extra_channel_info.push_back(info); + } + if (have_alpha) { + JXL_DEBUG_V(PSD_VERBOSITY, "Have alpha"); + real_nb_channels++; + info.type = ExtraChannel::kAlpha; + info.alpha_associated = + false; // true? PSD is not consistent with this, need to check + io->metadata.m.extra_channel_info.push_back(info); + } + if (merged_has_alpha && !spotcolor.empty() && spotcolor[0][5] == 1) { + // first alpha channel + spotcolor.erase(spotcolor.begin()); + } + for (size_t i = 0; i < spotcolor.size(); i++) { + real_nb_channels++; + if (spotcolor[i][5] == 2) { + info.type = ExtraChannel::kSpotColor; + info.spot_color[0] = spotcolor[i][0]; + info.spot_color[1] = spotcolor[i][1]; + info.spot_color[2] = spotcolor[i][2]; + info.spot_color[3] = spotcolor[i][4]; + } else if (spotcolor[i][5] == 1) { + info.type = ExtraChannel::kAlpha; + } else if (spotcolor[i][5] == 0) { + info.type = ExtraChannel::kSelectionMask; + } else + return JXL_FAILURE("PSD: unhandled extra channel"); + io->metadata.m.extra_channel_info.push_back(info); + } + std::vector> layer_chan_id; + std::vector layer_offsets(layercount + 1, 0); + std::vector is_real_layer(layercount, false); + for (int l = 0; l < layercount; l++) { + ImageBundle layer(&io->metadata.m); + layer.duration = 0; + layer.blend = (l > 0); + + layer.use_for_next_frame = (l + 1 < layercount); + layer.origin.y0 = get_be_int(4, pos, maxpos); + layer.origin.x0 = get_be_int(4, pos, maxpos); + size_t height = get_be_int(4, pos, maxpos) - layer.origin.y0; + size_t width = get_be_int(4, pos, maxpos) - layer.origin.x0; + JXL_DEBUG_V(PSD_VERBOSITY, "Layer %i: %zu x %zu at origin (%i, %i)", l, + width, height, layer.origin.x0, layer.origin.y0); + int nb_chs = get_be_int(2, pos, maxpos); + JXL_DEBUG_V(PSD_VERBOSITY, " channels: %i", nb_chs); + std::vector chan_ids; + layer_offsets[l + 1] = layer_offsets[l]; + for (int lc = 0; lc < nb_chs; lc++) { + int id = get_be_int(2, pos, maxpos); + JXL_DEBUG_V(PSD_VERBOSITY, " id=%i", id); + if (id == 65535) { + chan_ids.push_back(colormodel); // alpha + } else if (id == 65534) { + chan_ids.push_back(-1); // layer mask, ignored + } else { + chan_ids.push_back(id); // color channel + } + layer_offsets[l + 1] += get_be_int(4 * version, pos, maxpos); + } + layer_chan_id.push_back(chan_ids); + if (safe_strncmp(pos, maxpos, "8BIM", 4)) + return JXL_FAILURE("PSD: Layer %i: Unexpected signature (not 8BIM)", l); + pos += 4; + if (safe_strncmp(pos, maxpos, "norm", 4)) { + return JXL_FAILURE( + "PSD: Layer %i: Cannot handle non-default blend mode", l); + } + pos += 4; + int opacity = get_be_int(1, pos, maxpos); + if (opacity < 100) { + JXL_WARNING( + "PSD: ignoring opacity of semi-transparent layer %i (opacity=%i)", + l, opacity); + } + pos++; // clipping + int flags = get_be_int(1, pos, maxpos); + pos++; + bool invisible = (flags & 2); + if (invisible) { + if (l + 1 < layercount) { + layer.blend = false; + layer.use_for_next_frame = false; + } else { + // TODO: instead add dummy last frame? + JXL_WARNING("PSD: invisible top layer was made visible"); + } + } + size_t extradata = get_be_int(4, pos, maxpos); + JXL_DEBUG_V(PSD_VERBOSITY, " extradata: %zu bytes", extradata); + const uint8_t* after_extra = pos + extradata; + // TODO: deal with non-empty layer masks + pos += get_be_int(4, pos, maxpos); // skip layer mask data + pos += get_be_int(4, pos, maxpos); // skip layer blend range data + size_t namelength = get_be_int(1, pos, maxpos); + size_t delta = maxpos - pos; + if (delta < namelength) return JXL_FAILURE("PSD: Invalid block length"); + char lname[256] = {}; + memcpy(lname, pos, namelength); + lname[namelength] = 0; + JXL_DEBUG_V(PSD_VERBOSITY, " name: %s", lname); + pos = after_extra; + if (width == 0 || height == 0) { + JXL_DEBUG_V(PSD_VERBOSITY, + " NOT A REAL LAYER"); // probably layer group + continue; + } + is_real_layer[l] = true; + JXL_RETURN_IF_ERROR(VerifyDimensions(constraints, width, height)); + uint64_t pixel_count = static_cast(width) * height; + if (!SafeAdd(total_pixel_count, pixel_count, total_pixel_count)) { + return JXL_FAILURE("Image too big"); + } + if (total_pixel_count > constraints->dec_max_pixels) { + return JXL_FAILURE("Image too big"); + } + Image3F rgb(width, height); + layer.SetFromImage(std::move(rgb), io->metadata.m.color_encoding); + std::vector ec; + for (const auto& ec_meta : layer.metadata()->extra_channel_info) { + ImageF extra(width, height); + if (ec_meta.type == ExtraChannel::kAlpha) { + FillPlane(1.0f, &extra, Rect(extra)); // opaque + } else { + ZeroFillPlane(&extra, Rect(extra)); // zeroes + } + ec.push_back(std::move(extra)); + } + if (!ec.empty()) layer.SetExtraChannels(std::move(ec)); + layer.name = lname; + io->dec_pixels += layer.xsize() * layer.ysize(); + io->frames.push_back(std::move(layer)); + } + + std::vector invert(real_nb_channels, false); + int il = 0; + const uint8_t* bpos = pos; + for (int l = 0; l < layercount; l++) { + if (!is_real_layer[l]) continue; + pos = bpos + layer_offsets[l]; + if (pos < bpos) return JXL_FAILURE("PSD: invalid layer offset"); + JXL_DEBUG_V(PSD_VERBOSITY, "At position %i (%zu)", + (int)(pos - bytes.data()), (size_t)pos); + ImageBundle& layer = io->frames[il++]; + std::vector& chan_id = layer_chan_id[l]; + if (chan_id.size() > invert.size()) invert.resize(chan_id.size(), false); + JXL_RETURN_IF_ERROR(decode_layer(pos, maxpos, layer, chan_id, invert, + layer.xsize(), layer.ysize(), version, + colormodel, true, bitdepth)); + } + } else + return JXL_FAILURE("PSD: no layer data found"); + + if (!hasmergeddata && !spotcolor.empty()) { + return JXL_FAILURE("PSD: extra channel data declared but not found"); + } + + if (!spotcolor.empty() || (hasmergeddata && io->frames.empty())) { + // PSD only has spot colors / extra alpha/mask data in the merged image + // We don't redundantly store the merged image, so we put it in the first + // layer (the next layers will kAdd zeroes to it) + pos = after_layers_pos; + bool have_only_merged = false; + if (io->frames.empty()) { + // There is only the merged image, no layers + ImageBundle nlayer(&io->metadata.m); + Image3F rgb(xsize, ysize); + nlayer.SetFromImage(std::move(rgb), io->metadata.m.color_encoding); + std::vector ec; + for (const auto& ec_meta : nlayer.metadata()->extra_channel_info) { + ImageF extra(xsize, ysize); + if (ec_meta.type == ExtraChannel::kAlpha) { + FillPlane(1.0f, &extra, Rect(extra)); // opaque + } else { + ZeroFillPlane(&extra, Rect(extra)); // zeroes + } + ec.push_back(std::move(extra)); + } + if (!ec.empty()) nlayer.SetExtraChannels(std::move(ec)); + io->dec_pixels += nlayer.xsize() * nlayer.ysize(); + io->frames.push_back(std::move(nlayer)); + have_only_merged = true; + } + ImageBundle& layer = io->frames[0]; + std::vector chan_id(real_nb_channels); + std::iota(chan_id.begin(), chan_id.end(), 0); + std::vector invert(real_nb_channels, false); + if (static_cast(spotcolor.size()) + colormodel + 1 < + real_nb_channels) { + return JXL_FAILURE("Inconsistent layer configuration"); + } + if (!merged_has_alpha) { + if (colormodel >= real_nb_channels) { + return JXL_FAILURE("Inconsistent layer configuration"); + } + chan_id.erase(chan_id.begin() + colormodel); + invert.erase(invert.begin() + colormodel); + } else { + colormodel++; + } + for (size_t i = colormodel; i < invert.size(); i++) { + if (spotcolor[i - colormodel][5] == 2) invert[i] = true; + if (spotcolor[i - colormodel][5] == 0) invert[i] = true; + } + JXL_RETURN_IF_ERROR(decode_layer( + pos, maxpos, layer, chan_id, invert, layer.xsize(), layer.ysize(), + version, (have_only_merged ? 0 : colormodel), false, bitdepth)); + } + + if (io->frames.empty()) return JXL_FAILURE("PSD: no layers"); + + io->SetSize(xsize, ysize); + + SetIntensityTarget(io); + + return true; +} + +Status EncodeImagePSD(const CodecInOut* io, const ColorEncoding& c_desired, + size_t bits_per_sample, ThreadPool* pool, + PaddedBytes* bytes) { + return JXL_FAILURE("PSD encoding not yet implemented"); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_psd.h b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_psd.h new file mode 100644 index 0000000000..11a9fb882e --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_psd.h @@ -0,0 +1,33 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_EXTRAS_CODEC_PSD_H_ +#define LIB_EXTRAS_CODEC_PSD_H_ + +// Decodes Photoshop PSD/PSB, preserving the layers + +#include +#include + +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" + +namespace jxl { + +// Decodes `bytes` into `io`. +Status DecodeImagePSD(const Span bytes, ThreadPool* pool, + CodecInOut* io); + +// Not implemented yet +Status EncodeImagePSD(const CodecInOut* io, const ColorEncoding& c_desired, + size_t bits_per_sample, ThreadPool* pool, + PaddedBytes* bytes); + +} // namespace jxl + +#endif // LIB_EXTRAS_CODEC_PSD_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_test.cc new file mode 100644 index 0000000000..24426444fb --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/codec_test.cc @@ -0,0 +1,375 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/extras/codec.h" + +#include +#include + +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "lib/extras/codec_pgx.h" +#include "lib/extras/codec_pnm.h" +#include "lib/jxl/base/thread_pool_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/luminance.h" +#include "lib/jxl/testdata.h" + +namespace jxl { +namespace { + +CodecInOut CreateTestImage(const size_t xsize, const size_t ysize, + const bool is_gray, const bool add_alpha, + const size_t bits_per_sample, + const ColorEncoding& c_native) { + Image3F image(xsize, ysize); + std::mt19937_64 rng(129); + std::uniform_real_distribution dist(0.0f, 1.0f); + if (is_gray) { + for (size_t y = 0; y < ysize; ++y) { + float* JXL_RESTRICT row0 = image.PlaneRow(0, y); + float* JXL_RESTRICT row1 = image.PlaneRow(1, y); + float* JXL_RESTRICT row2 = image.PlaneRow(2, y); + for (size_t x = 0; x < xsize; ++x) { + row0[x] = row1[x] = row2[x] = dist(rng); + } + } + } else { + RandomFillImage(&image, 1.0f); + } + CodecInOut io; + + if (bits_per_sample == 32) { + io.metadata.m.SetFloat32Samples(); + } else { + io.metadata.m.SetUintSamples(bits_per_sample); + } + io.metadata.m.color_encoding = c_native; + io.SetFromImage(std::move(image), c_native); + if (add_alpha) { + ImageF alpha(xsize, ysize); + RandomFillImage(&alpha, 1.f); + io.metadata.m.SetAlphaBits(bits_per_sample <= 8 ? 8 : 16); + io.Main().SetAlpha(std::move(alpha), /*alpha_is_premultiplied=*/false); + } + return io; +} + +// Ensures reading a newly written file leads to the same image pixels. +void TestRoundTrip(Codec codec, const size_t xsize, const size_t ysize, + const bool is_gray, const bool add_alpha, + const size_t bits_per_sample, ThreadPool* pool) { + // JPEG encoding is not lossless. + if (codec == Codec::kJPG) return; + if (codec == Codec::kPNM && add_alpha) return; + // Our EXR codec always uses 16-bit premultiplied alpha, does not support + // grayscale, and somehow does not have sufficient precision for this test. + if (codec == Codec::kEXR) return; + printf("Codec %s bps:%zu gr:%d al:%d\n", + ExtensionFromCodec(codec, is_gray, bits_per_sample).c_str(), + bits_per_sample, is_gray, add_alpha); + + ColorEncoding c_native; + c_native.SetColorSpace(is_gray ? ColorSpace::kGray : ColorSpace::kRGB); + // Note: this must not be wider than c_external, otherwise gamut clipping + // will cause large round-trip errors. + c_native.primaries = Primaries::kP3; + c_native.tf.SetTransferFunction(TransferFunction::kLinear); + JXL_CHECK(c_native.CreateICC()); + + // Generally store same color space to reduce round trip errors.. + ColorEncoding c_external = c_native; + // .. unless we have enough precision for some transforms. + if (bits_per_sample >= 16) { + c_external.white_point = WhitePoint::kE; + c_external.primaries = Primaries::k2100; + c_external.tf.SetTransferFunction(TransferFunction::kSRGB); + } + JXL_CHECK(c_external.CreateICC()); + + const CodecInOut io = CreateTestImage(xsize, ysize, is_gray, add_alpha, + bits_per_sample, c_native); + const ImageBundle& ib1 = io.Main(); + + PaddedBytes encoded; + JXL_CHECK(Encode(io, codec, c_external, bits_per_sample, &encoded, pool)); + + CodecInOut io2; + io2.target_nits = io.metadata.m.IntensityTarget(); + // Only for PNM because PNG will warn about ignoring them. + if (codec == Codec::kPNM) { + io2.dec_hints.Add("color_space", Description(c_external)); + } + JXL_CHECK(SetFromBytes(Span(encoded), &io2, pool)); + ImageBundle& ib2 = io2.Main(); + + EXPECT_EQ(Description(c_external), + Description(io2.metadata.m.color_encoding)); + + // See c_external above - for low bits_per_sample the encoded space is + // already the same. + if (bits_per_sample < 16) { + EXPECT_EQ(Description(ib1.c_current()), Description(ib2.c_current())); + } + + if (add_alpha) { + EXPECT_TRUE(SamePixels(ib1.alpha(), *ib2.alpha())); + } + + JXL_CHECK(ib2.TransformTo(ib1.c_current(), pool)); + + double max_l1, max_rel; + // Round-trip tolerances must be higher than in external_image_test because + // codecs do not support unbounded ranges. +#if JPEGXL_ENABLE_SKCMS + if (bits_per_sample <= 12) { + max_l1 = 0.5; + max_rel = 6E-3; + } else { + max_l1 = 1E-3; + max_rel = 5E-4; + } +#else // JPEGXL_ENABLE_SKCMS + if (bits_per_sample <= 12) { + max_l1 = 0.5; + max_rel = 6E-3; + } else if (bits_per_sample == 16) { + max_l1 = 3E-3; + max_rel = 1E-4; + } else { +#ifdef __ARM_ARCH + // pow() implementation in arm is a bit less precise than in x86 and + // therefore we need a bigger error margin in this case. + max_l1 = 1E-7; + max_rel = 1E-4; +#else + max_l1 = 1E-7; + max_rel = 1E-5; +#endif + } +#endif // JPEGXL_ENABLE_SKCMS + + VerifyRelativeError(ib1.color(), *ib2.color(), max_l1, max_rel); +} + +#if 0 +TEST(CodecTest, TestRoundTrip) { + ThreadPoolInternal pool(12); + + const size_t xsize = 7; + const size_t ysize = 4; + + for (Codec codec : Values()) { + for (int bits_per_sample : {8, 10, 12, 16, 32}) { + for (bool is_gray : {false, true}) { + for (bool add_alpha : {false, true}) { + TestRoundTrip(codec, xsize, ysize, is_gray, add_alpha, + static_cast(bits_per_sample), &pool); + } + } + } + } +} +#endif + +CodecInOut DecodeRoundtrip(const std::string& pathname, Codec expected_codec, + ThreadPool* pool, + const DecoderHints& dec_hints = DecoderHints()) { + CodecInOut io; + io.dec_hints = dec_hints; + const PaddedBytes orig = ReadTestData(pathname); + JXL_CHECK(SetFromBytes(Span(orig), &io, pool)); + const ImageBundle& ib1 = io.Main(); + + // Encode/Decode again to make sure Encode carries through all metadata. + PaddedBytes encoded; + JXL_CHECK(Encode(io, expected_codec, io.metadata.m.color_encoding, + io.metadata.m.bit_depth.bits_per_sample, &encoded, pool)); + + CodecInOut io2; + io2.dec_hints = dec_hints; + JXL_CHECK(SetFromBytes(Span(encoded), &io2, pool)); + const ImageBundle& ib2 = io2.Main(); + EXPECT_EQ(Description(ib1.metadata()->color_encoding), + Description(ib2.metadata()->color_encoding)); + EXPECT_EQ(Description(ib1.c_current()), Description(ib2.c_current())); + + size_t bits_per_sample = io2.metadata.m.bit_depth.bits_per_sample; + + // "Same" pixels? + double max_l1 = bits_per_sample <= 12 ? 1.3 : 2E-3; + double max_rel = bits_per_sample <= 12 ? 6E-3 : 1E-4; + if (ib1.metadata()->color_encoding.IsGray()) { + max_rel *= 2.0; + } else if (ib1.metadata()->color_encoding.primaries != Primaries::kSRGB) { + // Need more tolerance for large gamuts (anything but sRGB) + max_l1 *= 1.5; + max_rel *= 3.0; + } + VerifyRelativeError(ib1.color(), ib2.color(), max_l1, max_rel); + + // Simulate the encoder removing profile and decoder restoring it. + if (!ib2.metadata()->color_encoding.WantICC()) { + io2.metadata.m.color_encoding.InternalRemoveICC(); + EXPECT_TRUE(io2.metadata.m.color_encoding.CreateICC()); + } + + return io2; +} + +#if 0 +TEST(CodecTest, TestMetadataSRGB) { + ThreadPoolInternal pool(12); + + const char* paths[] = {"raw.pixls/DJI-FC6310-16bit_srgb8_v4_krita.png", + "raw.pixls/Google-Pixel2XL-16bit_srgb8_v4_krita.png", + "raw.pixls/HUAWEI-EVA-L09-16bit_srgb8_dt.png", + "raw.pixls/Nikon-D300-12bit_srgb8_dt.png", + "raw.pixls/Sony-DSC-RX1RM2-14bit_srgb8_v4_krita.png"}; + for (const char* relative_pathname : paths) { + const CodecInOut io = + DecodeRoundtrip(relative_pathname, Codec::kPNG, &pool); + EXPECT_EQ(8, io.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0, io.metadata.m.bit_depth.exponent_bits_per_sample); + + EXPECT_EQ(64, io.xsize()); + EXPECT_EQ(64, io.ysize()); + EXPECT_FALSE(io.metadata.m.HasAlpha()); + + const ColorEncoding& c_original = io.metadata.m.color_encoding; + EXPECT_FALSE(c_original.ICC().empty()); + EXPECT_EQ(ColorSpace::kRGB, c_original.GetColorSpace()); + EXPECT_EQ(WhitePoint::kD65, c_original.white_point); + EXPECT_EQ(Primaries::kSRGB, c_original.primaries); + EXPECT_TRUE(c_original.tf.IsSRGB()); + } +} + +TEST(CodecTest, TestMetadataLinear) { + ThreadPoolInternal pool(12); + + const char* paths[3] = { + "raw.pixls/Google-Pixel2XL-16bit_acescg_g1_v4_krita.png", + "raw.pixls/HUAWEI-EVA-L09-16bit_709_g1_dt.png", + "raw.pixls/Nikon-D300-12bit_2020_g1_dt.png", + }; + const WhitePoint white_points[3] = {WhitePoint::kCustom, WhitePoint::kD65, + WhitePoint::kD65}; + const Primaries primaries[3] = {Primaries::kCustom, Primaries::kSRGB, + Primaries::k2100}; + + for (size_t i = 0; i < 3; ++i) { + const CodecInOut io = DecodeRoundtrip(paths[i], Codec::kPNG, &pool); + EXPECT_EQ(16, io.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0, io.metadata.m.bit_depth.exponent_bits_per_sample); + + EXPECT_EQ(64, io.xsize()); + EXPECT_EQ(64, io.ysize()); + EXPECT_FALSE(io.metadata.m.HasAlpha()); + + const ColorEncoding& c_original = io.metadata.m.color_encoding; + EXPECT_FALSE(c_original.ICC().empty()); + EXPECT_EQ(ColorSpace::kRGB, c_original.GetColorSpace()); + EXPECT_EQ(white_points[i], c_original.white_point); + EXPECT_EQ(primaries[i], c_original.primaries); + EXPECT_TRUE(c_original.tf.IsLinear()); + } +} + +TEST(CodecTest, TestMetadataICC) { + ThreadPoolInternal pool(12); + + const char* paths[] = { + "raw.pixls/DJI-FC6310-16bit_709_v4_krita.png", + "raw.pixls/Sony-DSC-RX1RM2-14bit_709_v4_krita.png", + }; + for (const char* relative_pathname : paths) { + const CodecInOut io = + DecodeRoundtrip(relative_pathname, Codec::kPNG, &pool); + EXPECT_GE(16, io.metadata.m.bit_depth.bits_per_sample); + EXPECT_LE(14, io.metadata.m.bit_depth.bits_per_sample); + + EXPECT_EQ(64, io.xsize()); + EXPECT_EQ(64, io.ysize()); + EXPECT_FALSE(io.metadata.m.HasAlpha()); + + const ColorEncoding& c_original = io.metadata.m.color_encoding; + EXPECT_FALSE(c_original.ICC().empty()); + EXPECT_EQ(RenderingIntent::kPerceptual, c_original.rendering_intent); + EXPECT_EQ(ColorSpace::kRGB, c_original.GetColorSpace()); + EXPECT_EQ(WhitePoint::kD65, c_original.white_point); + EXPECT_EQ(Primaries::kSRGB, c_original.primaries); + EXPECT_EQ(TransferFunction::k709, c_original.tf.GetTransferFunction()); + } +} + +TEST(CodecTest, TestPNGSuite) { + ThreadPoolInternal pool(12); + + // Ensure we can load PNG with text, japanese UTF-8, compressed text. + (void)DecodeRoundtrip("pngsuite/ct1n0g04.png", Codec::kPNG, &pool); + (void)DecodeRoundtrip("pngsuite/ctjn0g04.png", Codec::kPNG, &pool); + (void)DecodeRoundtrip("pngsuite/ctzn0g04.png", Codec::kPNG, &pool); + + // Extract gAMA + const CodecInOut b1 = + DecodeRoundtrip("pngsuite/g10n3p04.png", Codec::kPNG, &pool); + EXPECT_TRUE(b1.metadata.color_encoding.tf.IsLinear()); + + // Extract cHRM + const CodecInOut b_p = + DecodeRoundtrip("pngsuite/ccwn2c08.png", Codec::kPNG, &pool); + EXPECT_EQ(Primaries::kSRGB, b_p.metadata.color_encoding.primaries); + EXPECT_EQ(WhitePoint::kD65, b_p.metadata.color_encoding.white_point); + + // Extract EXIF from (new-style) dedicated chunk + const CodecInOut b_exif = + DecodeRoundtrip("pngsuite/exif2c08.png", Codec::kPNG, &pool); + EXPECT_EQ(978, b_exif.blobs.exif.size()); +} +#endif + +void VerifyWideGamutMetadata(const std::string& relative_pathname, + const Primaries primaries, ThreadPool* pool) { + const CodecInOut io = DecodeRoundtrip(relative_pathname, Codec::kPNG, pool); + + EXPECT_EQ(8, io.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0, io.metadata.m.bit_depth.exponent_bits_per_sample); + + const ColorEncoding& c_original = io.metadata.m.color_encoding; + EXPECT_FALSE(c_original.ICC().empty()); + EXPECT_EQ(RenderingIntent::kAbsolute, c_original.rendering_intent); + EXPECT_EQ(ColorSpace::kRGB, c_original.GetColorSpace()); + EXPECT_EQ(WhitePoint::kD65, c_original.white_point); + EXPECT_EQ(primaries, c_original.primaries); +} + +TEST(CodecTest, TestWideGamut) { + ThreadPoolInternal pool(12); + // VerifyWideGamutMetadata("wide-gamut-tests/P3-sRGB-color-bars.png", + // Primaries::kP3, &pool); + VerifyWideGamutMetadata("wide-gamut-tests/P3-sRGB-color-ring.png", + Primaries::kP3, &pool); + // VerifyWideGamutMetadata("wide-gamut-tests/R2020-sRGB-color-bars.png", + // Primaries::k2100, &pool); + // VerifyWideGamutMetadata("wide-gamut-tests/R2020-sRGB-color-ring.png", + // Primaries::k2100, &pool); +} + +TEST(CodecTest, TestPNM) { TestCodecPNM(); } +TEST(CodecTest, TestPGX) { TestCodecPGX(); } + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/time.cc b/codec/L2/demos/jxlEnc/third_partys/lib/extras/time.cc new file mode 100644 index 0000000000..73d1b8f260 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/time.cc @@ -0,0 +1,60 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/extras/time.h" + +#include +#include +#include + +#include + +#include "lib/jxl/base/os_macros.h" // for JXL_OS_* + +#if JXL_OS_WIN +#ifndef NOMINMAX +#define NOMINMAX +#endif // NOMINMAX +#include +#endif // JXL_OS_WIN + +#if JXL_OS_MAC +#include +#include +#endif // JXL_OS_MAC + +#if JXL_OS_HAIKU +#include +#endif // JXL_OS_HAIKU + +namespace jxl { + +double Now() { +#if JXL_OS_WIN + LARGE_INTEGER counter; + (void)QueryPerformanceCounter(&counter); + LARGE_INTEGER freq; + (void)QueryPerformanceFrequency(&freq); + return double(counter.QuadPart) / freq.QuadPart; +#elif JXL_OS_MAC + const auto t = mach_absolute_time(); + // On OSX/iOS platform the elapsed time is cpu time unit + // We have to query the time base information to convert it back + // See https://developer.apple.com/library/mac/qa/qa1398/_index.html + static mach_timebase_info_data_t timebase; + if (timebase.denom == 0) { + (void)mach_timebase_info(&timebase); + } + return double(t) * timebase.numer / timebase.denom * 1E-9; +#elif JXL_OS_HAIKU + return double(system_time_nsecs()) * 1E-9; +#else + timespec t; + clock_gettime(CLOCK_MONOTONIC, &t); + return t.tv_sec + t.tv_nsec * 1E-9; +#endif +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/time.h b/codec/L2/demos/jxlEnc/third_partys/lib/extras/time.h new file mode 100644 index 0000000000..c71414b877 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/time.h @@ -0,0 +1,19 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_EXTRAS_TIME_H_ +#define LIB_EXTRAS_TIME_H_ + +// OS-specific function for timing. + +namespace jxl { + +// Returns current time [seconds] from a monotonic clock with unspecified +// starting point - only suitable for computing elapsed time. +double Now(); + +} // namespace jxl + +#endif // LIB_EXTRAS_TIME_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/tone_mapping.cc b/codec/L2/demos/jxlEnc/third_partys/lib/extras/tone_mapping.cc new file mode 100644 index 0000000000..9bb1c0559c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/tone_mapping.cc @@ -0,0 +1,160 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/extras/tone_mapping.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/extras/tone_mapping.cc" +#include +#include + +#include "lib/jxl/transfer_functions-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +Status ToneMapFrame(const std::pair display_nits, + ImageBundle* const ib, ThreadPool* const pool) { + // Perform tone mapping as described in Report ITU-R BT.2390-8, section 5.4 + // (pp. 23-25). + // https://www.itu.int/pub/R-REP-BT.2390-8-2020 + + HWY_FULL(float) df; + using V = decltype(Zero(df)); + + ColorEncoding linear_rec2020; + linear_rec2020.SetColorSpace(ColorSpace::kRGB); + linear_rec2020.primaries = Primaries::k2100; + linear_rec2020.white_point = WhitePoint::kD65; + linear_rec2020.tf.SetTransferFunction(TransferFunction::kLinear); + JXL_RETURN_IF_ERROR(linear_rec2020.CreateICC()); + JXL_RETURN_IF_ERROR(ib->TransformTo(linear_rec2020, pool)); + + const auto eotf_inv = [&df](const V luminance) -> V { + return TF_PQ().EncodedFromDisplay(df, luminance * Set(df, 1. / 10000)); + }; + + const V pq_mastering_min = + eotf_inv(Set(df, ib->metadata()->tone_mapping.min_nits)); + const V pq_mastering_max = + eotf_inv(Set(df, ib->metadata()->tone_mapping.intensity_target)); + const V pq_mastering_range = pq_mastering_max - pq_mastering_min; + const V inv_pq_mastering_range = + Set(df, 1) / (pq_mastering_max - pq_mastering_min); + const V min_lum = (eotf_inv(Set(df, display_nits.first)) - pq_mastering_min) * + inv_pq_mastering_range; + const V max_lum = + (eotf_inv(Set(df, display_nits.second)) - pq_mastering_min) * + inv_pq_mastering_range; + const V ks = MulAdd(Set(df, 1.5f), max_lum, Set(df, -0.5f)); + const V b = min_lum; + + const V inv_one_minus_ks = Set(df, 1) / Max(Set(df, 1e-6f), Set(df, 1) - ks); + const auto T = [ks, inv_one_minus_ks](const V a) { + return (a - ks) * inv_one_minus_ks; + }; + const auto P = [&T, &df, ks, max_lum](const V b) { + const V t_b = T(b); + const V t_b_2 = t_b * t_b; + const V t_b_3 = t_b_2 * t_b; + return MulAdd( + MulAdd(Set(df, 2), t_b_3, MulAdd(Set(df, -3), t_b_2, Set(df, 1))), ks, + MulAdd(t_b_3 + MulAdd(Set(df, -2), t_b_2, t_b), Set(df, 1) - ks, + MulAdd(Set(df, -2), t_b_3, Set(df, 3) * t_b_2) * max_lum)); + }; + + const V inv_max_display_nits = Set(df, 1 / display_nits.second); + + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, ib->ysize(), ThreadPool::SkipInit(), + [&](const int y, const int thread) { + float* const JXL_RESTRICT row_r = ib->color()->PlaneRow(0, y); + float* const JXL_RESTRICT row_g = ib->color()->PlaneRow(1, y); + float* const JXL_RESTRICT row_b = ib->color()->PlaneRow(2, y); + for (size_t x = 0; x < ib->xsize(); x += Lanes(df)) { + V red = Load(df, row_r + x); + V green = Load(df, row_g + x); + V blue = Load(df, row_b + x); + const V luminance = Set(df, ib->metadata()->IntensityTarget()) * + (MulAdd(Set(df, 0.2627f), red, + MulAdd(Set(df, 0.6780f), green, + Set(df, 0.0593f) * blue))); + const V normalized_pq = + Min(Set(df, 1.f), (eotf_inv(luminance) - pq_mastering_min) * + inv_pq_mastering_range); + const V e2 = + IfThenElse(normalized_pq < ks, normalized_pq, P(normalized_pq)); + const V one_minus_e2 = Set(df, 1) - e2; + const V one_minus_e2_2 = one_minus_e2 * one_minus_e2; + const V one_minus_e2_4 = one_minus_e2_2 * one_minus_e2_2; + const V e3 = MulAdd(b, one_minus_e2_4, e2); + const V e4 = MulAdd(e3, pq_mastering_range, pq_mastering_min); + const V new_luminance = + Min(Set(df, display_nits.second), + ZeroIfNegative(Set(df, 10000) * + TF_PQ().DisplayFromEncoded(df, e4))); + + const V ratio = new_luminance / luminance; + const V multiplier = ratio * + Set(df, ib->metadata()->IntensityTarget()) * + inv_max_display_nits; + + red *= multiplier; + green *= multiplier; + blue *= multiplier; + + const V gray = new_luminance * inv_max_display_nits; + + // Desaturate out-of-gamut pixels. + V gray_mix = Zero(df); + for (const V val : {red, green, blue}) { + const V inv_val_minus_gray = Set(df, 1) / (val - gray); + const V bound1 = val * inv_val_minus_gray; + const V bound2 = bound1 - inv_val_minus_gray; + const V min_bound = Min(bound1, bound2); + const V max_bound = Max(bound1, bound2); + gray_mix = Clamp(gray_mix, min_bound, max_bound); + } + gray_mix = Clamp(gray_mix, Zero(df), Set(df, 1)); + for (V* const val : {&red, &green, &blue}) { + *val = IfThenElse(luminance < Set(df, 1e-6), gray, + MulAdd(gray_mix, gray - *val, *val)); + } + + Store(red, df, row_r + x); + Store(green, df, row_g + x); + Store(blue, df, row_b + x); + } + }, + "ToneMap")); + + return true; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +namespace { +HWY_EXPORT(ToneMapFrame); +} + +Status ToneMapTo(const std::pair display_nits, + CodecInOut* const io, ThreadPool* const pool) { + const auto tone_map_frame = HWY_DYNAMIC_DISPATCH(ToneMapFrame); + for (ImageBundle& ib : io->frames) { + JXL_RETURN_IF_ERROR(tone_map_frame(display_nits, &ib, pool)); + } + io->metadata.m.SetIntensityTarget(display_nits.second); + return true; +} + +} // namespace jxl +#endif diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/tone_mapping.h b/codec/L2/demos/jxlEnc/third_partys/lib/extras/tone_mapping.h new file mode 100644 index 0000000000..4f9feeccc6 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/tone_mapping.h @@ -0,0 +1,18 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_EXTRAS_TONE_MAPPING_H_ +#define LIB_EXTRAS_TONE_MAPPING_H_ + +#include "lib/jxl/codec_in_out.h" + +namespace jxl { + +Status ToneMapTo(std::pair display_nits, CodecInOut* io, + ThreadPool* pool = nullptr); + +} // namespace jxl + +#endif // LIB_EXTRAS_TONE_MAPPING_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/extras/tone_mapping_gbench.cc b/codec/L2/demos/jxlEnc/third_partys/lib/extras/tone_mapping_gbench.cc new file mode 100644 index 0000000000..c87c9fcc21 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/extras/tone_mapping_gbench.cc @@ -0,0 +1,45 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "benchmark/benchmark.h" +#include "lib/extras/codec.h" +#include "lib/extras/tone_mapping.h" +#include "lib/jxl/testdata.h" + +namespace jxl { + +static void BM_ToneMapping(benchmark::State& state) { + CodecInOut image; + const PaddedBytes image_bytes = + ReadTestData("imagecompression.info/flower_foveon.png"); + JXL_CHECK(SetFromBytes(Span(image_bytes), &image)); + + // Convert to linear Rec. 2020 so that `ToneMapTo` doesn't have to and we + // mainly measure the tone mapping itself. + ColorEncoding linear_rec2020; + linear_rec2020.SetColorSpace(ColorSpace::kRGB); + linear_rec2020.primaries = Primaries::k2100; + linear_rec2020.white_point = WhitePoint::kD65; + linear_rec2020.tf.SetTransferFunction(TransferFunction::kLinear); + JXL_CHECK(linear_rec2020.CreateICC()); + JXL_CHECK(image.TransformTo(linear_rec2020)); + + for (auto _ : state) { + state.PauseTiming(); + CodecInOut tone_mapping_input; + tone_mapping_input.SetFromImage(CopyImage(*image.Main().color()), + image.Main().c_current()); + tone_mapping_input.metadata.m.SetIntensityTarget( + image.metadata.m.IntensityTarget()); + state.ResumeTiming(); + + JXL_CHECK(ToneMapTo({0.1, 100}, &tone_mapping_input)); + } + + state.SetItemsProcessed(state.iterations() * image.xsize() * image.ysize()); +} +BENCHMARK(BM_ToneMapping); + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/butteraugli.h b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/butteraugli.h new file mode 100644 index 0000000000..f543413b8c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/butteraugli.h @@ -0,0 +1,156 @@ +/* Copyright (c) the JPEG XL Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style + * license that can be found in the LICENSE file. + */ + +/** @file butteraugli.h + * @brief Butteraugli API for JPEG XL. + */ + +#ifndef JXL_BUTTERAUGLI_H_ +#define JXL_BUTTERAUGLI_H_ + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +#include "jxl/jxl_export.h" +#include "jxl/memory_manager.h" +#include "jxl/parallel_runner.h" +#include "jxl/types.h" + +/** + * Opaque structure that holds a butteraugli API. + * + * Allocated and initialized with JxlButteraugliApiCreate(). + * Cleaned up and deallocated with JxlButteraugliApiDestroy(). + */ +typedef struct JxlButteraugliApiStruct JxlButteraugliApi; + +/** + * Opaque structure that holds intermediary butteraugli results. + * + * Allocated and initialized with JxlButteraugliCompute(). + * Cleaned up and deallocated with JxlButteraugliResultDestroy(). + */ +typedef struct JxlButteraugliResultStruct JxlButteraugliResult; + +/** + * Deinitializes and frees JxlButteraugliResult instance. + * + * @param result instance to be cleaned up and deallocated. + */ +JXL_EXPORT void JxlButteraugliResultDestroy(JxlButteraugliResult* result); + +/** + * Creates an instance of JxlButteraugliApi and initializes it. + * + * @p memory_manager will be used for all the library dynamic allocations made + * from this instance. The parameter may be NULL, in which case the default + * allocator will be used. See jxl/memory_manager.h for details. + * + * @param memory_manager custom allocator function. It may be NULL. The memory + * manager will be copied internally. + * @return @c NULL if the instance can not be allocated or initialized + * @return pointer to initialized JxlEncoder otherwise + */ +JXL_EXPORT JxlButteraugliApi* JxlButteraugliApiCreate( + const JxlMemoryManager* memory_manager); + +/** + * Set the parallel runner for multithreading. + * + * @param api api instance. + * @param parallel_runner function pointer to runner for multithreading. A + * multithreaded runner should be set to reach fast performance. + * @param parallel_runner_opaque opaque pointer for parallel_runner. + */ +JXL_EXPORT void JxlButteraugliApiSetParallelRunner( + JxlButteraugliApi* api, JxlParallelRunner parallel_runner, + void* parallel_runner_opaque); + +/** + * Set the hf_asymmetry option for butteraugli. + * + * @param api api instance. + * @param v new hf_asymmetry value. + */ +JXL_EXPORT void JxlButteraugliApiSetHFAsymmetry(JxlButteraugliApi* api, + float v); + +/** + * Set the intensity_target option for butteraugli. + * + * @param api api instance. + * @param v new intensity_target value. + */ +JXL_EXPORT void JxlButteraugliApiSetIntensityTarget(JxlButteraugliApi* api, + float v); + +/** + * Deinitializes and frees JxlButteraugliApi instance. + * + * @param api instance to be cleaned up and deallocated. + */ +JXL_EXPORT void JxlButteraugliApiDestroy(JxlButteraugliApi* api); + +/** + * Computes intermediary butteraugli result between an original image and a + * distortion. + * + * @param api api instance for this computation. + * @param xsize width of the compared images. + * @param ysize height of the compared images. + * @param pixel_format_orig pixel format for original image. + * @param buffer_orig pixel data for original image. + * @param size_orig size of buffer_orig in bytes. + * @param pixel_format_dist pixel format for distortion. + * @param buffer_dist pixel data for distortion. + * @param size_dist size of buffer_dist in bytes. + * @return @c NULL if the results can not be computed or initialized. + * @return pointer to initialized and computed intermediary result. + */ +JXL_EXPORT JxlButteraugliResult* JxlButteraugliCompute( + const JxlButteraugliApi* api, uint32_t xsize, uint32_t ysize, + const JxlPixelFormat* pixel_format_orig, const void* buffer_orig, + size_t size_orig, const JxlPixelFormat* pixel_format_dist, + const void* buffer_dist, size_t size_dist); + +/** + * Computes butteraugli max distance based on an intermediary butteraugli + * result. + * + * @param result intermediary result instance. + * @return max distance. + */ +JXL_EXPORT float JxlButteraugliResultGetMaxDistance( + const JxlButteraugliResult* result); + +/** + * Computes a butteraugli distance based on an intermediary butteraugli result. + * + * @param result intermediary result instance. + * @param pnorm pnorm to calculate. + * @return distance using the given pnorm. + */ +JXL_EXPORT float JxlButteraugliResultGetDistance( + const JxlButteraugliResult* result, float pnorm); + +/** + * Get a pointer to the distmap in the result. + * + * @param result intermediary result instance. + * @param buffer will be set to the distmap. The distance value for (x,y) will + * be available at buffer + y * row_stride + x. + * @param row_stride will be set to the row stride of the distmap. + */ +JXL_EXPORT void JxlButteraugliResultGetDistmap( + const JxlButteraugliResult* result, const float** buffer, + uint32_t* row_stride); + +#if defined(__cplusplus) || defined(c_plusplus) +} +#endif + +#endif /* JXL_BUTTERAUGLI_H_ */ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/butteraugli_cxx.h b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/butteraugli_cxx.h new file mode 100644 index 0000000000..c0e93ad74a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/butteraugli_cxx.h @@ -0,0 +1,55 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/// @file butteraugli_cxx.h +/// @brief C++ header-only helper for @ref butteraugli.h. +/// +/// There's no binary library associated with the header since this is a header +/// only library. + +#ifndef JXL_BUTTERAUGLI_CXX_H_ +#define JXL_BUTTERAUGLI_CXX_H_ + +#include + +#include "jxl/butteraugli.h" + +#if !(defined(__cplusplus) || defined(c_plusplus)) +#error "This a C++ only header. Use jxl/butteraugli.h from C sources." +#endif + +/// Struct to call JxlButteraugliApiDestroy from the JxlButteraugliApiPtr +/// unique_ptr. +struct JxlButteraugliApiDestroyStruct { + /// Calls @ref JxlButteraugliApiDestroy() on the passed api. + void operator()(JxlButteraugliApi* api) { JxlButteraugliApiDestroy(api); } +}; + +/// std::unique_ptr<> type that calls JxlButteraugliApiDestroy() when releasing +/// the pointer. +/// +/// Use this helper type from C++ sources to ensure the api is destroyed and +/// their internal resources released. +typedef std::unique_ptr + JxlButteraugliApiPtr; + +/// Struct to call JxlButteraugliResultDestroy from the JxlButteraugliResultPtr +/// unique_ptr. +struct JxlButteraugliResultDestroyStruct { + /// Calls @ref JxlButteraugliResultDestroy() on the passed result object. + void operator()(JxlButteraugliResult* result) { + JxlButteraugliResultDestroy(result); + } +}; + +/// std::unique_ptr<> type that calls JxlButteraugliResultDestroy() when +/// releasing the pointer. +/// +/// Use this helper type from C++ sources to ensure the result object is +/// destroyed and their internal resources released. +typedef std::unique_ptr + JxlButteraugliResultPtr; + +#endif // JXL_BUTTERAUGLI_CXX_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/codestream_header.h b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/codestream_header.h new file mode 100644 index 0000000000..04e40d11d9 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/codestream_header.h @@ -0,0 +1,311 @@ +/* Copyright (c) the JPEG XL Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style + * license that can be found in the LICENSE file. + */ + +/** @file codestream_header.h + * @brief Definitions of structs and enums for the metadata from the JPEG XL + * codestream headers (signature, metadata, preview dimensions, ...), excluding + * color encoding which is in color_encoding.h. + */ + +#ifndef JXL_CODESTREAM_HEADER_H_ +#define JXL_CODESTREAM_HEADER_H_ + +#include +#include + +#include "jxl/color_encoding.h" +#include "jxl/types.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +/** Image orientation metadata. + * Values 1..8 match the EXIF definitions. + * The name indicates the operation to perform to transform from the encoded + * image to the display image. + */ +typedef enum { + JXL_ORIENT_IDENTITY = 1, + JXL_ORIENT_FLIP_HORIZONTAL = 2, + JXL_ORIENT_ROTATE_180 = 3, + JXL_ORIENT_FLIP_VERTICAL = 4, + JXL_ORIENT_TRANSPOSE = 5, + JXL_ORIENT_ROTATE_90_CW = 6, + JXL_ORIENT_ANTI_TRANSPOSE = 7, + JXL_ORIENT_ROTATE_90_CCW = 8, +} JxlOrientation; + +/** Given type of an extra channel. + */ +typedef enum { + JXL_CHANNEL_ALPHA, + JXL_CHANNEL_DEPTH, + JXL_CHANNEL_SPOT_COLOR, + JXL_CHANNEL_SELECTION_MASK, + JXL_CHANNEL_BLACK, + JXL_CHANNEL_CFA, + JXL_CHANNEL_THERMAL, + JXL_CHANNEL_RESERVED0, + JXL_CHANNEL_RESERVED1, + JXL_CHANNEL_RESERVED2, + JXL_CHANNEL_RESERVED3, + JXL_CHANNEL_RESERVED4, + JXL_CHANNEL_RESERVED5, + JXL_CHANNEL_RESERVED6, + JXL_CHANNEL_RESERVED7, + JXL_CHANNEL_UNKNOWN, + JXL_CHANNEL_OPTIONAL +} JxlExtraChannelType; + +/** The codestream preview header */ +typedef struct { + /** Preview width in pixels */ + uint32_t xsize; + + /** Preview height in pixels */ + uint32_t ysize; +} JxlPreviewHeader; + +/** The codestream animation header, optionally present in the beginning of + * the codestream, and if it is it applies to all animation frames, unlike + * JxlFrameHeader which applies to an individual frame. + */ +typedef struct { + /** Numerator of ticks per second of a single animation frame time unit */ + uint32_t tps_numerator; + + /** Denominator of ticks per second of a single animation frame time unit */ + uint32_t tps_denominator; + + /** Amount of animation loops, or 0 to repeat infinitely */ + uint32_t num_loops; + + /** Whether animation time codes are present at animation frames in the + * codestream */ + JXL_BOOL have_timecodes; +} JxlAnimationHeader; + +/** Basic image information. This information is available from the file + * signature and first part of the codestream header. + */ +typedef struct JxlBasicInfo { + /* TODO(lode): need additional fields for (transcoded) JPEG? For reusable + * fields orientation must be read from Exif APP1. For has_icc_profile: must + * look up where ICC profile is guaranteed to be in a JPEG file to be able to + * indicate this. */ + + /* TODO(lode): make struct packed, and/or make this opaque struct with getter + * functions (still separate struct from opaque decoder) */ + + /** Whether the codestream is embedded in the container format. If true, + * metadata information and extensions may be available in addition to the + * codestream. + */ + JXL_BOOL have_container; + + /** Width of the image in pixels, before applying orientation. + */ + uint32_t xsize; + + /** Height of the image in pixels, before applying orientation. + */ + uint32_t ysize; + + /** Original image color channel bit depth. + */ + uint32_t bits_per_sample; + + /** Original image color channel floating point exponent bits, or 0 if they + * are unsigned integer. For example, if the original data is half-precision + * (binary16) floating point, bits_per_sample is 16 and + * exponent_bits_per_sample is 5, and so on for other floating point + * precisions. + */ + uint32_t exponent_bits_per_sample; + + /** Upper bound on the intensity level present in the image in nits. For + * unsigned integer pixel encodings, this is the brightness of the largest + * representable value. The image does not necessarily contain a pixel + * actually this bright. An encoder is allowed to set 255 for SDR images + * without computing a histogram. + */ + float intensity_target; + + /** Lower bound on the intensity level present in the image. This may be + * loose, i.e. lower than the actual darkest pixel. When tone mapping, a + * decoder will map [min_nits, intensity_target] to the display range. + */ + float min_nits; + + /** See the description of @see linear_below. + */ + JXL_BOOL relative_to_max_display; + + /** The tone mapping will leave unchanged (linear mapping) any pixels whose + * brightness is strictly below this. The interpretation depends on + * relative_to_max_display. If true, this is a ratio [0, 1] of the maximum + * display brightness [nits], otherwise an absolute brightness [nits]. + */ + float linear_below; + + /** Whether the data in the codestream is encoded in the original color + * profile that is attached to the codestream metadata header, or is + * encoded in an internally supported absolute color space (which the decoder + * can always convert to linear or non-linear sRGB or to XYB). If the original + * profile is used, the decoder outputs pixel data in the color space matching + * that profile, but doesn't convert it to any other color space. If the + * original profile is not used, the decoder only outputs the data as sRGB + * (linear if outputting to floating point, nonlinear with standard sRGB + * transfer function if outputting to unsigned integers) but will not convert + * it to to the original color profile. The decoder also does not convert to + * the target display color profile, but instead will always indicate which + * color profile the returned pixel data is encoded in when using @see + * JXL_COLOR_PROFILE_TARGET_DATA so that a CMS can be used to convert the + * data. + */ + JXL_BOOL uses_original_profile; + + /** Indicates a preview image exists near the beginning of the codestream. + * The preview itself or its dimensions are not included in the basic info. + */ + JXL_BOOL have_preview; + + /** Indicates animation frames exist in the codestream. The animation + * information is not included in the basic info. + */ + JXL_BOOL have_animation; + + /** Image orientation, value 1-8 matching the values used by JEITA CP-3451C + * (Exif version 2.3). + */ + JxlOrientation orientation; + + /** Number of color channels encoded in the image, this is either 1 for + * grayscale data, or 3 for colored data. This count does not include + * the alpha channel or other extra channels. To check presence of an alpha + * channel, such as in the case of RGBA color, check alpha_bits != 0. + * If and only if this is 1, the JxlColorSpace in the JxlColorEncoding is + * JXL_COLOR_SPACE_GRAY. + */ + uint32_t num_color_channels; + + /** Number of additional image channels. This includes the main alpha channel, + * but can also include additional channels such as depth, additional alpha + * channels, spot colors, and so on. Information about the extra channels + * can be queried with JxlDecoderGetExtraChannelInfo. The main alpha channel, + * if it exists, also has its information available in the alpha_bits, + * alpha_exponent_bits and alpha_premultiplied fields in this JxlBasicInfo. + */ + uint32_t num_extra_channels; + + /** Bit depth of the encoded alpha channel, or 0 if there is no alpha channel. + */ + uint32_t alpha_bits; + + /** Alpha channel floating point exponent bits, or 0 if they are unsigned + * integer. + */ + uint32_t alpha_exponent_bits; + + /** Whether the alpha channel is premultiplied + */ + JXL_BOOL alpha_premultiplied; + + /** Dimensions of encoded preview image, only used if have_preview is + * JXL_TRUE. + */ + JxlPreviewHeader preview; + + /** Animation header with global animation properties for all frames, only + * used if have_animation is JXL_TRUE. + */ + JxlAnimationHeader animation; +} JxlBasicInfo; + +/** Information for a single extra channel. + */ +typedef struct { + /** Given type of an extra channel. + */ + JxlExtraChannelType type; + + /** Total bits per sample for this channel. + */ + uint32_t bits_per_sample; + + /** Floating point exponent bits per channel, or 0 if they are unsigned + * integer. + */ + uint32_t exponent_bits_per_sample; + + /** The exponent the channel is downsampled by on each axis. + * TODO(lode): expand this comment to match the JPEG XL specification, + * specify how to upscale, how to round the size computation, and to which + * extra channels this field applies. + */ + uint32_t dim_shift; + + /** Length of the extra channel name in bytes, or 0 if no name. + * Excludes null termination character. + */ + uint32_t name_length; + + /** Whether alpha channel uses premultiplied alpha. Only applicable if + * type is JXL_CHANNEL_ALPHA. + */ + JXL_BOOL alpha_associated; + + /** Spot color of the current spot channel in linear RGBA. Only applicable if + * type is JXL_CHANNEL_SPOT_COLOR. + */ + float spot_color[4]; + + /** Only applicable if type is JXL_CHANNEL_CFA. + * TODO(lode): add comment about the meaning of this field. + */ + uint32_t cfa_channel; +} JxlExtraChannelInfo; + +/* TODO(lode): add API to get the codestream header extensions. */ +/** Extensions in the codestream header. */ +typedef struct { + /** Extension bits. */ + uint64_t extensions; +} JxlHeaderExtensions; + +/** The header of one displayed frame. */ +typedef struct { + /** How long to wait after rendering in ticks. The duration in seconds of a + * tick is given by tps_numerator and tps_denominator in JxlAnimationHeader. + */ + uint32_t duration; + + /** SMPTE timecode of the current frame in form 0xHHMMSSFF, or 0. The bits are + * interpreted from most-significant to least-significant as hour, minute, + * second, and frame. If timecode is nonzero, it is strictly larger than that + * of a previous frame with nonzero duration. These values are only available + * if have_timecodes in JxlAnimationHeader is JXL_TRUE. + * This value is only used if have_timecodes in JxlAnimationHeader is + * JXL_TRUE. + */ + uint32_t timecode; + + /** Length of the frame name in bytes, or 0 if no name. + * Excludes null termination character. + */ + uint32_t name_length; + + /** Indicates this is the last animation frame. + */ + JXL_BOOL is_last; +} JxlFrameHeader; + +#if defined(__cplusplus) || defined(c_plusplus) +} +#endif + +#endif /* JXL_CODESTREAM_HEADER_H_ */ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/color_encoding.h b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/color_encoding.h new file mode 100644 index 0000000000..e86dae369a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/color_encoding.h @@ -0,0 +1,145 @@ +/* Copyright (c) the JPEG XL Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style + * license that can be found in the LICENSE file. + */ + +/** @file color_encoding.h + * @brief Color Encoding definitions used by JPEG XL. + * All CIE units are for the standard 1931 2 degree observer. + */ + +#ifndef JXL_COLOR_ENCODING_H_ +#define JXL_COLOR_ENCODING_H_ + +#include + +#include "jxl/types.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +/** Color space of the image data. */ +typedef enum { + /** Tristimulus RGB */ + JXL_COLOR_SPACE_RGB, + /** Luminance based, the primaries in JxlColorEncoding must be ignored. This + * value implies that num_color_channels in JxlBasicInfo is 1, any other value + * implies num_color_channels is 3. */ + JXL_COLOR_SPACE_GRAY, + /** XYB (opsin) color space */ + JXL_COLOR_SPACE_XYB, + /** None of the other table entries describe the color space appropriately */ + JXL_COLOR_SPACE_UNKNOWN, +} JxlColorSpace; + +/** Built-in whitepoints for color encoding. Numeric values match CICP (Rec. + * ITU-T H.273 | ISO/IEC 23091-2:2019(E)). */ +typedef enum { + /** CIE Standard Illuminant D65: 0.3127, 0.3290 */ + JXL_WHITE_POINT_D65 = 1, + /** Custom white point stored in JxlColorEncoding white_point. */ + JXL_WHITE_POINT_CUSTOM = 2, + /** CIE Standard Illuminant E (equal-energy): 1/3, 1/3 */ + JXL_WHITE_POINT_E = 10, + /** DCI-P3 from SMPTE RP 431-2: 0.314, 0.351 */ + JXL_WHITE_POINT_DCI = 11, +} JxlWhitePoint; + +/** Built-in primaries for color encoding. Numeric values match CICP (Rec. ITU-T + * H.273 | ISO/IEC 23091-2:2019(E)). */ +typedef enum { + /** The CIE xy values of the red, green and blue primaries are: 0.639998686, + 0.330010138; 0.300003784, 0.600003357; 0.150002046, 0.059997204 */ + JXL_PRIMARIES_SRGB = 1, + /** Custom white point stored in JxlColorEncoding primaries_red_xy, + primaries_green_xy and primaries_blue_xy. */ + JXL_PRIMARIES_CUSTOM = 2, + /** As specified in Rec. ITU-R BT.2100-1 */ + JXL_PRIMARIES_2100 = 9, + /** As specified in SMPTE RP 431-2 */ + JXL_PRIMARIES_P3 = 11, +} JxlPrimaries; + +/** Built-in transfer functions for color encoding. Numeric values match CICP + * (Rec. ITU-T H.273 | ISO/IEC 23091-2:2019(E)) unless specified otherwise. */ +typedef enum { + /** As specified in SMPTE RP 431-2 */ + JXL_TRANSFER_FUNCTION_709 = 1, + /** None of the other table entries describe the transfer function. */ + JXL_TRANSFER_FUNCTION_UNKNOWN = 2, + /** The gamma exponent is 1 */ + JXL_TRANSFER_FUNCTION_LINEAR = 8, + /** As specified in IEC 61966-2-1 sRGB */ + JXL_TRANSFER_FUNCTION_SRGB = 13, + /** As specified in SMPTE ST 428-1 */ + JXL_TRANSFER_FUNCTION_PQ = 16, + /** As specified in SMPTE ST 428-1 */ + JXL_TRANSFER_FUNCTION_DCI = 17, + /** As specified in Rec. ITU-R BT.2100-1 (HLG) */ + JXL_TRANSFER_FUNCTION_HLG = 18, + /** Transfer function follows power law given by the gamma value in + JxlColorEncoding. Not a CICP value. */ + JXL_TRANSFER_FUNCTION_GAMMA = 65535, +} JxlTransferFunction; + +/** Renderig intent for color encoding, as specified in ISO 15076-1:2010 */ +typedef enum { + /** vendor-specific */ + JXL_RENDERING_INTENT_PERCEPTUAL = 0, + /** media-relative */ + JXL_RENDERING_INTENT_RELATIVE, + /** vendor-specific */ + JXL_RENDERING_INTENT_SATURATION, + /** ICC-absolute */ + JXL_RENDERING_INTENT_ABSOLUTE, +} JxlRenderingIntent; + +/** Color encoding of the image as structured information. + */ +typedef struct { + /** Color space of the image data. + */ + JxlColorSpace color_space; + + /** Built-in white point. If this value is JXL_WHITE_POINT_CUSTOM, must + * use the numerical whitepoint values from white_point_xy. + */ + JxlWhitePoint white_point; + + /** Numerical whitepoint values in CIE xy space. */ + double white_point_xy[2]; + + /** Built-in RGB primaries. If this value is JXL_PRIMARIES_CUSTOM, must + * use the numerical primaries values below. This field and the custom values + * below are unused and must be ignored if the color space is + * JXL_COLOR_SPACE_GRAY or JXL_COLOR_SPACE_XYB. + */ + JxlPrimaries primaries; + + /** Numerical red primary values in CIE xy space. */ + double primaries_red_xy[2]; + + /** Numerical green primary values in CIE xy space. */ + double primaries_green_xy[2]; + + /** Numerical blue primary values in CIE xy space. */ + double primaries_blue_xy[2]; + + /** Transfer function if have_gamma is 0 */ + JxlTransferFunction transfer_function; + + /** Gamma value used when transfer_function is JXL_TRANSFER_FUNCTION_GAMMA + */ + double gamma; + + /** Rendering intent defined for the color profile. */ + JxlRenderingIntent rendering_intent; +} JxlColorEncoding; + +#if defined(__cplusplus) || defined(c_plusplus) +} +#endif + +#endif /* JXL_COLOR_ENCODING_H_ */ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/decode.h b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/decode.h new file mode 100644 index 0000000000..888058682e --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/decode.h @@ -0,0 +1,888 @@ +/* Copyright (c) the JPEG XL Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style + * license that can be found in the LICENSE file. + */ + +/** @file decode.h + * @brief Decoding API for JPEG XL. + */ + +#ifndef JXL_DECODE_H_ +#define JXL_DECODE_H_ + +#include +#include + +#include "jxl/codestream_header.h" +#include "jxl/color_encoding.h" +#include "jxl/jxl_export.h" +#include "jxl/memory_manager.h" +#include "jxl/parallel_runner.h" +#include "jxl/types.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +/** + * Decoder library version. + * + * @return the decoder library version as an integer: + * MAJOR_VERSION * 1000000 + MINOR_VERSION * 1000 + PATCH_VERSION. For example, + * version 1.2.3 would return 1002003. + */ +JXL_EXPORT uint32_t JxlDecoderVersion(void); + +/** The result of JxlSignatureCheck. + */ +typedef enum { + /** Not enough bytes were passed to determine if a valid signature was found. + */ + JXL_SIG_NOT_ENOUGH_BYTES = 0, + + /** No valid JPEGXL header was found. */ + JXL_SIG_INVALID = 1, + + /** A valid JPEG XL codestream signature was found, that is a JPEG XL image + * without container. + */ + JXL_SIG_CODESTREAM = 2, + + /** A valid container signature was found, that is a JPEG XL image embedded + * in a box format container. + */ + JXL_SIG_CONTAINER = 3, +} JxlSignature; + +/** + * JPEG XL signature identification. + * + * Checks if the passed buffer contains a valid JPEG XL signature. The passed @p + * buf of size + * @p size doesn't need to be a full image, only the beginning of the file. + * + * @return a flag indicating if a JPEG XL signature was found and what type. + * - JXL_SIG_NOT_ENOUGH_BYTES not enough bytes were passed to determine + * if a valid signature is there. + * - JXL_SIG_INVALID: no valid signature found for JPEG XL decoding. + * - JXL_SIG_CODESTREAM a valid JPEG XL codestream signature was found. + * - JXL_SIG_CONTAINER a valid JPEG XL container signature was found. + */ +JXL_EXPORT JxlSignature JxlSignatureCheck(const uint8_t* buf, size_t len); + +/** + * Opaque structure that holds the JPEGXL decoder. + * + * Allocated and initialized with JxlDecoderCreate(). + * Cleaned up and deallocated with JxlDecoderDestroy(). + */ +typedef struct JxlDecoderStruct JxlDecoder; + +/** + * Creates an instance of JxlDecoder and initializes it. + * + * @p memory_manager will be used for all the library dynamic allocations made + * from this instance. The parameter may be NULL, in which case the default + * allocator will be used. See jpegxl/memory_manager.h for details. + * + * @param memory_manager custom allocator function. It may be NULL. The memory + * manager will be copied internally. + * @return @c NULL if the instance can not be allocated or initialized + * @return pointer to initialized JxlDecoder otherwise + */ +JXL_EXPORT JxlDecoder* JxlDecoderCreate(const JxlMemoryManager* memory_manager); + +/** + * Re-initializes a JxlDecoder instance, so it can be re-used for decoding + * another image. All state and settings are reset as if the object was + * newly created with JxlDecoderCreate, but the memory manager is kept. + * + * @param dec instance to be re-initialized. + */ +JXL_EXPORT void JxlDecoderReset(JxlDecoder* dec); + +/** + * Deinitializes and frees JxlDecoder instance. + * + * @param dec instance to be cleaned up and deallocated. + */ +JXL_EXPORT void JxlDecoderDestroy(JxlDecoder* dec); + +/** + * Return value for JxlDecoderProcessInput. + * The values above 0x40 are optional informal events that can be subscribed to, + * they are never returned if they have not been registered with + * JxlDecoderSubscribeEvents. + */ +typedef enum { + /** Function call finished successfully, or decoding is finished and there is + * nothing more to be done. + */ + JXL_DEC_SUCCESS = 0, + + /** An error occurred, for example invalid input file or out of memory. + * TODO(lode): add function to get error information from decoder. + */ + JXL_DEC_ERROR = 1, + + /** The decoder needs more input bytes to continue. Before the next + * JxlDecoderProcessInput call, more input data must be set, by calling + * JxlDecoderReleaseInput (if input was set previously) and then calling + * JxlDecoderSetInput. JxlDecoderReleaseInput returns how many bytes are + * not yet processed, before a next call to JxlDecoderProcessInput all + * unprocessed bytes must be provided again (the address need not match, but + * the contents must), and more bytes must be concatenated after the + * unprocessed bytes. + */ + JXL_DEC_NEED_MORE_INPUT = 2, + + /** The decoder is able to decode a preview image and requests setting a + * preview output buffer using JxlDecoderSetPreviewOutBuffer. This occurs if + * JXL_DEC_PREVIEW_IMAGE is requested and it is possible to decode a preview + * image from the codestream and the preview out buffer was not yet set. There + * is maximum one preview image in a codestream. + */ + JXL_DEC_NEED_PREVIEW_OUT_BUFFER = 3, + + /** The decoder is able to decode a DC image and requests setting a DC output + * buffer using JxlDecoderSetDCOutBuffer. This occurs if JXL_DEC_DC_IMAGE is + * requested and it is possible to decode a DC image from the codestream and + * the DC out buffer was not yet set. This event re-occurs for new frames + * if there are multiple animation frames. + * DEPRECATED: the DC feature in this form will be removed. You can use + * JxlDecoderFlushImage for progressive rendering. + */ + JXL_DEC_NEED_DC_OUT_BUFFER = 4, + + /** The decoder requests an output buffer to store the full resolution image, + * which can be set with JxlDecoderSetImageOutBuffer or with + * JxlDecoderSetImageOutCallback. This event re-occurs for new frames if there + * are multiple animation frames and requires setting an output again. + */ + JXL_DEC_NEED_IMAGE_OUT_BUFFER = 5, + + /** Informative event by JxlDecoderProcessInput: JPEG reconstruction buffer is + * too small for reconstructed JPEG codestream to fit. + * JxlDecoderSetJPEGBuffer must be called again to make room for remaining + * bytes. This event may occur multiple times after + * JXL_DEC_JPEG_RECONSTRUCTION + */ + JXL_DEC_JPEG_NEED_MORE_OUTPUT = 6, + + /** Informative event by JxlDecoderProcessInput: basic information such as + * image dimensions and extra channels. This event occurs max once per image. + */ + JXL_DEC_BASIC_INFO = 0x40, + + /** Informative event by JxlDecoderProcessInput: user extensions of the + * codestream header. This event occurs max once per image and always later + * than JXL_DEC_BASIC_INFO and earlier than any pixel data. + */ + JXL_DEC_EXTENSIONS = 0x80, + + /** Informative event by JxlDecoderProcessInput: color encoding or ICC + * profile from the codestream header. This event occurs max once per image + * and always later than JXL_DEC_BASIC_INFO and earlier than any pixel + * data. + */ + JXL_DEC_COLOR_ENCODING = 0x100, + + /** Informative event by JxlDecoderProcessInput: Preview image, a small + * frame, decoded. This event can only happen if the image has a preview + * frame encoded. This event occurs max once for the codestream and always + * later than JXL_DEC_COLOR_ENCODING and before JXL_DEC_FRAME. + */ + JXL_DEC_PREVIEW_IMAGE = 0x200, + + /** Informative event by JxlDecoderProcessInput: Beginning of a frame. + * JxlDecoderGetFrameHeader can be used at this point. A note on frames: + * a JPEG XL image can have internal frames that are not intended to be + * displayed (e.g. used for compositing a final frame), but this only returns + * displayed frames. A displayed frame either has an animation duration or is + * the only or last frame in the image. This event occurs max once per + * displayed frame, always later than JXL_DEC_COLOR_ENCODING, and always + * earlier than any pixel data. While JPEG XL supports encoding a single frame + * as the composition of multiple internal sub-frames also called frames, this + * event is not indicated for the internal frames. + */ + JXL_DEC_FRAME = 0x400, + + /** Informative event by JxlDecoderProcessInput: DC image, 8x8 sub-sampled + * frame, decoded. It is not guaranteed that the decoder will always return DC + * separately, but when it does it will do so before outputting the full + * frame. JxlDecoderSetDCOutBuffer must be used after getting the basic + * image information to be able to get the DC pixels, if not this return + * status only indicates we're past this point in the codestream. This event + * occurs max once per frame and always later than JXL_DEC_FRAME_HEADER + * and other header events and earlier than full resolution pixel data. + * DEPRECATED: the DC feature in this form will be removed. You can use + * JxlDecoderFlushImage for progressive rendering. + */ + JXL_DEC_DC_IMAGE = 0x800, + + /** Informative event by JxlDecoderProcessInput: full frame decoded. + * JxlDecoderSetImageOutBuffer must be used after getting the basic image + * information to be able to get the image pixels, if not this return status + * only indicates we're past this point in the codestream. This event occurs + * max once per frame and always later than JXL_DEC_DC_IMAGE. + */ + JXL_DEC_FULL_IMAGE = 0x1000, + + /** Informative event by JxlDecoderProcessInput: JPEG reconstruction data + * decoded. JxlDecoderSetJPEGBuffer may be used to set a JPEG + * reconstruction buffer after getting the JPEG reconstruction data. If a JPEG + * reconstruction buffer is set a byte stream identical to the JPEG codestream + * used to encode the image will be written to the JPEG reconstruction buffer + * instead of pixels to the image out buffer. This event occurs max once per + * image and always before JXL_DEC_FULL_IMAGE. + */ + JXL_DEC_JPEG_RECONSTRUCTION = 0x2000, +} JxlDecoderStatus; + +/** Rewinds decoder to the beginning. The same input must be given again from + * the beginning of the file and the decoder will emit events from the beginning + * again. When rewinding (as opposed to JxlDecoderReset), the decoder can keep + * state about the image, which it can use to skip to a requested frame more + * efficiently with JxlDecoderSkipFrames. After rewind, + * JxlDecoderSubscribeEvents can be used again, and it is feasible to leave out + * events that were already handled before, such as JXL_DEC_BASIC_INFO and + * JXL_DEC_COLOR_ENCODING, since they will provide the same information as + * before. + * @param dec decoder object + */ +JXL_EXPORT void JxlDecoderRewind(JxlDecoder* dec); + +/** Makes the decoder skip the next `amount` frames. It still needs to process + * the input, but will not output the frame events. It can be more efficient + * when skipping frames, and even more so when using this after + * JxlDecoderRewind. If the decoder is already processing a frame (could + * have emitted JXL_DEC_FRAME but not yet JXL_DEC_FULL_IMAGE), it starts + * skipping from the next frame. If the amount is larger than the amount of + * frames remaining in the image, all remaining frames are skipped. Calling this + * function multiple times adds the amount to skip to the already existing + * amount. + * A frame here is defined as a frame that without skipping emits events such as + * JXL_DEC_FRAME and JXL_FULL_IMAGE, frames that are internal to the file format + * but are not rendered as part of an animation, or are not the final still + * frame of a still image, are not counted. + * @param dec decoder object + * @param amount the amount of frames to skip + */ +JXL_EXPORT void JxlDecoderSkipFrames(JxlDecoder* dec, size_t amount); + +/** + * Get the default pixel format for this decoder. + * + * Requires that the decoder can produce JxlBasicInfo. + * + * @param dec JxlDecoder to query when creating the recommended pixel format. + * @param format JxlPixelFormat to populate with the recommended settings for + * the data loaded into this decoder. + * @return JXL_DEC_SUCCESS if no error, JXL_DEC_NEED_MORE_INPUT if the + * basic info isn't yet available, and JXL_DEC_ERROR otherwise. + */ +JXL_EXPORT JxlDecoderStatus +JxlDecoderDefaultPixelFormat(const JxlDecoder* dec, JxlPixelFormat* format); + +/** + * Set the parallel runner for multithreading. May only be set before starting + * decoding. + * + * @param dec decoder object + * @param parallel_runner function pointer to runner for multithreading. It may + * be NULL to use the default, single-threaded, runner. A multithreaded + * runner should be set to reach fast performance. + * @param parallel_runner_opaque opaque pointer for parallel_runner. + * @return JXL_DEC_SUCCESS if the runner was set, JXL_DEC_ERROR + * otherwise (the previous runner remains set). + */ +JXL_EXPORT JxlDecoderStatus +JxlDecoderSetParallelRunner(JxlDecoder* dec, JxlParallelRunner parallel_runner, + void* parallel_runner_opaque); + +/** + * Returns a hint indicating how many more bytes the decoder is expected to + * need to make JxlDecoderGetBasicInfo available after the next + * JxlDecoderProcessInput call. This is a suggested large enough value for + * the amount of bytes to provide in the next JxlDecoderSetInput call, but it is + * not guaranteed to be an upper bound nor a lower bound. + * Can be used before the first JxlDecoderProcessInput call, and is correct + * the first time in most cases. If not, JxlDecoderSizeHintBasicInfo can be + * called again to get an updated hint. + * + * @param dec decoder object + * @return the size hint in bytes if the basic info is not yet fully decoded. + * @return 0 when the basic info is already available. + */ +JXL_EXPORT size_t JxlDecoderSizeHintBasicInfo(const JxlDecoder* dec); + +/** Select for which informative events (JXL_DEC_BASIC_INFO, etc...) the + * decoder should return with a status. It is not required to subscribe to any + * events, data can still be requested from the decoder as soon as it available. + * By default, the decoder is subscribed to no events (events_wanted == 0), and + * the decoder will then only return when it cannot continue because it needs + * more input data or more output buffer. This function may only be be called + * before using JxlDecoderProcessInput + * + * @param dec decoder object + * @param events_wanted bitfield of desired events. + * @return JXL_DEC_SUCCESS if no error, JXL_DEC_ERROR otherwise. + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderSubscribeEvents(JxlDecoder* dec, + int events_wanted); + +/** Enables or disables preserving of original orientation. Some images are + * encoded with an orientation tag indicating the image is rotated and/or + * mirrored (here called the original orientation). + * + * *) If keep_orientation is JXL_FALSE (the default): the decoder will perform + * work to undo the transformation. This ensures the decoded pixels will not + * be rotated or mirrored. The decoder will always set the orientation field + * of the JxlBasicInfo to JXL_ORIENT_IDENTITY to match the returned pixel data. + * The decoder may also swap xsize and ysize in the JxlBasicInfo compared to the + * values inside of the codestream, to correctly match the decoded pixel data, + * e.g. when a 90 degree rotation was performed. + * + * *) If this option is JXL_TRUE: then the image is returned as-is, which may be + * rotated or mirrored, and the user must check the orientation field in + * JxlBasicInfo after decoding to correctly interpret the decoded pixel data. + * This may be faster to decode since the decoder doesn't have to apply the + * transformation, but can cause wrong display of the image if the orientation + * tag is not correctly taken into account by the user. + * + * By default, this option is disabled, and the decoder automatically corrects + * the orientation. + * + * This function must be called at the beginning, before decoding is performed. + * + * @see JxlBasicInfo for the orientation field, and @see JxlOrientation for the + * possible values. + * + * @param dec decoder object + * @param keep_orientation JXL_TRUE to enable, JXL_FALSE to disable. + * @return JXL_DEC_SUCCESS if no error, JXL_DEC_ERROR otherwise. + */ +JXL_EXPORT JxlDecoderStatus +JxlDecoderSetKeepOrientation(JxlDecoder* dec, JXL_BOOL keep_orientation); + +/** + * Decodes JPEG XL file using the available bytes. Requires input has been + * set with JxlDecoderSetInput. After JxlDecoderProcessInput, input can + * optionally be released with JxlDecoderReleaseInput and then set again to + * next bytes in the stream. JxlDecoderReleaseInput returns how many bytes are + * not yet processed, before a next call to JxlDecoderProcessInput all + * unprocessed bytes must be provided again (the address need not match, but the + * contents must), and more bytes may be concatenated after the unprocessed + * bytes. + * + * The returned status indicates whether the decoder needs more input bytes, or + * more output buffer for a certain type of output data. No matter what the + * returned status is (other than JXL_DEC_ERROR), new information, such as + * JxlDecoderGetBasicInfo, may have become available after this call. When + * the return value is not JXL_DEC_ERROR or JXL_DEC_SUCCESS, the decoding + * requires more JxlDecoderProcessInput calls to continue. + * + * @param dec decoder object + * @return JXL_DEC_SUCCESS when decoding finished and all events handled. + * @return JXL_DEC_ERROR when decoding failed, e.g. invalid codestream. + * TODO(lode) document the input data mechanism + * @return JXL_DEC_NEED_MORE_INPUT more input data is necessary. + * @return JXL_DEC_BASIC_INFO when basic info such as image dimensions is + * available and this informative event is subscribed to. + * @return JXL_DEC_EXTENSIONS when JPEG XL codestream user extensions are + * available and this informative event is subscribed to. + * @return JXL_DEC_COLOR_ENCODING when color profile information is + * available and this informative event is subscribed to. + * @return JXL_DEC_PREVIEW_IMAGE when preview pixel information is available and + * output in the preview buffer. + * @return JXL_DEC_DC_IMAGE when DC pixel information (8x8 downscaled version + * of the image) is available and output in the DC buffer. + * @return JXL_DEC_FULL_IMAGE when all pixel information at highest detail is + * available and has been output in the pixel buffer. + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderProcessInput(JxlDecoder* dec); + +/** + * Sets input data for JxlDecoderProcessInput. The data is owned by the caller + * and may be used by the decoder until JxlDecoderReleaseInput is called or + * the decoder is destroyed or reset so must be kept alive until then. + * @param dec decoder object + * @param data pointer to next bytes to read from + * @param size amount of bytes available starting from data + * @return JXL_DEC_ERROR if input was already set without releasing, + * JXL_DEC_SUCCESS otherwise + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderSetInput(JxlDecoder* dec, + const uint8_t* data, + size_t size); + +/** + * Releases input which was provided with JxlDecoderSetInput. Between + * JxlDecoderProcessInput and JxlDecoderReleaseInput, the user may not alter + * the data in the buffer. Calling JxlDecoderReleaseInput is required whenever + * any input is already set and new input needs to be added with + * JxlDecoderSetInput, but is not required before JxlDecoderDestroy or + * JxlDecoderReset. Calling JxlDecoderReleaseInput when no input is set is + * not an error and returns 0. + * @param dec decoder object + * @return the amount of bytes the decoder has not yet processed that are + * still remaining in the data set by JxlDecoderSetInput, or 0 if no input is + * set or JxlDecoderReleaseInput was already called. For a next call to + * JxlDecoderProcessInput, the buffer must start with these unprocessed bytes. + * This value doesn't provide information about how many bytes the decoder + * truly processed internally or how large the original JPEG XL codestream or + * file are. + */ +JXL_EXPORT size_t JxlDecoderReleaseInput(JxlDecoder* dec); + +/** + * Outputs the basic image information, such as image dimensions, bit depth and + * all other JxlBasicInfo fields, if available. + * + * @param dec decoder object + * @param info struct to copy the information into, or NULL to only check + * whether the information is available through the return value. + * @return JXL_DEC_SUCCESS if the value is available, + * JXL_DEC_NEED_MORE_INPUT if not yet available, JXL_DEC_ERROR in case + * of other error conditions. + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderGetBasicInfo(const JxlDecoder* dec, + JxlBasicInfo* info); + +/** + * Outputs information for extra channel at the given index. The index must be + * smaller than num_extra_channels in the associated JxlBasicInfo. + * + * @param dec decoder object + * @param index index of the extra channel to query. + * @param info struct to copy the information into, or NULL to only check + * whether the information is available through the return value. + * @return JXL_DEC_SUCCESS if the value is available, + * JXL_DEC_NEED_MORE_INPUT if not yet available, JXL_DEC_ERROR in case + * of other error conditions. + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderGetExtraChannelInfo( + const JxlDecoder* dec, size_t index, JxlExtraChannelInfo* info); + +/** + * Outputs name for extra channel at the given index in UTF-8. The index must be + * smaller than num_extra_channels in the associated JxlBasicInfo. The buffer + * for name must have at least name_length + 1 bytes allocated, gotten from + * the associated JxlExtraChannelInfo. + * + * @param dec decoder object + * @param index index of the extra channel to query. + * @param name buffer to copy the name into + * @param size size of the name buffer in bytes + * @return JXL_DEC_SUCCESS if the value is available, + * JXL_DEC_NEED_MORE_INPUT if not yet available, JXL_DEC_ERROR in case + * of other error conditions. + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderGetExtraChannelName(const JxlDecoder* dec, + size_t index, + char* name, + size_t size); + +/** Defines which color profile to get: the profile from the codestream + * metadata header, which represents the color profile of the original image, + * or the color profile from the pixel data received by the decoder. Both are + * the same if the basic has uses_original_profile set. + */ +typedef enum { + /** Get the color profile of the original image from the metadata.. + */ + JXL_COLOR_PROFILE_TARGET_ORIGINAL = 0, + + /** Get the color profile of the pixel data the decoder outputs. */ + JXL_COLOR_PROFILE_TARGET_DATA = 1, +} JxlColorProfileTarget; + +/** + * Outputs the color profile as JPEG XL encoded structured data, if available. + * This is an alternative to an ICC Profile, which can represent a more limited + * amount of color spaces, but represents them exactly through enum values. + * + * It is often possible to use JxlDecoderGetColorAsICCProfile as an + * alternative anyway. The following scenarios are possible: + * - The JPEG XL image has an attached ICC Profile, in that case, the encoded + * structured data is not available, this function will return an error status + * and you must use JxlDecoderGetColorAsICCProfile instead. + * - The JPEG XL image has an encoded structured color profile, and it + * represents an RGB or grayscale color space. This function will return it. + * You can still use JxlDecoderGetColorAsICCProfile as well as an + * alternative if desired, though depending on which RGB color space is + * represented, the ICC profile may be a close approximation. It is also not + * always feasible to deduce from an ICC profile which named color space it + * exactly represents, if any, as it can represent any arbitrary space. + * - The JPEG XL image has an encoded structured color profile, and it indicates + * an unknown or xyb color space. In that case, + * JxlDecoderGetColorAsICCProfile is not available. + * + * If you wish to render the image using a system that supports ICC profiles, + * use JxlDecoderGetColorAsICCProfile first. If you're looking for a specific + * color space possibly indicated in the JPEG XL image, use + * JxlDecoderGetColorAsEncodedProfile first. + * + * @param dec decoder object + * @param format pixel format to output the data to. Only used for + * JXL_COLOR_PROFILE_TARGET_DATA, may be nullptr otherwise. + * @param target whether to get the original color profile from the metadata + * or the color profile of the decoded pixels. + * @param color_encoding struct to copy the information into, or NULL to only + * check whether the information is available through the return value. + * @return JXL_DEC_SUCCESS if the data is available and returned, + * JXL_DEC_NEED_MORE_INPUT if not yet available, JXL_DEC_ERROR in case + * the encuded structured color profile does not exist in the codestream. + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderGetColorAsEncodedProfile( + const JxlDecoder* dec, const JxlPixelFormat* format, + JxlColorProfileTarget target, JxlColorEncoding* color_encoding); + +/** + * Outputs the size in bytes of the ICC profile returned by + * JxlDecoderGetColorAsICCProfile, if available, or indicates there is none + * available. In most cases, the image will have an ICC profile available, but + * if it does not, JxlDecoderGetColorAsEncodedProfile must be used instead. + * @see JxlDecoderGetColorAsEncodedProfile for more information. The ICC + * profile is either the exact ICC profile attached to the codestream metadata, + * or a close approximation generated from JPEG XL encoded structured data, + * depending of what is encoded in the codestream. + * + * @param dec decoder object + * @param format pixel format to output the data to. Only used for + * JXL_COLOR_PROFILE_TARGET_DATA, may be nullptr otherwise. + * @param target whether to get the original color profile from the metadata + * or the color profile of the decoded pixels. + * @param size variable to output the size into, or NULL to only check the + * return status. + * @return JXL_DEC_SUCCESS if the ICC profile is available, + * JXL_DEC_NEED_MORE_INPUT if the decoder has not yet received enough + * input data to determine whether an ICC profile is available or what its + * size is, JXL_DEC_ERROR in case the ICC profile is not available and + * cannot be generated. + */ +JXL_EXPORT JxlDecoderStatus +JxlDecoderGetICCProfileSize(const JxlDecoder* dec, const JxlPixelFormat* format, + JxlColorProfileTarget target, size_t* size); + +/** + * Outputs ICC profile if available. The profile is only available if + * JxlDecoderGetICCProfileSize returns success. The output buffer must have + * at least as many bytes as given by JxlDecoderGetICCProfileSize. + * + * @param dec decoder object + * @param format pixel format to output the data to. Only used for + * JXL_COLOR_PROFILE_TARGET_DATA, may be nullptr otherwise. + * @param target whether to get the original color profile from the metadata + * or the color profile of the decoded pixels. + * @param icc_profile buffer to copy the ICC profile into + * @param size size of the icc_profile buffer in bytes + * @return JXL_DEC_SUCCESS if the profile was successfully returned is + * available, JXL_DEC_NEED_MORE_INPUT if not yet available, + * JXL_DEC_ERROR if the profile doesn't exist or the output size is not + * large enough. + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderGetColorAsICCProfile( + const JxlDecoder* dec, const JxlPixelFormat* format, + JxlColorProfileTarget target, uint8_t* icc_profile, size_t size); + +/** Sets the color profile to use for JXL_COLOR_PROFILE_TARGET_DATA for the + * special case when the decoder has a choice. This only has effect for a JXL + * image where uses_original_profile is false, and the original color profile is + * encoded as an ICC color profile rather than a JxlColorEncoding with known + * enum values. In most other cases (uses uses_original_profile is true, or the + * color profile is already given as a JxlColorEncoding), this setting is + * ignored and the decoder uses a profile related to the image. + * No matter what, the JXL_COLOR_PROFILE_TARGET_DATA must still be queried to + * know the actual data format of the decoded pixels after decoding. + * + * The intended use case of this function is for cases where you are using + * a color management system to parse the original ICC color profile + * (JXL_COLOR_PROFILE_TARGET_ORIGINAL), from this you know that the ICC + * profile represents one of the color profiles supported by JxlColorEncoding + * (such as sRGB, PQ or HLG): in that case it is beneficial (but not necessary) + * to use JxlDecoderSetPreferredColorProfile to match the parsed profile. The + * JXL decoder has no color management system built in, but can convert XYB + * color to any of the ones supported by JxlColorEncoding. + * + * Can only be set after the JXL_DEC_COLOR_ENCODING event occurred and before + * any other event occurred, and can affect the result of + * JXL_COLOR_PROFILE_TARGET_DATA (but not of JXL_COLOR_PROFILE_TARGET_ORIGINAL), + * so should be used after getting JXL_COLOR_PROFILE_TARGET_ORIGINAL but before + * getting JXL_COLOR_PROFILE_TARGET_DATA. The color_encoding must be grayscale + * if num_color_channels from the basic info is 1, RGB if num_color_channels + * from the basic info is 3. + * + * If JxlDecoderSetPreferredColorProfile is not used, then for images for which + * uses_original_profile is false and with ICC color profile, the decoder will + * choose linear sRGB for color images, linear grayscale for grayscale images. + * This function only sets a preference, since for other images the decoder has + * no choice what color profile to use, it is determined by the image. + * + * @param dec decoder object + * @param color_encoding the default color encoding to set + * @return JXL_DEC_SUCCESS if the preference was set successfully, JXL_DEC_ERROR + * otherwise. + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderSetPreferredColorProfile( + JxlDecoder* dec, const JxlColorEncoding* color_encoding); + +/** + * Returns the minimum size in bytes of the preview image output pixel buffer + * for the given format. This is the buffer for JxlDecoderSetPreviewOutBuffer. + * Requires the preview header information is available in the decoder. + * + * @param dec decoder object + * @param format format of pixels + * @param size output value, buffer size in bytes + * @return JXL_DEC_SUCCESS on success, JXL_DEC_ERROR on error, such as + * information not available yet. + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderPreviewOutBufferSize( + const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size); + +/** + * Sets the buffer to write the small resolution preview image + * to. The size of the buffer must be at least as large as given by + * JxlDecoderPreviewOutBufferSize. The buffer follows the format described by + * JxlPixelFormat. The preview image dimensions are given by the + * JxlPreviewHeader. The buffer is owned by the caller. + * + * @param dec decoder object + * @param format format of pixels. Object owned by user and its contents are + * copied internally. + * @param buffer buffer type to output the pixel data to + * @param size size of buffer in bytes + * @return JXL_DEC_SUCCESS on success, JXL_DEC_ERROR on error, such as + * size too small. + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderSetPreviewOutBuffer( + JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size); + +/** + * Outputs the information from the frame, such as duration when have_animation. + * This function can be called when JXL_DEC_FRAME occurred for the current + * frame, even when have_animation in the JxlBasicInfo is JXL_FALSE. + * + * @param dec decoder object + * @param header struct to copy the information into, or NULL to only check + * whether the information is available through the return value. + * @return JXL_DEC_SUCCESS if the value is available, + * JXL_DEC_NEED_MORE_INPUT if not yet available, JXL_DEC_ERROR in case + * of other error conditions. + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderGetFrameHeader(const JxlDecoder* dec, + JxlFrameHeader* header); + +/** + * Outputs name for the current frame. The buffer + * for name must have at least name_length + 1 bytes allocated, gotten from + * the associated JxlFrameHeader. + * + * @param dec decoder object + * @param name buffer to copy the name into + * @param size size of the name buffer in bytes, including zero termination + * character, so this must be at least JxlFrameHeader.name_length + 1. + * @return JXL_DEC_SUCCESS if the value is available, + * JXL_DEC_NEED_MORE_INPUT if not yet available, JXL_DEC_ERROR in case + * of other error conditions. + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderGetFrameName(const JxlDecoder* dec, + char* name, size_t size); + +/** + * Returns the minimum size in bytes of the DC image output buffer + * for the given format. This is the buffer for JxlDecoderSetDCOutBuffer. + * Requires the basic image information is available in the decoder. + * + * @param dec decoder object + * @param format format of pixels + * @param size output value, buffer size in bytes + * @return JXL_DEC_SUCCESS on success, JXL_DEC_ERROR on error, such as + * information not available yet. + * + * DEPRECATED: the DC feature in this form will be removed. You can use + * JxlDecoderFlushImage for progressive rendering. + */ +JXL_EXPORT JXL_DEPRECATED JxlDecoderStatus JxlDecoderDCOutBufferSize( + const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size); + +/** + * Sets the buffer to write the lower resolution (8x8 sub-sampled) DC image + * to. The size of the buffer must be at least as large as given by + * JxlDecoderDCOutBufferSize. The buffer follows the format described by + * JxlPixelFormat. The DC image has dimensions ceil(xsize / 8) * ceil(ysize / + * 8). The buffer is owned by the caller. + * + * @param dec decoder object + * @param format format of pixels. Object owned by user and its contents are + * copied internally. + * @param buffer buffer type to output the pixel data to + * @param size size of buffer in bytes + * @return JXL_DEC_SUCCESS on success, JXL_DEC_ERROR on error, such as + * size too small. + * + * DEPRECATED: the DC feature in this form will be removed. You can use + * JxlDecoderFlushImage for progressive rendering. + */ +JXL_EXPORT JXL_DEPRECATED JxlDecoderStatus JxlDecoderSetDCOutBuffer( + JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size); + +/** + * Returns the minimum size in bytes of the image output pixel buffer for the + * given format. This is the buffer for JxlDecoderSetImageOutBuffer. Requires + * the basic image information is available in the decoder. + * + * @param dec decoder object + * @param format format of the pixels. + * @param size output value, buffer size in bytes + * @return JXL_DEC_SUCCESS on success, JXL_DEC_ERROR on error, such as + * information not available yet. + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderImageOutBufferSize( + const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size); + +/** + * Sets the buffer to write the full resolution image to. This can be set when + * the JXL_DEC_FRAME event occurs, must be set when the + * JXL_DEC_NEED_IMAGE_OUT_BUFFER event occurs, and applies only for the current + * frame. The size of the buffer must be at least as large as given by + * JxlDecoderImageOutBufferSize. The buffer follows the format described by + * JxlPixelFormat. The buffer is owned by the caller. + * + * @param dec decoder object + * @param format format of the pixels. Object owned by user and its contents + * are copied internally. + * @param buffer buffer type to output the pixel data to + * @param size size of buffer in bytes + * @return JXL_DEC_SUCCESS on success, JXL_DEC_ERROR on error, such as + * size too small. + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderSetImageOutBuffer( + JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size); + +/** + * Callback function type for JxlDecoderSetImageOutCallback. @see + * JxlDecoderSetImageOutCallback for usage. + * + * The callback bay be called simultaneously by different threads when using a + * threaded parallel runner, on different pixels. + * + * @param opaque optional user data, as given to JxlDecoderSetImageOutCallback. + * @param x horizontal position of leftmost pixel of the pixel data. + * @param y vertical position of the pixel data. + * @param num_pixels amount of pixels included in the pixel data, horizontally. + * This is not the same as xsize of the full image, it may be smaller. + * @param pixels pixel data as a horizontal stripe, in the format passed to + * JxlDecoderSetImageOutCallback. The memory is not owned by the user, and is + * only valid during the time the callback is running. + */ +typedef void (*JxlImageOutCallback)(void* opaque, size_t x, size_t y, + size_t num_pixels, const void* pixels); + +/** + * Sets pixel output callback. This is an alternative to + * JxlDecoderSetImageOutBuffer. This can be set when the JXL_DEC_FRAME event + * occurs, must be set when the JXL_DEC_NEED_IMAGE_OUT_BUFFER event occurs, and + * applies only for the current frame. Only one of JxlDecoderSetImageOutBuffer + * or JxlDecoderSetImageOutCallback may be used for the same frame, not both at + * the same time. + * + * The callback will be called multiple times, to receive the image + * data in small chunks. The callback receives a horizontal stripe of pixel + * data, 1 pixel high, xsize pixels wide, called a scanline. The xsize here is + * not the same as the full image width, the scanline may be a partial section, + * and xsize may differ between calls. The user can then process and/or copy the + * partial scanline to an image buffer. The callback bay be called + * simultaneously by different threads when using a threaded parallel runner, on + * different pixels. + * + * If JxlDecoderFlushImage is not used, then each pixel will be visited exactly + * once by the different callback calls, during processing with one or more + * JxlDecoderProcessInput calls. These pixels are decoded to full detail, they + * are not part of a lower resolution or lower quality progressive pass, but the + * final pass. + * + * If JxlDecoderFlushImage is used, then in addition each pixel will be visited + * zero or one times during the blocking JxlDecoderFlushImage call. Pixels + * visited as a result of JxlDecoderFlushImage may represent a lower resolution + * or lower quality intermediate progressive pass of the image. Any visited + * pixel will be of a quality at least as good or better than previous visits of + * this pixel. A pixel may be visited zero times if it cannot be decoded yet + * or if it was already decoded to full precision (this behavior is not + * guaranteed). + * + * @param dec decoder object + * @param format format of the pixels. Object owned by user and its contents + * are copied internally. + * @param callback the callback function receiving partial scanlines of pixel + * data. + * @param opaque optional user data, which will be passed on to the callback, + * may be NULL. + * @return JXL_DEC_SUCCESS on success, JXL_DEC_ERROR on error, such as + * JxlDecoderSetImageOutBuffer already set. + */ +JXL_EXPORT JxlDecoderStatus +JxlDecoderSetImageOutCallback(JxlDecoder* dec, const JxlPixelFormat* format, + JxlImageOutCallback callback, void* opaque); + +/** + * Sets output buffer for reconstructed JPEG codestream. + * + * The data is owned by the caller + * and may be used by the decoder until JxlDecoderReleaseJPEGBuffer is called or + * the decoder is destroyed or reset so must be kept alive until then. + * + * @param dec decoder object + * @param data pointer to next bytes to write to + * @param size amount of bytes available starting from data + * @return JXL_DEC_ERROR if input was already set without releasing, + * JXL_DEC_SUCCESS otherwise + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderSetJPEGBuffer(JxlDecoder* dec, + uint8_t* data, size_t size); + +/** + * Releases buffer which was provided with JxlDecoderSetJPEGBuffer. + * + * Calling JxlDecoderReleaseJPEGBuffer is required whenever + * a buffer is already set and a new buffer needs to be added with + * JxlDecoderSetJPEGBuffer, but is not required before JxlDecoderDestroy or + * JxlDecoderReset. + * + * Calling JxlDecoderReleaseJPEGBuffer when no input is set is + * not an error and returns 0. + * + * @param dec decoder object + * @return the amount of bytes the decoder has not yet written to of the data + * set by JxlDecoderSetJPEGBuffer, or 0 if no buffer is set or + * JxlDecoderReleaseJPEGBuffer was already called. + */ +JXL_EXPORT size_t JxlDecoderReleaseJPEGBuffer(JxlDecoder* dec); + +/* TODO(lode): add way to output extra channels */ + +/** + * Outputs progressive step towards the decoded image so far when only partial + * input was received. If the flush was successful, the buffer set with + * JxlDecoderSetImageOutBuffer will contain partial image data. + * + * Can be called when JxlDecoderProcessInput returns JXL_DEC_NEED_MORE_INPUT, + * after the JXL_DEC_FRAME event already occurred and before the + * JXL_DEC_FULL_IMAGE event occurred for a frame. + * + * @param dec decoder object + * @return JXL_DEC_SUCCESS if image data was flushed to the output buffer, or + * JXL_DEC_ERROR when no flush was done, e.g. if not enough image data was + * available yet even for flush, or no output buffer was set yet. An error is + * not fatal, it only indicates no flushed image is available now, regular, + * decoding can still be performed. + */ +JXL_EXPORT JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec); + +#if defined(__cplusplus) || defined(c_plusplus) +} +#endif + +#endif /* JXL_DECODE_H_ */ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/decode_cxx.h b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/decode_cxx.h new file mode 100644 index 0000000000..4e7315289c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/decode_cxx.h @@ -0,0 +1,52 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/// @file decode_cxx.h +/// @brief C++ header-only helper for @ref decode.h. +/// +/// There's no binary library associated with the header since this is a header +/// only library. + +#ifndef JXL_DECODE_CXX_H_ +#define JXL_DECODE_CXX_H_ + +#include + +#include "jxl/decode.h" + +#if !(defined(__cplusplus) || defined(c_plusplus)) +#error "This a C++ only header. Use jxl/decode.h from C sources." +#endif + +/// Struct to call JxlDecoderDestroy from the JxlDecoderPtr unique_ptr. +struct JxlDecoderDestroyStruct { + /// Calls @ref JxlDecoderDestroy() on the passed decoder. + void operator()(JxlDecoder* decoder) { JxlDecoderDestroy(decoder); } +}; + +/// std::unique_ptr<> type that calls JxlDecoderDestroy() when releasing the +/// decoder. +/// +/// Use this helper type from C++ sources to ensure the decoder is destroyed and +/// their internal resources released. +typedef std::unique_ptr JxlDecoderPtr; + +/// Creates an instance of JxlDecoder into a JxlDecoderPtr and initializes it. +/// +/// This function returns a unique_ptr that will call JxlDecoderDestroy() when +/// releasing the pointer. See @ref JxlDecoderCreate for details on the +/// instance creation. +/// +/// @param memory_manager custom allocator function. It may be NULL. The memory +/// manager will be copied internally. +/// @return a @c NULL JxlDecoderPtr if the instance can not be allocated or +/// initialized +/// @return initialized JxlDecoderPtr instance otherwise. +static inline JxlDecoderPtr JxlDecoderMake( + const JxlMemoryManager* memory_manager) { + return JxlDecoderPtr(JxlDecoderCreate(memory_manager)); +} + +#endif // JXL_DECODE_CXX_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/encode.h b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/encode.h new file mode 100644 index 0000000000..3e0882cc76 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/encode.h @@ -0,0 +1,379 @@ +/* Copyright (c) the JPEG XL Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style + * license that can be found in the LICENSE file. + */ + +/** @file encode.h + * @brief Encoding API for JPEG XL. + */ + +#ifndef JXL_ENCODE_H_ +#define JXL_ENCODE_H_ + +#include "jxl/decode.h" +#include "jxl/jxl_export.h" +#include "jxl/memory_manager.h" +#include "jxl/parallel_runner.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +/** + * Encoder library version. + * + * @return the encoder library version as an integer: + * MAJOR_VERSION * 1000000 + MINOR_VERSION * 1000 + PATCH_VERSION. For example, + * version 1.2.3 would return 1002003. + */ +JXL_EXPORT uint32_t JxlEncoderVersion(void); + +/** + * Opaque structure that holds the JPEG XL encoder. + * + * Allocated and initialized with JxlEncoderCreate(). + * Cleaned up and deallocated with JxlEncoderDestroy(). + */ +typedef struct JxlEncoderStruct JxlEncoder; + +/** + * Opaque structure that holds frame specific encoding options for a JPEG XL + * encoder. + * + * Allocated and initialized with JxlEncoderOptionsCreate(). + * Cleaned up and deallocated when the encoder is destroyed with + * JxlEncoderDestroy(). + */ +typedef struct JxlEncoderOptionsStruct JxlEncoderOptions; + +/** + * Return value for multiple encoder functions. + */ +typedef enum { + /** Function call finished successfully, or encoding is finished and there is + * nothing more to be done. + */ + JXL_ENC_SUCCESS = 0, + + /** An error occurred, for example out of memory. + */ + JXL_ENC_ERROR = 1, + + /** The encoder needs more output buffer to continue encoding. + */ + JXL_ENC_NEED_MORE_OUTPUT = 2, + + /** The encoder doesn't (yet) support this. + */ + JXL_ENC_NOT_SUPPORTED = 3, + +} JxlEncoderStatus; + +/** + * Creates an instance of JxlEncoder and initializes it. + * + * @p memory_manager will be used for all the library dynamic allocations made + * from this instance. The parameter may be NULL, in which case the default + * allocator will be used. See jpegxl/memory_manager.h for details. + * + * @param memory_manager custom allocator function. It may be NULL. The memory + * manager will be copied internally. + * @return @c NULL if the instance can not be allocated or initialized + * @return pointer to initialized JxlEncoder otherwise + */ +JXL_EXPORT JxlEncoder* JxlEncoderCreate(const JxlMemoryManager* memory_manager); + +/** + * Re-initializes a JxlEncoder instance, so it can be re-used for encoding + * another image. All state and settings are reset as if the object was + * newly created with JxlEncoderCreate, but the memory manager is kept. + * + * @param enc instance to be re-initialized. + */ +JXL_EXPORT void JxlEncoderReset(JxlEncoder* enc); + +/** + * Deinitializes and frees JxlEncoder instance. + * + * @param enc instance to be cleaned up and deallocated. + */ +JXL_EXPORT void JxlEncoderDestroy(JxlEncoder* enc); + +/** + * Set the parallel runner for multithreading. May only be set before starting + * encoding. + * + * @param enc encoder object. + * @param parallel_runner function pointer to runner for multithreading. It may + * be NULL to use the default, single-threaded, runner. A multithreaded + * runner should be set to reach fast performance. + * @param parallel_runner_opaque opaque pointer for parallel_runner. + * @return JXL_ENC_SUCCESS if the runner was set, JXL_ENC_ERROR + * otherwise (the previous runner remains set). + */ +JXL_EXPORT JxlEncoderStatus +JxlEncoderSetParallelRunner(JxlEncoder* enc, JxlParallelRunner parallel_runner, + void* parallel_runner_opaque); + +/** + * Encodes JPEG XL file using the available bytes. @p *avail_out indicates how + * many output bytes are available, and @p *next_out points to the input bytes. + * *avail_out will be decremented by the amount of bytes that have been + * processed by the encoder and *next_out will be incremented by the same + * amount, so *next_out will now point at the amount of *avail_out unprocessed + * bytes. + * + * The returned status indicates whether the encoder needs more output bytes. + * When the return value is not JXL_ENC_ERROR or JXL_ENC_SUCCESS, the encoding + * requires more JxlEncoderProcessOutput calls to continue. + * + * @param enc encoder object. + * @param next_out pointer to next bytes to write to. + * @param avail_out amount of bytes available starting from *next_out. + * @return JXL_ENC_SUCCESS when encoding finished and all events handled. + * @return JXL_ENC_ERROR when encoding failed, e.g. invalid input. + * @return JXL_ENC_NEED_MORE_OUTPUT more output buffer is necessary. + */ +JXL_EXPORT JxlEncoderStatus JxlEncoderProcessOutput(JxlEncoder* enc, + uint8_t** next_out, + size_t* avail_out); + +/** + * Sets the buffer to read JPEG encoded bytes from for the next frame to encode. + * + * If JxlEncoderSetBasicInfo has not yet been called, calling + * JxlEncoderAddJPEGFrame will implicitly call it with the parameters of the + * added JPEG frame. + * + * If JxlEncoderSetColorEncoding or JxlEncoderSetICCProfile has not yet been + * called, calling JxlEncoderAddJPEGFrame will implicitly call it with the + * parameters of the added JPEG frame. + * + * If the encoder is set to store JPEG reconstruction metadata using @ref + * JxlEncoderStoreJPEGMetadata and a single JPEG frame is added, it will be + * possible to losslessly reconstruct the JPEG codestream. + * + * @param options set of encoder options to use when encoding the frame. + * @param buffer bytes to read JPEG from. Owned by the caller and its contents + * are copied internally. + * @param size size of buffer in bytes. + * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error + */ +JXL_EXPORT JxlEncoderStatus JxlEncoderAddJPEGFrame( + const JxlEncoderOptions* options, const uint8_t* buffer, size_t size); + +/** + * Sets the buffer to read pixels from for the next image to encode. Must call + * JxlEncoderSetBasicInfo before JxlEncoderAddImageFrame. + * + * Currently only some pixel formats are supported: + * - JXL_TYPE_UINT8 + * - JXL_TYPE_UINT16 + * - JXL_TYPE_FLOAT, with nominal range 0..1 + * + * The color profile of the pixels depends on the value of uses_original_profile + * in the JxlBasicInfo. If true, the pixels are assumed to be encoded in the + * original profile that is set with JxlEncoderSetColorEncoding or + * JxlEncoderSetICCProfile. If false, the pixels are assumed to be nonlinear + * sRGB for integer data types (JXL_TYPE_UINT8 and JXL_TYPE_UINT16), and linear + * sRGB for floating point data types (JXL_TYPE_FLOAT). + * + * @param options set of encoder options to use when encoding the frame. + * @param pixel_format format for pixels. Object owned by the caller and its + * contents are copied internally. + * @param buffer buffer type to input the pixel data from. Owned by the caller + * and its contents are copied internally. + * @param size size of buffer in bytes. + * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error + */ +JXL_EXPORT JxlEncoderStatus JxlEncoderAddImageFrame( + const JxlEncoderOptions* options, const JxlPixelFormat* pixel_format, + const void* buffer, size_t size); + +/** + * Declares that this encoder will not encode anything further. + * + * Must be called between JxlEncoderAddImageFrame/JPEGFrame of the last frame + * and the next call to JxlEncoderProcessOutput, or JxlEncoderProcessOutput + * won't output the last frame correctly. + * + * @param enc encoder object. + */ +JXL_EXPORT void JxlEncoderCloseInput(JxlEncoder* enc); + +/** + * Sets the original color encoding of the image encoded by this encoder. This + * is an alternative to JxlEncoderSetICCProfile and only one of these two must + * be used. This one sets the color encoding as a @ref JxlColorEncoding, while + * the other sets it as ICC binary data. + * + * @param enc encoder object. + * @param color color encoding. Object owned by the caller and its contents are + * copied internally. + * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR or + * JXL_ENC_NOT_SUPPORTED otherwise + */ +JXL_EXPORT JxlEncoderStatus +JxlEncoderSetColorEncoding(JxlEncoder* enc, const JxlColorEncoding* color); + +/** + * Sets the original color encoding of the image encoded by this encoder as an + * ICC color profile. This is an alternative to JxlEncoderSetColorEncoding and + * only one of these two must be used. This one sets the color encoding as ICC + * binary data, while the other defines it as a @ref JxlColorEncoding. + * + * @param enc encoder object. + * @param icc_profile bytes of the original ICC profile + * @param size size of the icc_profile buffer in bytes + * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR or + * JXL_ENC_NOT_SUPPORTED otherwise + */ +JXL_EXPORT JxlEncoderStatus JxlEncoderSetICCProfile(JxlEncoder* enc, + const uint8_t* icc_profile, + size_t size); + +/** + * Sets the global metadata of the image encoded by this encoder. + * + * @param enc encoder object. + * @param info global image metadata. Object owned by the caller and its + * contents are copied internally. + * @return JXL_ENC_SUCCESS if the operation was successful, + * JXL_ENC_ERROR or JXL_ENC_NOT_SUPPORTED otherwise + */ +JXL_EXPORT JxlEncoderStatus JxlEncoderSetBasicInfo(JxlEncoder* enc, + const JxlBasicInfo* info); + +/** + * Configure the encoder to store JPEG reconstruction metadata in the JPEG XL + * container. + * + * The encoder must be configured to use the JPEG XL container format using @ref + * JxlEncoderUseContainer for this to have any effect. + * + * If this is set to true and a single JPEG frame is added, it will be + * possible to losslessly reconstruct the JPEG codestream. + * + * @param enc encoder object. + * @param store_jpeg_metadata true if the encoder should store JPEG metadata. + * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR + * otherwise. + */ +JXL_EXPORT JxlEncoderStatus +JxlEncoderStoreJPEGMetadata(JxlEncoder* enc, JXL_BOOL store_jpeg_metadata); + +/** + * Configure the encoder to use the JPEG XL container format. + * + * Using the JPEG XL container format allows to store metadata such as JPEG + * reconstruction (@ref JxlEncoderStoreJPEGMetadata) or other metadata like + * EXIF; but it adds a few bytes to the encoded file for container headers even + * if there is no extra metadata. + * + * @param enc encoder object. + * @param use_container true if the encoder should output the JPEG XL container + * format. + * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR + * otherwise. + */ +JXL_EXPORT JxlEncoderStatus JxlEncoderUseContainer(JxlEncoder* enc, + JXL_BOOL use_container); + +/** + * Sets lossless/lossy mode for the provided options. Default is lossy. + * + * @param options set of encoder options to update with the new mode + * @param lossless whether the options should be lossless + * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR + * otherwise. + */ +JXL_EXPORT JxlEncoderStatus +JxlEncoderOptionsSetLossless(JxlEncoderOptions* options, JXL_BOOL lossless); + +/** + * Set the decoding speed tier for the provided options. Minimum is 0 (highest + * quality), and maximum is 4 (lowest quality). Default is 0. + * + * @param options set of encoder options to update with the new decoding speed + * tier. + * @param tier the decoding speed tier to set. + * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR + * otherwise. + */ +JXL_EXPORT JxlEncoderStatus +JxlEncoderOptionsSetDecodingSpeed(JxlEncoderOptions* options, int tier); + +/** + * Sets encoder effort/speed level without affecting decoding speed. Valid + * values are, from faster to slower speed: 3:falcon 4:cheetah 5:hare 6:wombat + * 7:squirrel 8:kitten 9:tortoise Default: squirrel (7). + * + * @param options set of encoder options to update with the new mode. + * @param effort the effort value to set. + * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR + * otherwise. + */ +JXL_EXPORT JxlEncoderStatus +JxlEncoderOptionsSetEffort(JxlEncoderOptions* options, int effort); + +/** + * Sets the distance level for lossy compression: target max butteraugli + * distance, lower = higher quality. Range: 0 .. 15. + * 0.0 = mathematically lossless (however, use JxlEncoderOptionsSetLossless to + * use true lossless). + * 1.0 = visually lossless. + * Recommended range: 0.5 .. 3.0. + * Default value: 1.0. + * If JxlEncoderOptionsSetLossless is used, this value is unused and implied + * to be 0. + * + * @param options set of encoder options to update with the new mode. + * @param distance the distance value to set. + * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR + * otherwise. + */ +JXL_EXPORT JxlEncoderStatus +JxlEncoderOptionsSetDistance(JxlEncoderOptions* options, float distance); + +/** + * Create a new set of encoder options, with all values initially copied from + * the @p source options, or set to default if @p source is NULL. + * + * The returned pointer is an opaque struct tied to the encoder and it will be + * deallocated by the encoder when JxlEncoderDestroy() is called. For functions + * taking both a @ref JxlEncoder and a @ref JxlEncoderOptions, only + * JxlEncoderOptions created with this function for the same encoder instance + * can be used. + * + * @param enc encoder object. + * @param source source options to copy initial values from, or NULL to get + * defaults initialized to defaults. + * @return the opaque struct pointer identifying a new set of encoder options. + */ +JXL_EXPORT JxlEncoderOptions* JxlEncoderOptionsCreate( + JxlEncoder* enc, const JxlEncoderOptions* source); + +/** + * Sets a color encoding to be sRGB. + * + * @param color_encoding color encoding instance. + * @param is_gray whether the color encoding should be gray scale or color. + */ +JXL_EXPORT void JxlColorEncodingSetToSRGB(JxlColorEncoding* color_encoding, + JXL_BOOL is_gray); + +/** + * Sets a color encoding to be linear sRGB. + * + * @param color_encoding color encoding instance. + * @param is_gray whether the color encoding should be gray scale or color. + */ +JXL_EXPORT void JxlColorEncodingSetToLinearSRGB( + JxlColorEncoding* color_encoding, JXL_BOOL is_gray); + +#if defined(__cplusplus) || defined(c_plusplus) +} +#endif + +#endif /* JXL_ENCODE_H_ */ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/encode_cxx.h b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/encode_cxx.h new file mode 100644 index 0000000000..841528f57c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/encode_cxx.h @@ -0,0 +1,52 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/// @file encode_cxx.h +/// @brief C++ header-only helper for @ref encode.h. +/// +/// There's no binary library associated with the header since this is a header +/// only library. + +#ifndef JXL_ENCODE_CXX_H_ +#define JXL_ENCODE_CXX_H_ + +#include + +#include "jxl/encode.h" + +#if !(defined(__cplusplus) || defined(c_plusplus)) +#error "This a C++ only header. Use jxl/encode.h from C sources." +#endif + +/// Struct to call JxlEncoderDestroy from the JxlEncoderPtr unique_ptr. +struct JxlEncoderDestroyStruct { + /// Calls @ref JxlEncoderDestroy() on the passed encoder. + void operator()(JxlEncoder* encoder) { JxlEncoderDestroy(encoder); } +}; + +/// std::unique_ptr<> type that calls JxlEncoderDestroy() when releasing the +/// encoder. +/// +/// Use this helper type from C++ sources to ensure the encoder is destroyed and +/// their internal resources released. +typedef std::unique_ptr JxlEncoderPtr; + +/// Creates an instance of JxlEncoder into a JxlEncoderPtr and initializes it. +/// +/// This function returns a unique_ptr that will call JxlEncoderDestroy() when +/// releasing the pointer. See @ref JxlEncoderCreate for details on the +/// instance creation. +/// +/// @param memory_manager custom allocator function. It may be NULL. The memory +/// manager will be copied internally. +/// @return a @c NULL JxlEncoderPtr if the instance can not be allocated or +/// initialized +/// @return initialized JxlEncoderPtr instance otherwise. +static inline JxlEncoderPtr JxlEncoderMake( + const JxlMemoryManager* memory_manager) { + return JxlEncoderPtr(JxlEncoderCreate(memory_manager)); +} + +#endif // JXL_ENCODE_CXX_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/memory_manager.h b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/memory_manager.h new file mode 100644 index 0000000000..30e6f9000d --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/memory_manager.h @@ -0,0 +1,67 @@ +/* Copyright (c) the JPEG XL Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style + * license that can be found in the LICENSE file. + */ + +/** @file memory_manager.h + * @brief Abstraction functions used by JPEG XL to allocate memory. + */ + +#ifndef JXL_MEMORY_MANAGER_H_ +#define JXL_MEMORY_MANAGER_H_ + +#include + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +/** + * Allocating function for a memory region of a given size. + * + * Allocates a contiguous memory region of size @p size bytes. The returned + * memory may not be aligned to a specific size or initialized at all. + * + * @param opaque custom memory manager handle provided by the caller. + * @param size in bytes of the requested memory region. + * @returns @c 0 if the memory can not be allocated, + * @returns pointer to the memory otherwise. + */ +typedef void* (*jpegxl_alloc_func)(void* opaque, size_t size); + +/** + * Deallocating function pointer type. + * + * This function @b MUST do nothing if @p address is @c 0. + * + * @param opaque custom memory manager handle provided by the caller. + * @param address memory region pointer returned by ::jpegxl_alloc_func, or @c 0 + */ +typedef void (*jpegxl_free_func)(void* opaque, void* address); + +/** + * Memory Manager struct. + * These functions, when provided by the caller, will be used to handle memory + * allocations. + */ +typedef struct JxlMemoryManagerStruct { + /** The opaque pointer that will be passed as the first parameter to all the + * functions in this struct. */ + void* opaque; + + /** Memory allocation function. This can be NULL if and only if also the + * free() member in this class is NULL. All dynamic memory will be allocated + * and freed with these functions if they are not NULL. */ + jpegxl_alloc_func alloc; + /** Free function matching the alloc() member. */ + jpegxl_free_func free; + + /* TODO(deymo): Add cache-aligned alloc/free functions here. */ +} JxlMemoryManager; + +#if defined(__cplusplus) || defined(c_plusplus) +} +#endif + +#endif /* JXL_MEMORY_MANAGER_H_ */ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/parallel_runner.h b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/parallel_runner.h new file mode 100644 index 0000000000..3411c994d1 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/parallel_runner.h @@ -0,0 +1,151 @@ +/* Copyright (c) the JPEG XL Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style + * license that can be found in the LICENSE file. + */ + +/** + * @file parallel_runner.h + */ + +/** API for running data operations in parallel in a multi-threaded environment. + * This module allows the JPEG XL caller to define their own way of creating and + * assigning threads. + * + * The JxlParallelRunner function type defines a parallel data processing + * runner that may be implemented by the caller to allow the library to process + * in multiple threads. The multi-threaded processing in this library only + * requires to run the same function over each number of a range, possibly + * running each call in a different thread. The JPEG XL caller is responsible + * for implementing this logic using the thread APIs available in their system. + * For convenience, a C++ implementation based on std::thread is provided in + * jpegxl/parallel_runner_thread.h (part of the jpegxl_threads library). + * + * Thread pools usually store small numbers of heterogeneous tasks in a queue. + * When tasks are identical or differ only by an integer input parameter, it is + * much faster to store just one function of an integer parameter and call it + * for each value. Conventional vector-of-tasks can be run in parallel using a + * lambda function adapter that simply calls task_funcs[task]. + * + * If no multi-threading is desired, a @c NULL value of JxlParallelRunner + * will use an internal implementation without multi-threading. + */ + +#ifndef JXL_PARALLEL_RUNNER_H_ +#define JXL_PARALLEL_RUNNER_H_ + +#include +#include + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +/** Return code used in the JxlParallel* functions as return value. A value + * of 0 means success and any other value means error. The special value + * JXL_PARALLEL_RET_RUNNER_ERROR can be used by the runner to indicate any + * other error. + */ +typedef int JxlParallelRetCode; + +/** + * General error returned by the JxlParallelRunInit function to indicate + * an error. + */ +#define JXL_PARALLEL_RET_RUNNER_ERROR (-1) + +/** + * Parallel run initialization callback. See JxlParallelRunner for details. + * + * This function MUST be called by the JxlParallelRunner only once, on the + * same thread that called JxlParallelRunner, before any parallel execution. + * The purpose of this call is to provide the maximum number of threads that the + * JxlParallelRunner will use, which can be used by JPEG XL to allocate + * per-thread storage if needed. + * + * @param jpegxl_opaque the @p jpegxl_opaque handle provided to + * JxlParallelRunner() must be passed here. + * @param num_threads the maximum number of threads. This value must be + * positive. + * @returns 0 if the initialization process was successful. + * @returns an error code if there was an error, which should be returned by + * JxlParallelRunner(). + */ +typedef JxlParallelRetCode (*JxlParallelRunInit)(void* jpegxl_opaque, + size_t num_threads); + +/** + * Parallel run data processing callback. See JxlParallelRunner for details. + * + * This function MUST be called once for every number in the range [start_range, + * end_range) (including start_range but not including end_range) passing this + * number as the @p value. Calls for different value may be executed from + * different threads in parallel. + * + * @param jpegxl_opaque the @p jpegxl_opaque handle provided to + * JxlParallelRunner() must be passed here. + * @param value the number in the range [start_range, end_range) of the call. + * @param thread_id the thread number where this function is being called from. + * This must be lower than the @p num_threads value passed to + * JxlParallelRunInit. + */ +typedef void (*JxlParallelRunFunction)(void* jpegxl_opaque, uint32_t value, + size_t thread_id); + +/** + * JxlParallelRunner function type. A parallel runner implementation can be + * provided by a JPEG XL caller to allow running computations in multiple + * threads. This function must call the initialization function @p init in the + * same thread that called it and then call the passed @p func once for every + * number in the range [start_range, end_range) (including start_range but not + * including end_range) possibly from different multiple threads in parallel. + * + * The JxlParallelRunner function does not need to be re-entrant. This means + * that the same JxlParallelRunner function with the same runner_opaque + * provided parameter will not be called from the library from either @p init or + * @p func in the same decoder or encoder instance. However, a single decoding + * or encoding instance may call the provided JxlParallelRunner multiple + * times for different parts of the decoding or encoding process. + * + * @returns 0 if the @p init call succeeded (returned 0) and no other error + * occurred in the runner code. + * @returns JXL_PARALLEL_RET_RUNNER_ERROR if an error occurred in the runner + * code, for example, setting up the threads. + * @return the return value of @p init() if non-zero. + */ +typedef JxlParallelRetCode (*JxlParallelRunner)( + void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range); + +/* The following is an example of a JxlParallelRunner that doesn't use any + * multi-threading. Note that this implementation doesn't store any state + * between multiple calls of the ExampleSequentialRunner function, so the + * runner_opaque value is not used. + + JxlParallelRetCode ExampleSequentialRunner(void* runner_opaque, + void* jpegxl_opaque, + JxlParallelRunInit init, + JxlParallelRunFunction func, + uint32_t start_range, + uint32_t end_range) { + // We only use one thread (the currently running thread). + JxlParallelRetCode init_ret = (*init)(jpegxl_opaque, 1); + if (init_ret != 0) return init_ret; + + // In case of other initialization error (for example when initializing the + // threads) one can return JXL_PARALLEL_RET_RUNNER_ERROR. + + for (uint32_t i = start_range; i < end_range; i++) { + // Every call is in the thread number 0. These don't need to be in any + // order. + (*func)(jpegxl_opaque, i, 0); + } + return 0; + } + */ + +#if defined(__cplusplus) || defined(c_plusplus) +} +#endif + +#endif /* JXL_PARALLEL_RUNNER_H_ */ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/resizable_parallel_runner.h b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/resizable_parallel_runner.h new file mode 100644 index 0000000000..88a315dca2 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/resizable_parallel_runner.h @@ -0,0 +1,75 @@ +/* Copyright (c) the JPEG XL Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style + * license that can be found in the LICENSE file. + */ + +/** @file resizable_parallel_runner.h + * @brief implementation using std::thread of a resizeable ::JxlParallelRunner. + */ + +/** Implementation of JxlParallelRunner than can be used to enable + * multithreading when using the JPEG XL library. This uses std::thread + * internally and related synchronization functions. The number of threads + * created can be changed after creation of the thread pool; the threads + * (including the main thread) are re-used for every + * ResizableParallelRunner::Runner call. Only one concurrent + * JxlResizableParallelRunner call per instance is allowed at a time. + * + * This is a scalable, lower-overhead thread pool runner, especially suitable + * for data-parallel computations in the fork-join model, where clients need to + * know when all tasks have completed. + * + * Compared to the implementation in @ref thread_parallel_runner.h, this + * implementation is tuned for execution on lower-powered systems, including + * for example ARM CPUs with big.LITTLE computation models. + */ + +#ifndef JXL_RESIZABLE_PARALLEL_RUNNER_H_ +#define JXL_RESIZABLE_PARALLEL_RUNNER_H_ + +#include +#include +#include +#include + +#include "jxl/jxl_threads_export.h" +#include "jxl/memory_manager.h" +#include "jxl/parallel_runner.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +/** Parallel runner internally using std::thread. Use as JxlParallelRunner. + */ +JXL_THREADS_EXPORT JxlParallelRetCode JxlResizableParallelRunner( + void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range); + +/** Creates the runner for JxlResizableParallelRunner. Use as the opaque + * runner. The runner will execute tasks on the calling thread until + * @ref JxlResizableParallelRunnerSetThreads is called. + */ +JXL_THREADS_EXPORT void* JxlResizableParallelRunnerCreate( + const JxlMemoryManager* memory_manager); + +/** Changes the number of threads for JxlResizableParallelRunner. + */ +JXL_THREADS_EXPORT void JxlResizableParallelRunnerSetThreads( + void* runner_opaque, size_t num_threads); + +/** Suggests a number of threads to use for an image of given size. + */ +JXL_THREADS_EXPORT uint32_t +JxlResizableParallelRunnerSuggestThreads(uint64_t xsize, uint64_t ysize); + +/** Destroys the runner created by JxlResizableParallelRunnerCreate. + */ +JXL_THREADS_EXPORT void JxlResizableParallelRunnerDestroy(void* runner_opaque); + +#if defined(__cplusplus) || defined(c_plusplus) +} +#endif + +#endif /* JXL_RESIZABLE_PARALLEL_RUNNER_H_ */ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/resizable_parallel_runner_cxx.h b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/resizable_parallel_runner_cxx.h new file mode 100644 index 0000000000..54b8b95a57 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/resizable_parallel_runner_cxx.h @@ -0,0 +1,59 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/// @file resizable_parallel_runner_cxx.h +/// @brief C++ header-only helper for @ref resizable_parallel_runner.h. +/// +/// There's no binary library associated with the header since this is a header +/// only library. + +#ifndef JXL_RESIZABLE_PARALLEL_RUNNER_CXX_H_ +#define JXL_RESIZABLE_PARALLEL_RUNNER_CXX_H_ + +#include + +#include "jxl/resizable_parallel_runner.h" + +#if !(defined(__cplusplus) || defined(c_plusplus)) +#error \ + "This a C++ only header. Use jxl/jxl_resizable_parallel_runner.h from C" \ + "sources." +#endif + +/// Struct to call JxlResizableParallelRunnerDestroy from the +/// JxlResizableParallelRunnerPtr unique_ptr. +struct JxlResizableParallelRunnerDestroyStruct { + /// Calls @ref JxlResizableParallelRunnerDestroy() on the passed runner. + void operator()(void* runner) { JxlResizableParallelRunnerDestroy(runner); } +}; + +/// std::unique_ptr<> type that calls JxlResizableParallelRunnerDestroy() when +/// releasing the runner. +/// +/// Use this helper type from C++ sources to ensure the runner is destroyed and +/// their internal resources released. +typedef std::unique_ptr + JxlResizableParallelRunnerPtr; + +/// Creates an instance of JxlResizableParallelRunner into a +/// JxlResizableParallelRunnerPtr and initializes it. +/// +/// This function returns a unique_ptr that will call +/// JxlResizableParallelRunnerDestroy() when releasing the pointer. See @ref +/// JxlResizableParallelRunnerCreate for details on the instance creation. +/// +/// @param memory_manager custom allocator function. It may be NULL. The memory +/// manager will be copied internally. +/// @param num_worker_threads the number of worker threads to create. +/// @return a @c NULL JxlResizableParallelRunnerPtr if the instance can not be +/// allocated or initialized +/// @return initialized JxlResizableParallelRunnerPtr instance otherwise. +static inline JxlResizableParallelRunnerPtr JxlResizableParallelRunnerMake( + const JxlMemoryManager* memory_manager) { + return JxlResizableParallelRunnerPtr( + JxlResizableParallelRunnerCreate(memory_manager)); +} + +#endif // JXL_RESIZABLE_PARALLEL_RUNNER_CXX_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/thread_parallel_runner.h b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/thread_parallel_runner.h new file mode 100644 index 0000000000..c3d8308e0c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/thread_parallel_runner.h @@ -0,0 +1,69 @@ +/* Copyright (c) the JPEG XL Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style + * license that can be found in the LICENSE file. + */ + +/** @file thread_parallel_runner.h + * @brief implementation using std::thread of a ::JxlParallelRunner. + */ + +/** Implementation of JxlParallelRunner than can be used to enable + * multithreading when using the JPEG XL library. This uses std::thread + * internally and related synchronization functions. The number of threads + * created is fixed at construction time and the threads are re-used for every + * ThreadParallelRunner::Runner call. Only one concurrent + * JxlThreadParallelRunner call per instance is allowed at a time. + * + * This is a scalable, lower-overhead thread pool runner, especially suitable + * for data-parallel computations in the fork-join model, where clients need to + * know when all tasks have completed. + * + * This thread pool can efficiently load-balance millions of tasks using an + * atomic counter, thus avoiding per-task virtual or system calls. With 48 + * hyperthreads and 1M tasks that add to an atomic counter, overall runtime is + * 10-20x higher when using std::async, and ~200x for a queue-based thread + */ + +#ifndef JXL_THREAD_PARALLEL_RUNNER_H_ +#define JXL_THREAD_PARALLEL_RUNNER_H_ + +#include +#include +#include +#include + +#include "jxl/jxl_threads_export.h" +#include "jxl/memory_manager.h" +#include "jxl/parallel_runner.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +/** Parallel runner internally using std::thread. Use as JxlParallelRunner. + */ +JXL_THREADS_EXPORT JxlParallelRetCode JxlThreadParallelRunner( + void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range); + +/** Creates the runner for JxlThreadParallelRunner. Use as the opaque + * runner. + */ +JXL_THREADS_EXPORT void* JxlThreadParallelRunnerCreate( + const JxlMemoryManager* memory_manager, size_t num_worker_threads); + +/** Destroys the runner created by JxlThreadParallelRunnerCreate. + */ +JXL_THREADS_EXPORT void JxlThreadParallelRunnerDestroy(void* runner_opaque); + +/** Returns a default num_worker_threads value for + * JxlThreadParallelRunnerCreate. + */ +JXL_THREADS_EXPORT size_t JxlThreadParallelRunnerDefaultNumWorkerThreads(); + +#if defined(__cplusplus) || defined(c_plusplus) +} +#endif + +#endif /* JXL_THREAD_PARALLEL_RUNNER_H_ */ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/thread_parallel_runner_cxx.h b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/thread_parallel_runner_cxx.h new file mode 100644 index 0000000000..121c556130 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/thread_parallel_runner_cxx.h @@ -0,0 +1,59 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/// @file thread_parallel_runner_cxx.h +/// @brief C++ header-only helper for @ref thread_parallel_runner.h. +/// +/// There's no binary library associated with the header since this is a header +/// only library. + +#ifndef JXL_THREAD_PARALLEL_RUNNER_CXX_H_ +#define JXL_THREAD_PARALLEL_RUNNER_CXX_H_ + +#include + +#include "jxl/thread_parallel_runner.h" + +#if !(defined(__cplusplus) || defined(c_plusplus)) +#error \ + "This a C++ only header. Use jxl/jxl_thread_parallel_runner.h from C" \ + "sources." +#endif + +/// Struct to call JxlThreadParallelRunnerDestroy from the +/// JxlThreadParallelRunnerPtr unique_ptr. +struct JxlThreadParallelRunnerDestroyStruct { + /// Calls @ref JxlThreadParallelRunnerDestroy() on the passed runner. + void operator()(void* runner) { JxlThreadParallelRunnerDestroy(runner); } +}; + +/// std::unique_ptr<> type that calls JxlThreadParallelRunnerDestroy() when +/// releasing the runner. +/// +/// Use this helper type from C++ sources to ensure the runner is destroyed and +/// their internal resources released. +typedef std::unique_ptr + JxlThreadParallelRunnerPtr; + +/// Creates an instance of JxlThreadParallelRunner into a +/// JxlThreadParallelRunnerPtr and initializes it. +/// +/// This function returns a unique_ptr that will call +/// JxlThreadParallelRunnerDestroy() when releasing the pointer. See @ref +/// JxlThreadParallelRunnerCreate for details on the instance creation. +/// +/// @param memory_manager custom allocator function. It may be NULL. The memory +/// manager will be copied internally. +/// @param num_worker_threads the number of worker threads to create. +/// @return a @c NULL JxlThreadParallelRunnerPtr if the instance can not be +/// allocated or initialized +/// @return initialized JxlThreadParallelRunnerPtr instance otherwise. +static inline JxlThreadParallelRunnerPtr JxlThreadParallelRunnerMake( + const JxlMemoryManager* memory_manager, size_t num_worker_threads) { + return JxlThreadParallelRunnerPtr( + JxlThreadParallelRunnerCreate(memory_manager, num_worker_threads)); +} + +#endif // JXL_THREAD_PARALLEL_RUNNER_CXX_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/types.h b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/types.h new file mode 100644 index 0000000000..58ade64347 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/include/jxl/types.h @@ -0,0 +1,116 @@ +/* Copyright (c) the JPEG XL Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style + * license that can be found in the LICENSE file. + */ + +/** @file types.h + * @brief Data types for the JPEG XL API, for both encoding and decoding. + */ + +#ifndef JXL_TYPES_H_ +#define JXL_TYPES_H_ + +#include +#include + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +/** + * A portable @c bool replacement. + * + * ::JXL_BOOL is a "documentation" type: actually it is @c int, but in API it + * denotes a type, whose only values are ::JXL_TRUE and ::JXL_FALSE. + */ +#define JXL_BOOL int +/** Portable @c true replacement. */ +#define JXL_TRUE 1 +/** Portable @c false replacement. */ +#define JXL_FALSE 0 + +/** Data type for the sample values per channel per pixel. + */ +typedef enum { + /** Use 32-bit single-precision floating point values, with range 0.0-1.0 + * (within gamut, may go outside this range for wide color gamut). Floating + * point output, either JXL_TYPE_FLOAT or JXL_TYPE_FLOAT16, is recommended + * for HDR and wide gamut images when color profile conversion is required. */ + JXL_TYPE_FLOAT = 0, + + /** Use 1-bit packed in uint8_t, first pixel in LSB, padded to uint8_t per + * row. + * TODO(lode): support first in MSB, other padding. + */ + JXL_TYPE_BOOLEAN, + + /** Use type uint8_t. May clip wide color gamut data. + */ + JXL_TYPE_UINT8, + + /** Use type uint16_t. May clip wide color gamut data. + */ + JXL_TYPE_UINT16, + + /** Use type uint32_t. May clip wide color gamut data. + */ + JXL_TYPE_UINT32, + + /** Use 16-bit IEEE 754 half-precision floating point values */ + JXL_TYPE_FLOAT16, +} JxlDataType; + +/** Ordering of multi-byte data. + */ +typedef enum { + /** Use the endianness of the system, either little endian or big endian, + * without forcing either specific endianness. Do not use if pixel data + * should be exported to a well defined format. + */ + JXL_NATIVE_ENDIAN = 0, + /** Force little endian */ + JXL_LITTLE_ENDIAN = 1, + /** Force big endian */ + JXL_BIG_ENDIAN = 2, +} JxlEndianness; + +/** Data type for the sample values per channel per pixel for the output buffer + * for pixels. This is not necessarily the same as the data type encoded in the + * codestream. The channels are interleaved per pixel. The pixels are + * organized row by row, left to right, top to bottom. + * TODO(lode): implement padding / alignment (row stride) + * TODO(lode): support different channel orders if needed (RGB, BGR, ...) + */ +typedef struct { + /** Amount of channels available in a pixel buffer. + * 1: single-channel data, e.g. grayscale + * 2: single-channel + alpha + * 3: trichromatic, e.g. RGB + * 4: trichromatic + alpha + * TODO(lode): this needs finetuning. It is not yet defined how the user + * chooses output color space. CMYK+alpha needs 5 channels. + */ + uint32_t num_channels; + + /** Data type of each channel. + */ + JxlDataType data_type; + + /** Whether multi-byte data types are represented in big endian or little + * endian format. This applies to JXL_TYPE_UINT16, JXL_TYPE_UINT32 + * and JXL_TYPE_FLOAT. + */ + JxlEndianness endianness; + + /** Align scanlines to a multiple of align bytes, or 0 to require no + * alignment at all (which has the same effect as value 1) + */ + size_t align; +} JxlPixelFormat; + +#if defined(__cplusplus) || defined(c_plusplus) +} +#endif + +#endif /* JXL_TYPES_H_ */ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ac_context.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ac_context.h new file mode 100644 index 0000000000..94e5bb7c03 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ac_context.h @@ -0,0 +1,151 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_AC_CONTEXT_H_ +#define LIB_JXL_AC_CONTEXT_H_ + +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order_fwd.h" + +namespace jxl { + +// Block context used for scanning order, number of non-zeros, AC coefficients. +// Equal to the channel. +constexpr uint32_t kDCTOrderContextStart = 0; + +// The number of predicted nonzeros goes from 0 to 1008. We use +// ceil(log2(predicted+1)) as a context for the number of nonzeros, so from 0 to +// 10, inclusive. +constexpr uint32_t kNonZeroBuckets = 37; + +static const uint16_t kCoeffFreqContext[64] = { + 0xBAD, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, + 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, + 27, 27, 27, 27, 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, +}; + +static const uint16_t kCoeffNumNonzeroContext[64] = { + 0xBAD, 0, 31, 62, 62, 93, 93, 93, 93, 123, 123, 123, 123, + 152, 152, 152, 152, 152, 152, 152, 152, 180, 180, 180, 180, 180, + 180, 180, 180, 180, 180, 180, 180, 206, 206, 206, 206, 206, 206, + 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, + 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, +}; + +static const uint8_t kDefaultCtxMap[39] = { + // Default ctx map clusters all the large transforms together. + 0, 1, 2, 2, 3, 3, 4, 5, 6, 6, 6, 6, 6, // + 7, 8, 9, 9, 10, 11, 12, 13, 14, 14, 14, 14, 14, // + 7, 8, 9, 9, 10, 11, 12, 13, 14, 14, 14, 14, 14, // +}; + +// Supremum of ZeroDensityContext(x, y) + 1, when x + y < 64. +constexpr int kZeroDensityContextCount = 458; +// Supremum of ZeroDensityContext(x, y) + 1. +constexpr int kZeroDensityContextLimit = 474; + +/* This function is used for entropy-sources pre-clustering. + * + * Ideally, each combination of |nonzeros_left| and |k| should go to its own + * bucket; but it implies (64 * 63 / 2) == 2016 buckets. If there is other + * dimension (e.g. block context), then number of primary clusters becomes too + * big. + * + * To solve this problem, |nonzeros_left| and |k| values are clustered. It is + * known that their sum is at most 64, consequently, the total number buckets + * is at most A(64) * B(64). + */ +// TODO(user): investigate, why disabling pre-clustering makes entropy code +// less dense. Perhaps we would need to add HQ clustering algorithm that would +// be able to squeeze better by spending more CPU cycles. +static JXL_INLINE size_t ZeroDensityContext(size_t nonzeros_left, size_t k, + size_t covered_blocks, + size_t log2_covered_blocks, + size_t prev) { + JXL_DASSERT((1u << log2_covered_blocks) == covered_blocks); + nonzeros_left = (nonzeros_left + covered_blocks - 1) >> log2_covered_blocks; + k >>= log2_covered_blocks; + JXL_DASSERT(k > 0); + JXL_DASSERT(k < 64); + JXL_DASSERT(nonzeros_left > 0); + // Asserting nonzeros_left + k < 65 here causes crashes in debug mode with + // invalid input, since the (hot) decoding loop does not check this condition. + // As no out-of-bound memory reads are issued even if that condition is + // broken, we check this simpler condition which holds anyway. The decoder + // will still mark a file in which that condition happens as not valid at the + // end of the decoding loop, as `nzeros` will not be `0`. + JXL_DASSERT(nonzeros_left < 64); + return (kCoeffNumNonzeroContext[nonzeros_left] + kCoeffFreqContext[k]) * 2 + + prev; +} + +struct BlockCtxMap { + std::vector dc_thresholds[3]; + std::vector qf_thresholds; + std::vector ctx_map; + size_t num_ctxs, num_dc_ctxs; + + static_assert(3 * kNumOrders == + sizeof(kDefaultCtxMap) / sizeof *kDefaultCtxMap, + "Update default context map"); + + size_t Context(int dc_idx, uint32_t qf, size_t ord, size_t c) const { + size_t qf_idx = 0; + for (uint32_t t : qf_thresholds) { + if (qf > t) qf_idx++; + } + size_t idx = c < 2 ? c ^ 1 : 2; + idx = idx * kNumOrders + ord; + idx = idx * (qf_thresholds.size() + 1) + qf_idx; + idx = idx * num_dc_ctxs + dc_idx; + return ctx_map[idx]; + } + // Non-zero context is based on number of non-zeros and block context. + // For better clustering, contexts with same number of non-zeros are grouped. + uint32_t ZeroDensityContextsOffset(uint32_t block_ctx) const { + return num_ctxs * kNonZeroBuckets + kZeroDensityContextCount * block_ctx; + } + + // Context map for AC coefficients consists of 2 blocks: + // |num_ctxs x : context for number of non-zeros in the block + // kNonZeroBuckets| computed from block context and predicted + // value (based top and left values) + // |num_ctxs x : context for AC coefficient symbols, + // kZeroDensityContextCount| computed from block context, + // number of non-zeros left and + // index in scan order + uint32_t NumACContexts() const { + return num_ctxs * (kNonZeroBuckets + kZeroDensityContextCount); + } + + // Non-zero context is based on number of non-zeros and block context. + // For better clustering, contexts with same number of non-zeros are grouped. + inline uint32_t NonZeroContext(uint32_t non_zeros, uint32_t block_ctx) const { + uint32_t ctx; + if (non_zeros >= 64) non_zeros = 64; + if (non_zeros < 8) { + ctx = non_zeros; + } else { + ctx = 4 + non_zeros / 2; + } + return ctx * num_ctxs + block_ctx; + } + + BlockCtxMap() { + ctx_map.assign(std::begin(jxl::kDefaultCtxMap), + std::end(jxl::kDefaultCtxMap)); + num_ctxs = *std::max_element(ctx_map.begin(), ctx_map.end()) + 1; + num_dc_ctxs = 1; + } +}; + +} // namespace jxl + +#endif // LIB_JXL_AC_CONTEXT_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy.cc new file mode 100644 index 0000000000..f262f33319 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy.cc @@ -0,0 +1,110 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/ac_strategy.h" + +#include + +#include +#include // iota +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { + +// Tries to generalize zig-zag order to non-square blocks. Surprisingly, in +// square block frequency along the (i + j == const) diagonals is roughly the +// same. For historical reasons, consecutive diagonals are traversed +// in alternating directions - so called "zig-zag" (or "snake") order. +AcStrategy::CoeffOrderAndLut::CoeffOrderAndLut() { + for (size_t s = 0; s < AcStrategy::kNumValidStrategies; s++) { + const AcStrategy acs = AcStrategy::FromRawStrategy(s); + size_t cx = acs.covered_blocks_x(); + size_t cy = acs.covered_blocks_y(); + CoefficientLayout(&cy, &cx); + JXL_ASSERT((AcStrategy::CoeffOrderAndLut::kOffset[s + 1] - + AcStrategy::CoeffOrderAndLut::kOffset[s]) == cx * cy); + coeff_order_t* JXL_RESTRICT order_start = + order + AcStrategy::CoeffOrderAndLut::kOffset[s] * kDCTBlockSize; + coeff_order_t* JXL_RESTRICT lut_start = + lut + AcStrategy::CoeffOrderAndLut::kOffset[s] * kDCTBlockSize; + + // CoefficientLayout ensures cx >= cy. + // We compute the zigzag order for a cx x cx block, then discard all the + // lines that are not multiple of the ratio between cx and cy. + size_t xs = cx / cy; + size_t xsm = xs - 1; + size_t xss = CeilLog2Nonzero(xs); + // First half of the block + size_t cur = cx * cy; + for (size_t i = 0; i < cx * kBlockDim; i++) { + for (size_t j = 0; j <= i; j++) { + size_t x = j; + size_t y = i - j; + if (i % 2) std::swap(x, y); + if ((y & xsm) != 0) continue; + y >>= xss; + size_t val = 0; + if (x < cx && y < cy) { + val = y * cx + x; + } else { + val = cur++; + } + lut_start[y * cx * kBlockDim + x] = val; + order_start[val] = y * cx * kBlockDim + x; + } + } + // Second half + for (size_t ip = cx * kBlockDim - 1; ip > 0; ip--) { + size_t i = ip - 1; + for (size_t j = 0; j <= i; j++) { + size_t x = cx * kBlockDim - 1 - (i - j); + size_t y = cx * kBlockDim - 1 - j; + if (i % 2) std::swap(x, y); + if ((y & xsm) != 0) continue; + y >>= xss; + size_t val = cur++; + lut_start[y * cx * kBlockDim + x] = val; + order_start[val] = y * cx * kBlockDim + x; + } + } + } +} + +const AcStrategy::CoeffOrderAndLut* AcStrategy::CoeffOrder() { + static AcStrategy::CoeffOrderAndLut* order = + new AcStrategy::CoeffOrderAndLut(); + return order; +} + +// These definitions are needed before C++17. +constexpr size_t AcStrategy::kMaxCoeffBlocks; +constexpr size_t AcStrategy::kMaxBlockDim; +constexpr size_t AcStrategy::kMaxCoeffArea; +constexpr size_t AcStrategy::CoeffOrderAndLut::kOffset[]; + +AcStrategyImage::AcStrategyImage(size_t xsize, size_t ysize) + : layers_(xsize, ysize) { + row_ = layers_.Row(0); + stride_ = layers_.PixelsPerRow(); +} + +size_t AcStrategyImage::CountBlocks(AcStrategy::Type type) const { + size_t ret = 0; + for (size_t y = 0; y < layers_.ysize(); y++) { + const uint8_t* JXL_RESTRICT row = layers_.ConstRow(y); + for (size_t x = 0; x < layers_.xsize(); x++) { + if (row[x] == ((static_cast(type) << 1) | 1)) ret++; + } + } + return ret; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy.h new file mode 100644 index 0000000000..b51564594c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy.h @@ -0,0 +1,287 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_AC_STRATEGY_H_ +#define LIB_JXL_AC_STRATEGY_H_ + +#include +#include + +#include // kMaxVectorSize + +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image_ops.h" + +// Defines the different kinds of transforms, and heuristics to choose between +// them. +// `AcStrategy` represents what transform should be used, and which sub-block of +// that transform we are currently in. Note that DCT4x4 is applied on all four +// 4x4 sub-blocks of an 8x8 block. +// `AcStrategyImage` defines which strategy should be used for each 8x8 block +// of the image. The highest 4 bits represent the strategy to be used, the +// lowest 4 represent the index of the block inside that strategy. + +namespace jxl { + +class AcStrategy { + public: + // Extremal values for the number of blocks/coefficients of a single strategy. + static constexpr size_t kMaxCoeffBlocks = 32; + static constexpr size_t kMaxBlockDim = kBlockDim * kMaxCoeffBlocks; + // Maximum number of coefficients in a block. Guaranteed to be a multiple of + // the vector size. + static constexpr size_t kMaxCoeffArea = kMaxBlockDim * kMaxBlockDim; + static_assert((kMaxCoeffArea * sizeof(float)) % hwy::kMaxVectorSize == 0, + "Coefficient area is not a multiple of vector size"); + + // Raw strategy types. + enum Type : uint32_t { + // Regular block size DCT + DCT = 0, + // Encode pixels without transforming + IDENTITY = 1, + // Use 2-by-2 DCT + DCT2X2 = 2, + // Use 4-by-4 DCT + DCT4X4 = 3, + // Use 16-by-16 DCT + DCT16X16 = 4, + // Use 32-by-32 DCT + DCT32X32 = 5, + // Use 16-by-8 DCT + DCT16X8 = 6, + // Use 8-by-16 DCT + DCT8X16 = 7, + // Use 32-by-8 DCT + DCT32X8 = 8, + // Use 8-by-32 DCT + DCT8X32 = 9, + // Use 32-by-16 DCT + DCT32X16 = 10, + // Use 16-by-32 DCT + DCT16X32 = 11, + // 4x8 and 8x4 DCT + DCT4X8 = 12, + DCT8X4 = 13, + // Corner-DCT. + AFV0 = 14, + AFV1 = 15, + AFV2 = 16, + AFV3 = 17, + // Larger DCTs + DCT64X64 = 18, + DCT64X32 = 19, + DCT32X64 = 20, + DCT128X128 = 21, + DCT128X64 = 22, + DCT64X128 = 23, + DCT256X256 = 24, + DCT256X128 = 25, + DCT128X256 = 26, + // Marker for num of valid strategies. + kNumValidStrategies + }; + + static constexpr uint32_t TypeBit(const Type type) { + return 1u << static_cast(type); + } + + // Returns true if this block is the first 8x8 block (i.e. top-left) of a + // possibly multi-block strategy. + JXL_INLINE bool IsFirstBlock() const { return is_first_; } + + JXL_INLINE bool IsMultiblock() const { + constexpr uint32_t bits = + TypeBit(Type::DCT16X16) | TypeBit(Type::DCT32X32) | + TypeBit(Type::DCT16X8) | TypeBit(Type::DCT8X16) | + TypeBit(Type::DCT32X8) | TypeBit(Type::DCT8X32) | + TypeBit(Type::DCT16X32) | TypeBit(Type::DCT32X16) | + TypeBit(Type::DCT32X64) | TypeBit(Type::DCT64X32) | + TypeBit(Type::DCT64X64) | TypeBit(DCT64X128) | TypeBit(DCT128X64) | + TypeBit(DCT128X128) | TypeBit(DCT128X256) | TypeBit(DCT256X128) | + TypeBit(DCT256X256); + JXL_DASSERT(Strategy() < kNumValidStrategies); + return ((1u << static_cast(Strategy())) & bits) != 0; + } + + // Returns the raw strategy value. Should only be used for tokenization. + JXL_INLINE uint8_t RawStrategy() const { + return static_cast(strategy_); + } + + JXL_INLINE Type Strategy() const { return strategy_; } + + // Inverse check + static JXL_INLINE constexpr bool IsRawStrategyValid(int raw_strategy) { + return raw_strategy < static_cast(kNumValidStrategies) && + raw_strategy >= 0; + } + static JXL_INLINE AcStrategy FromRawStrategy(uint8_t raw_strategy) { + return FromRawStrategy(static_cast(raw_strategy)); + } + static JXL_INLINE AcStrategy FromRawStrategy(Type raw_strategy) { + JXL_DASSERT(IsRawStrategyValid(static_cast(raw_strategy))); + return AcStrategy(raw_strategy, /*is_first=*/true); + } + + // "Natural order" means the order of increasing of "anisotropic" frequency of + // continuous version of DCT basis. + // Round-trip, for any given strategy s: + // X = NaturalCoeffOrder(s)[NaturalCoeffOrderLutN(s)[X]] + // X = NaturalCoeffOrderLut(s)[NaturalCoeffOrderN(s)[X]] + JXL_INLINE const coeff_order_t* NaturalCoeffOrder() const { + return CoeffOrder()->order + + CoeffOrderAndLut::kOffset[RawStrategy()] * kDCTBlockSize; + } + + JXL_INLINE const coeff_order_t* NaturalCoeffOrderLut() const { + return CoeffOrder()->lut + + CoeffOrderAndLut::kOffset[RawStrategy()] * kDCTBlockSize; + } + + // Number of 8x8 blocks that this strategy will cover. 0 for non-top-left + // blocks inside a multi-block transform. + JXL_INLINE size_t covered_blocks_x() const { + static constexpr uint8_t kLut[] = {1, 1, 1, 1, 2, 4, 1, 2, 1, + 4, 2, 4, 1, 1, 1, 1, 1, 1, + 8, 4, 8, 16, 8, 16, 32, 16, 32}; + static_assert(sizeof(kLut) / sizeof(*kLut) == kNumValidStrategies, + "Update LUT"); + return kLut[size_t(strategy_)]; + } + + JXL_INLINE size_t covered_blocks_y() const { + static constexpr uint8_t kLut[] = {1, 1, 1, 1, 2, 4, 2, 1, 4, + 1, 4, 2, 1, 1, 1, 1, 1, 1, + 8, 8, 4, 16, 16, 8, 32, 32, 16}; + static_assert(sizeof(kLut) / sizeof(*kLut) == kNumValidStrategies, + "Update LUT"); + return kLut[size_t(strategy_)]; + } + + JXL_INLINE size_t log2_covered_blocks() const { + static constexpr uint8_t kLut[] = {0, 0, 0, 0, 2, 4, 1, 1, 2, + 2, 3, 3, 0, 0, 0, 0, 0, 0, + 6, 5, 5, 8, 7, 7, 10, 9, 9}; + static_assert(sizeof(kLut) / sizeof(*kLut) == kNumValidStrategies, + "Update LUT"); + return kLut[size_t(strategy_)]; + } + + struct CoeffOrderAndLut { + // Those offsets get multiplied by kDCTBlockSize. + // TODO(veluca): reduce this array by merging together the same order type. + static constexpr size_t kOffset[kNumValidStrategies + 1] = { + 0, 1, 2, 3, 4, 8, 24, 26, 28, 32, 36, 44, 52, 53, + 54, 55, 56, 57, 58, 122, 154, 186, 442, 570, 698, 1722, 2234, 2746, + }; + static constexpr size_t kTotalTableSize = + kOffset[kNumValidStrategies] * kDCTBlockSize; + coeff_order_t order[kTotalTableSize]; + coeff_order_t lut[kTotalTableSize]; + + private: + CoeffOrderAndLut(); + friend class AcStrategy; + }; + + private: + friend class AcStrategyRow; + JXL_INLINE AcStrategy(Type strategy, bool is_first) + : strategy_(strategy), is_first_(is_first) { + JXL_DASSERT(IsMultiblock() || is_first == true); + } + + Type strategy_; + bool is_first_; + + static const CoeffOrderAndLut* CoeffOrder(); +}; + +// Class to use a certain row of the AC strategy. +class AcStrategyRow { + public: + explicit AcStrategyRow(const uint8_t* row) : row_(row) {} + AcStrategy operator[](size_t x) const { + return AcStrategy(static_cast(row_[x] >> 1), row_[x] & 1); + } + + private: + const uint8_t* JXL_RESTRICT row_; +}; + +class AcStrategyImage { + public: + AcStrategyImage() = default; + AcStrategyImage(size_t xsize, size_t ysize); + AcStrategyImage(AcStrategyImage&&) = default; + AcStrategyImage& operator=(AcStrategyImage&&) = default; + + void FillDCT8(const Rect& rect) { + FillPlane((static_cast(AcStrategy::Type::DCT) << 1) | 1, + &layers_, rect); + } + void FillDCT8() { FillDCT8(Rect(layers_)); } + + void FillInvalid() { FillImage(INVALID, &layers_); } + + void Set(size_t x, size_t y, AcStrategy::Type type) { +#if JXL_ENABLE_ASSERT + AcStrategy acs = AcStrategy::FromRawStrategy(type); +#endif // JXL_ENABLE_ASSERT + JXL_ASSERT(y + acs.covered_blocks_y() <= layers_.ysize()); + JXL_ASSERT(x + acs.covered_blocks_x() <= layers_.xsize()); + JXL_CHECK(SetNoBoundsCheck(x, y, type, /*check=*/false)); + } + + Status SetNoBoundsCheck(size_t x, size_t y, AcStrategy::Type type, + bool check = true) { + AcStrategy acs = AcStrategy::FromRawStrategy(type); + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + size_t pos = (y + iy) * stride_ + x + ix; + if (check && row_[pos] != INVALID) { + return JXL_FAILURE("Invalid AC strategy: block overlap"); + } + row_[pos] = + (static_cast(type) << 1) | ((iy | ix) == 0 ? 1 : 0); + } + } + return true; + } + + bool IsValid(size_t x, size_t y) { return row_[y * stride_ + x] != INVALID; } + + AcStrategyRow ConstRow(size_t y, size_t x_prefix = 0) const { + return AcStrategyRow(layers_.ConstRow(y) + x_prefix); + } + + AcStrategyRow ConstRow(const Rect& rect, size_t y) const { + return ConstRow(rect.y0() + y, rect.x0()); + } + + size_t PixelsPerRow() const { return layers_.PixelsPerRow(); } + + size_t xsize() const { return layers_.xsize(); } + size_t ysize() const { return layers_.ysize(); } + + // Count the number of blocks of a given type. + size_t CountBlocks(AcStrategy::Type type) const; + + private: + ImageB layers_; + uint8_t* JXL_RESTRICT row_; + size_t stride_; + + // A value that does not represent a valid combined AC strategy + // value. Used as a sentinel. + static constexpr uint8_t INVALID = 0xFF; +}; + +} // namespace jxl + +#endif // LIB_JXL_AC_STRATEGY_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy_test.cc new file mode 100644 index 0000000000..e4ceb88b43 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ac_strategy_test.cc @@ -0,0 +1,225 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/ac_strategy.h" + +#include + +#include +#include +#include // HWY_ALIGN_MAX +#include +#include + +#include "lib/jxl/common.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/dec_transforms_testonly.h" +#include "lib/jxl/enc_transforms.h" + +namespace jxl { +namespace { + +// Test that DCT -> IDCT is a noop. +class AcStrategyRoundtrip : public ::hwy::TestWithParamTargetAndT { + protected: + void Run() { + const AcStrategy::Type type = static_cast(GetParam()); + const AcStrategy acs = AcStrategy::FromRawStrategy(type); + + auto mem = hwy::AllocateAligned(4 * AcStrategy::kMaxCoeffArea); + float* scratch_space = mem.get(); + float* coeffs = scratch_space + AcStrategy::kMaxCoeffArea; + float* idct = coeffs + AcStrategy::kMaxCoeffArea; + + for (size_t i = 0; i < std::min(1024u, 64u << acs.log2_covered_blocks()); + i++) { + float* input = idct + AcStrategy::kMaxCoeffArea; + std::fill_n(input, AcStrategy::kMaxCoeffArea, 0); + input[i] = 0.2f; + TransformFromPixels(type, input, acs.covered_blocks_x() * 8, coeffs, + scratch_space); + ASSERT_NEAR(coeffs[0], 0.2 / (64 << acs.log2_covered_blocks()), 1e-6) + << " i = " << i; + TransformToPixels(type, coeffs, idct, acs.covered_blocks_x() * 8, + scratch_space); + for (size_t j = 0; j < 64u << acs.log2_covered_blocks(); j++) { + ASSERT_NEAR(idct[j], j == i ? 0.2f : 0, 2e-6) + << "j = " << j << " i = " << i << " acs " << type; + } + } + // Test DC. + std::fill_n(idct, AcStrategy::kMaxCoeffArea, 0); + for (size_t y = 0; y < acs.covered_blocks_y(); y++) { + for (size_t x = 0; x < acs.covered_blocks_x(); x++) { + float* dc = idct + AcStrategy::kMaxCoeffArea; + std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0); + dc[y * acs.covered_blocks_x() * 8 + x] = 0.2; + LowestFrequenciesFromDC(type, dc, acs.covered_blocks_x() * 8, coeffs); + DCFromLowestFrequencies(type, coeffs, idct, acs.covered_blocks_x() * 8); + std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0); + dc[y * acs.covered_blocks_x() * 8 + x] = 0.2; + for (size_t j = 0; j < 64u << acs.log2_covered_blocks(); j++) { + ASSERT_NEAR(idct[j], dc[j], 1e-6) + << "j = " << j << " x = " << x << " y = " << y << " acs " << type; + } + } + } + } +}; + +HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T( + AcStrategyRoundtrip, + ::testing::Range(0, int(AcStrategy::Type::kNumValidStrategies))); + +TEST_P(AcStrategyRoundtrip, Test) { Run(); } + +// Test that DC(2x2) -> DCT coefficients -> IDCT -> downsampled IDCT is a noop. +class AcStrategyRoundtripDownsample + : public ::hwy::TestWithParamTargetAndT { + protected: + void Run() { + const AcStrategy::Type type = static_cast(GetParam()); + const AcStrategy acs = AcStrategy::FromRawStrategy(type); + + auto mem = hwy::AllocateAligned(4 * AcStrategy::kMaxCoeffArea); + float* scratch_space = mem.get(); + float* coeffs = scratch_space + AcStrategy::kMaxCoeffArea; + std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0.0f); + float* idct = coeffs + AcStrategy::kMaxCoeffArea; + + for (size_t y = 0; y < acs.covered_blocks_y(); y++) { + for (size_t x = 0; x < acs.covered_blocks_x(); x++) { + float* dc = idct + AcStrategy::kMaxCoeffArea; + std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0); + dc[y * acs.covered_blocks_x() * 8 + x] = 0.2f; + LowestFrequenciesFromDC(type, dc, acs.covered_blocks_x() * 8, coeffs); + TransformToPixels(type, coeffs, idct, acs.covered_blocks_x() * 8, + scratch_space); + std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0.0f); + std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0); + dc[y * acs.covered_blocks_x() * 8 + x] = 0.2f; + // Downsample + for (size_t dy = 0; dy < acs.covered_blocks_y(); dy++) { + for (size_t dx = 0; dx < acs.covered_blocks_x(); dx++) { + float sum = 0; + for (size_t iy = 0; iy < 8; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + sum += idct[(dy * 8 + iy) * 8 * acs.covered_blocks_x() + + dx * 8 + ix]; + } + } + sum /= 64.0f; + ASSERT_NEAR(sum, dc[dy * 8 * acs.covered_blocks_x() + dx], 1e-6) + << "acs " << type; + } + } + } + } + } +}; + +HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T( + AcStrategyRoundtripDownsample, + ::testing::Range(0, int(AcStrategy::Type::kNumValidStrategies))); + +TEST_P(AcStrategyRoundtripDownsample, Test) { Run(); } + +// Test that IDCT(block with zeros in the non-topleft corner) -> downsampled +// IDCT is the same as IDCT -> DC(2x2) of the same block. +class AcStrategyDownsample : public ::hwy::TestWithParamTargetAndT { + protected: + void Run() { + const AcStrategy::Type type = static_cast(GetParam()); + const AcStrategy acs = AcStrategy::FromRawStrategy(type); + size_t cx = acs.covered_blocks_y(); + size_t cy = acs.covered_blocks_x(); + CoefficientLayout(&cy, &cx); + + auto mem = hwy::AllocateAligned(4 * AcStrategy::kMaxCoeffArea); + float* scratch_space = mem.get(); + float* idct = scratch_space + AcStrategy::kMaxCoeffArea; + float* idct_acs_downsampled = idct + AcStrategy::kMaxCoeffArea; + + for (size_t y = 0; y < cy; y++) { + for (size_t x = 0; x < cx; x++) { + float* coeffs = idct + AcStrategy::kMaxCoeffArea; + std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0); + coeffs[y * cx * 8 + x] = 0.2f; + TransformToPixels(type, coeffs, idct, acs.covered_blocks_x() * 8, + scratch_space); + std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0); + coeffs[y * cx * 8 + x] = 0.2f; + DCFromLowestFrequencies(type, coeffs, idct_acs_downsampled, + acs.covered_blocks_x() * 8); + // Downsample + for (size_t dy = 0; dy < acs.covered_blocks_y(); dy++) { + for (size_t dx = 0; dx < acs.covered_blocks_x(); dx++) { + float sum = 0; + for (size_t iy = 0; iy < 8; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + sum += idct[(dy * 8 + iy) * 8 * acs.covered_blocks_x() + + dx * 8 + ix]; + } + } + sum /= 64; + ASSERT_NEAR( + sum, idct_acs_downsampled[dy * 8 * acs.covered_blocks_x() + dx], + 1e-6) + << " acs " << type; + } + } + } + } + } +}; + +HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T( + AcStrategyDownsample, + ::testing::Range(0, int(AcStrategy::Type::kNumValidStrategies))); + +TEST_P(AcStrategyDownsample, Test) { Run(); } + +class AcStrategyTargetTest : public ::hwy::TestWithParamTarget {}; +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(AcStrategyTargetTest); + +TEST_P(AcStrategyTargetTest, RoundtripAFVDCT) { + HWY_ALIGN_MAX float idct[16]; + for (size_t i = 0; i < 16; i++) { + HWY_ALIGN_MAX float pixels[16] = {}; + pixels[i] = 1; + HWY_ALIGN_MAX float coeffs[16] = {}; + + AFVDCT4x4(pixels, coeffs); + AFVIDCT4x4(coeffs, idct); + for (size_t j = 0; j < 16; j++) { + EXPECT_NEAR(idct[j], pixels[j], 1e-6); + } + } +} + +TEST_P(AcStrategyTargetTest, BenchmarkAFV) { + const AcStrategy::Type type = AcStrategy::Type::AFV0; + HWY_ALIGN_MAX float pixels[64] = {1}; + HWY_ALIGN_MAX float coeffs[64] = {}; + HWY_ALIGN_MAX float scratch_space[64] = {}; + for (size_t i = 0; i < 1 << 14; i++) { + TransformToPixels(type, coeffs, pixels, 8, scratch_space); + TransformFromPixels(type, pixels, 8, coeffs, scratch_space); + } + EXPECT_NEAR(pixels[0], 0.0, 1E-6); +} + +TEST_P(AcStrategyTargetTest, BenchmarkAFVDCT) { + HWY_ALIGN_MAX float pixels[64] = {1}; + HWY_ALIGN_MAX float coeffs[64] = {}; + for (size_t i = 0; i < 1 << 14; i++) { + AFVDCT4x4(pixels, coeffs); + AFVIDCT4x4(coeffs, pixels); + } + EXPECT_NEAR(pixels[0], 1.0, 1E-6); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/adaptive_reconstruction_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/adaptive_reconstruction_test.cc new file mode 100644 index 0000000000..788bb7cc7f --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/adaptive_reconstruction_test.cc @@ -0,0 +1,184 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include +#include +#include + +#include "gtest/gtest.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_reconstruct.h" +#include "lib/jxl/epf.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/test_utils.h" + +namespace jxl { +namespace { + +const size_t xsize = 16; +const size_t ysize = 8; + +void GenerateFlat(const float background, const float foreground, + std::vector* images) { + for (size_t c = 0; c < Image3F::kNumPlanes; ++c) { + Image3F in(xsize, ysize); + // Plane c = foreground, all others = background. + for (size_t y = 0; y < ysize; ++y) { + float* rows[3] = {in.PlaneRow(0, y), in.PlaneRow(1, y), + in.PlaneRow(2, y)}; + for (size_t x = 0; x < xsize; ++x) { + rows[0][x] = rows[1][x] = rows[2][x] = background; + rows[c][x] = foreground; + } + } + images->push_back(std::move(in)); + } +} + +// Single foreground point at any position in any channel +void GeneratePoints(const float background, const float foreground, + std::vector* images) { + for (size_t c = 0; c < Image3F::kNumPlanes; ++c) { + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + Image3F in(xsize, ysize); + FillImage(background, &in); + in.PlaneRow(c, y)[x] = foreground; + images->push_back(std::move(in)); + } + } + } +} + +void GenerateHorzEdges(const float background, const float foreground, + std::vector* images) { + for (size_t c = 0; c < Image3F::kNumPlanes; ++c) { + // Begin of foreground rows + for (size_t y = 1; y < ysize; ++y) { + Image3F in(xsize, ysize); + FillImage(background, &in); + for (size_t iy = y; iy < ysize; ++iy) { + std::fill(in.PlaneRow(c, iy), in.PlaneRow(c, iy) + xsize, foreground); + } + images->push_back(std::move(in)); + } + } +} + +void GenerateVertEdges(const float background, const float foreground, + std::vector* images) { + for (size_t c = 0; c < Image3F::kNumPlanes; ++c) { + // Begin of foreground columns + for (size_t x = 1; x < xsize; ++x) { + Image3F in(xsize, ysize); + FillImage(background, &in); + for (size_t iy = 0; iy < ysize; ++iy) { + float* JXL_RESTRICT row = in.PlaneRow(c, iy); + for (size_t ix = x; ix < xsize; ++ix) { + row[ix] = foreground; + } + } + images->push_back(std::move(in)); + } + } +} + +void DumpTestImage(const char* name, const Image3F& img) { + fprintf(stderr, "Image %s:\n", name); + for (size_t y = 0; y < img.ysize(); ++y) { + const float* row_x = img.ConstPlaneRow(0, y); + const float* row_y = img.ConstPlaneRow(1, y); + const float* row_b = img.ConstPlaneRow(2, y); + for (size_t x = 0; x < img.xsize(); ++x) { + fprintf(stderr, "%5.1f|%5.1f|%5.1f ", row_x[x], row_y[x], row_b[x]); + } + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +// Ensures input remains unchanged by filter - verifies the edge-preserving +// nature of the filter because inputs are piecewise constant. +void EnsureUnchanged(const float background, const float foreground, + uint32_t epf_iters) { + std::vector images; + GenerateFlat(background, foreground, &images); + GeneratePoints(background, foreground, &images); + GenerateHorzEdges(background, foreground, &images); + GenerateVertEdges(background, foreground, &images); + + CodecMetadata metadata; + JXL_CHECK(metadata.size.Set(xsize, ysize)); + metadata.m.xyb_encoded = false; + FrameHeader frame_header(&metadata); + // Ensure no CT is applied + frame_header.color_transform = ColorTransform::kNone; + LoopFilter& lf = frame_header.loop_filter; + lf.gab = false; + lf.epf_iters = epf_iters; + FrameDimensions frame_dim = frame_header.ToFrameDimensions(); + + jxl::PassesDecoderState state; + JXL_CHECK( + jxl::InitializePassesSharedState(frame_header, &state.shared_storage)); + JXL_CHECK(state.Init()); + state.InitForAC(/*pool=*/nullptr); + + JXL_CHECK(state.filter_weights.Init(lf, frame_dim)); + FillImage(-0.5f, &state.filter_weights.sigma); + + for (size_t idx_image = 0; idx_image < images.size(); ++idx_image) { + const Image3F& in = images[idx_image]; + state.decoded = CopyImage(in); + + ImageBundle out(&metadata.m); + out.SetFromImage(CopyImage(in), ColorEncoding::LinearSRGB()); + FillImage(-99.f, out.color()); // Initialized with garbage. + Image3F padded = PadImageMirror(in, 2 * kBlockDim, 0); + // Call with `force_fir` set to true to force to apply filters to all of the + // input image. + JXL_CHECK(FinalizeFrameDecoding(&out, &state, /*pool=*/nullptr, + /*force_fir=*/true, + /*skip_blending=*/true)); + +#if JXL_HIGH_PRECISION + VerifyRelativeError(in, *out.color(), 1E-3, 1E-4); +#else + VerifyRelativeError(in, *out.color(), 1E-2, 1E-2); +#endif + if (testing::Test::HasFatalFailure()) { + DumpTestImage("in", in); + DumpTestImage("out", *out.color()); + } + } +} + +} // namespace + +class AdaptiveReconstructionTest : public testing::TestWithParam {}; + +JXL_GTEST_INSTANTIATE_TEST_SUITE_P(EPFItersGroup, AdaptiveReconstructionTest, + testing::Values(1, 2, 3), + testing::PrintToStringParamName()); + +TEST_P(AdaptiveReconstructionTest, TestBright) { + EnsureUnchanged(1.0f, 128.0f, GetParam()); +} +TEST_P(AdaptiveReconstructionTest, TestDark) { + EnsureUnchanged(128.0f, 1.0f, GetParam()); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/alpha.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/alpha.cc new file mode 100644 index 0000000000..77ac9021d7 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/alpha.cc @@ -0,0 +1,111 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/alpha.h" + +#include + +#include + +namespace jxl { + +static float Clamp(float x) { return std::max(std::min(1.0f, x), 0.0f); } + +void PerformAlphaBlending(const AlphaBlendingInputLayer& bg, + const AlphaBlendingInputLayer& fg, + const AlphaBlendingOutput& out, size_t num_pixels, + bool alpha_is_premultiplied, bool clamp) { + if (alpha_is_premultiplied) { + for (size_t x = 0; x < num_pixels; ++x) { + float fga = clamp ? Clamp(fg.a[x]) : fg.a[x]; + out.r[x] = (fg.r[x] + bg.r[x] * (1.f - fga)); + out.g[x] = (fg.g[x] + bg.g[x] * (1.f - fga)); + out.b[x] = (fg.b[x] + bg.b[x] * (1.f - fga)); + out.a[x] = (1.f - (1.f - fga) * (1.f - bg.a[x])); + } + } else { + for (size_t x = 0; x < num_pixels; ++x) { + float fga = clamp ? Clamp(fg.a[x]) : fg.a[x]; + const float new_a = 1.f - (1.f - fga) * (1.f - bg.a[x]); + const float rnew_a = (new_a > 0 ? 1.f / new_a : 0.f); + out.r[x] = (fg.r[x] * fga + bg.r[x] * bg.a[x] * (1.f - fga)) * rnew_a; + out.g[x] = (fg.g[x] * fga + bg.g[x] * bg.a[x] * (1.f - fga)) * rnew_a; + out.b[x] = (fg.b[x] * fga + bg.b[x] * bg.a[x] * (1.f - fga)) * rnew_a; + out.a[x] = new_a; + } + } +} +void PerformAlphaBlending(const float* bg, const float* bga, const float* fg, + const float* fga, float* out, size_t num_pixels, + bool alpha_is_premultiplied, bool clamp) { + if (bg == bga && fg == fga) { + for (size_t x = 0; x < num_pixels; ++x) { + float fa = clamp ? fga[x] : std::min(std::max(0.0f, fga[x]), 1.0f); + out[x] = (1.f - (1.f - fa) * (1.f - bga[x])); + } + } else { + if (alpha_is_premultiplied) { + for (size_t x = 0; x < num_pixels; ++x) { + float fa = clamp ? fga[x] : Clamp(fga[x]); + out[x] = (fg[x] + bg[x] * (1.f - fa)); + } + } else { + for (size_t x = 0; x < num_pixels; ++x) { + float fa = clamp ? fga[x] : Clamp(fga[x]); + const float new_a = 1.f - (1.f - fa) * (1.f - bga[x]); + const float rnew_a = (new_a > 0 ? 1.f / new_a : 0.f); + out[x] = (fg[x] * fa + bg[x] * bga[x] * (1.f - fa)) * rnew_a; + } + } + } +} + +void PerformAlphaWeightedAdd(const float* bg, const float* fg, const float* fga, + float* out, size_t num_pixels, bool clamp) { + if (fg == fga) { + memcpy(out, bg, num_pixels * sizeof(*out)); + } else { + for (size_t x = 0; x < num_pixels; ++x) { + out[x] = bg[x] + fg[x] * Clamp(fga[x]); + } + } +} + +void PerformMulBlending(const float* bg, const float* fg, float* out, + size_t num_pixels, bool clamp) { + if (clamp) { + for (size_t x = 0; x < num_pixels; ++x) { + out[x] = bg[x] * Clamp(fg[x]); + } + } else { + for (size_t x = 0; x < num_pixels; ++x) { + out[x] = bg[x] * fg[x]; + } + } +} + +void PremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g, + float* JXL_RESTRICT b, const float* JXL_RESTRICT a, + size_t num_pixels) { + for (size_t x = 0; x < num_pixels; ++x) { + const float multiplier = std::max(kSmallAlpha, a[x]); + r[x] *= multiplier; + g[x] *= multiplier; + b[x] *= multiplier; + } +} + +void UnpremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g, + float* JXL_RESTRICT b, const float* JXL_RESTRICT a, + size_t num_pixels) { + for (size_t x = 0; x < num_pixels; ++x) { + const float multiplier = 1.f / std::max(kSmallAlpha, a[x]); + r[x] *= multiplier; + g[x] *= multiplier; + b[x] *= multiplier; + } +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/alpha.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/alpha.h new file mode 100644 index 0000000000..efb76c800f --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/alpha.h @@ -0,0 +1,66 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ALPHA_H_ +#define LIB_JXL_ALPHA_H_ + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" + +namespace jxl { + +// A very small value to avoid divisions by zero when converting to +// unpremultiplied alpha. Page 21 of the technical introduction to OpenEXR +// (https://www.openexr.com/documentation/TechnicalIntroduction.pdf) recommends +// "a power of two" that is "less than half of the smallest positive 16-bit +// floating-point value". That smallest value happens to be the denormal number +// 2^-24, so 2^-26 should be a good choice. +static constexpr float kSmallAlpha = 1.f / (1u << 26u); + +struct AlphaBlendingInputLayer { + const float* r; + const float* g; + const float* b; + const float* a; +}; + +struct AlphaBlendingOutput { + float* r; + float* g; + float* b; + float* a; +}; + +// Note: The pointers in `out` are allowed to alias those in `bg` or `fg`. +// No pointer shall be null. +void PerformAlphaBlending(const AlphaBlendingInputLayer& bg, + const AlphaBlendingInputLayer& fg, + const AlphaBlendingOutput& out, size_t num_pixels, + bool alpha_is_premultiplied, bool clamp); +// Single plane alpha blending +void PerformAlphaBlending(const float* bg, const float* bga, const float* fg, + const float* fga, float* out, size_t num_pixels, + bool alpha_is_premultiplied, bool clamp); + +void PerformAlphaWeightedAdd(const float* bg, const float* fg, const float* fga, + float* out, size_t num_pixels, bool clamp); + +void PerformMulBlending(const float* bg, const float* fg, float* out, + size_t num_pixels, bool clamp); + +void PremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g, + float* JXL_RESTRICT b, const float* JXL_RESTRICT a, + size_t num_pixels); +void UnpremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g, + float* JXL_RESTRICT b, const float* JXL_RESTRICT a, + size_t num_pixels); + +} // namespace jxl + +#endif // LIB_JXL_ALPHA_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/alpha_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/alpha_test.cc new file mode 100644 index 0000000000..d90bbd37d9 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/alpha_test.cc @@ -0,0 +1,134 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/alpha.h" + +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace jxl { +namespace { + +using ::testing::_; +using ::testing::ElementsAre; +using ::testing::FloatNear; + +TEST(AlphaTest, BlendingWithNonPremultiplied) { + const float bg_rgb[3] = {100, 110, 120}; + const float bg_a = 180.f / 255; + const float fg_rgb[3] = {25, 21, 23}; + const float fg_a = 15420.f / 65535; + const float fg_a2 = 2.0f; + float out_rgb[3]; + float out_a; + PerformAlphaBlending( + /*bg=*/{&bg_rgb[0], &bg_rgb[1], &bg_rgb[2], &bg_a}, + /*fg=*/{&fg_rgb[0], &fg_rgb[1], &fg_rgb[2], &fg_a}, + /*out=*/{&out_rgb[0], &out_rgb[1], &out_rgb[2], &out_a}, 1, + /*alpha_is_premultiplied=*/false, /*clamp=*/false); + EXPECT_THAT(out_rgb, + ElementsAre(FloatNear(77.2f, .05f), FloatNear(83.0f, .05f), + FloatNear(90.6f, .05f))); + EXPECT_NEAR(out_a, 3174.f / 4095, 1e-5); + PerformAlphaBlending( + /*bg=*/{&bg_rgb[0], &bg_rgb[1], &bg_rgb[2], &bg_a}, + /*fg=*/{&fg_rgb[0], &fg_rgb[1], &fg_rgb[2], &fg_a2}, + /*out=*/{&out_rgb[0], &out_rgb[1], &out_rgb[2], &out_a}, 1, + /*alpha_is_premultiplied=*/false, /*clamp=*/true); + EXPECT_THAT(out_rgb, ElementsAre(FloatNear(fg_rgb[0], .05f), + FloatNear(fg_rgb[1], .05f), + FloatNear(fg_rgb[2], .05f))); + EXPECT_NEAR(out_a, 1.0f, 1e-5); +} + +TEST(AlphaTest, BlendingWithPremultiplied) { + const float bg_rgb[3] = {100, 110, 120}; + const float bg_a = 180.f / 255; + const float fg_rgb[3] = {25, 21, 23}; + const float fg_a = 15420.f / 65535; + const float fg_a2 = 2.0f; + float out_rgb[3]; + float out_a; + PerformAlphaBlending( + /*bg=*/{&bg_rgb[0], &bg_rgb[1], &bg_rgb[2], &bg_a}, + /*fg=*/{&fg_rgb[0], &fg_rgb[1], &fg_rgb[2], &fg_a}, + /*out=*/{&out_rgb[0], &out_rgb[1], &out_rgb[2], &out_a}, 1, + /*alpha_is_premultiplied=*/true, /*clamp=*/false); + EXPECT_THAT(out_rgb, + ElementsAre(FloatNear(101.5f, .05f), FloatNear(105.1f, .05f), + FloatNear(114.8f, .05f))); + EXPECT_NEAR(out_a, 3174.f / 4095, 1e-5); + PerformAlphaBlending( + /*bg=*/{&bg_rgb[0], &bg_rgb[1], &bg_rgb[2], &bg_a}, + /*fg=*/{&fg_rgb[0], &fg_rgb[1], &fg_rgb[2], &fg_a2}, + /*out=*/{&out_rgb[0], &out_rgb[1], &out_rgb[2], &out_a}, 1, + /*alpha_is_premultiplied=*/true, /*clamp=*/true); + EXPECT_THAT(out_rgb, ElementsAre(FloatNear(fg_rgb[0], .05f), + FloatNear(fg_rgb[1], .05f), + FloatNear(fg_rgb[2], .05f))); + EXPECT_NEAR(out_a, 1.0f, 1e-5); +} + +TEST(AlphaTest, Mul) { + const float bg = 100; + const float fg = 25; + float out; + PerformMulBlending(&bg, &fg, &out, 1, /*clamp=*/false); + EXPECT_THAT(out, FloatNear(fg * bg, .05f)); + PerformMulBlending(&bg, &fg, &out, 1, /*clamp=*/true); + EXPECT_THAT(out, FloatNear(bg, .05f)); +} + +TEST(AlphaTest, PremultiplyAndUnpremultiply) { + const float alpha[] = {0.f, 63.f / 255, 127.f / 255, 1.f}; + float r[] = {120, 130, 140, 150}; + float g[] = {124, 134, 144, 154}; + float b[] = {127, 137, 147, 157}; + + PremultiplyAlpha(r, g, b, alpha, 4); + EXPECT_THAT( + r, ElementsAre(FloatNear(0.f, 1e-5f), FloatNear(130 * 63.f / 255, 1e-5f), + FloatNear(140 * 127.f / 255, 1e-5f), 150)); + EXPECT_THAT( + g, ElementsAre(FloatNear(0.f, 1e-5f), FloatNear(134 * 63.f / 255, 1e-5f), + FloatNear(144 * 127.f / 255, 1e-5f), 154)); + EXPECT_THAT( + b, ElementsAre(FloatNear(0.f, 1e-5f), FloatNear(137 * 63.f / 255, 1e-5f), + FloatNear(147 * 127.f / 255, 1e-5f), 157)); + + UnpremultiplyAlpha(r, g, b, alpha, 4); + EXPECT_THAT(r, ElementsAre(FloatNear(120, 1e-4f), FloatNear(130, 1e-4f), + FloatNear(140, 1e-4f), FloatNear(150, 1e-4f))); + EXPECT_THAT(g, ElementsAre(FloatNear(124, 1e-4f), FloatNear(134, 1e-4f), + FloatNear(144, 1e-4f), FloatNear(154, 1e-4f))); + EXPECT_THAT(b, ElementsAre(FloatNear(127, 1e-4f), FloatNear(137, 1e-4f), + FloatNear(147, 1e-4f), FloatNear(157, 1e-4f))); +} + +TEST(AlphaTest, UnpremultiplyAndPremultiply) { + const float alpha[] = {0.f, 63.f / 255, 127.f / 255, 1.f}; + float r[] = {50, 60, 70, 80}; + float g[] = {54, 64, 74, 84}; + float b[] = {57, 67, 77, 87}; + + UnpremultiplyAlpha(r, g, b, alpha, 4); + EXPECT_THAT(r, ElementsAre(_, FloatNear(60 * 255.f / 63, 1e-4f), + FloatNear(70 * 255.f / 127, 1e-4f), 80)); + EXPECT_THAT(g, ElementsAre(_, FloatNear(64 * 255.f / 63, 1e-4f), + FloatNear(74 * 255.f / 127, 1e-4f), 84)); + EXPECT_THAT(b, ElementsAre(_, FloatNear(67 * 255.f / 63, 1e-4f), + FloatNear(77 * 255.f / 127, 1e-4f), 87)); + + PremultiplyAlpha(r, g, b, alpha, 4); + EXPECT_THAT(r, ElementsAre(FloatNear(50, 1e-4f), FloatNear(60, 1e-4f), + FloatNear(70, 1e-4f), FloatNear(80, 1e-4f))); + EXPECT_THAT(g, ElementsAre(FloatNear(54, 1e-4f), FloatNear(64, 1e-4f), + FloatNear(74, 1e-4f), FloatNear(84, 1e-4f))); + EXPECT_THAT(b, ElementsAre(FloatNear(57, 1e-4f), FloatNear(67, 1e-4f), + FloatNear(77, 1e-4f), FloatNear(87, 1e-4f))); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common.cc new file mode 100644 index 0000000000..cc0d58b446 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common.cc @@ -0,0 +1,148 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/ans_common.h" + +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +std::vector CreateFlatHistogram(int length, int total_count) { + JXL_ASSERT(length > 0); + JXL_ASSERT(length <= total_count); + const int count = total_count / length; + std::vector result(length, count); + const int rem_counts = total_count % length; + for (int i = 0; i < rem_counts; ++i) { + ++result[i]; + } + return result; +} + +// First, all trailing non-occuring symbols are removed from the distribution; +// if this leaves the distribution empty, a dummy symbol with max weight is +// added. This ensures that the resulting distribution sums to total table size. +// Then, `entry_size` is chosen to be the largest power of two so that +// `table_size` = ANS_TAB_SIZE/`entry_size` is at least as big as the +// distribution size. +// Note that each entry will only ever contain two different symbols, and +// consecutive ranges of offsets, which allows us to use a compact +// representation. +// Each entry is initialized with only the (symbol=i, offset) pairs; then +// positions for which the entry overflows (i.e. distribution[i] > entry_size) +// or is not full are computed, and put into a stack in increasing order. +// Missing symbols in the distribution are padded with 0 (because `table_size` +// >= number of symbols). The `cutoff` value for each entry is initialized to +// the number of occupied slots in that entry (i.e. `distributions[i]`). While +// the overflowing-symbol stack is not empty (which implies that the +// underflowing-symbol stack also is not), the top overfull and underfull +// positions are popped from the stack; the empty slots in the underfull entry +// are then filled with as many slots as needed from the overfull entry; such +// slots are placed after the slots in the overfull entry, and `offsets[1]` is +// computed accordingly. The formerly underfull entry is thus now neither +// underfull nor overfull, and represents exactly two symbols. The overfull +// entry might be either overfull or underfull, and is pushed into the +// corresponding stack. +void InitAliasTable(std::vector distribution, uint32_t range, + size_t log_alpha_size, AliasTable::Entry* JXL_RESTRICT a) { + while (!distribution.empty() && distribution.back() == 0) { + distribution.pop_back(); + } + // Ensure that a valid table is always returned, even for an empty + // alphabet. Otherwise, a specially-crafted stream might crash the + // decoder. + if (distribution.empty()) { + distribution.emplace_back(range); + } + const size_t table_size = 1 << log_alpha_size; +#if JXL_ENABLE_ASSERT + int sum = std::accumulate(distribution.begin(), distribution.end(), 0); +#endif // JXL_ENABLE_ASSERT + JXL_ASSERT(static_cast(sum) == range); + // range must be a power of two + JXL_ASSERT((range & (range - 1)) == 0); + JXL_ASSERT(distribution.size() <= table_size); + JXL_ASSERT(table_size <= range); + const uint32_t entry_size = range >> log_alpha_size; // this is exact + // Special case for single-symbol distributions, that ensures that the state + // does not change when decoding from such a distribution. Note that, since we + // hardcode offset0 == 0, it is not straightforward (if at all possible) to + // fix the general case to produce this result. + for (size_t sym = 0; sym < distribution.size(); sym++) { + if (distribution[sym] == ANS_TAB_SIZE) { + for (size_t i = 0; i < table_size; i++) { + a[i].right_value = sym; + a[i].cutoff = 0; + a[i].offsets1 = entry_size * i; + a[i].freq0 = 0; + a[i].freq1_xor_freq0 = ANS_TAB_SIZE; + } + return; + } + } + std::vector underfull_posn; + std::vector overfull_posn; + std::vector cutoffs(1 << log_alpha_size); + // Initialize entries. + for (size_t i = 0; i < distribution.size(); i++) { + cutoffs[i] = distribution[i]; + if (cutoffs[i] > entry_size) { + overfull_posn.push_back(i); + } else if (cutoffs[i] < entry_size) { + underfull_posn.push_back(i); + } + } + for (uint32_t i = distribution.size(); i < table_size; i++) { + cutoffs[i] = 0; + underfull_posn.push_back(i); + } + // Reassign overflow/underflow values. + while (!overfull_posn.empty()) { + uint32_t overfull_i = overfull_posn.back(); + overfull_posn.pop_back(); + JXL_ASSERT(!underfull_posn.empty()); + uint32_t underfull_i = underfull_posn.back(); + underfull_posn.pop_back(); + uint32_t underfull_by = entry_size - cutoffs[underfull_i]; + cutoffs[overfull_i] -= underfull_by; + // overfull positions have their original symbols + a[underfull_i].right_value = overfull_i; + a[underfull_i].offsets1 = cutoffs[overfull_i]; + // Slots in the right part of entry underfull_i were taken from the end + // of the symbols in entry overfull_i. + if (cutoffs[overfull_i] < entry_size) { + underfull_posn.push_back(overfull_i); + } else if (cutoffs[overfull_i] > entry_size) { + overfull_posn.push_back(overfull_i); + } + } + for (uint32_t i = 0; i < table_size; i++) { + // cutoffs[i] is properly initialized but the clang-analyzer doesn't infer + // it since it is partially initialized across two for-loops. + // NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult) + if (cutoffs[i] == entry_size) { + a[i].right_value = i; + a[i].offsets1 = 0; + a[i].cutoff = 0; + } else { + // Note that, if cutoff is not equal to entry_size, + // a[i].offsets1 was initialized with (overfull cutoff) - + // (entry_size - a[i].cutoff). Thus, subtracting + // a[i].cutoff cannot make it negative. + a[i].offsets1 -= cutoffs[i]; + a[i].cutoff = cutoffs[i]; + } + const size_t freq0 = i < distribution.size() ? distribution[i] : 0; + const size_t i1 = a[i].right_value; + const size_t freq1 = i1 < distribution.size() ? distribution[i1] : 0; + a[i].freq0 = static_cast(freq0); + a[i].freq1_xor_freq0 = static_cast(freq1 ^ freq0); + } +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common.h new file mode 100644 index 0000000000..12ce1eff36 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common.h @@ -0,0 +1,143 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ANS_COMMON_H_ +#define LIB_JXL_ANS_COMMON_H_ + +#include + +#include +#include // Prefetch +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +// Returns the precision (number of bits) that should be used to store +// a histogram count such that Log2Floor(count) == logcount. +static JXL_INLINE uint32_t GetPopulationCountPrecision(uint32_t logcount, + uint32_t shift) { + int32_t r = std::min( + logcount, int(shift) - int((ANS_LOG_TAB_SIZE - logcount) >> 1)); + if (r < 0) return 0; + return r; +} + +// Returns a histogram where the counts are positive, differ by at most 1, +// and add up to total_count. The bigger counts (if any) are at the beginning +// of the histogram. +std::vector CreateFlatHistogram(int length, int total_count); + +// An alias table implements a mapping from the [0, ANS_TAB_SIZE) range into +// the [0, ANS_MAX_ALPHABET_SIZE) range, satisfying the following conditions: +// - each symbol occurs as many times as specified by any valid distribution +// of frequencies of the symbols. A valid distribution here is an array of +// ANS_MAX_ALPHABET_SIZE that contains numbers in the range [0, ANS_TAB_SIZE], +// and whose sum is ANS_TAB_SIZE. +// - lookups can be done in constant time, and also return how many smaller +// input values map into the same symbol, according to some well-defined order +// of input values. +// - the space used by the alias table is given by a small constant times the +// index of the largest symbol with nonzero probability in the distribution. +// Each of the entries in the table covers a range of `entry_size` values in the +// [0, ANS_TAB_SIZE) range; consecutive entries represent consecutive +// sub-ranges. In the range covered by entry `i`, the first `cutoff` values map +// to symbol `i`, while the others map to symbol `right_value`. +// +// TODO(veluca): consider making the order used for computing offsets easier to +// define - it is currently defined by the algorithm to compute the alias table. +// Beware of breaking the implicit assumption that symbols that come after the +// cutoff value should have an offset at least as big as the cutoff. + +struct AliasTable { + struct Symbol { + size_t value; + size_t offset; + size_t freq; + }; + +// Working set size matters here (~64 tables x 256 entries). +// offsets0 is always zero (beginning of [0] side among the same symbol). +// offsets1 is an offset of (pos >= cutoff) side decremented by cutoff. +#pragma pack(push, 1) + struct Entry { + uint8_t cutoff; // < kEntrySizeMinus1 when used by ANS. + uint8_t right_value; // < alphabet size. + uint16_t freq0; + + // Only used if `greater` (see Lookup) + uint16_t offsets1; // <= ANS_TAB_SIZE + uint16_t freq1_xor_freq0; // for branchless ternary in Lookup + }; +#pragma pack(pop) + + // Dividing `value` by `entry_size` determines `i`, the entry which is + // responsible for the input. If the remainder is below `cutoff`, then the + // mapped symbol is `i`; since `offsets[0]` stores the number of occurrences + // of `i` "before" the start of this entry, the offset of the input will be + // `offsets[0] + remainder`. If the remainder is above cutoff, the mapped + // symbol is `right_value`; since `offsets[1]` stores the number of + // occurrences of `right_value` "before" this entry, minus the `cutoff` value, + // the input offset is then `remainder + offsets[1]`. + static JXL_INLINE Symbol Lookup(const Entry* JXL_RESTRICT table, size_t value, + size_t log_entry_size, + size_t entry_size_minus_1) { + const size_t i = value >> log_entry_size; + const size_t pos = value & entry_size_minus_1; + +#if JXL_BYTE_ORDER_LITTLE + uint64_t entry; + memcpy(&entry, &table[i].cutoff, sizeof(entry)); + const size_t cutoff = entry & 0xFF; // = MOVZX + const size_t right_value = (entry >> 8) & 0xFF; // = MOVZX + const size_t freq0 = (entry >> 16) & 0xFFFF; +#else + // Generates multiple loads with complex addressing. + const size_t cutoff = table[i].cutoff; + const size_t right_value = table[i].right_value; + const size_t freq0 = table[i].freq0; +#endif + + const bool greater = pos >= cutoff; + +#if JXL_BYTE_ORDER_LITTLE + const uint64_t conditional = greater ? entry : 0; // = CMOV + const size_t offsets1_or_0 = (conditional >> 32) & 0xFFFF; + const size_t freq1_xor_freq0_or_0 = conditional >> 48; +#else + const size_t offsets1_or_0 = greater ? table[i].offsets1 : 0; + const size_t freq1_xor_freq0_or_0 = greater ? table[i].freq1_xor_freq0 : 0; +#endif + + // WARNING: moving this code may interfere with CMOV heuristics. + Symbol s; + s.value = greater ? right_value : i; + s.offset = offsets1_or_0 + pos; + s.freq = freq0 ^ freq1_xor_freq0_or_0; // = greater ? freq1 : freq0 + // XOR avoids implementation-defined conversion from unsigned to signed. + // Alternatives considered: BEXTR is 2 cycles on HSW, SET+shift causes + // spills, simple ternary has a long dependency chain. + + return s; + } + + static HWY_INLINE void Prefetch(const Entry* JXL_RESTRICT table, size_t value, + size_t log_entry_size) { + const size_t i = value >> log_entry_size; + hwy::Prefetch(table + i); + } +}; + +// Computes an alias table for a given distribution. +void InitAliasTable(std::vector distribution, uint32_t range, + size_t log_alpha_size, AliasTable::Entry* JXL_RESTRICT a); + +} // namespace jxl + +#endif // LIB_JXL_ANS_COMMON_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common_test.cc new file mode 100644 index 0000000000..1960c795ad --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ans_common_test.cc @@ -0,0 +1,43 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/ans_common.h" + +#include + +#include "gtest/gtest.h" +#include "lib/jxl/ans_params.h" + +namespace jxl { +namespace { + +void VerifyAliasDistribution(const std::vector& distribution, + uint32_t range) { + constexpr size_t log_alpha_size = 8; + AliasTable::Entry table[1 << log_alpha_size]; + InitAliasTable(distribution, range, log_alpha_size, table); + std::vector> offsets(distribution.size()); + for (uint32_t i = 0; i < range; i++) { + AliasTable::Symbol s = AliasTable::Lookup( + table, i, ANS_LOG_TAB_SIZE - 8, (1 << (ANS_LOG_TAB_SIZE - 8)) - 1); + offsets[s.value].push_back(s.offset); + } + for (uint32_t i = 0; i < distribution.size(); i++) { + ASSERT_EQ(distribution[i], offsets[i].size()); + std::sort(offsets[i].begin(), offsets[i].end()); + for (uint32_t j = 0; j < offsets[i].size(); j++) { + ASSERT_EQ(offsets[i][j], j); + } + } +} + +TEST(ANSCommonTest, AliasDistributionSmoke) { + VerifyAliasDistribution({ANS_TAB_SIZE / 2, ANS_TAB_SIZE / 2}, ANS_TAB_SIZE); + VerifyAliasDistribution({ANS_TAB_SIZE}, ANS_TAB_SIZE); + VerifyAliasDistribution({0, 0, 0, ANS_TAB_SIZE, 0}, ANS_TAB_SIZE); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ans_params.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ans_params.h new file mode 100644 index 0000000000..4bbc284c0b --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ans_params.h @@ -0,0 +1,36 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ANS_PARAMS_H_ +#define LIB_JXL_ANS_PARAMS_H_ + +// Common parameters that are needed for both the ANS entropy encoding and +// decoding methods. + +#include +#include + +namespace jxl { + +// TODO(veluca): decide if 12 is the best constant here (valid range is up to +// 16). This requires recomputing the Huffman tables in {enc,dec}_ans.cc +// 14 gives a 0.2% improvement at d1 and makes d8 slightly worse. This is +// likely not worth the increase in encoder complexity. +#define ANS_LOG_TAB_SIZE 12u +#define ANS_TAB_SIZE (1 << ANS_LOG_TAB_SIZE) +#define ANS_TAB_MASK (ANS_TAB_SIZE - 1) + +// Largest possible symbol to be encoded by either ANS or prefix coding. +#define PREFIX_MAX_ALPHABET_SIZE 4096 +#define ANS_MAX_ALPHABET_SIZE 256 + +// Max number of bits for prefix coding. +#define PREFIX_MAX_BITS 15 + +#define ANS_SIGNATURE 0x13 // Initial state, used as CRC. + +} // namespace jxl + +#endif // LIB_JXL_ANS_PARAMS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ans_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ans_test.cc new file mode 100644 index 0000000000..808c5f3aaa --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/ans_test.cc @@ -0,0 +1,280 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include +#include + +#include "gtest/gtest.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_bit_writer.h" + +namespace jxl { +namespace { + +void RoundtripTestcase(int n_histograms, int alphabet_size, + const std::vector& input_values) { + constexpr uint16_t kMagic1 = 0x9e33; + constexpr uint16_t kMagic2 = 0x8b04; + + BitWriter writer; + // Space for magic bytes. + BitWriter::Allotment allotment_magic1(&writer, 16); + writer.Write(16, kMagic1); + ReclaimAndCharge(&writer, &allotment_magic1, 0, nullptr); + + std::vector context_map; + EntropyEncodingData codes; + std::vector> input_values_vec; + input_values_vec.push_back(input_values); + + BuildAndEncodeHistograms(HistogramParams(), n_histograms, input_values_vec, + &codes, &context_map, &writer, 0, nullptr); + WriteTokens(input_values_vec[0], codes, context_map, &writer, 0, nullptr); + + // Magic bytes + padding + BitWriter::Allotment allotment_magic2(&writer, 24); + writer.Write(16, kMagic2); + writer.ZeroPadToByte(); + ReclaimAndCharge(&writer, &allotment_magic2, 0, nullptr); + + // We do not truncate the output. Reading past the end reads out zeroes + // anyway. + BitReader br(writer.GetSpan()); + + ASSERT_EQ(br.ReadBits(16), kMagic1); + + std::vector dec_context_map; + ANSCode decoded_codes; + ASSERT_TRUE( + DecodeHistograms(&br, n_histograms, &decoded_codes, &dec_context_map)); + ASSERT_EQ(dec_context_map, context_map); + ANSSymbolReader reader(&decoded_codes, &br); + + for (const Token& symbol : input_values) { + uint32_t read_symbol = + reader.ReadHybridUint(symbol.context, &br, dec_context_map); + ASSERT_EQ(read_symbol, symbol.value); + } + ASSERT_TRUE(reader.CheckANSFinalState()); + + ASSERT_EQ(br.ReadBits(16), kMagic2); + EXPECT_TRUE(br.Close()); +} + +TEST(ANSTest, EmptyRoundtrip) { + RoundtripTestcase(2, ANS_MAX_ALPHABET_SIZE, std::vector()); +} + +TEST(ANSTest, SingleSymbolRoundtrip) { + for (uint32_t i = 0; i < ANS_MAX_ALPHABET_SIZE; i++) { + RoundtripTestcase(2, ANS_MAX_ALPHABET_SIZE, {{0, i}}); + } + for (uint32_t i = 0; i < ANS_MAX_ALPHABET_SIZE; i++) { + RoundtripTestcase(2, ANS_MAX_ALPHABET_SIZE, + std::vector(1024, {0, i})); + } +} + +#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ + defined(THREAD_SANITIZER) +constexpr size_t kReps = 10; +#else +constexpr size_t kReps = 100; +#endif + +void RoundtripRandomStream(int alphabet_size, size_t reps = kReps, + size_t num = 1 << 18) { + constexpr int kNumHistograms = 3; + std::mt19937_64 rng; + for (size_t i = 0; i < reps; i++) { + std::vector symbols; + for (size_t j = 0; j < num; j++) { + int context = std::uniform_int_distribution<>(0, kNumHistograms - 1)(rng); + int value = std::uniform_int_distribution<>(0, alphabet_size - 1)(rng); + symbols.emplace_back(context, value); + } + RoundtripTestcase(kNumHistograms, alphabet_size, symbols); + } +} + +void RoundtripRandomUnbalancedStream(int alphabet_size) { + constexpr int kNumHistograms = 3; + constexpr int kPrecision = 1 << 10; + std::mt19937_64 rng; + for (int i = 0; i < 100; i++) { + std::vector distributions[kNumHistograms]; + for (int j = 0; j < kNumHistograms; j++) { + distributions[j].resize(kPrecision); + int symbol = 0; + int remaining = 1; + for (int k = 0; k < kPrecision; k++) { + if (remaining == 0) { + if (symbol < alphabet_size - 1) symbol++; + // There is no meaning behind this distribution: it's anything that + // will create a nonuniform distribution and won't have too few + // symbols usually. Also we want different distributions we get to be + // sufficiently dissimilar. + remaining = + std::uniform_int_distribution<>(0, (kPrecision - k) / 1)(rng); + } + distributions[j][k] = symbol; + remaining--; + } + } + std::vector symbols; + for (int j = 0; j < 1 << 18; j++) { + int context = std::uniform_int_distribution<>(0, kNumHistograms - 1)(rng); + int value = distributions[context][std::uniform_int_distribution<>( + 0, kPrecision - 1)(rng)]; + symbols.emplace_back(context, value); + } + RoundtripTestcase(kNumHistograms + 1, alphabet_size, symbols); + } +} + +TEST(ANSTest, RandomStreamRoundtrip3Small) { RoundtripRandomStream(3, 1, 16); } + +TEST(ANSTest, RandomStreamRoundtrip3) { RoundtripRandomStream(3); } + +TEST(ANSTest, RandomStreamRoundtripBig) { + RoundtripRandomStream(ANS_MAX_ALPHABET_SIZE); +} + +TEST(ANSTest, RandomUnbalancedStreamRoundtrip3) { + RoundtripRandomUnbalancedStream(3); +} + +TEST(ANSTest, RandomUnbalancedStreamRoundtripBig) { + RoundtripRandomUnbalancedStream(ANS_MAX_ALPHABET_SIZE); +} + +TEST(ANSTest, UintConfigRoundtrip) { + for (size_t log_alpha_size = 5; log_alpha_size <= 8; log_alpha_size++) { + std::vector uint_config, uint_config_dec; + for (size_t i = 0; i < log_alpha_size; i++) { + for (size_t j = 0; j <= i; j++) { + for (size_t k = 0; k <= i - j; k++) { + uint_config.emplace_back(i, j, k); + } + } + } + uint_config.emplace_back(log_alpha_size, 0, 0); + uint_config_dec.resize(uint_config.size()); + BitWriter writer; + BitWriter::Allotment allotment(&writer, 10 * uint_config.size()); + EncodeUintConfigs(uint_config, &writer, log_alpha_size); + ReclaimAndCharge(&writer, &allotment, 0, nullptr); + writer.ZeroPadToByte(); + BitReader br(writer.GetSpan()); + EXPECT_TRUE(DecodeUintConfigs(log_alpha_size, &uint_config_dec, &br)); + EXPECT_TRUE(br.Close()); + for (size_t i = 0; i < uint_config.size(); i++) { + EXPECT_EQ(uint_config[i].split_token, uint_config_dec[i].split_token); + EXPECT_EQ(uint_config[i].msb_in_token, uint_config_dec[i].msb_in_token); + EXPECT_EQ(uint_config[i].lsb_in_token, uint_config_dec[i].lsb_in_token); + } + } +} + +void TestCheckpointing(bool ans, bool lz77) { + std::vector> input_values(1); + for (size_t i = 0; i < 1024; i++) { + input_values[0].push_back(Token(0, i % 4)); + } + // up to lz77 window size. + for (size_t i = 0; i < (1 << 20) - 1022; i++) { + input_values[0].push_back(Token(0, (i % 5) + 4)); + } + // Ensure that when the window wraps around, new values are different. + input_values[0].push_back(Token(0, 0)); + for (size_t i = 0; i < 1024; i++) { + input_values[0].push_back(Token(0, i % 4)); + } + + std::vector context_map; + EntropyEncodingData codes; + HistogramParams params; + params.lz77_method = lz77 ? HistogramParams::LZ77Method::kLZ77 + : HistogramParams::LZ77Method::kNone; + params.force_huffman = !ans; + + BitWriter writer; + { + auto input_values_copy = input_values; + BuildAndEncodeHistograms(params, 1, input_values_copy, &codes, &context_map, + &writer, 0, nullptr); + WriteTokens(input_values_copy[0], codes, context_map, &writer, 0, nullptr); + writer.ZeroPadToByte(); + } + + // We do not truncate the output. Reading past the end reads out zeroes + // anyway. + BitReader br(writer.GetSpan()); + Status status = true; + { + BitReaderScopedCloser bc(&br, &status); + + std::vector dec_context_map; + ANSCode decoded_codes; + ASSERT_TRUE(DecodeHistograms(&br, 1, &decoded_codes, &dec_context_map)); + ASSERT_EQ(dec_context_map, context_map); + ANSSymbolReader reader(&decoded_codes, &br); + + ANSSymbolReader::Checkpoint checkpoint; + size_t br_pos; + constexpr size_t kInterval = ANSSymbolReader::kMaxCheckpointInterval - 2; + for (size_t i = 0; i < input_values[0].size(); i++) { + if (i % kInterval == 0 && i > 0) { + reader.Restore(checkpoint); + ASSERT_TRUE(br.Close()); + br = BitReader(writer.GetSpan()); + br.SkipBits(br_pos); + for (size_t j = i - kInterval; j < i; j++) { + Token symbol = input_values[0][j]; + uint32_t read_symbol = + reader.ReadHybridUint(symbol.context, &br, dec_context_map); + ASSERT_EQ(read_symbol, symbol.value) << "j = " << j; + } + } + if (i % kInterval == 0) { + reader.Save(&checkpoint); + br_pos = br.TotalBitsConsumed(); + } + Token symbol = input_values[0][i]; + uint32_t read_symbol = + reader.ReadHybridUint(symbol.context, &br, dec_context_map); + ASSERT_EQ(read_symbol, symbol.value) << "i = " << i; + } + ASSERT_TRUE(reader.CheckANSFinalState()); + } + EXPECT_TRUE(status); +} + +TEST(ANSTest, TestCheckpointingANS) { + TestCheckpointing(/*ans=*/true, /*lz77=*/false); +} + +TEST(ANSTest, TestCheckpointingPrefix) { + TestCheckpointing(/*ans=*/false, /*lz77=*/false); +} + +TEST(ANSTest, TestCheckpointingANSLZ77) { + TestCheckpointing(/*ans=*/true, /*lz77=*/true); +} + +TEST(ANSTest, TestCheckpointingPrefixLZ77) { + TestCheckpointing(/*ans=*/false, /*lz77=*/true); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out.cc new file mode 100644 index 0000000000..e83140d50b --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out.cc @@ -0,0 +1,96 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/aux_out.h" + +#include + +#include // accumulate + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/enc_bit_writer.h" + +namespace jxl { + +void AuxOut::Print(size_t num_inputs) const { + if (num_inputs == 0) return; + + LayerTotals all_layers; + for (size_t i = 0; i < layers.size(); ++i) { + all_layers.Assimilate(layers[i]); + } + + printf("Average butteraugli iters: %10.2f\n", + num_butteraugli_iters * 1.0 / num_inputs); + + for (size_t i = 0; i < layers.size(); ++i) { + if (layers[i].total_bits != 0) { + printf("Total layer bits %-10s\t", LayerName(i)); + printf("%10f%%", 100.0 * layers[i].total_bits / all_layers.total_bits); + layers[i].Print(num_inputs); + } + } + printf("Total image size "); + all_layers.Print(num_inputs); + + const uint32_t dc_pred_total = + std::accumulate(dc_pred_usage.begin(), dc_pred_usage.end(), 0u); + const uint32_t dc_pred_total_xb = + std::accumulate(dc_pred_usage_xb.begin(), dc_pred_usage_xb.end(), 0u); + if (dc_pred_total + dc_pred_total_xb != 0) { + printf("\nDC pred Y XB:\n"); + for (size_t i = 0; i < dc_pred_usage.size(); ++i) { + printf(" %6u (%5.2f%%) %6u (%5.2f%%)\n", dc_pred_usage[i], + 100.0 * dc_pred_usage[i] / dc_pred_total, dc_pred_usage_xb[i], + 100.0 * dc_pred_usage_xb[i] / dc_pred_total_xb); + } + } + + size_t total_blocks = 0; + size_t total_positions = 0; + if (total_blocks != 0 && total_positions != 0) { + printf("\n\t\t Blocks\t\tPositions\t\t\tBlocks/Position\n"); + printf(" Total:\t\t %7zu\t\t %7zu \t\t\t%10f%%\n\n", total_blocks, + total_positions, 100.0 * total_blocks / total_positions); + } +} + +void AuxOut::DumpCoeffImage(const char* label, + const Image3S& coeff_image) const { + JXL_ASSERT(coeff_image.xsize() % 64 == 0); + Image3S reshuffled(coeff_image.xsize() / 8, coeff_image.ysize() * 8); + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < coeff_image.ysize(); y++) { + for (size_t x = 0; x < coeff_image.xsize(); x += 64) { + for (size_t i = 0; i < 64; i++) { + reshuffled.PlaneRow(c, 8 * y + i / 8)[x / 8 + i % 8] = + coeff_image.PlaneRow(c, y)[x + i]; + } + } + } + } + DumpImage(label, reshuffled); +} + +void ReclaimAndCharge(BitWriter* JXL_RESTRICT writer, + BitWriter::Allotment* JXL_RESTRICT allotment, + size_t layer, AuxOut* JXL_RESTRICT aux_out) { + size_t used_bits, unused_bits; + allotment->PrivateReclaim(writer, &used_bits, &unused_bits); + +#if 0 + printf("Layer %s bits: max %zu used %zu unused %zu\n", LayerName(layer), + allotment->MaxBits(), used_bits, unused_bits); +#endif + + // This may be a nested call with aux_out == null. Whenever we know that + // aux_out is null, we can call ReclaimUnused directly. + if (aux_out != nullptr) { + aux_out->layers[layer].total_bits += used_bits; + aux_out->layers[layer].histogram_bits += allotment->HistogramBits(); + } +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out.h new file mode 100644 index 0000000000..5baf5bbc28 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out.h @@ -0,0 +1,311 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_AUX_OUT_H_ +#define LIB_JXL_AUX_OUT_H_ + +// Optional output information for debugging and analyzing size usage. + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/jxl_inspection.h" + +namespace jxl { + +// For LayerName and AuxOut::layers[] index. Order does not matter. +enum { + kLayerHeader = 0, + kLayerTOC, + kLayerNoise, + kLayerQuant, + kLayerDequantTables, + kLayerOrder, + kLayerDC, + kLayerControlFields, + kLayerAC, + kLayerACTokens, + kLayerDictionary, + kLayerDots, + kLayerSplines, + kLayerLossless, + kLayerModularGlobal, + kLayerModularDcGroup, + kLayerModularAcGroup, + kLayerModularTree, + kLayerAlpha, + kLayerDepth, + kLayerExtraChannels, + kNumImageLayers +}; + +static inline const char* LayerName(size_t layer) { + switch (layer) { + case kLayerHeader: + return "headers"; + case kLayerTOC: + return "TOC"; + case kLayerNoise: + return "noise"; + case kLayerQuant: + return "quantizer"; + case kLayerDequantTables: + return "quant tables"; + case kLayerOrder: + return "order"; + case kLayerDC: + return "DC"; + case kLayerControlFields: + return "ControlFields"; + case kLayerAC: + return "AC"; + case kLayerACTokens: + return "ACTokens"; + case kLayerDictionary: + return "dictionary"; + case kLayerDots: + return "dots"; + case kLayerSplines: + return "splines"; + case kLayerLossless: + return "lossless"; + case kLayerModularGlobal: + return "modularGlobal"; + case kLayerModularDcGroup: + return "modularDcGroup"; + case kLayerModularAcGroup: + return "modularAcGroup"; + case kLayerModularTree: + return "modularTree"; + case kLayerAlpha: + return "alpha"; + case kLayerDepth: + return "depth"; + case kLayerExtraChannels: + return "extra channels"; + default: + JXL_ABORT("Invalid layer %zu\n", layer); + } +} + +// Statistics gathered during compression or decompression. +struct AuxOut { + private: + struct LayerTotals { + void Assimilate(const LayerTotals& victim) { + num_clustered_histograms += victim.num_clustered_histograms; + histogram_bits += victim.histogram_bits; + extra_bits += victim.extra_bits; + total_bits += victim.total_bits; + clustered_entropy += victim.clustered_entropy; + } + void Print(size_t num_inputs) const { + printf("%10zd", total_bits); + if (histogram_bits != 0) { + printf(" [c/i:%6.2f | hst:%8zd | ex:%8zd | h+c+e:%12.3f", + num_clustered_histograms * 1.0 / num_inputs, histogram_bits >> 3, + extra_bits >> 3, + (histogram_bits + clustered_entropy + extra_bits) / 8.0); + printf("]"); + } + printf("\n"); + } + size_t num_clustered_histograms = 0; + size_t extra_bits = 0; + + // Set via BitsWritten below + size_t histogram_bits = 0; + size_t total_bits = 0; + + double clustered_entropy = 0.0; + }; + + public: + AuxOut() = default; + AuxOut(const AuxOut&) = default; + + void Assimilate(const AuxOut& victim) { + for (size_t i = 0; i < layers.size(); ++i) { + layers[i].Assimilate(victim.layers[i]); + } + num_blocks += victim.num_blocks; + num_dct2_blocks += victim.num_dct2_blocks; + num_dct4_blocks += victim.num_dct4_blocks; + num_dct4x8_blocks += victim.num_dct4x8_blocks; + num_afv_blocks += victim.num_afv_blocks; + num_dct8_blocks += victim.num_dct8_blocks; + num_dct8x16_blocks += victim.num_dct8x16_blocks; + num_dct8x32_blocks += victim.num_dct8x32_blocks; + num_dct16_blocks += victim.num_dct16_blocks; + num_dct16x32_blocks += victim.num_dct16x32_blocks; + num_dct32_blocks += victim.num_dct32_blocks; + num_butteraugli_iters += victim.num_butteraugli_iters; + for (size_t i = 0; i < dc_pred_usage.size(); ++i) { + dc_pred_usage[i] += victim.dc_pred_usage[i]; + dc_pred_usage_xb[i] += victim.dc_pred_usage_xb[i]; + } + } + + void Print(size_t num_inputs) const; + + template + void DumpImage(const char* label, const Image3& image) const { + if (!dump_image) return; + if (debug_prefix.empty()) return; + std::ostringstream pathname; + pathname << debug_prefix << label << ".png"; + CodecInOut io; + // Always save to 16-bit png. + io.metadata.m.SetUintSamples(16); + io.metadata.m.color_encoding = ColorEncoding::SRGB(); + io.SetFromImage(ConvertToFloat(image), io.metadata.m.color_encoding); + (void)dump_image(io, pathname.str()); + } + template + void DumpImage(const char* label, const Plane& image) { + DumpImage(label, + Image3(CopyImage(image), CopyImage(image), CopyImage(image))); + } + + template + void DumpXybImage(const char* label, const Image3& image) const { + if (!dump_image) return; + if (debug_prefix.empty()) return; + std::ostringstream pathname; + pathname << debug_prefix << label << ".png"; + + Image3F linear(image.xsize(), image.ysize()); + OpsinParams opsin_params; + opsin_params.Init(kDefaultIntensityTarget); + OpsinToLinear(image, Rect(linear), nullptr, &linear, opsin_params); + + CodecInOut io; + io.metadata.m.SetUintSamples(16); + io.metadata.m.color_encoding = ColorEncoding::LinearSRGB(); + io.SetFromImage(std::move(linear), io.metadata.m.color_encoding); + + (void)dump_image(io, pathname.str()); + } + + // Normalizes all the channels to range 0-1, creating a false-color image + // which allows seeing the information from non-RGB channels in an RGB debug + // image. + template + void DumpImageNormalized(const char* label, const Image3& image) const { + std::array min; + std::array max; + Image3MinMax(image, &min, &max); + Image3B normalized(image.xsize(), image.ysize()); + for (size_t c = 0; c < 3; ++c) { + float mul = min[c] == max[c] ? 0 : (1.0f / (max[c] - min[c])); + for (size_t y = 0; y < image.ysize(); ++y) { + const T* JXL_RESTRICT row_in = image.ConstPlaneRow(c, y); + uint8_t* JXL_RESTRICT row_out = normalized.PlaneRow(c, y); + for (size_t x = 0; x < image.xsize(); ++x) { + row_out[x] = static_cast((row_in[x] - min[c]) * mul); + } + } + } + DumpImage(label, normalized); + } + + template + void DumpPlaneNormalized(const char* label, const Plane& image) const { + T min; + T max; + ImageMinMax(image, &min, &max); + Image3B normalized(image.xsize(), image.ysize()); + for (size_t c = 0; c < 3; ++c) { + float mul = min == max ? 0 : (255.0f / (max - min)); + for (size_t y = 0; y < image.ysize(); ++y) { + const T* JXL_RESTRICT row_in = image.ConstRow(y); + uint8_t* JXL_RESTRICT row_out = normalized.PlaneRow(c, y); + for (size_t x = 0; x < image.xsize(); ++x) { + row_out[x] = static_cast((row_in[x] - min) * mul); + } + } + } + DumpImage(label, normalized); + } + + // This dumps coefficients as a 16-bit PNG with coefficients of a block placed + // in the area that would contain that block in a normal image. To view the + // resulting image manually, rescale intensities by using: + // $ convert -auto-level IMAGE.PNG - | display - + void DumpCoeffImage(const char* label, const Image3S& coeff_image) const; + + void SetInspectorImage3F(const jxl::InspectorImage3F& inspector) { + inspector_image3f_ = inspector; + } + + // Allows hooking intermediate data inspection into various places of the + // processing pipeline. Returns true iff processing should proceed. + bool InspectImage3F(const char* label, const Image3F& image) { + if (inspector_image3f_ != nullptr) { + return inspector_image3f_(label, image); + } + return true; + } + + std::array layers; + size_t num_blocks = 0; + + // Number of blocks that use larger DCT (set by ac_strategy). + size_t num_dct2_blocks = 0; + size_t num_dct4_blocks = 0; + size_t num_dct4x8_blocks = 0; + size_t num_afv_blocks = 0; + size_t num_dct8_blocks = 0; + size_t num_dct8x16_blocks = 0; + size_t num_dct8x32_blocks = 0; + size_t num_dct16_blocks = 0; + size_t num_dct16x32_blocks = 0; + size_t num_dct32_blocks = 0; + + std::array dc_pred_usage = {0}; + std::array dc_pred_usage_xb = {0}; + + int num_butteraugli_iters = 0; + + // If not empty, additional debugging information (e.g. debug images) is + // saved in files with this prefix. + std::string debug_prefix; + + // By how much the decoded image was downsampled relative to the encoded + // image. + size_t downsampling = 1; + + jxl::InspectorImage3F inspector_image3f_; + + std::function dump_image = + nullptr; +}; + +// Used to skip image creation if they won't be written to debug directory. +static inline bool WantDebugOutput(const AuxOut* aux_out) { + // Need valid pointer and filename. + return aux_out != nullptr && !aux_out->debug_prefix.empty(); +} + +} // namespace jxl + +#endif // LIB_JXL_AUX_OUT_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out_fwd.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out_fwd.h new file mode 100644 index 0000000000..29b31ad87a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/aux_out_fwd.h @@ -0,0 +1,28 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_AUX_OUT_FWD_H_ +#define LIB_JXL_AUX_OUT_FWD_H_ + +#include + +#include "lib/jxl/enc_bit_writer.h" + +namespace jxl { + +struct AuxOut; + +// Helper function that ensures the `bits_written` are charged to `layer` in +// `aux_out`. Example usage: +// BitWriter::Allotment allotment(&writer, max_bits); +// writer.Write(..); writer.Write(..); +// ReclaimAndCharge(&writer, &allotment, layer, aux_out); +void ReclaimAndCharge(BitWriter* JXL_RESTRICT writer, + BitWriter::Allotment* JXL_RESTRICT allotment, + size_t layer, AuxOut* JXL_RESTRICT aux_out); + +} // namespace jxl + +#endif // LIB_JXL_AUX_OUT_FWD_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/arch_macros.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/arch_macros.h new file mode 100644 index 0000000000..a98301915e --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/arch_macros.h @@ -0,0 +1,33 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_ARCH_MACROS_H_ +#define LIB_JXL_BASE_ARCH_MACROS_H_ + +// Defines the JXL_ARCH_* macros. + +namespace jxl { + +#if defined(__x86_64__) || defined(_M_X64) +#define JXL_ARCH_X64 1 +#else +#define JXL_ARCH_X64 0 +#endif + +#if defined(__powerpc64__) || defined(_M_PPC) +#define JXL_ARCH_PPC 1 +#else +#define JXL_ARCH_PPC 0 +#endif + +#if defined(__aarch64__) || defined(__arm__) +#define JXL_ARCH_ARM 1 +#else +#define JXL_ARCH_ARM 0 +#endif + +} // namespace jxl + +#endif // LIB_JXL_BASE_ARCH_MACROS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/bits.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/bits.h new file mode 100644 index 0000000000..9f86118e72 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/bits.h @@ -0,0 +1,147 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_BITS_H_ +#define LIB_JXL_BASE_BITS_H_ + +// Specialized instructions for processing register-sized bit arrays. + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" + +#if JXL_COMPILER_MSVC +#include +#endif + +#include +#include + +namespace jxl { + +// Empty struct used as a size tag type. +template +struct SizeTag {}; + +template +constexpr bool IsSigned() { + return T(0) > T(-1); +} + +// Undefined results for x == 0. +static JXL_INLINE JXL_MAYBE_UNUSED size_t +Num0BitsAboveMS1Bit_Nonzero(SizeTag<4> /* tag */, const uint32_t x) { + JXL_DASSERT(x != 0); +#if JXL_COMPILER_MSVC + unsigned long index; + _BitScanReverse(&index, x); + return 31 - index; +#else + return static_cast(__builtin_clz(x)); +#endif +} +static JXL_INLINE JXL_MAYBE_UNUSED size_t +Num0BitsAboveMS1Bit_Nonzero(SizeTag<8> /* tag */, const uint64_t x) { + JXL_DASSERT(x != 0); +#if JXL_COMPILER_MSVC +#if JXL_ARCH_X64 + unsigned long index; + _BitScanReverse64(&index, x); + return 63 - index; +#else // JXL_ARCH_X64 + // _BitScanReverse64 not available + uint32_t msb = static_cast(x >> 32u); + unsigned long index; + if (msb == 0) { + uint32_t lsb = static_cast(x & 0xFFFFFFFF); + _BitScanReverse(&index, lsb); + return 63 - index; + } else { + _BitScanReverse(&index, msb); + return 31 - index; + } +#endif // JXL_ARCH_X64 +#else + return static_cast(__builtin_clzll(x)); +#endif +} +template +static JXL_INLINE JXL_MAYBE_UNUSED size_t +Num0BitsAboveMS1Bit_Nonzero(const T x) { + static_assert(!IsSigned(), "Num0BitsAboveMS1Bit_Nonzero: use unsigned"); + return Num0BitsAboveMS1Bit_Nonzero(SizeTag(), x); +} + +// Undefined results for x == 0. +static JXL_INLINE JXL_MAYBE_UNUSED size_t +Num0BitsBelowLS1Bit_Nonzero(SizeTag<4> /* tag */, const uint32_t x) { + JXL_DASSERT(x != 0); +#if JXL_COMPILER_MSVC + unsigned long index; + _BitScanForward(&index, x); + return index; +#else + return static_cast(__builtin_ctz(x)); +#endif +} +static JXL_INLINE JXL_MAYBE_UNUSED size_t +Num0BitsBelowLS1Bit_Nonzero(SizeTag<8> /* tag */, const uint64_t x) { + JXL_DASSERT(x != 0); +#if JXL_COMPILER_MSVC +#if JXL_ARCH_X64 + unsigned long index; + _BitScanForward64(&index, x); + return index; +#else // JXL_ARCH_64 + // _BitScanForward64 not available + uint32_t lsb = static_cast(x & 0xFFFFFFFF); + unsigned long index; + if (lsb == 0) { + uint32_t msb = static_cast(x >> 32u); + _BitScanForward(&index, msb); + return 32 + index; + } else { + _BitScanForward(&index, lsb); + return index; + } +#endif // JXL_ARCH_X64 +#else + return static_cast(__builtin_ctzll(x)); +#endif +} +template +static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsBelowLS1Bit_Nonzero(T x) { + static_assert(!IsSigned(), "Num0BitsBelowLS1Bit_Nonzero: use unsigned"); + return Num0BitsBelowLS1Bit_Nonzero(SizeTag(), x); +} + +// Returns bit width for x == 0. +template +static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsAboveMS1Bit(const T x) { + return (x == 0) ? sizeof(T) * 8 : Num0BitsAboveMS1Bit_Nonzero(x); +} + +// Returns bit width for x == 0. +template +static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsBelowLS1Bit(const T x) { + return (x == 0) ? sizeof(T) * 8 : Num0BitsBelowLS1Bit_Nonzero(x); +} + +// Returns base-2 logarithm, rounded down. +template +static JXL_INLINE JXL_MAYBE_UNUSED size_t FloorLog2Nonzero(const T x) { + return (sizeof(T) * 8 - 1) ^ Num0BitsAboveMS1Bit_Nonzero(x); +} + +// Returns base-2 logarithm, rounded up. +template +static JXL_INLINE JXL_MAYBE_UNUSED size_t CeilLog2Nonzero(const T x) { + const size_t floor_log2 = FloorLog2Nonzero(x); + if ((x & (x - 1)) == 0) return floor_log2; // power of two + return floor_log2 + 1; +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_BITS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/byte_order.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/byte_order.h new file mode 100644 index 0000000000..f27017d661 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/byte_order.h @@ -0,0 +1,283 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_BYTE_ORDER_H_ +#define LIB_JXL_BASE_BYTE_ORDER_H_ + +#include +#include // memcpy + +#include "lib/jxl/base/compiler_specific.h" + +#if JXL_COMPILER_MSVC +#include // _byteswap_* +#endif + +#if (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) +#define JXL_BYTE_ORDER_LITTLE 1 +#else +// This means that we don't know that the byte order is little endian, in +// this case we use endian-neutral code that works for both little- and +// big-endian. +#define JXL_BYTE_ORDER_LITTLE 0 +#endif + +// Returns whether the system is little-endian (least-significant byte first). +#if JXL_BYTE_ORDER_LITTLE +static constexpr bool IsLittleEndian() { return true; } +#else +static inline bool IsLittleEndian() { + const uint32_t multibyte = 1; + uint8_t byte; + memcpy(&byte, &multibyte, 1); + return byte == 1; +} +#endif + +#if JXL_COMPILER_MSVC +#define JXL_BSWAP32(x) _byteswap_ulong(x) +#define JXL_BSWAP64(x) _byteswap_uint64(x) +#else +#define JXL_BSWAP32(x) __builtin_bswap32(x) +#define JXL_BSWAP64(x) __builtin_bswap64(x) +#endif + +static JXL_INLINE uint32_t LoadBE16(const uint8_t* p) { + const uint32_t byte1 = p[0]; + const uint32_t byte0 = p[1]; + return (byte1 << 8) | byte0; +} + +static JXL_INLINE uint32_t LoadLE16(const uint8_t* p) { + const uint32_t byte0 = p[0]; + const uint32_t byte1 = p[1]; + return (byte1 << 8) | byte0; +} + +static JXL_INLINE uint32_t LoadBE24(const uint8_t* p) { + const uint32_t byte2 = p[0]; + const uint32_t byte1 = p[1]; + const uint32_t byte0 = p[2]; + return (byte2 << 16) | (byte1 << 8) | byte0; +} + +static JXL_INLINE uint32_t LoadLE24(const uint8_t* p) { + const uint32_t byte0 = p[0]; + const uint32_t byte1 = p[1]; + const uint32_t byte2 = p[2]; + return (byte2 << 16) | (byte1 << 8) | byte0; +} + +static JXL_INLINE uint32_t LoadBE32(const uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + uint32_t big; + memcpy(&big, p, 4); + return JXL_BSWAP32(big); +#else + // Byte-order-independent - can't assume this machine is big endian. + const uint32_t byte3 = p[0]; + const uint32_t byte2 = p[1]; + const uint32_t byte1 = p[2]; + const uint32_t byte0 = p[3]; + return (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0; +#endif +} + +static JXL_INLINE uint64_t LoadBE64(const uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + uint64_t big; + memcpy(&big, p, 8); + return JXL_BSWAP64(big); +#else + // Byte-order-independent - can't assume this machine is big endian. + const uint64_t byte7 = p[0]; + const uint64_t byte6 = p[1]; + const uint64_t byte5 = p[2]; + const uint64_t byte4 = p[3]; + const uint64_t byte3 = p[4]; + const uint64_t byte2 = p[5]; + const uint64_t byte1 = p[6]; + const uint64_t byte0 = p[7]; + return (byte7 << 56ull) | (byte6 << 48ull) | (byte5 << 40ull) | + (byte4 << 32ull) | (byte3 << 24ull) | (byte2 << 16ull) | + (byte1 << 8ull) | byte0; +#endif +} + +static JXL_INLINE uint32_t LoadLE32(const uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + uint32_t little; + memcpy(&little, p, 4); + return little; +#else + // Byte-order-independent - can't assume this machine is big endian. + const uint32_t byte0 = p[0]; + const uint32_t byte1 = p[1]; + const uint32_t byte2 = p[2]; + const uint32_t byte3 = p[3]; + return (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0; +#endif +} + +static JXL_INLINE uint64_t LoadLE64(const uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + uint64_t little; + memcpy(&little, p, 8); + return little; +#else + // Byte-order-independent - can't assume this machine is big endian. + const uint64_t byte0 = p[0]; + const uint64_t byte1 = p[1]; + const uint64_t byte2 = p[2]; + const uint64_t byte3 = p[3]; + const uint64_t byte4 = p[4]; + const uint64_t byte5 = p[5]; + const uint64_t byte6 = p[6]; + const uint64_t byte7 = p[7]; + return (byte7 << 56) | (byte6 << 48) | (byte5 << 40) | (byte4 << 32) | + (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0; +#endif +} + +static JXL_INLINE void StoreBE16(const uint32_t native, uint8_t* p) { + p[0] = (native >> 8) & 0xFF; + p[1] = native & 0xFF; +} + +static JXL_INLINE void StoreLE16(const uint32_t native, uint8_t* p) { + p[1] = (native >> 8) & 0xFF; + p[0] = native & 0xFF; +} + +static JXL_INLINE void StoreBE24(const uint32_t native, uint8_t* p) { + p[0] = (native >> 16) & 0xFF; + p[1] = (native >> 8) & 0xFF; + p[2] = native & 0xFF; +} + +static JXL_INLINE void StoreLE24(const uint32_t native, uint8_t* p) { + p[2] = (native >> 24) & 0xFF; + p[1] = (native >> 8) & 0xFF; + p[0] = native & 0xFF; +} + +static JXL_INLINE void StoreBE32(const uint32_t native, uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + const uint32_t big = JXL_BSWAP32(native); + memcpy(p, &big, 4); +#else + // Byte-order-independent - can't assume this machine is big endian. + p[0] = native >> 24; + p[1] = (native >> 16) & 0xFF; + p[2] = (native >> 8) & 0xFF; + p[3] = native & 0xFF; +#endif +} + +static JXL_INLINE void StoreBE64(const uint64_t native, uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + const uint64_t big = JXL_BSWAP64(native); + memcpy(p, &big, 8); +#else + // Byte-order-independent - can't assume this machine is big endian. + p[0] = native >> 56ull; + p[1] = (native >> 48ull) & 0xFF; + p[2] = (native >> 40ull) & 0xFF; + p[3] = (native >> 32ull) & 0xFF; + p[4] = (native >> 24ull) & 0xFF; + p[5] = (native >> 16ull) & 0xFF; + p[6] = (native >> 8ull) & 0xFF; + p[7] = native & 0xFF; +#endif +} + +static JXL_INLINE void StoreLE32(const uint32_t native, uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + const uint32_t little = native; + memcpy(p, &little, 4); +#else + // Byte-order-independent - can't assume this machine is big endian. + p[3] = native >> 24; + p[2] = (native >> 16) & 0xFF; + p[1] = (native >> 8) & 0xFF; + p[0] = native & 0xFF; +#endif +} + +static JXL_INLINE void StoreLE64(const uint64_t native, uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + const uint64_t little = native; + memcpy(p, &little, 8); +#else + // Byte-order-independent - can't assume this machine is big endian. + p[7] = native >> 56; + p[6] = (native >> 48) & 0xFF; + p[5] = (native >> 40) & 0xFF; + p[4] = (native >> 32) & 0xFF; + p[3] = (native >> 24) & 0xFF; + p[2] = (native >> 16) & 0xFF; + p[1] = (native >> 8) & 0xFF; + p[0] = native & 0xFF; +#endif +} + +// Big/Little Endian order. +struct OrderBE {}; +struct OrderLE {}; + +// Wrappers for calling from generic code. +static JXL_INLINE void Store16(OrderBE /*tag*/, const uint32_t native, + uint8_t* p) { + return StoreBE16(native, p); +} + +static JXL_INLINE void Store16(OrderLE /*tag*/, const uint32_t native, + uint8_t* p) { + return StoreLE16(native, p); +} + +static JXL_INLINE void Store24(OrderBE /*tag*/, const uint32_t native, + uint8_t* p) { + return StoreBE24(native, p); +} + +static JXL_INLINE void Store24(OrderLE /*tag*/, const uint32_t native, + uint8_t* p) { + return StoreLE24(native, p); +} +static JXL_INLINE void Store32(OrderBE /*tag*/, const uint32_t native, + uint8_t* p) { + return StoreBE32(native, p); +} + +static JXL_INLINE void Store32(OrderLE /*tag*/, const uint32_t native, + uint8_t* p) { + return StoreLE32(native, p); +} + +static JXL_INLINE uint32_t Load16(OrderBE /*tag*/, const uint8_t* p) { + return LoadBE16(p); +} + +static JXL_INLINE uint32_t Load16(OrderLE /*tag*/, const uint8_t* p) { + return LoadLE16(p); +} + +static JXL_INLINE uint32_t Load24(OrderBE /*tag*/, const uint8_t* p) { + return LoadBE24(p); +} + +static JXL_INLINE uint32_t Load24(OrderLE /*tag*/, const uint8_t* p) { + return LoadLE24(p); +} +static JXL_INLINE uint32_t Load32(OrderBE /*tag*/, const uint8_t* p) { + return LoadBE32(p); +} + +static JXL_INLINE uint32_t Load32(OrderLE /*tag*/, const uint8_t* p) { + return LoadLE32(p); +} + +#endif // LIB_JXL_BASE_BYTE_ORDER_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/cache_aligned.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/cache_aligned.cc new file mode 100644 index 0000000000..35ee2aee06 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/cache_aligned.cc @@ -0,0 +1,154 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/cache_aligned.h" + +#include +#include + +// Disabled: slower than malloc + alignment. +#define JXL_USE_MMAP 0 + +#if JXL_USE_MMAP +#include +#endif + +#include // std::max +#include +#include // kMaxVectorSize +#include + +#include "lib/jxl/base/status.h" + +namespace jxl { +namespace { + +#pragma pack(push, 1) +struct AllocationHeader { + void* allocated; + size_t allocated_size; + uint8_t left_padding[hwy::kMaxVectorSize]; +}; +#pragma pack(pop) + +std::atomic num_allocations{0}; +std::atomic bytes_in_use{0}; +std::atomic max_bytes_in_use{0}; + +} // namespace + +// Avoids linker errors in pre-C++17 builds. +constexpr size_t CacheAligned::kPointerSize; +constexpr size_t CacheAligned::kCacheLineSize; +constexpr size_t CacheAligned::kAlignment; +constexpr size_t CacheAligned::kAlias; + +void CacheAligned::PrintStats() { + printf("Allocations: %zu (max bytes in use: %E)\n", + size_t(num_allocations.load(std::memory_order_relaxed)), + double(max_bytes_in_use.load(std::memory_order_relaxed))); +} + +size_t CacheAligned::NextOffset() { + static std::atomic next{0}; + constexpr uint32_t kGroups = CacheAligned::kAlias / CacheAligned::kAlignment; + const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups; + return CacheAligned::kAlignment * group; +} + +void* CacheAligned::Allocate(const size_t payload_size, size_t offset) { + JXL_ASSERT(payload_size <= std::numeric_limits::max() / 2); + JXL_ASSERT((offset % kAlignment == 0) && offset <= kAlias); + + // What: | misalign | unused | AllocationHeader |payload + // Size: |<= kAlias | offset | |payload_size + // ^allocated.^aligned.^header............^payload + // The header must immediately precede payload, which must remain aligned. + // To avoid wasting space, the header resides at the end of `unused`, + // which therefore cannot be empty (offset == 0). + if (offset == 0) { + offset = kAlignment; // = round_up(sizeof(AllocationHeader), kAlignment) + static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up"); + } + +#if JXL_USE_MMAP + const size_t allocated_size = offset + payload_size; + const int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE; + void* allocated = + mmap(nullptr, allocated_size, PROT_READ | PROT_WRITE, flags, -1, 0); + if (allocated == MAP_FAILED) return nullptr; + const uintptr_t aligned = reinterpret_cast(allocated); +#else + const size_t allocated_size = kAlias + offset + payload_size; + void* allocated = malloc(allocated_size); + if (allocated == nullptr) return nullptr; + // Always round up even if already aligned - we already asked for kAlias + // extra bytes and there's no way to give them back. + uintptr_t aligned = reinterpret_cast(allocated) + kAlias; + static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2"); + static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias"); + aligned &= ~(kAlias - 1); +#endif + +#if 0 + // No effect. + uintptr_t page_aligned = reinterpret_cast(allocated); + page_aligned &= ~(4096 - 1); + if (madvise(reinterpret_cast(page_aligned), allocated_size, + MADV_WILLNEED) != 0) { + JXL_NOTIFY_ERROR("madvise failed"); + } +#elif 0 + // INCREASES both first and subsequent decode times. + if (mlock(allocated, allocated_size) != 0) { + JXL_NOTIFY_ERROR("mlock failed"); + } +#endif + + // Update statistics (#allocations and max bytes in use) + num_allocations.fetch_add(1, std::memory_order_relaxed); + const uint64_t prev_bytes = + bytes_in_use.fetch_add(allocated_size, std::memory_order_acq_rel); + uint64_t expected_max = max_bytes_in_use.load(std::memory_order_acquire); + for (;;) { + const uint64_t desired = + std::max(expected_max, prev_bytes + allocated_size); + if (max_bytes_in_use.compare_exchange_strong(expected_max, desired, + std::memory_order_acq_rel)) { + break; + } + } + + const uintptr_t payload = aligned + offset; // still aligned + + // Stash `allocated` and payload_size inside header for use by Free(). + AllocationHeader* header = reinterpret_cast(payload) - 1; + header->allocated = allocated; + header->allocated_size = allocated_size; + + return JXL_ASSUME_ALIGNED(reinterpret_cast(payload), 64); +} + +void CacheAligned::Free(const void* aligned_pointer) { + if (aligned_pointer == nullptr) { + return; + } + const uintptr_t payload = reinterpret_cast(aligned_pointer); + JXL_ASSERT(payload % kAlignment == 0); + const AllocationHeader* header = + reinterpret_cast(payload) - 1; + + // Subtract (2's complement negation). + bytes_in_use.fetch_add(~header->allocated_size + 1, + std::memory_order_acq_rel); + +#if JXL_USE_MMAP + munmap(header->allocated, header->allocated_size); +#else + free(header->allocated); +#endif +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/cache_aligned.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/cache_aligned.h new file mode 100644 index 0000000000..e57df14837 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/cache_aligned.h @@ -0,0 +1,74 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_CACHE_ALIGNED_H_ +#define LIB_JXL_BASE_CACHE_ALIGNED_H_ + +// Memory allocator with support for alignment + misalignment. + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" + +namespace jxl { + +// Functions that depend on the cache line size. +class CacheAligned { + public: + static void PrintStats(); + + static constexpr size_t kPointerSize = sizeof(void*); + static constexpr size_t kCacheLineSize = 64; + // To avoid RFOs, match L2 fill size (pairs of lines). + static constexpr size_t kAlignment = 2 * kCacheLineSize; + // Minimum multiple for which cache set conflicts and/or loads blocked by + // preceding stores can occur. + static constexpr size_t kAlias = 2048; + + // Returns a 'random' (cyclical) offset suitable for Allocate. + static size_t NextOffset(); + + // Returns null or memory whose address is congruent to `offset` (mod kAlias). + // This reduces cache conflicts and load/store stalls, especially with large + // allocations that would otherwise have similar alignments. At least + // `payload_size` (which can be zero) bytes will be accessible. + static void* Allocate(size_t payload_size, size_t offset); + + static void* Allocate(const size_t payload_size) { + return Allocate(payload_size, NextOffset()); + } + + static void Free(const void* aligned_pointer); +}; + +// Avoids the need for a function pointer (deleter) in CacheAlignedUniquePtr. +struct CacheAlignedDeleter { + void operator()(uint8_t* aligned_pointer) const { + return CacheAligned::Free(aligned_pointer); + } +}; + +using CacheAlignedUniquePtr = std::unique_ptr; + +// Does not invoke constructors. +static inline CacheAlignedUniquePtr AllocateArray(const size_t bytes) { + return CacheAlignedUniquePtr( + static_cast(CacheAligned::Allocate(bytes)), + CacheAlignedDeleter()); +} + +static inline CacheAlignedUniquePtr AllocateArray(const size_t bytes, + const size_t offset) { + return CacheAlignedUniquePtr( + static_cast(CacheAligned::Allocate(bytes, offset)), + CacheAlignedDeleter()); +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_CACHE_ALIGNED_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/compiler_specific.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/compiler_specific.h new file mode 100644 index 0000000000..b279fa0c82 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/compiler_specific.h @@ -0,0 +1,153 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_COMPILER_SPECIFIC_H_ +#define LIB_JXL_BASE_COMPILER_SPECIFIC_H_ + +// Macros for compiler version + nonstandard keywords, e.g. __builtin_expect. + +#include + +// #if is shorter and safer than #ifdef. *_VERSION are zero if not detected, +// otherwise 100 * major + minor version. Note that other packages check for +// #ifdef COMPILER_MSVC, so we cannot use that same name. + +#ifdef _MSC_VER +#define JXL_COMPILER_MSVC _MSC_VER +#else +#define JXL_COMPILER_MSVC 0 +#endif + +#ifdef __GNUC__ +#define JXL_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__) +#else +#define JXL_COMPILER_GCC 0 +#endif + +#ifdef __clang__ +#define JXL_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__) +// Clang pretends to be GCC for compatibility. +#undef JXL_COMPILER_GCC +#define JXL_COMPILER_GCC 0 +#else +#define JXL_COMPILER_CLANG 0 +#endif + +#if JXL_COMPILER_MSVC +#define JXL_RESTRICT __restrict +#elif JXL_COMPILER_GCC || JXL_COMPILER_CLANG +#define JXL_RESTRICT __restrict__ +#else +#define JXL_RESTRICT +#endif + +#if JXL_COMPILER_MSVC +#define JXL_INLINE __forceinline +#define JXL_NOINLINE __declspec(noinline) +#else +#define JXL_INLINE inline __attribute__((always_inline)) +#define JXL_NOINLINE __attribute__((noinline)) +#endif + +#if JXL_COMPILER_MSVC +#define JXL_NORETURN __declspec(noreturn) +#elif JXL_COMPILER_GCC || JXL_COMPILER_CLANG +#define JXL_NORETURN __attribute__((noreturn)) +#endif + +#if JXL_COMPILER_MSVC +#define JXL_UNREACHABLE __assume(false) +#elif JXL_COMPILER_CLANG || JXL_COMPILER_GCC >= 405 +#define JXL_UNREACHABLE __builtin_unreachable() +#else +#define JXL_UNREACHABLE +#endif + +#if JXL_COMPILER_MSVC +#define JXL_MAYBE_UNUSED +#else +// Encountered "attribute list cannot appear here" when using the C++17 +// [[maybe_unused]], so only use the old style attribute for now. +#define JXL_MAYBE_UNUSED __attribute__((unused)) +#endif + +#if JXL_COMPILER_MSVC +// Unsupported, __assume is not the same. +#define JXL_LIKELY(expr) expr +#define JXL_UNLIKELY(expr) expr +#else +#define JXL_LIKELY(expr) __builtin_expect(!!(expr), 1) +#define JXL_UNLIKELY(expr) __builtin_expect(!!(expr), 0) +#endif + +#if JXL_COMPILER_MSVC +#include + +#pragma intrinsic(_ReadWriteBarrier) +#define JXL_COMPILER_FENCE _ReadWriteBarrier() +#elif JXL_COMPILER_GCC || JXL_COMPILER_CLANG +#define JXL_COMPILER_FENCE asm volatile("" : : : "memory") +#else +#define JXL_COMPILER_FENCE +#endif + +// Returns a void* pointer which the compiler then assumes is N-byte aligned. +// Example: float* JXL_RESTRICT aligned = (float*)JXL_ASSUME_ALIGNED(in, 32); +// +// The assignment semantics are required by GCC/Clang. ICC provides an in-place +// __assume_aligned, whereas MSVC's __assume appears unsuitable. +#if JXL_COMPILER_CLANG +// Early versions of Clang did not support __builtin_assume_aligned. +#define JXL_HAS_ASSUME_ALIGNED __has_builtin(__builtin_assume_aligned) +#elif JXL_COMPILER_GCC +#define JXL_HAS_ASSUME_ALIGNED 1 +#else +#define JXL_HAS_ASSUME_ALIGNED 0 +#endif + +#if JXL_HAS_ASSUME_ALIGNED +#define JXL_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align)) +#else +#define JXL_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */ +#endif + +#ifdef __has_attribute +#define JXL_HAVE_ATTRIBUTE(x) __has_attribute(x) +#else +#define JXL_HAVE_ATTRIBUTE(x) 0 +#endif + +// Raises warnings if the function return value is unused. Should appear as the +// first part of a function definition/declaration. +#if JXL_HAVE_ATTRIBUTE(nodiscard) +#define JXL_MUST_USE_RESULT [[nodiscard]] +#elif JXL_COMPILER_CLANG && JXL_HAVE_ATTRIBUTE(warn_unused_result) +#define JXL_MUST_USE_RESULT __attribute__((warn_unused_result)) +#else +#define JXL_MUST_USE_RESULT +#endif + +// Disable certain -fsanitize flags for functions that are expected to include +// things like unsigned integer overflow. For example use in the function +// declaration JXL_NO_SANITIZE("unsigned-integer-overflow") to silence unsigned +// integer overflow ubsan messages. +#if JXL_COMPILER_CLANG && JXL_HAVE_ATTRIBUTE(no_sanitize) +#define JXL_NO_SANITIZE(X) __attribute__((no_sanitize(X))) +#else +#define JXL_NO_SANITIZE(X) +#endif + +#if JXL_HAVE_ATTRIBUTE(__format__) +#define JXL_FORMAT(idx_fmt, idx_arg) \ + __attribute__((__format__(__printf__, idx_fmt, idx_arg))) +#else +#define JXL_FORMAT(idx_fmt, idx_arg) +#endif + +#if JXL_COMPILER_MSVC +using ssize_t = intptr_t; +#endif + +#endif // LIB_JXL_BASE_COMPILER_SPECIFIC_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/data_parallel.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/data_parallel.cc new file mode 100644 index 0000000000..20a911255c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/data_parallel.cc @@ -0,0 +1,23 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/data_parallel.h" + +namespace jxl { + +// static +JxlParallelRetCode ThreadPool::SequentialRunnerStatic( + void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) { + JxlParallelRetCode init_ret = (*init)(jpegxl_opaque, 1); + if (init_ret != 0) return init_ret; + + for (uint32_t i = start_range; i < end_range; i++) { + (*func)(jpegxl_opaque, i, 0); + } + return 0; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/data_parallel.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/data_parallel.h new file mode 100644 index 0000000000..8982974009 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/data_parallel.h @@ -0,0 +1,155 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_DATA_PARALLEL_H_ +#define LIB_JXL_BASE_DATA_PARALLEL_H_ + +// Portable, low-overhead C++11 ThreadPool alternative to OpenMP for +// data-parallel computations. + +#include +#include + +#include "jxl/parallel_runner.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +class ThreadPool { + public: + // Use this type as an InitFunc to skip the initialization step in Run(). + // When this is used the return value of Run() is always true and does not + // need to be checked. + struct SkipInit {}; + + ThreadPool(JxlParallelRunner runner, void* runner_opaque) + : runner_(runner ? runner : &ThreadPool::SequentialRunnerStatic), + runner_opaque_(runner ? runner_opaque : static_cast(this)) {} + + ThreadPool(const ThreadPool&) = delete; + ThreadPool& operator&(const ThreadPool&) = delete; + + // Runs init_func(num_threads) followed by data_func(task, thread) on worker + // thread(s) for every task in [begin, end). init_func() must return a Status + // indicating whether the initialization succeeded. + // "thread" is an integer smaller than num_threads. + // Not thread-safe - no two calls to Run may overlap. + // Subsequent calls will reuse the same threads. + // + // Precondition: begin <= end. + template + Status Run(uint32_t begin, uint32_t end, const InitFunc& init_func, + const DataFunc& data_func, const char* caller = "") { + JXL_ASSERT(begin <= end); + if (begin == end) return true; + RunCallState call_state(init_func, data_func); + // The runner_ uses the C convention and returns 0 in case of error, so we + // convert it to an Status. + return (*runner_)(runner_opaque_, static_cast(&call_state), + &call_state.CallInitFunc, &call_state.CallDataFunc, begin, + end) == 0; + } + + // Specialization that returns bool when SkipInit is used. + template + bool Run(uint32_t begin, uint32_t end, const SkipInit /* tag */, + const DataFunc& data_func, const char* caller = "") { + return Run(begin, end, ReturnTrueInit, data_func, caller); + } + + private: + static Status ReturnTrueInit(size_t num_threads) { return true; } + + // class holding the state of a Run() call to pass to the runner_ as an + // opaque_jpegxl pointer. + template + class RunCallState final { + public: + RunCallState(const InitFunc& init_func, const DataFunc& data_func) + : init_func_(init_func), data_func_(data_func) {} + + // JxlParallelRunInit interface. + static int CallInitFunc(void* jpegxl_opaque, size_t num_threads) { + const auto* self = + static_cast*>(jpegxl_opaque); + // Returns -1 when the internal init function returns false Status to + // indicate an error. + return self->init_func_(num_threads) ? 0 : -1; + } + + // JxlParallelRunFunction interface. + static void CallDataFunc(void* jpegxl_opaque, uint32_t value, + size_t thread_id) { + const auto* self = + static_cast*>(jpegxl_opaque); + return self->data_func_(value, thread_id); + } + + private: + const InitFunc& init_func_; + const DataFunc& data_func_; + }; + + // Default JxlParallelRunner used when no runner is provided by the + // caller. This runner doesn't use any threading and thread_id is always 0. + static JxlParallelRetCode SequentialRunnerStatic( + void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range); + + // The caller supplied runner function and its opaque void*. + const JxlParallelRunner runner_; + void* const runner_opaque_; +}; + +// TODO(deymo): Convert the return value to a Status when not using SkipInit. +template +bool RunOnPool(ThreadPool* pool, const uint32_t begin, const uint32_t end, + const InitFunc& init_func, const DataFunc& data_func, + const char* caller) { + Status ret = true; + if (pool == nullptr) { + ThreadPool default_pool(nullptr, nullptr); + ret = default_pool.Run(begin, end, init_func, data_func, caller); + } else { + ret = pool->Run(begin, end, init_func, data_func, caller); + } + return ret; +} + +// Accelerates multiple unsigned 32-bit divisions with the same divisor by +// precomputing a multiplier. This is useful for splitting a contiguous range of +// indices (the task index) into 2D indices. Exhaustively tested on dividends +// up to 4M with non-power of two divisors up to 2K. +class Divider { + public: + // "d" is the divisor (what to divide by). + explicit Divider(const uint32_t d) : shift_(FloorLog2Nonzero(d)) { + // Power of two divisors (including 1) are not supported because it is more + // efficient to special-case them at a higher level. + JXL_ASSERT((d & (d - 1)) != 0); + + // ceil_log2 = floor_log2 + 1 because we ruled out powers of two above. + const uint64_t next_pow2 = 1ULL << (shift_ + 1); + + mul_ = ((next_pow2 - d) << 32) / d + 1; + } + + // "n" is the numerator (what is being divided). + inline uint32_t operator()(const uint32_t n) const { + // Algorithm from "Division by Invariant Integers using Multiplication". + // Its "sh1" is hardcoded to 1 because we don't need to handle d=1. + const uint32_t hi = (uint64_t(mul_) * n) >> 32; + return (hi + ((n - hi) >> 1)) >> shift_; + } + + private: + uint32_t mul_; + const int shift_; +}; + +} // namespace jxl + +#endif // LIB_JXL_BASE_DATA_PARALLEL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/descriptive_statistics.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/descriptive_statistics.cc new file mode 100644 index 0000000000..9303f2c776 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/descriptive_statistics.cc @@ -0,0 +1,102 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/descriptive_statistics.h" + +#include + +#include "lib/jxl/base/status.h" + +namespace jxl { + +void Stats::Assimilate(const Stats& other) { + const int64_t total_n = n_ + other.n_; + if (total_n == 0) return; // Nothing to do; prevents div by zero. + + min_ = std::min(min_, other.min_); + max_ = std::max(max_, other.max_); + + product_ *= other.product_; + + const double product_n = n_ * other.n_; + const double n2 = n_ * n_; + const double other_n2 = other.n_ * other.n_; + // Warning: multiplying int64 can overflow here. + const double total_n2 = static_cast(total_n) * total_n; + const double total_n3 = static_cast(total_n2) * total_n; + // Precompute reciprocal for speed - used at least twice. + const double inv_total_n = 1.0 / total_n; + const double inv_total_n2 = 1.0 / total_n2; + + const double delta = other.m1_ - m1_; + const double delta2 = delta * delta; + const double delta3 = delta * delta2; + const double delta4 = delta2 * delta2; + + m1_ = (n_ * m1_ + other.n_ * other.m1_) * inv_total_n; + + const double new_m2 = m2_ + other.m2_ + delta2 * product_n * inv_total_n; + + const double new_m3 = + m3_ + other.m3_ + delta3 * product_n * (n_ - other.n_) * inv_total_n2 + + 3.0 * delta * (n_ * other.m2_ - other.n_ * m2_) * inv_total_n; + + m4_ += other.m4_ + + delta4 * product_n * (n2 - product_n + other_n2) / total_n3 + + 6.0 * delta2 * (n2 * other.m2_ + other_n2 * m2_) * inv_total_n2 + + 4.0 * delta * (n_ * other.m3_ - other.n_ * m3_) * inv_total_n; + + m2_ = new_m2; + m3_ = new_m3; + n_ = total_n; +} + +std::string Stats::ToString(int exclude) const { + if (Count() == 0) return std::string("(none)"); + + char buf[300]; + size_t pos = 0; + int ret; // snprintf - bytes written or negative for error. + + if ((exclude & kNoCount) == 0) { + ret = snprintf(buf + pos, sizeof(buf) - pos, "Count=%6zu ", + static_cast(Count())); + JXL_ASSERT(ret > 0); + pos += ret; + } + + if ((exclude & kNoMeanSD) == 0) { + ret = snprintf(buf + pos, sizeof(buf) - pos, "Mean=%9.6f SD=%8.5f ", Mean(), + StandardDeviation()); + JXL_ASSERT(ret > 0); + pos += ret; + } + + if ((exclude & kNoMinMax) == 0) { + ret = snprintf(buf + pos, sizeof(buf) - pos, "Min=%8.5f Max=%8.5f ", Min(), + Max()); + JXL_ASSERT(ret > 0); + pos += ret; + } + + if ((exclude & kNoSkewKurt) == 0) { + ret = snprintf(buf + pos, sizeof(buf) - pos, "Skew=%5.2f Kurt=%7.2f ", + Skewness(), Kurtosis()); + JXL_ASSERT(ret > 0); + pos += ret; + } + + if ((exclude & kNoGeomean) == 0) { + ret = snprintf(buf + pos, sizeof(buf) - pos, "GeoMean=%9.6f ", + GeometricMean()); + JXL_ASSERT(ret > 0); + pos += ret; + } + + JXL_ASSERT(pos < sizeof(buf)); + return buf; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/descriptive_statistics.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/descriptive_statistics.h new file mode 100644 index 0000000000..0d1e4850e1 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/descriptive_statistics.h @@ -0,0 +1,126 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_DESCRIPTIVE_STATISTICS_H_ +#define LIB_JXL_BASE_DESCRIPTIVE_STATISTICS_H_ + +// For analyzing the range/distribution of scalars. + +#include + +#include +#include +#include + +namespace jxl { + +// Descriptive statistics of a variable (4 moments). +class Stats { + public: + void Notify(const float x) { + ++n_; + + min_ = std::min(min_, x); + max_ = std::max(max_, x); + + product_ *= x; + + // Online moments. Reference: https://goo.gl/9ha694 + const double d = x - m1_; + const double d_div_n = d / n_; + const double d2n1_div_n = d * (n_ - 1) * d_div_n; + const int64_t n_poly = n_ * n_ - 3 * n_ + 3; + m1_ += d_div_n; + m4_ += d_div_n * (d_div_n * (d2n1_div_n * n_poly + 6.0 * m2_) - 4.0 * m3_); + m3_ += d_div_n * (d2n1_div_n * (n_ - 2) - 3.0 * m2_); + m2_ += d2n1_div_n; + } + + void Assimilate(const Stats& other); + + int64_t Count() const { return n_; } + + float Min() const { return min_; } + float Max() const { return max_; } + + double GeometricMean() const { + return n_ == 0 ? 0.0 : pow(product_, 1.0 / n_); + } + + double Mean() const { return m1_; } + // Same as Mu2. Assumes n_ is large. + double SampleVariance() const { + return n_ == 0 ? 0.0 : m2_ / static_cast(n_); + } + // Unbiased estimator for population variance even for smaller n_. + double Variance() const { + if (n_ == 0) return 0.0; + if (n_ == 1) return m2_; + return m2_ / static_cast(n_ - 1); + } + double StandardDeviation() const { return std::sqrt(Variance()); } + // Near zero for normal distributions; if positive on a unimodal distribution, + // the right tail is fatter. Assumes n_ is large. + double SampleSkewness() const { + if (std::abs(m2_) < 1E-7) return 0.0; + return m3_ * std::sqrt(static_cast(n_)) / std::pow(m2_, 1.5); + } + // Corrected for bias (same as Wikipedia and Minitab but not Excel). + double Skewness() const { + if (n_ == 0) return 0.0; + const double biased = SampleSkewness(); + const double r = (n_ - 1.0) / n_; + return biased * std::pow(r, 1.5); + } + // Near zero for normal distributions; smaller values indicate fewer/smaller + // outliers and larger indicates more/larger outliers. Assumes n_ is large. + double SampleKurtosis() const { + if (std::abs(m2_) < 1E-7) return 0.0; + return m4_ * n_ / (m2_ * m2_); + } + // Corrected for bias (same as Wikipedia and Minitab but not Excel). + double Kurtosis() const { + if (n_ == 0) return 0.0; + const double biased = SampleKurtosis(); + const double r = (n_ - 1.0) / n_; + return biased * r * r; + } + + // Central moments, useful for "method of moments"-based parameter estimation + // of a mixture of two Gaussians. Assumes Count() != 0. + double Mu1() const { return m1_; } + double Mu2() const { return m2_ / static_cast(n_); } + double Mu3() const { return m3_ / static_cast(n_); } + double Mu4() const { return m4_ / static_cast(n_); } + + // Which statistics to EXCLUDE in ToString + enum { + kNoCount = 1, + kNoMeanSD = 2, + kNoMinMax = 4, + kNoSkewKurt = 8, + kNoGeomean = 16 + }; + + std::string ToString(int exclude = 0) const; + + private: + int64_t n_ = 0; // signed for faster conversion + safe subtraction + + float min_ = 1E30f; + float max_ = -1E30f; + + double product_ = 1.0; + + // Moments + double m1_ = 0.0; + double m2_ = 0.0; + double m3_ = 0.0; + double m4_ = 0.0; +}; + +} // namespace jxl + +#endif // LIB_JXL_BASE_DESCRIPTIVE_STATISTICS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/file_io.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/file_io.h new file mode 100644 index 0000000000..2ecf854e1b --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/file_io.h @@ -0,0 +1,112 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_FILE_IO_H_ +#define LIB_JXL_BASE_FILE_IO_H_ + +// Helper functions for reading/writing files. + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +// Returns extension including the dot, or empty string if none. Assumes +// filename is not a hidden file (e.g. ".bashrc"). May be called with a pathname +// if the filename contains a dot and/or no other path component does. +static inline std::string Extension(const std::string& filename) { + const size_t pos = filename.rfind('.'); + if (pos == std::string::npos) return std::string(); + return filename.substr(pos); +} + +// RAII, ensures files are closed even when returning early. +class FileWrapper { + public: + FileWrapper(const FileWrapper& other) = delete; + FileWrapper& operator=(const FileWrapper& other) = delete; + + explicit FileWrapper(const std::string& pathname, const char* mode) + : file_(fopen(pathname.c_str(), mode)) {} + + ~FileWrapper() { + if (file_ != nullptr) { + const int err = fclose(file_); + JXL_CHECK(err == 0); + } + } + + // We intend to use FileWrapper as a replacement of FILE. + // NOLINTNEXTLINE(google-explicit-constructor) + operator FILE*() const { return file_; } + + private: + FILE* const file_; +}; + +template +static inline Status ReadFile(const std::string& pathname, + ContainerType* JXL_RESTRICT bytes) { + FileWrapper f(pathname, "rb"); + if (f == nullptr) return JXL_FAILURE("Failed to open file for reading"); + + // Ensure it is a regular file +#ifdef _WIN32 + struct __stat64 s = {}; + const int err = _stat64(pathname.c_str(), &s); + const bool is_file = (s.st_mode & S_IFREG) != 0; +#else + struct stat s = {}; + const int err = stat(pathname.c_str(), &s); + const bool is_file = S_ISREG(s.st_mode); +#endif + if (err != 0) return JXL_FAILURE("Failed to obtain file status"); + if (!is_file) return JXL_FAILURE("Not a file"); + + // Get size of file in bytes + const int64_t size = s.st_size; + if (size <= 0) return JXL_FAILURE("Empty or invalid file size"); + bytes->resize(static_cast(size)); + + size_t pos = 0; + while (pos < bytes->size()) { + // Needed in case ContainerType is std::string, whose data() is const. + char* bytes_writable = reinterpret_cast(&(*bytes)[0]); + const size_t bytes_read = + fread(bytes_writable + pos, 1, bytes->size() - pos, f); + if (bytes_read == 0) return JXL_FAILURE("Failed to read"); + pos += bytes_read; + } + JXL_ASSERT(pos == bytes->size()); + return true; +} + +template +static inline Status WriteFile(const ContainerType& bytes, + const std::string& pathname) { + FileWrapper f(pathname, "wb"); + if (f == nullptr) return JXL_FAILURE("Failed to open file for writing"); + + size_t pos = 0; + while (pos < bytes.size()) { + const size_t bytes_written = + fwrite(bytes.data() + pos, 1, bytes.size() - pos, f); + if (bytes_written == 0) return JXL_FAILURE("Failed to write"); + pos += bytes_written; + } + JXL_ASSERT(pos == bytes.size()); + + return true; +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_FILE_IO_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/iaca.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/iaca.h new file mode 100644 index 0000000000..e5732dae5c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/iaca.h @@ -0,0 +1,65 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_IACA_H_ +#define LIB_JXL_BASE_IACA_H_ + +#include "lib/jxl/base/compiler_specific.h" + +// IACA (Intel's Code Analyzer) analyzes instruction latencies, but only for +// code between special markers. These functions embed such markers in an +// executable, but only for reading via IACA - they deliberately trigger a +// crash if executed to ensure they are removed in normal builds. + +#ifndef JXL_IACA_ENABLED +#define JXL_IACA_ENABLED 0 +#endif + +namespace jxl { + +// Call before the region of interest. +static JXL_INLINE void BeginIACA() { +#if JXL_IACA_ENABLED && (JXL_COMPILER_GCC || JXL_COMPILER_CLANG) + asm volatile( + // UD2 "instruction" raises an invalid opcode exception. + ".byte 0x0F, 0x0B\n\t" + // Magic sequence recognized by IACA (MOV + addr32 fs:NOP). This actually + // clobbers EBX, but we don't care because the code won't be run, and we + // want IACA to observe the same code the compiler would have generated + // without this marker. + "movl $111, %%ebx\n\t" + ".byte 0x64, 0x67, 0x90\n\t" + : + : + // (Allegedly) clobbering memory may prevent reordering. + : "memory"); +#endif +} + +// Call after the region of interest. +static JXL_INLINE void EndIACA() { +#if JXL_IACA_ENABLED && (JXL_COMPILER_GCC || JXL_COMPILER_CLANG) + asm volatile( + // See above. + "movl $222, %%ebx\n\t" + ".byte 0x64, 0x67, 0x90\n\t" + // UD2 + ".byte 0x0F, 0x0B\n\t" + : + : + // (Allegedly) clobbering memory may prevent reordering. + : "memory"); +#endif +} + +// Add to a scope to mark a region. +struct ScopeIACA { + JXL_INLINE ScopeIACA() { BeginIACA(); } + JXL_INLINE ~ScopeIACA() { EndIACA(); } +}; + +} // namespace jxl + +#endif // LIB_JXL_BASE_IACA_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/os_macros.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/os_macros.h new file mode 100644 index 0000000000..b230f26758 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/os_macros.h @@ -0,0 +1,50 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_OS_MACROS_H_ +#define LIB_JXL_BASE_OS_MACROS_H_ + +// Defines the JXL_OS_* macros. + +#if defined(_WIN32) || defined(_WIN64) +#define JXL_OS_WIN 1 +#else +#define JXL_OS_WIN 0 +#endif + +#ifdef __linux__ +#define JXL_OS_LINUX 1 +#else +#define JXL_OS_LINUX 0 +#endif + +#ifdef __MACH__ +#define JXL_OS_MAC 1 +#else +#define JXL_OS_MAC 0 +#endif + +#define JXL_OS_IOS 0 +#ifdef __APPLE__ +#include +#if TARGET_OS_IPHONE +#undef JXL_OS_IOS +#define JXL_OS_IOS 1 +#endif +#endif + +#ifdef __FreeBSD__ +#define JXL_OS_FREEBSD 1 +#else +#define JXL_OS_FREEBSD 0 +#endif + +#ifdef __HAIKU__ +#define JXL_OS_HAIKU 1 +#else +#define JXL_OS_HAIKU 0 +#endif + +#endif // LIB_JXL_BASE_OS_MACROS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/override.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/override.h new file mode 100644 index 0000000000..1f8b657974 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/override.h @@ -0,0 +1,29 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_OVERRIDE_H_ +#define LIB_JXL_BASE_OVERRIDE_H_ + +// 'Trool' for command line arguments: force enable/disable, or use default. + +namespace jxl { + +// No effect if kDefault, otherwise forces a feature (typically a FrameHeader +// flag) on or off. +enum class Override : int { kOn = 1, kOff = 0, kDefault = -1 }; + +static inline Override OverrideFromBool(bool flag) { + return flag ? Override::kOn : Override::kOff; +} + +static inline bool ApplyOverride(Override o, bool default_condition) { + if (o == Override::kOn) return true; + if (o == Override::kOff) return false; + return default_condition; +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_OVERRIDE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/padded_bytes.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/padded_bytes.cc new file mode 100644 index 0000000000..11e4bff6fe --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/padded_bytes.cc @@ -0,0 +1,63 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/padded_bytes.h" + +namespace jxl { + +void PaddedBytes::IncreaseCapacityTo(size_t capacity) { + JXL_ASSERT(capacity > capacity_); + + size_t new_capacity = std::max(capacity, 3 * capacity_ / 2); + new_capacity = std::max(64, new_capacity); + + // BitWriter writes up to 7 bytes past the end. + CacheAlignedUniquePtr new_data = AllocateArray(new_capacity + 8); + if (new_data == nullptr) { + // Allocation failed, discard all data to ensure this is noticed. + size_ = capacity_ = 0; + return; + } + + if (data_ == nullptr) { + // First allocation: ensure first byte is initialized (won't be copied). + new_data[0] = 0; + } else { + // Subsequent resize: copy existing data to new location. + memcpy(new_data.get(), data_.get(), size_); + // Ensure that the first new byte is initialized, to allow write_bits to + // safely append to the newly-resized PaddedBytes. + new_data[size_] = 0; + } + + capacity_ = new_capacity; + std::swap(new_data, data_); +} + +void PaddedBytes::assign(const uint8_t* new_begin, const uint8_t* new_end) { + JXL_DASSERT(new_begin <= new_end); + const size_t new_size = static_cast(new_end - new_begin); + + // memcpy requires non-overlapping ranges, and resizing might invalidate the + // new range. Neither happens if the new range is completely to the left or + // right of the _allocated_ range (irrespective of size_). + const uint8_t* allocated_end = begin() + capacity_; + const bool outside = new_end <= begin() || new_begin >= allocated_end; + if (outside) { + resize(new_size); // grow or shrink + memcpy(data(), new_begin, new_size); + return; + } + + // There is overlap. The new size cannot be larger because we own the memory + // and the new range cannot include anything outside the allocated range. + JXL_ASSERT(new_size <= capacity_); + + // memmove allows overlap and capacity_ is sufficient. + memmove(data(), new_begin, new_size); + size_ = new_size; // shrink +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/padded_bytes.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/padded_bytes.h new file mode 100644 index 0000000000..1840a6c936 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/padded_bytes.h @@ -0,0 +1,195 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_PADDED_BYTES_H_ +#define LIB_JXL_BASE_PADDED_BYTES_H_ + +// std::vector replacement with padding to reduce bounds checks in WriteBits + +#include +#include +#include // memcpy + +#include // max +#include +#include // swap + +#include "lib/jxl/base/cache_aligned.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +// Provides a subset of the std::vector interface with some differences: +// - allows BitWriter to write 64 bits at a time without bounds checking; +// - ONLY zero-initializes the first byte (required by BitWriter); +// - ensures cache-line alignment. +class PaddedBytes { + public: + // Required for output params. + PaddedBytes() : size_(0), capacity_(0) {} + + explicit PaddedBytes(size_t size) : size_(size), capacity_(0) { + if (size != 0) IncreaseCapacityTo(size); + } + + PaddedBytes(size_t size, uint8_t value) : size_(size), capacity_(0) { + if (size != 0) { + IncreaseCapacityTo(size); + } + if (size_ != 0) { + memset(data(), value, size); + } + } + + PaddedBytes(const PaddedBytes& other) : size_(other.size_), capacity_(0) { + if (size_ != 0) IncreaseCapacityTo(size_); + if (data() != nullptr) memcpy(data(), other.data(), size_); + } + PaddedBytes& operator=(const PaddedBytes& other) { + // Self-assignment is safe. + resize(other.size()); + if (data() != nullptr) memmove(data(), other.data(), size_); + return *this; + } + + // default is not OK - need to set other.size_ to 0! + PaddedBytes(PaddedBytes&& other) noexcept + : size_(other.size_), + capacity_(other.capacity_), + data_(std::move(other.data_)) { + other.size_ = other.capacity_ = 0; + } + PaddedBytes& operator=(PaddedBytes&& other) noexcept { + size_ = other.size_; + capacity_ = other.capacity_; + data_ = std::move(other.data_); + + if (&other != this) { + other.size_ = other.capacity_ = 0; + } + return *this; + } + + void swap(PaddedBytes& other) { + std::swap(size_, other.size_); + std::swap(capacity_, other.capacity_); + std::swap(data_, other.data_); + } + + void reserve(size_t capacity) { + if (capacity > capacity_) IncreaseCapacityTo(capacity); + } + + // NOTE: unlike vector, this does not initialize the new data! + // However, we guarantee that write_bits can safely append after + // the resize, as we zero-initialize the first new byte of data. + // If size < capacity(), does not invalidate the memory. + void resize(size_t size) { + if (size > capacity_) IncreaseCapacityTo(size); + size_ = (data() == nullptr) ? 0 : size; + } + + // resize(size) plus explicit initialization of the new data with `value`. + void resize(size_t size, uint8_t value) { + size_t old_size = size_; + resize(size); + if (size_ > old_size) { + memset(data() + old_size, value, size_ - old_size); + } + } + + // Amortized constant complexity due to exponential growth. + void push_back(uint8_t x) { + if (size_ == capacity_) { + IncreaseCapacityTo(capacity_ + 1); + if (data() == nullptr) return; + } + + data_[size_++] = x; + } + + size_t size() const { return size_; } + size_t capacity() const { return capacity_; } + + uint8_t* data() { return data_.get(); } + const uint8_t* data() const { return data_.get(); } + + // std::vector operations implemented in terms of the public interface above. + + void clear() { resize(0); } + bool empty() const { return size() == 0; } + + void assign(std::initializer_list il) { + resize(il.size()); + memcpy(data(), il.begin(), il.size()); + } + + // Replaces data() with [new_begin, new_end); potentially reallocates. + void assign(const uint8_t* new_begin, const uint8_t* new_end); + + uint8_t* begin() { return data(); } + const uint8_t* begin() const { return data(); } + uint8_t* end() { return begin() + size(); } + const uint8_t* end() const { return begin() + size(); } + + uint8_t& operator[](const size_t i) { + BoundsCheck(i); + return data()[i]; + } + const uint8_t& operator[](const size_t i) const { + BoundsCheck(i); + return data()[i]; + } + + uint8_t& back() { + JXL_ASSERT(size() != 0); + return data()[size() - 1]; + } + const uint8_t& back() const { + JXL_ASSERT(size() != 0); + return data()[size() - 1]; + } + + template + void append(const T& other) { + append(reinterpret_cast(other.data()), + reinterpret_cast(other.data()) + other.size()); + } + + void append(const uint8_t* begin, const uint8_t* end) { + size_t old_size = size(); + resize(size() + (end - begin)); + memcpy(data() + old_size, begin, end - begin); + } + + private: + void BoundsCheck(size_t i) const { + // <= is safe due to padding and required by BitWriter. + JXL_ASSERT(i <= size()); + } + + // Copies existing data to newly allocated "data_". If allocation fails, + // data() == nullptr and size_ = capacity_ = 0. + // The new capacity will be at least 1.5 times the old capacity. This ensures + // that we avoid quadratic behaviour. + void IncreaseCapacityTo(size_t capacity); + + size_t size_; + size_t capacity_; + CacheAlignedUniquePtr data_; +}; + +template +static inline void Append(const T& s, PaddedBytes* out, + size_t* JXL_RESTRICT byte_pos) { + memcpy(out->data() + *byte_pos, s.data(), s.size()); + *byte_pos += s.size(); + JXL_CHECK(*byte_pos <= out->size()); +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_PADDED_BYTES_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/profiler.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/profiler.h new file mode 100644 index 0000000000..13f95d2b7a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/profiler.h @@ -0,0 +1,32 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_PROFILER_H_ +#define LIB_JXL_BASE_PROFILER_H_ + +// High precision, low overhead time measurements. Returns exact call counts and +// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes). +// +// To use the profiler you must set the JPEGXL_ENABLE_PROFILER CMake flag, which +// defines PROFILER_ENABLED and links against the libjxl_profiler library. + +// If zero, this file has no effect and no measurements will be recorded. +#ifndef PROFILER_ENABLED +#define PROFILER_ENABLED 0 +#endif // PROFILER_ENABLED + +#if PROFILER_ENABLED + +#include "lib/profiler/profiler.h" + +#else // !PROFILER_ENABLED + +#define PROFILER_ZONE(name) +#define PROFILER_FUNC +#define PROFILER_PRINT_RESULTS() + +#endif // PROFILER_ENABLED + +#endif // LIB_JXL_BASE_PROFILER_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/robust_statistics.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/robust_statistics.h new file mode 100644 index 0000000000..4e6445b7f9 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/robust_statistics.h @@ -0,0 +1,357 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_ROBUST_STATISTICS_H_ +#define LIB_JXL_BASE_ROBUST_STATISTICS_H_ + +// Robust statistics: Mode, Median, MedianAbsoluteDeviation. + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +namespace jxl { + +template +T Geomean(const T* items, size_t count) { + double product = 1.0; + for (size_t i = 0; i < count; ++i) { + product *= items[i]; + } + return static_cast(std::pow(product, 1.0 / count)); +} + +// Round up for integers +template ::is_integer>::type* = nullptr> +inline T Half(T x) { + return (x + 1) / 2; +} + +// Mul is faster than div. +template ::is_integer>::type* = nullptr> +inline T Half(T x) { + return x * T(0.5); +} + +// Returns the median value. Side effect: values <= median will appear before, +// values >= median after the middle index. +// Guarantees average speed O(num_values). +template +T Median(T* samples, const size_t num_samples) { + HWY_ASSERT(num_samples != 0); + std::nth_element(samples, samples + num_samples / 2, samples + num_samples); + T result = samples[num_samples / 2]; + // If even size, find largest element in the partially sorted vector to + // use as second element to average with + if ((num_samples & 1) == 0) { + T biggest = *std::max_element(samples, samples + num_samples / 2); + result = Half(result + biggest); + } + return result; +} + +template +T Median(std::vector* samples) { + return Median(samples->data(), samples->size()); +} + +template +static inline T Median3(const T a, const T b, const T c) { + return std::max(std::min(a, b), std::min(c, std::max(a, b))); +} + +template +static inline T Median5(const T a, const T b, const T c, const T d, const T e) { + return Median3(e, std::max(std::min(a, b), std::min(c, d)), + std::min(std::max(a, b), std::max(c, d))); +} + +// Returns a robust measure of variability. +template +T MedianAbsoluteDeviation(const T* samples, const size_t num_samples, + const T median) { + HWY_ASSERT(num_samples != 0); + std::vector abs_deviations; + abs_deviations.reserve(num_samples); + for (size_t i = 0; i < num_samples; ++i) { + abs_deviations.push_back(std::abs(samples[i] - median)); + } + return Median(&abs_deviations); +} + +template +T MedianAbsoluteDeviation(const std::vector& samples, const T median) { + return MedianAbsoluteDeviation(samples.data(), samples.size(), median); +} + +// Half{Range/Sample}Mode are implementations of "Robust estimators of the mode +// and skewness of continuous data". The mode is less affected by outliers in +// highly-skewed distributions than the median. + +// Robust estimator of the mode for data given as sorted values. +// O(N*logN), N=num_values. +class HalfSampleMode { + public: + // Returns mode. "sorted" must be in ascending order. + template + T operator()(const T* const HWY_RESTRICT sorted, + const size_t num_values) const { + int64_t center = num_values / 2; + int64_t width = num_values; + + // Zoom in on modal intervals of decreasing width. Stop before we reach + // width=1, i.e. single values, for which there is no "slope". + while (width > 2) { + // Round up so we can still reach the outer edges of odd widths. + width = Half(width); + + center = CenterOfIntervalWithMinSlope(sorted, num_values, center, width); + } + + return sorted[center]; // mode := middle value in modal interval. + } + + private: + // Returns center of the densest region [c-radius, c+radius]. + template + static HWY_INLINE int64_t CenterOfIntervalWithMinSlope( + const T* HWY_RESTRICT sorted, const int64_t total_values, + const int64_t center, const int64_t width) { + const int64_t radius = Half(width); + + auto compute_slope = [radius, total_values, sorted]( + int64_t c, int64_t* actual_center = nullptr) { + // For symmetry, check 2*radius+1 values, i.e. [min, max]. + const int64_t min = std::max(c - radius, int64_t(0)); + const int64_t max = std::min(c + radius, total_values - 1); + HWY_ASSERT(min < max); + HWY_ASSERT(sorted[min] <= + sorted[max] + std::numeric_limits::epsilon()); + const float dx = max - min + 1; + const float slope = (sorted[max] - sorted[min]) / dx; + + if (actual_center != nullptr) { + // c may be out of bounds, so return center of the clamped bounds. + *actual_center = Half(min + max); + } + return slope; + }; + + // First find min_slope for all centers. + float min_slope = std::numeric_limits::max(); + for (int64_t c = center - radius; c <= center + radius; ++c) { + min_slope = std::min(min_slope, compute_slope(c)); + } + + // Candidates := centers with slope ~= min_slope. + std::vector candidates; + for (int64_t c = center - radius; c <= center + radius; ++c) { + int64_t actual_center; + const float slope = compute_slope(c, &actual_center); + if (slope <= min_slope * 1.001f) { + candidates.push_back(actual_center); + } + } + + // Keep the median. + HWY_ASSERT(!candidates.empty()); + if (candidates.size() == 1) return candidates[0]; + return Median(&candidates); + } +}; + +// Robust estimator of the mode for data given as a CDF. +// O(N*logN), N=num_bins. +class HalfRangeMode { + public: + // Returns mode expressed as a histogram bin index. "cdf" must be weakly + // monotonically increasing, e.g. from std::partial_sum. + int operator()(const uint32_t* HWY_RESTRICT cdf, + const size_t num_bins) const { + int center = num_bins / 2; + int width = num_bins; + + // Zoom in on modal intervals of decreasing width. Stop before we reach + // width=1, i.e. original bins, because those are noisy. + while (width > 2) { + // Round up so we can still reach the outer edges of odd widths. + width = Half(width); + + center = CenterOfIntervalWithMaxDensity(cdf, num_bins, center, width); + } + + return center; // mode := midpoint of modal interval. + } + + private: + // Returns center of the densest interval [c-radius, c+radius]. + static HWY_INLINE int CenterOfIntervalWithMaxDensity( + const uint32_t* HWY_RESTRICT cdf, const int total_bins, const int center, + const int width) { + const int radius = Half(width); + + auto compute_density = [radius, total_bins, cdf]( + int c, int* actual_center = nullptr) { + // For symmetry, check 2*radius+1 bins, i.e. [min, max]. + const int min = std::max(c - radius, 1); // for -1 below + const int max = std::min(c + radius, total_bins - 1); + HWY_ASSERT(min < max); + HWY_ASSERT(cdf[min] <= cdf[max - 1]); + const int num_bins = max - min + 1; + // Sum over [min, max] == CDF(max) - CDF(min-1). + const float density = float(cdf[max] - cdf[min - 1]) / num_bins; + + if (actual_center != nullptr) { + // c may be out of bounds, so take center of the clamped bounds. + *actual_center = Half(min + max); + } + return density; + }; + + // First find max_density for all centers. + float max_density = 0.0f; + for (int c = center - radius; c <= center + radius; ++c) { + max_density = std::max(max_density, compute_density(c)); + } + + // Candidates := centers with density ~= max_density. + std::vector candidates; + for (int c = center - radius; c <= center + radius; ++c) { + int actual_center; + const float density = compute_density(c, &actual_center); + if (density >= max_density * 0.999f) { + candidates.push_back(actual_center); + } + } + + // Keep the median. + HWY_ASSERT(!candidates.empty()); + if (candidates.size() == 1) return candidates[0]; + return Median(&candidates); + } +}; + +// Sorts integral values in ascending order. About 3x faster than std::sort for +// input distributions with very few unique values. +template +void CountingSort(T* begin, T* end) { + // Unique values and their frequency (similar to flat_map). + using Unique = std::pair; + std::vector unique; + for (const T* p = begin; p != end; ++p) { + const T value = *p; + const auto pos = + std::find_if(unique.begin(), unique.end(), + [value](const Unique& u) { return u.first == value; }); + if (pos == unique.end()) { + unique.push_back(std::make_pair(*p, 1)); + } else { + ++pos->second; + } + } + + // Sort in ascending order of value (pair.first). + std::sort(unique.begin(), unique.end()); + + // Write that many copies of each unique value to the array. + T* HWY_RESTRICT p = begin; + for (const auto& value_count : unique) { + std::fill(p, p + value_count.second, value_count.first); + p += value_count.second; + } + HWY_ASSERT(p == end); +} + +struct Bivariate { + Bivariate(float x, float y) : x(x), y(y) {} + float x; + float y; +}; + +class Line { + public: + constexpr Line(const float slope, const float intercept) + : slope_(slope), intercept_(intercept) {} + + constexpr float slope() const { return slope_; } + constexpr float intercept() const { return intercept_; } + + // Robust line fit using Siegel's repeated-median algorithm. + explicit Line(const std::vector& points) { + const size_t N = points.size(); + // This straightforward N^2 implementation is OK for small N. + HWY_ASSERT(N < 10 * 1000); + + // One for every point i. + std::vector medians; + medians.reserve(N); + + // One for every j != i. Never cleared to avoid reallocation. + std::vector slopes(N - 1); + + for (size_t i = 0; i < N; ++i) { + // Index within slopes[] (avoids the hole where j == i). + size_t idx_slope = 0; + + for (size_t j = 0; j < N; ++j) { + if (j == i) continue; + + const float dy = points[j].y - points[i].y; + const float dx = points[j].x - points[i].x; + HWY_ASSERT(std::abs(dx) > 1E-7f); // x must be distinct + slopes[idx_slope++] = dy / dx; + } + HWY_ASSERT(idx_slope == N - 1); + + const float median = Median(&slopes); + medians.push_back(median); + } + + slope_ = Median(&medians); + + // Solve for intercept, overwriting medians[]. + for (size_t i = 0; i < N; ++i) { + medians[i] = points[i].y - slope_ * points[i].x; + } + intercept_ = Median(&medians); + } + + constexpr float operator()(float x) const { return x * slope_ + intercept_; } + + private: + float slope_; + float intercept_; +}; + +static inline void EvaluateQuality(const Line& line, + const std::vector& points, + float* HWY_RESTRICT max_l1, + float* HWY_RESTRICT median_abs_deviation) { + // For computing median_abs_deviation. + std::vector abs_deviations; + abs_deviations.reserve(points.size()); + + *max_l1 = 0.0f; + for (const Bivariate& point : points) { + const float l1 = std::abs(line(point.x) - point.y); + *max_l1 = std::max(*max_l1, l1); + abs_deviations.push_back(l1); + } + + *median_abs_deviation = Median(&abs_deviations); +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_ROBUST_STATISTICS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/span.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/span.h new file mode 100644 index 0000000000..f9e59b3710 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/span.h @@ -0,0 +1,58 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_SPAN_H_ +#define LIB_JXL_BASE_SPAN_H_ + +// Span (array view) is a non-owning container that provides cheap "cut" +// operations and could be used as "ArrayLike" data source for PaddedBytes. + +#include + +#include "lib/jxl/base/status.h" + +namespace jxl { + +template +class Span { + public: + constexpr Span() noexcept : Span(nullptr, 0) {} + + constexpr Span(T* array, size_t length) noexcept + : ptr_(array), len_(length) {} + + template + explicit constexpr Span(T (&a)[N]) noexcept : Span(a, N) {} + + template + explicit constexpr Span(const ArrayLike& other) noexcept + : Span(reinterpret_cast(other.data()), other.size()) { + static_assert(sizeof(*other.data()) == sizeof(T), + "Incompatible type of source."); + } + + constexpr T* data() const noexcept { return ptr_; } + + constexpr size_t size() const noexcept { return len_; } + + constexpr T& operator[](size_t i) const noexcept { + // MSVC 2015 accepts this as constexpr, but not ptr_[i] + return *(data() + i); + } + + void remove_prefix(size_t n) noexcept { + JXL_ASSERT(size() >= n); + ptr_ += n; + len_ -= n; + } + + private: + T* ptr_; + size_t len_; +}; + +} // namespace jxl + +#endif // LIB_JXL_BASE_SPAN_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/status.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/status.cc new file mode 100644 index 0000000000..9a94345912 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/status.cc @@ -0,0 +1,46 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/status.h" + +#include +#include +#include + +#include + +#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ + defined(THREAD_SANITIZER) +#include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace +#endif // defined(*_SANITIZER) + +namespace jxl { + +bool Debug(const char* format, ...) { + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + return false; +} + +bool Abort() { +#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ + defined(THREAD_SANITIZER) + // If compiled with any sanitizer print a stack trace. This call doesn't crash + // the program, instead the trap below will crash it also allowing gdb to + // break there. + __sanitizer_print_stack_trace(); +#endif // defined(*_SANITIZER) + +#if JXL_COMPILER_MSVC + __debugbreak(); + abort(); +#else + __builtin_trap(); +#endif +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/status.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/status.h new file mode 100644 index 0000000000..e57e6b0632 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/status.h @@ -0,0 +1,299 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_STATUS_H_ +#define LIB_JXL_BASE_STATUS_H_ + +// Error handling: Status return type + helper macros. + +#include +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" + +namespace jxl { + +// Uncomment to abort when JXL_FAILURE or JXL_STATUS with a fatal error is +// reached: +// #define JXL_CRASH_ON_ERROR + +#ifndef JXL_ENABLE_ASSERT +#define JXL_ENABLE_ASSERT 1 +#endif + +#ifndef JXL_ENABLE_CHECK +#define JXL_ENABLE_CHECK 1 +#endif + +// Pass -DJXL_DEBUG_ON_ERROR at compile time to print debug messages when a +// function returns JXL_FAILURE or calls JXL_NOTIFY_ERROR. Note that this is +// irrelevant if you also pass -DJXL_CRASH_ON_ERROR. +#if defined(JXL_DEBUG_ON_ERROR) || defined(JXL_CRASH_ON_ERROR) +#undef JXL_DEBUG_ON_ERROR +#define JXL_DEBUG_ON_ERROR 1 +#else // JXL_DEBUG_ON_ERROR || JXL_CRASH_ON_ERROR +#ifdef NDEBUG +#define JXL_DEBUG_ON_ERROR 0 +#else // NDEBUG +#define JXL_DEBUG_ON_ERROR 1 +#endif // NDEBUG +#endif // JXL_DEBUG_ON_ERROR || JXL_CRASH_ON_ERROR + +// Pass -DJXL_DEBUG_ON_ALL_ERROR at compile time to print debug messages on +// all error (fatal and non-fatal) status. This implies JXL_DEBUG_ON_ERROR. +#if defined(JXL_DEBUG_ON_ALL_ERROR) +#undef JXL_DEBUG_ON_ALL_ERROR +#define JXL_DEBUG_ON_ALL_ERROR 1 +// JXL_DEBUG_ON_ALL_ERROR implies JXL_DEBUG_ON_ERROR too. +#undef JXL_DEBUG_ON_ERROR +#define JXL_DEBUG_ON_ERROR 1 +#else // JXL_DEBUG_ON_ALL_ERROR +#define JXL_DEBUG_ON_ALL_ERROR 0 +#endif // JXL_DEBUG_ON_ALL_ERROR + +// The Verbose level for the library +#ifndef JXL_DEBUG_V_LEVEL +#define JXL_DEBUG_V_LEVEL 0 +#endif // JXL_DEBUG_V_LEVEL + +// Pass -DJXL_DEBUG_ON_ABORT=0 to disable the debug messages on JXL_ASSERT, +// JXL_CHECK and JXL_ABORT. +#ifndef JXL_DEBUG_ON_ABORT +#define JXL_DEBUG_ON_ABORT 1 +#endif // JXL_DEBUG_ON_ABORT + +// Print a debug message on standard error. You should use the JXL_DEBUG macro +// instead of calling Debug directly. This function returns false, so it can be +// used as a return value in JXL_FAILURE. +JXL_FORMAT(1, 2) +bool Debug(const char* format, ...); + +// Print a debug message on standard error if "enabled" is true. "enabled" is +// normally a macro that evaluates to 0 or 1 at compile time, so the Debug +// function is never called and optimized out in release builds. Note that the +// arguments are compiled but not evaluated when enabled is false. The format +// string must be a explicit string in the call, for example: +// JXL_DEBUG(JXL_DEBUG_MYMODULE, "my module message: %d", some_var); +// Add a header at the top of your module's .cc or .h file (depending on whether +// you have JXL_DEBUG calls from the .h as well) like this: +// #ifndef JXL_DEBUG_MYMODULE +// #define JXL_DEBUG_MYMODULE 0 +// #endif JXL_DEBUG_MYMODULE +#define JXL_DEBUG(enabled, format, ...) \ + do { \ + if (enabled) { \ + ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, \ + ##__VA_ARGS__); \ + } \ + } while (0) + +// JXL_DEBUG version that prints the debug message if the global verbose level +// defined at compile time by JXL_DEBUG_V_LEVEL is greater or equal than the +// passed level. +#define JXL_DEBUG_V(level, format, ...) \ + JXL_DEBUG(level <= JXL_DEBUG_V_LEVEL, format, ##__VA_ARGS__) + +// Warnings (via JXL_WARNING) are enabled by default in debug builds (opt and +// debug). +#ifdef JXL_DEBUG_WARNING +#undef JXL_DEBUG_WARNING +#define JXL_DEBUG_WARNING 1 +#else // JXL_DEBUG_WARNING +#ifdef NDEBUG +#define JXL_DEBUG_WARNING 0 +#else // JXL_DEBUG_WARNING +#define JXL_DEBUG_WARNING 1 +#endif // NDEBUG +#endif // JXL_DEBUG_WARNING +#define JXL_WARNING(format, ...) \ + JXL_DEBUG(JXL_DEBUG_WARNING, format, ##__VA_ARGS__) + +// Exits the program after printing a stack trace when possible. +JXL_NORETURN bool Abort(); + +// Exits the program after printing file/line plus a formatted string. +#define JXL_ABORT(format, ...) \ + ((JXL_DEBUG_ON_ABORT) && ::jxl::Debug(("%s:%d: JXL_ABORT: " format "\n"), \ + __FILE__, __LINE__, ##__VA_ARGS__), \ + ::jxl::Abort()) + +// Does not guarantee running the code, use only for debug mode checks. +#if JXL_ENABLE_ASSERT +#define JXL_ASSERT(condition) \ + do { \ + if (!(condition)) { \ + JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_ASSERT: %s", #condition); \ + ::jxl::Abort(); \ + } \ + } while (0) +#else +#define JXL_ASSERT(condition) \ + do { \ + } while (0) +#endif + +// Define JXL_IS_DEBUG_BUILD that denotes asan, msan and other debug builds, +// but not opt or release. +#ifndef JXL_IS_DEBUG_BUILD +#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) || \ + defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER) || \ + defined(__clang_analyzer__) +#define JXL_IS_DEBUG_BUILD 1 +#else +#define JXL_IS_DEBUG_BUILD 0 +#endif +#endif // JXL_IS_DEBUG_BUILD + +// Same as above, but only runs in debug builds (builds where NDEBUG is not +// defined). This is useful for slower asserts that we want to run more rarely +// than usual. These will run on asan, msan and other debug builds, but not in +// opt or release. +#if JXL_IS_DEBUG_BUILD +#define JXL_DASSERT(condition) \ + do { \ + if (!(condition)) { \ + JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_DASSERT: %s", #condition); \ + ::jxl::Abort(); \ + } \ + } while (0) +#else +#define JXL_DASSERT(condition) \ + do { \ + } while (0) +#endif + +// Always runs the condition, so can be used for non-debug calls. +#if JXL_ENABLE_CHECK +#define JXL_CHECK(condition) \ + do { \ + if (!(condition)) { \ + JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_CHECK: %s", #condition); \ + ::jxl::Abort(); \ + } \ + } while (0) +#else +#define JXL_CHECK(condition) \ + do { \ + (void)(condition); \ + } while (0) +#endif + +// A jxl::Status value from a StatusCode or Status which prints a debug message +// when enabled. +#define JXL_STATUS(status, format, ...) \ + ::jxl::StatusMessage(::jxl::Status(status), "%s:%d: " format "\n", __FILE__, \ + __LINE__, ##__VA_ARGS__) + +// Notify of an error but discard the resulting Status value. This is only +// useful for debug builds or when building with JXL_CRASH_ON_ERROR. +#define JXL_NOTIFY_ERROR(format, ...) \ + (void)JXL_STATUS(::jxl::StatusCode::kGenericError, "JXL_ERROR: " format, \ + ##__VA_ARGS__) + +// An error Status with a message. The JXL_STATUS() macro will return a Status +// object with a kGenericError code, but the comma operator helps with +// clang-tidy inference and potentially with optimizations. +#define JXL_FAILURE(format, ...) \ + ((void)JXL_STATUS(::jxl::StatusCode::kGenericError, "JXL_FAILURE: " format, \ + ##__VA_ARGS__), \ + ::jxl::Status(::jxl::StatusCode::kGenericError)) + +// Always evaluates the status exactly once, so can be used for non-debug calls. +// Returns from the current context if the passed Status expression is an error +// (fatal or non-fatal). The return value is the passed Status. +#define JXL_RETURN_IF_ERROR(status) \ + do { \ + ::jxl::Status jxl_return_if_error_status = (status); \ + if (!jxl_return_if_error_status) { \ + (void)::jxl::StatusMessage( \ + jxl_return_if_error_status, \ + "%s:%d: JXL_RETURN_IF_ERROR code=%d: %s\n", __FILE__, __LINE__, \ + static_cast(jxl_return_if_error_status.code()), #status); \ + return jxl_return_if_error_status; \ + } \ + } while (0) + +// As above, but without calling StatusMessage. Intended for bundles (see +// fields.h), which have numerous call sites (-> relevant for code size) and do +// not want to generate excessive messages when decoding partial headers. +#define JXL_QUIET_RETURN_IF_ERROR(status) \ + do { \ + ::jxl::Status jxl_return_if_error_status = (status); \ + if (!jxl_return_if_error_status) { \ + return jxl_return_if_error_status; \ + } \ + } while (0) + +enum class StatusCode : int32_t { + // Non-fatal errors (negative values). + kNotEnoughBytes = -1, + + // The only non-error status code. + kOk = 0, + + // Fatal-errors (positive values) + kGenericError = 1, +}; + +// Drop-in replacement for bool that raises compiler warnings if not used +// after being returned from a function. Example: +// Status LoadFile(...) { return true; } is more compact than +// bool JXL_MUST_USE_RESULT LoadFile(...) { return true; } +// In case of error, the status can carry an extra error code in its value which +// is split between fatal and non-fatal error codes. +class JXL_MUST_USE_RESULT Status { + public: + // We want implicit constructor from bool to allow returning "true" or "false" + // on a function when using Status. "true" means kOk while "false" means a + // generic fatal error. + // NOLINTNEXTLINE(google-explicit-constructor) + constexpr Status(bool ok) + : code_(ok ? StatusCode::kOk : StatusCode::kGenericError) {} + + // NOLINTNEXTLINE(google-explicit-constructor) + constexpr Status(StatusCode code) : code_(code) {} + + // We also want implicit cast to bool to check for return values of functions. + // NOLINTNEXTLINE(google-explicit-constructor) + constexpr operator bool() const { return code_ == StatusCode::kOk; } + + constexpr StatusCode code() const { return code_; } + + // Returns whether the status code is a fatal error. + constexpr bool IsFatalError() const { + return static_cast(code_) > 0; + } + + private: + StatusCode code_; +}; + +// Helper function to create a Status and print the debug message or abort when +// needed. +inline JXL_FORMAT(2, 3) Status + StatusMessage(const Status status, const char* format, ...) { + // This block will be optimized out when JXL_DEBUG_ON_ERROR and + // JXL_DEBUG_ON_ALL_ERROR are both disabled. + if ((JXL_DEBUG_ON_ERROR && status.IsFatalError()) || + (JXL_DEBUG_ON_ALL_ERROR && !status)) { + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + } +#ifdef JXL_CRASH_ON_ERROR + // JXL_CRASH_ON_ERROR means to Abort() only on non-fatal errors. + if (status.IsFatalError()) { + Abort(); + } +#endif // JXL_CRASH_ON_ERROR + return status; +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_STATUS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/thread_pool_internal.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/thread_pool_internal.h new file mode 100644 index 0000000000..6e23a335a7 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/base/thread_pool_internal.h @@ -0,0 +1,52 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_THREAD_POOL_INTERNAL_H_ +#define LIB_JXL_BASE_THREAD_POOL_INTERNAL_H_ + +#include + +#include + +#include "jxl/parallel_runner.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/threads/thread_parallel_runner_internal.h" + +namespace jxl { + +// Helper class to pass an internal ThreadPool-like object using threads. This +// is only suitable for tests or tools that access the internal API of JPEG XL. +// In other cases the caller will provide a JxlParallelRunner() for handling +// this. This class uses jpegxl::ThreadParallelRunner (from jpegxl_threads +// library). For interface details check jpegxl::ThreadParallelRunner. +class ThreadPoolInternal : public ThreadPool { + public: + // Starts the given number of worker threads and blocks until they are ready. + // "num_worker_threads" defaults to one per hyperthread. If zero, all tasks + // run on the main thread. + explicit ThreadPoolInternal( + int num_worker_threads = std::thread::hardware_concurrency()) + : ThreadPool(&jpegxl::ThreadParallelRunner::Runner, + static_cast(&runner_)), + runner_(num_worker_threads) {} + + ThreadPoolInternal(const ThreadPoolInternal&) = delete; + ThreadPoolInternal& operator&(const ThreadPoolInternal&) = delete; + + size_t NumThreads() const { return runner_.NumThreads(); } + size_t NumWorkerThreads() const { return runner_.NumWorkerThreads(); } + + template + void RunOnEachThread(const Func& func) { + runner_.RunOnEachThread(func); + } + + private: + jpegxl::ThreadParallelRunner runner_; +}; + +} // namespace jxl + +#endif // LIB_JXL_BASE_THREAD_POOL_INTERNAL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/bit_reader_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/bit_reader_test.cc new file mode 100644 index 0000000000..c962853190 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/bit_reader_test.cc @@ -0,0 +1,260 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include +#include +#include + +#include "gtest/gtest.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/thread_pool_internal.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_bit_writer.h" + +namespace jxl { +namespace { + +TEST(BitReaderTest, ExtendsWithZeroes) { + for (size_t size = 4; size < 32; ++size) { + std::vector data(size, 0xff); + + for (size_t n_bytes = 0; n_bytes < size; n_bytes++) { + BitReader br(Span(data.data(), n_bytes)); + // Read all the bits + for (size_t i = 0; i < n_bytes * kBitsPerByte; i++) { + ASSERT_EQ(br.ReadBits(1), 1) << "n_bytes=" << n_bytes << " i=" << i; + } + + // PEEK more than the declared size - all will be zero. Cannot consume. + for (size_t i = 0; i < BitReader::kMaxBitsPerCall; i++) { + ASSERT_EQ(br.PeekBits(i), 0) + << "size=" << size << "n_bytes=" << n_bytes << " i=" << i; + } + + EXPECT_TRUE(br.Close()); + } + } +} + +struct Symbol { + uint32_t num_bits; + uint32_t value; +}; + +// Reading from output gives the same values. +TEST(BitReaderTest, TestRoundTrip) { + ThreadPoolInternal pool(8); + pool.Run(0, 1000, ThreadPool::SkipInit(), + [](const int task, const int /* thread */) { + constexpr size_t kMaxBits = 8000; + BitWriter writer; + BitWriter::Allotment allotment(&writer, kMaxBits); + + std::vector symbols; + symbols.reserve(1000); + + std::mt19937 rng(55537 + 129 * task); + std::uniform_int_distribution<> dist(1, 32); // closed interval + + for (;;) { + const uint32_t num_bits = dist(rng); + if (writer.BitsWritten() + num_bits > kMaxBits) break; + const uint32_t value = rng() >> (32 - num_bits); + symbols.push_back({num_bits, value}); + writer.Write(num_bits, value); + } + + writer.ZeroPadToByte(); + ReclaimAndCharge(&writer, &allotment, 0, nullptr); + BitReader reader(writer.GetSpan()); + for (const Symbol& s : symbols) { + EXPECT_EQ(s.value, reader.ReadBits(s.num_bits)); + } + EXPECT_TRUE(reader.Close()); + }); +} + +// SkipBits is the same as reading that many bits. +TEST(BitReaderTest, TestSkip) { + ThreadPoolInternal pool(8); + pool.Run( + 0, 96, ThreadPool::SkipInit(), + [](const int task, const int /* thread */) { + constexpr size_t kSize = 100; + + for (size_t skip = 0; skip < 128; ++skip) { + BitWriter writer; + BitWriter::Allotment allotment(&writer, kSize * kBitsPerByte); + // Start with "task" 1-bits. + for (int i = 0; i < task; ++i) { + writer.Write(1, 1); + } + + // Write 0-bits that we will skip over + for (size_t i = 0; i < skip; ++i) { + writer.Write(1, 0); + } + + // Write terminator bits '101' + writer.Write(3, 5); + EXPECT_EQ(task + skip + 3, writer.BitsWritten()); + writer.ZeroPadToByte(); + AuxOut aux_out; + ReclaimAndCharge(&writer, &allotment, 0, &aux_out); + EXPECT_LT(aux_out.layers[0].total_bits, kSize * 8); + + BitReader reader1(writer.GetSpan()); + BitReader reader2(writer.GetSpan()); + // Verify initial 1-bits + for (int i = 0; i < task; ++i) { + EXPECT_EQ(1, reader1.ReadBits(1)); + EXPECT_EQ(1, reader2.ReadBits(1)); + } + + // SkipBits or manually read "skip" bits + reader1.SkipBits(skip); + for (size_t i = 0; i < skip; ++i) { + EXPECT_EQ(0, reader2.ReadBits(1)) << " skip=" << skip << " i=" << i; + } + EXPECT_EQ(reader1.TotalBitsConsumed(), reader2.TotalBitsConsumed()); + + // Ensure both readers see the terminator bits. + EXPECT_EQ(5, reader1.ReadBits(3)); + EXPECT_EQ(5, reader2.ReadBits(3)); + + EXPECT_TRUE(reader1.Close()); + EXPECT_TRUE(reader2.Close()); + } + }); +} + +// Verifies byte order and different groupings of bits. +TEST(BitReaderTest, TestOrder) { + constexpr size_t kMaxBits = 16; + + // u(1) - bits written into LSBs of first byte + { + BitWriter writer; + BitWriter::Allotment allotment(&writer, kMaxBits); + for (size_t i = 0; i < 5; ++i) { + writer.Write(1, 1); + } + for (size_t i = 0; i < 5; ++i) { + writer.Write(1, 0); + } + for (size_t i = 0; i < 6; ++i) { + writer.Write(1, 1); + } + + writer.ZeroPadToByte(); + ReclaimAndCharge(&writer, &allotment, 0, nullptr); + BitReader reader(writer.GetSpan()); + EXPECT_EQ(0x1F, reader.ReadFixedBits<8>()); + EXPECT_EQ(0xFC, reader.ReadFixedBits<8>()); + EXPECT_TRUE(reader.Close()); + } + + // u(8) - get bytes in the same order + { + BitWriter writer; + BitWriter::Allotment allotment(&writer, kMaxBits); + writer.Write(8, 0xF8); + writer.Write(8, 0x3F); + + writer.ZeroPadToByte(); + ReclaimAndCharge(&writer, &allotment, 0, nullptr); + BitReader reader(writer.GetSpan()); + EXPECT_EQ(0xF8, reader.ReadFixedBits<8>()); + EXPECT_EQ(0x3F, reader.ReadFixedBits<8>()); + EXPECT_TRUE(reader.Close()); + } + + // u(16) - little-endian bytes + { + BitWriter writer; + BitWriter::Allotment allotment(&writer, kMaxBits); + writer.Write(16, 0xF83F); + + writer.ZeroPadToByte(); + ReclaimAndCharge(&writer, &allotment, 0, nullptr); + BitReader reader(writer.GetSpan()); + EXPECT_EQ(0x3F, reader.ReadFixedBits<8>()); + EXPECT_EQ(0xF8, reader.ReadFixedBits<8>()); + EXPECT_TRUE(reader.Close()); + } + + // Non-byte-aligned, mixed sizes + { + BitWriter writer; + BitWriter::Allotment allotment(&writer, kMaxBits); + writer.Write(1, 1); + writer.Write(3, 6); + writer.Write(8, 0xDB); + writer.Write(4, 8); + + writer.ZeroPadToByte(); + ReclaimAndCharge(&writer, &allotment, 0, nullptr); + BitReader reader(writer.GetSpan()); + EXPECT_EQ(0xBD, reader.ReadFixedBits<8>()); + EXPECT_EQ(0x8D, reader.ReadFixedBits<8>()); + EXPECT_TRUE(reader.Close()); + } +} + +TEST(BitReaderTest, TotalCountersTest) { + uint8_t buf[8] = {1, 2, 3, 4}; + BitReader reader(Span(buf, sizeof(buf))); + + EXPECT_EQ(sizeof(buf), reader.TotalBytes()); + EXPECT_EQ(0, reader.TotalBitsConsumed()); + reader.ReadFixedBits<1>(); + EXPECT_EQ(1, reader.TotalBitsConsumed()); + + reader.ReadFixedBits<10>(); + EXPECT_EQ(11, reader.TotalBitsConsumed()); + + reader.ReadFixedBits<4>(); + EXPECT_EQ(15, reader.TotalBitsConsumed()); + + reader.ReadFixedBits<1>(); + EXPECT_EQ(16, reader.TotalBitsConsumed()); + + reader.ReadFixedBits<16>(); + EXPECT_EQ(32, reader.TotalBitsConsumed()); + + EXPECT_TRUE(reader.Close()); +} + +TEST(BitReaderTest, MoveTest) { + uint8_t buf[8] = {1, 2, 3, 4}; + BitReader reader2; + { + BitReader reader1(Span(buf, sizeof(buf))); + + EXPECT_EQ(0, reader1.TotalBitsConsumed()); + reader1.ReadFixedBits<16>(); + EXPECT_EQ(16, reader1.TotalBitsConsumed()); + + reader2 = std::move(reader1); + // From this point reader1 is invalid, but can continue to access reader2 + // and we don't need to call Close() on reader1. + } + + EXPECT_EQ(16, reader2.TotalBitsConsumed()); + EXPECT_EQ(3U, reader2.ReadFixedBits<8>()); + EXPECT_EQ(24, reader2.TotalBitsConsumed()); + + EXPECT_TRUE(reader2.Close()); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/bits_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/bits_test.cc new file mode 100644 index 0000000000..9c109cb772 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/bits_test.cc @@ -0,0 +1,79 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/bits.h" + +#include "gtest/gtest.h" + +namespace jxl { +namespace { + +TEST(BitsTest, TestNumZeroBits) { + // Zero input is well-defined. + EXPECT_EQ(32, Num0BitsAboveMS1Bit(0u)); + EXPECT_EQ(64, Num0BitsAboveMS1Bit(0ull)); + EXPECT_EQ(32, Num0BitsBelowLS1Bit(0u)); + EXPECT_EQ(64, Num0BitsBelowLS1Bit(0ull)); + + EXPECT_EQ(31, Num0BitsAboveMS1Bit(1u)); + EXPECT_EQ(30, Num0BitsAboveMS1Bit(2u)); + EXPECT_EQ(63, Num0BitsAboveMS1Bit(1ull)); + EXPECT_EQ(62, Num0BitsAboveMS1Bit(2ull)); + + EXPECT_EQ(0, Num0BitsBelowLS1Bit(1u)); + EXPECT_EQ(0, Num0BitsBelowLS1Bit(1ull)); + EXPECT_EQ(1, Num0BitsBelowLS1Bit(2u)); + EXPECT_EQ(1, Num0BitsBelowLS1Bit(2ull)); + + EXPECT_EQ(0, Num0BitsAboveMS1Bit(0x80000000u)); + EXPECT_EQ(0, Num0BitsAboveMS1Bit(0x8000000000000000ull)); + EXPECT_EQ(31, Num0BitsBelowLS1Bit(0x80000000u)); + EXPECT_EQ(63, Num0BitsBelowLS1Bit(0x8000000000000000ull)); +} + +TEST(BitsTest, TestFloorLog2) { + // for input = [1, 7] + const int expected[7] = {0, 1, 1, 2, 2, 2, 2}; + for (uint32_t i = 1; i <= 7; ++i) { + EXPECT_EQ(expected[i - 1], FloorLog2Nonzero(i)) << " " << i; + EXPECT_EQ(expected[i - 1], FloorLog2Nonzero(uint64_t(i))) << " " << i; + } + + EXPECT_EQ(31, FloorLog2Nonzero(0x80000000u)); + EXPECT_EQ(31, FloorLog2Nonzero(0x80000001u)); + EXPECT_EQ(31, FloorLog2Nonzero(0xFFFFFFFFu)); + + EXPECT_EQ(31, FloorLog2Nonzero(0x80000000ull)); + EXPECT_EQ(31, FloorLog2Nonzero(0x80000001ull)); + EXPECT_EQ(31, FloorLog2Nonzero(0xFFFFFFFFull)); + + EXPECT_EQ(63, FloorLog2Nonzero(0x8000000000000000ull)); + EXPECT_EQ(63, FloorLog2Nonzero(0x8000000000000001ull)); + EXPECT_EQ(63, FloorLog2Nonzero(0xFFFFFFFFFFFFFFFFull)); +} + +TEST(BitsTest, TestCeilLog2) { + // for input = [1, 7] + const int expected[7] = {0, 1, 2, 2, 3, 3, 3}; + for (uint32_t i = 1; i <= 7; ++i) { + EXPECT_EQ(expected[i - 1], CeilLog2Nonzero(i)) << " " << i; + EXPECT_EQ(expected[i - 1], CeilLog2Nonzero(uint64_t(i))) << " " << i; + } + + EXPECT_EQ(31, CeilLog2Nonzero(0x80000000u)); + EXPECT_EQ(32, CeilLog2Nonzero(0x80000001u)); + EXPECT_EQ(32, CeilLog2Nonzero(0xFFFFFFFFu)); + + EXPECT_EQ(31, CeilLog2Nonzero(0x80000000ull)); + EXPECT_EQ(32, CeilLog2Nonzero(0x80000001ull)); + EXPECT_EQ(32, CeilLog2Nonzero(0xFFFFFFFFull)); + + EXPECT_EQ(63, CeilLog2Nonzero(0x8000000000000000ull)); + EXPECT_EQ(64, CeilLog2Nonzero(0x8000000000000001ull)); + EXPECT_EQ(64, CeilLog2Nonzero(0xFFFFFFFFFFFFFFFFull)); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/blending.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/blending.cc new file mode 100644 index 0000000000..6cf2502bf9 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/blending.cc @@ -0,0 +1,383 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/blending.h" + +#include "lib/jxl/alpha.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { + +bool ImageBlender::NeedsBlending(PassesDecoderState* dec_state) { + const PassesSharedState& state = *dec_state->shared; + if (!(state.frame_header.frame_type == FrameType::kRegularFrame || + state.frame_header.frame_type == FrameType::kSkipProgressive)) { + return false; + } + const auto& info = state.frame_header.blending_info; + bool replace_all = (info.mode == BlendMode::kReplace); + for (const auto& ec_i : state.frame_header.extra_channel_blending_info) { + if (ec_i.mode != BlendMode::kReplace) { + replace_all = false; + } + } + // Replace the full frame: nothing to do. + if (!state.frame_header.custom_size_or_origin && replace_all) { + return false; + } + return true; +} + +Status ImageBlender::PrepareBlending( + PassesDecoderState* dec_state, FrameOrigin foreground_origin, + size_t foreground_xsize, size_t foreground_ysize, + const std::vector* extra_channel_info, + const ColorEncoding& frame_color_encoding, const Rect& frame_rect, + Image3F* output, const Rect& output_rect, + std::vector* output_extra_channels, + std::vector output_extra_channels_rects) { + const PassesSharedState& state = *dec_state->shared; + info_ = state.frame_header.blending_info; + + ec_info_ = &state.frame_header.extra_channel_blending_info; + + extra_channel_info_ = extra_channel_info; + output_ = output; + output_rect_ = output_rect; + output_extra_channels_ = output_extra_channels; + output_extra_channels_rects_ = std::move(output_extra_channels_rects); + + size_t image_xsize = state.frame_header.nonserialized_metadata->xsize(); + size_t image_ysize = state.frame_header.nonserialized_metadata->ysize(); + + // the rect in the canvas that needs to be updated + cropbox_ = frame_rect; + // the rect of this frame that overlaps with the canvas + overlap_ = cropbox_; + o_ = foreground_origin; + o_.x0 -= frame_rect.x0(); + o_.y0 -= frame_rect.y0(); + int x0 = (o_.x0 >= 0 ? o_.x0 : 0); + int y0 = (o_.y0 >= 0 ? o_.y0 : 0); + int xsize = foreground_xsize; + if (o_.x0 < 0) xsize += o_.x0; + int ysize = foreground_ysize; + if (o_.y0 < 0) ysize += o_.y0; + xsize = Clamp1(xsize, 0, (int)cropbox_.xsize() - x0); + ysize = Clamp1(ysize, 0, (int)cropbox_.ysize() - y0); + cropbox_ = Rect(x0, y0, xsize, ysize); + x0 = (o_.x0 < 0 ? -o_.x0 : 0); + y0 = (o_.y0 < 0 ? -o_.y0 : 0); + overlap_ = Rect(x0, y0, xsize, ysize); + + // Image to write to. + ImageBundle& bg = *state.reference_frames[info_.source].frame; + if (bg.xsize() == 0 && bg.ysize() == 0) { + // there is no background, assume it to be all zeroes + ImageBundle empty(&state.metadata->m); + Image3F color(image_xsize, image_ysize); + ZeroFillImage(&color); + empty.SetFromImage(std::move(color), frame_color_encoding); + if (!output_extra_channels_->empty()) { + std::vector ec; + for (size_t i = 0; i < output_extra_channels_->size(); ++i) { + ImageF eci(image_xsize, image_ysize); + ZeroFillImage(&eci); + ec.push_back(std::move(eci)); + } + empty.SetExtraChannels(std::move(ec)); + } + bg = std::move(empty); + } else if (state.reference_frames[info_.source].ib_is_in_xyb) { + return JXL_FAILURE( + "Trying to blend XYB reference frame %i and non-XYB frame", + info_.source); + } + + if (bg.xsize() < image_xsize || bg.ysize() < image_ysize || + bg.origin.x0 != 0 || bg.origin.y0 != 0) { + return JXL_FAILURE("Trying to use a %zux%zu crop as a background", + bg.xsize(), bg.ysize()); + } + if (state.metadata->m.xyb_encoded) { + if (!dec_state->output_encoding_info.color_encoding_is_original) { + return JXL_FAILURE("Blending in unsupported color space"); + } + } + + if (!overlap_.IsInside(Rect(0, 0, foreground_xsize, foreground_ysize))) { + return JXL_FAILURE("Trying to use a %zux%zu crop as a foreground", + foreground_xsize, foreground_ysize); + } + + if (!cropbox_.IsInside(bg)) { + return JXL_FAILURE( + "Trying blend %zux%zu to (%zu,%zu), but background is %zux%zu", + cropbox_.xsize(), cropbox_.ysize(), cropbox_.x0(), cropbox_.y0(), + bg.xsize(), bg.ysize()); + } + + CopyImageTo(frame_rect, *bg.color(), output_rect, output); + for (size_t i = 0; i < ec_info_->size(); ++i) { + const auto& eci = (*ec_info_)[i]; + const auto& src = *state.reference_frames[eci.source].frame; + if (src.xsize() == 0 && src.ysize() == 0) { + ZeroFillPlane(&(*output_extra_channels_)[i], + output_extra_channels_rects_[i]); + } else { + if (src.extra_channels()[i].xsize() < image_xsize || + src.extra_channels()[i].ysize() < image_ysize || src.origin.x0 != 0 || + src.origin.y0 != 0) { + return JXL_FAILURE( + "Invalid size %zux%zu or origin %+d%+d for extra channel %zu of " + "reference frame %zu, expected at least %zux%zu+0+0", + src.extra_channels()[i].xsize(), src.extra_channels()[i].ysize(), + static_cast(src.origin.x0), static_cast(src.origin.y0), i, + static_cast(eci.source), image_xsize, image_ysize); + } + CopyImageTo(frame_rect, src.extra_channels()[i], + output_extra_channels_rects_[i], + &(*output_extra_channels_)[i]); + } + } + + return true; +} + +ImageBlender::RectBlender ImageBlender::PrepareRect( + const Rect& rect, const Image3F& foreground, + const std::vector& extra_channels, const Rect& input_rect) const { + JXL_DASSERT(rect.xsize() == input_rect.xsize()); + JXL_DASSERT(rect.ysize() == input_rect.ysize()); + JXL_DASSERT(input_rect.IsInside(foreground)); + + RectBlender blender(false); + blender.extra_channel_info_ = extra_channel_info_; + + blender.current_overlap_ = rect.Intersection(overlap_); + if (blender.current_overlap_.xsize() == 0 || + blender.current_overlap_.ysize() == 0) { + blender.done_ = true; + return blender; + } + + blender.current_cropbox_ = + Rect(o_.x0 + blender.current_overlap_.x0(), + o_.y0 + blender.current_overlap_.y0(), + blender.current_overlap_.xsize(), blender.current_overlap_.ysize()); + + // Turn current_overlap_ from being relative to the full foreground to being + // relative to the rect or input_rect. + blender.current_overlap_ = + Rect(blender.current_overlap_.x0() - rect.x0(), + blender.current_overlap_.y0() - rect.y0(), + blender.current_overlap_.xsize(), blender.current_overlap_.ysize()); + + // And this one is relative to the `foreground` subimage. + const Rect input_overlap(blender.current_overlap_.x0() + input_rect.x0(), + blender.current_overlap_.y0() + input_rect.y0(), + blender.current_overlap_.xsize(), + blender.current_overlap_.ysize()); + + blender.blending_info_.resize(extra_channels.size() + 1); + auto make_blending = [&](const BlendingInfo& info, PatchBlending* pb) { + pb->alpha_channel = info.alpha_channel; + pb->clamp = info.clamp; + switch (info.mode) { + case BlendMode::kReplace: { + pb->mode = PatchBlendMode::kReplace; + break; + } + case BlendMode::kAdd: { + pb->mode = PatchBlendMode::kAdd; + break; + } + case BlendMode::kMul: { + pb->mode = PatchBlendMode::kMul; + break; + } + case BlendMode::kBlend: { + pb->mode = PatchBlendMode::kBlendAbove; + break; + } + case BlendMode::kAlphaWeightedAdd: { + pb->mode = PatchBlendMode::kAlphaWeightedAddAbove; + break; + } + default: { + JXL_ABORT("Invalid blend mode"); // should have failed to decode + } + } + }; + make_blending(info_, &blender.blending_info_[0]); + for (size_t i = 0; i < extra_channels.size(); i++) { + make_blending((*ec_info_)[i], &blender.blending_info_[1 + i]); + } + + Rect cropbox_row = blender.current_cropbox_.Line(0); + Rect overlap_row = input_overlap.Line(0); + const auto num_ptrs = 3 + extra_channels.size(); + blender.fg_ptrs_.reserve(num_ptrs); + blender.fg_strides_.reserve(num_ptrs); + blender.bg_ptrs_.reserve(num_ptrs); + blender.bg_strides_.reserve(num_ptrs); + for (size_t c = 0; c < 3; c++) { + blender.fg_ptrs_.push_back(overlap_row.ConstPlaneRow(foreground, c, 0)); + blender.fg_strides_.push_back(foreground.PixelsPerRow()); + blender.bg_ptrs_.push_back( + cropbox_row.Translate(output_rect_.x0(), output_rect_.y0()) + .PlaneRow(output_, c, 0)); + blender.bg_strides_.push_back(output_->PixelsPerRow()); + } + for (size_t c = 0; c < extra_channels.size(); c++) { + blender.fg_ptrs_.push_back(overlap_row.ConstRow(extra_channels[c], 0)); + blender.fg_strides_.push_back(extra_channels[c].PixelsPerRow()); + blender.bg_ptrs_.push_back( + cropbox_row + .Translate(output_extra_channels_rects_[c].x0(), + output_extra_channels_rects_[c].y0()) + .Row(&(*output_extra_channels_)[c], 0)); + blender.bg_strides_.push_back((*output_extra_channels_)[c].PixelsPerRow()); + } + + return blender; +} + +Status PerformBlending( + const float* const* bg, const float* const* fg, float* const* out, + size_t xsize, const PatchBlending& color_blending, + const PatchBlending* ec_blending, + const std::vector& extra_channel_info) { + bool has_alpha = false; + size_t num_ec = extra_channel_info.size(); + for (size_t i = 0; i < num_ec; i++) { + if (extra_channel_info[i].type == jxl::ExtraChannel::kAlpha) { + has_alpha = true; + break; + } + } + ImageF tmp(xsize, 3 + num_ec); + // Blend extra channels first so that we use the pre-blending alpha. + for (size_t i = 0; i < num_ec; i++) { + if (ec_blending[i].mode == PatchBlendMode::kAdd) { + for (size_t x = 0; x < xsize; x++) { + tmp.Row(3 + i)[x] = bg[3 + i][x] + fg[3 + i][x]; + } + } else if (ec_blending[i].mode == PatchBlendMode::kBlendAbove) { + size_t alpha = ec_blending[i].alpha_channel; + bool is_premultiplied = extra_channel_info[alpha].alpha_associated; + PerformAlphaBlending(bg[3 + i], bg[3 + alpha], fg[3 + i], fg[3 + alpha], + tmp.Row(3 + i), xsize, is_premultiplied, + ec_blending[i].clamp); + } else if (ec_blending[i].mode == PatchBlendMode::kBlendBelow) { + size_t alpha = ec_blending[i].alpha_channel; + bool is_premultiplied = extra_channel_info[alpha].alpha_associated; + PerformAlphaBlending(fg[3 + i], fg[3 + alpha], bg[3 + i], bg[3 + alpha], + tmp.Row(3 + i), xsize, is_premultiplied, + ec_blending[i].clamp); + } else if (ec_blending[i].mode == PatchBlendMode::kAlphaWeightedAddAbove) { + size_t alpha = ec_blending[i].alpha_channel; + PerformAlphaWeightedAdd(bg[3 + i], fg[3 + i], fg[3 + alpha], + tmp.Row(3 + i), xsize, ec_blending[i].clamp); + } else if (ec_blending[i].mode == PatchBlendMode::kAlphaWeightedAddBelow) { + size_t alpha = ec_blending[i].alpha_channel; + PerformAlphaWeightedAdd(fg[3 + i], bg[3 + i], bg[3 + alpha], + tmp.Row(3 + i), xsize, ec_blending[i].clamp); + } else if (ec_blending[i].mode == PatchBlendMode::kMul) { + PerformMulBlending(bg[3 + i], fg[3 + i], tmp.Row(3 + i), xsize, + ec_blending[i].clamp); + } else if (ec_blending[i].mode == PatchBlendMode::kReplace) { + memcpy(tmp.Row(3 + i), fg[3 + i], xsize * sizeof(**fg)); + } else if (ec_blending[i].mode == PatchBlendMode::kNone) { + memcpy(tmp.Row(3 + i), bg[3 + i], xsize * sizeof(**fg)); + } else { + JXL_ABORT("Unreachable"); + } + } + size_t alpha = color_blending.alpha_channel; + + if (color_blending.mode == PatchBlendMode::kAdd || + (color_blending.mode == PatchBlendMode::kAlphaWeightedAddAbove && + !has_alpha) || + (color_blending.mode == PatchBlendMode::kAlphaWeightedAddBelow && + !has_alpha)) { + for (int p = 0; p < 3; p++) { + float* out = tmp.Row(p); + for (size_t x = 0; x < xsize; x++) { + out[x] = bg[p][x] + fg[p][x]; + } + } + } else if (color_blending.mode == PatchBlendMode::kBlendAbove + // blend without alpha is just replace + && has_alpha) { + bool is_premultiplied = extra_channel_info[alpha].alpha_associated; + PerformAlphaBlending( + {bg[0], bg[1], bg[2], bg[3 + alpha]}, + {fg[0], fg[1], fg[2], fg[3 + alpha]}, + {tmp.Row(0), tmp.Row(1), tmp.Row(2), tmp.Row(3 + alpha)}, xsize, + is_premultiplied, color_blending.clamp); + } else if (color_blending.mode == PatchBlendMode::kBlendBelow + // blend without alpha is just replace + && has_alpha) { + bool is_premultiplied = extra_channel_info[alpha].alpha_associated; + PerformAlphaBlending( + {fg[0], fg[1], fg[2], fg[3 + alpha]}, + {bg[0], bg[1], bg[2], bg[3 + alpha]}, + {tmp.Row(0), tmp.Row(1), tmp.Row(2), tmp.Row(3 + alpha)}, xsize, + is_premultiplied, color_blending.clamp); + } else if (color_blending.mode == PatchBlendMode::kAlphaWeightedAddAbove) { + JXL_DASSERT(has_alpha); + for (size_t c = 0; c < 3; c++) { + PerformAlphaWeightedAdd(bg[c], fg[c], fg[3 + alpha], tmp.Row(c), xsize, + color_blending.clamp); + } + } else if (color_blending.mode == PatchBlendMode::kAlphaWeightedAddBelow) { + JXL_DASSERT(has_alpha); + for (size_t c = 0; c < 3; c++) { + PerformAlphaWeightedAdd(fg[c], bg[c], bg[3 + alpha], tmp.Row(c), xsize, + color_blending.clamp); + } + } else if (color_blending.mode == PatchBlendMode::kMul) { + for (int p = 0; p < 3; p++) { + PerformMulBlending(bg[p], fg[p], tmp.Row(p), xsize, color_blending.clamp); + } + } else if (color_blending.mode == PatchBlendMode::kReplace || + color_blending.mode == PatchBlendMode::kBlendAbove || + color_blending.mode == PatchBlendMode::kBlendBelow) { // kReplace + for (size_t p = 0; p < 3; p++) { + memcpy(tmp.Row(p), fg[p], xsize * sizeof(**fg)); + } + } else if (color_blending.mode == PatchBlendMode::kNone) { + for (size_t p = 0; p < 3; p++) { + memcpy(tmp.Row(p), bg[p], xsize * sizeof(**fg)); + } + } else { + JXL_ABORT("Unreachable"); + } + for (size_t i = 0; i < 3 + num_ec; i++) { + memcpy(out[i], tmp.Row(i), xsize * sizeof(**out)); + } + return true; +} + +Status ImageBlender::RectBlender::DoBlending(size_t y) { + if (done_ || y < current_overlap_.y0() || + y >= current_overlap_.y0() + current_overlap_.ysize()) { + return true; + } + y -= current_overlap_.y0(); + fg_row_ptrs_.resize(fg_ptrs_.size()); + bg_row_ptrs_.resize(bg_ptrs_.size()); + for (size_t c = 0; c < fg_row_ptrs_.size(); c++) { + fg_row_ptrs_[c] = fg_ptrs_[c] + y * fg_strides_[c]; + bg_row_ptrs_[c] = bg_ptrs_[c] + y * bg_strides_[c]; + } + return PerformBlending(bg_row_ptrs_.data(), fg_row_ptrs_.data(), + bg_row_ptrs_.data(), current_overlap_.xsize(), + blending_info_[0], blending_info_.data() + 1, + *extra_channel_info_); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/blending.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/blending.h new file mode 100644 index 0000000000..5e60b146bf --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/blending.h @@ -0,0 +1,91 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BLENDING_H_ +#define LIB_JXL_BLENDING_H_ +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/dec_patch_dictionary.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +Status PerformBlending(const float* const* bg, const float* const* fg, + float* const* out, size_t xsize, + const PatchBlending& color_blending, + const PatchBlending* ec_blending, + const std::vector& extra_channel_info); + +class ImageBlender { + public: + class RectBlender { + public: + // Does the blending for a given row of the rect passed to + // ImageBlender::PrepareRect. + Status DoBlending(size_t y); + + // If this returns true, then nothing needs to be done for this rect and + // DoBlending can be skipped (but does not have to). + bool done() const { return done_; } + + private: + friend class ImageBlender; + explicit RectBlender(bool done) : done_(done) {} + + bool done_; + Rect current_overlap_; + Rect current_cropbox_; + const std::vector* extra_channel_info_; + std::vector fg_ptrs_; + std::vector fg_strides_; + std::vector bg_ptrs_; + std::vector bg_strides_; + std::vector fg_row_ptrs_; + std::vector bg_row_ptrs_; + std::vector blending_info_; + }; + + static bool NeedsBlending(PassesDecoderState* dec_state); + + Status PrepareBlending( + PassesDecoderState* dec_state, FrameOrigin foreground_origin, + size_t foreground_xsize, size_t foreground_ysize, + const std::vector* extra_channel_info, + const ColorEncoding& frame_color_encoding, const Rect& frame_rect, + Image3F* output, const Rect& output_rect, + std::vector* output_extra_channels, + std::vector output_extra_channels_rects); + // rect is relative to the full decoded foreground. + // But foreground here can be a subset of the full foreground, and input_rect + // indicates where that rect is in that subset. For example, if rect = + // Rect(10, 10, 20, 20), and foreground is subrect (7, 7, 30, 30) of the full + // foreground, then input_rect should be (3, 3, 20, 20), because that is where + // rect is relative to the foreground crop. + ImageBlender::RectBlender PrepareRect( + const Rect& rect, const Image3F& foreground, + const std::vector& extra_channels, const Rect& input_rect) const; + + // If this returns true, then it is not necessary to call further methods on + // this ImageBlender to achieve blending, although it is not forbidden either + // (those methods will just return immediately in that case). + bool done() const { return done_; } + + private: + BlendingInfo info_; + const std::vector* extra_channel_info_; + // Destination, as well as background before DoBlending is called. + Image3F* output_; + Rect output_rect_; + std::vector* output_extra_channels_; + std::vector output_extra_channels_rects_; + Rect cropbox_; + Rect overlap_; + bool done_ = false; + const std::vector* ec_info_; + FrameOrigin o_{}; +}; + +} // namespace jxl + +#endif // LIB_JXL_BLENDING_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/blending_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/blending_test.cc new file mode 100644 index 0000000000..4ce66c2f17 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/blending_test.cc @@ -0,0 +1,98 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/blending.h" + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "lib/extras/codec.h" +#include "lib/jxl/dec_file.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/testdata.h" + +namespace jxl { +namespace { + +using ::testing::SizeIs; + +TEST(BlendingTest, Crops) { + ThreadPool* pool = nullptr; + + const PaddedBytes compressed = + ReadTestData("jxl/blending/cropped_traffic_light.jxl"); + DecompressParams dparams; + CodecInOut decoded; + ASSERT_TRUE(DecodeFile(dparams, compressed, &decoded, pool)); + ASSERT_THAT(decoded.frames, SizeIs(4)); + + int i = 0; + for (const ImageBundle& ib : decoded.frames) { + std::ostringstream filename; + filename << "jxl/blending/cropped_traffic_light_frame-" << i << ".png"; + const PaddedBytes compressed_frame = ReadTestData(filename.str()); + CodecInOut frame; + ASSERT_TRUE(SetFromBytes(Span(compressed_frame), &frame)); + EXPECT_TRUE(SamePixels(ib.color(), *frame.Main().color())); + ++i; + } +} + +TEST(BlendingTest, Offset) { + const PaddedBytes background_bytes = ReadTestData("jxl/splines.png"); + CodecInOut background; + ASSERT_TRUE(SetFromBytes(Span(background_bytes), &background)); + const PaddedBytes foreground_bytes = + ReadTestData("jxl/grayscale_patches.png"); + CodecInOut foreground; + ASSERT_TRUE(SetFromBytes(Span(foreground_bytes), &foreground)); + + ImageBlender blender; + CodecMetadata nonserialized_metadata; + ASSERT_TRUE( + nonserialized_metadata.size.Set(background.xsize(), background.ysize())); + PassesSharedState state; + state.frame_header.blending_info.mode = BlendMode::kReplace; + state.frame_header.blending_info.source = 0; + state.frame_header.nonserialized_metadata = &nonserialized_metadata; + state.metadata = &background.metadata; + state.reference_frames[0].frame = &background.Main(); + PassesDecoderState dec_state; + dec_state.shared = &state; + const FrameOrigin foreground_origin = {-50, -50}; + ImageBundle output(&background.metadata.m); + output.SetFromImage(Image3F(background.xsize(), background.ysize()), + background.Main().c_current()); + ASSERT_TRUE(blender.PrepareBlending( + &dec_state, foreground_origin, foreground.xsize(), foreground.ysize(), + &nonserialized_metadata.m.extra_channel_info, + background.Main().c_current(), Rect(background), output.color(), + Rect(*output.color()), {}, {})); + + static constexpr int kStep = 20; + for (size_t x0 = 0; x0 < foreground.xsize(); x0 += kStep) { + for (size_t y0 = 0; y0 < foreground.ysize(); y0 += kStep) { + const Rect rect = + Rect(x0, y0, kStep, kStep).Intersection(Rect(foreground.Main())); + Image3F foreground_crop(rect.xsize(), rect.ysize()); + CopyImageTo(rect, *foreground.Main().color(), Rect(foreground_crop), + &foreground_crop); + auto rect_blender = + blender.PrepareRect(rect, foreground_crop, {}, Rect(foreground_crop)); + for (size_t y = 0; y < rect.ysize(); ++y) { + ASSERT_TRUE(rect_blender.DoBlending(y)); + } + } + } + + const PaddedBytes expected_bytes = + ReadTestData("jxl/blending/grayscale_patches_on_splines.png"); + CodecInOut expected; + ASSERT_TRUE(SetFromBytes(Span(expected_bytes), &expected)); + VerifyRelativeError(*expected.Main().color(), *output.color(), 1. / (2 * 255), + 0); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli/butteraugli.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli/butteraugli.cc new file mode 100644 index 0000000000..fc1ef2875c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli/butteraugli.cc @@ -0,0 +1,2139 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Author: Jyrki Alakuijala (jyrki.alakuijala@gmail.com) +// +// The physical architecture of butteraugli is based on the following naming +// convention: +// * Opsin - dynamics of the photosensitive chemicals in the retina +// with their immediate electrical processing +// * Xyb - hybrid opponent/trichromatic color space +// x is roughly red-subtract-green. +// y is yellow. +// b is blue. +// Xyb values are computed from Opsin mixing, not directly from rgb. +// * Mask - for visual masking +// * Hf - color modeling for spatially high-frequency features +// * Lf - color modeling for spatially low-frequency features +// * Diffmap - to cluster and build an image of error between the images +// * Blur - to hold the smoothing code + +#include "lib/jxl/butteraugli/butteraugli.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#if PROFILER_ENABLED +#include +#endif // PROFILER_ENABLED + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/butteraugli/butteraugli.cc" +#include + +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/fast_math-inl.h" +#include "lib/jxl/gauss_blur.h" +#include "lib/jxl/image_ops.h" + +#ifndef JXL_BUTTERAUGLI_ONCE +#define JXL_BUTTERAUGLI_ONCE + +namespace jxl { + +std::vector ComputeKernel(float sigma) { + const float m = 2.25; // Accuracy increases when m is increased. + const double scaler = -1.0 / (2.0 * sigma * sigma); + const int diff = std::max(1, m * std::fabs(sigma)); + std::vector kernel(2 * diff + 1); + for (int i = -diff; i <= diff; ++i) { + kernel[i + diff] = std::exp(scaler * i * i); + } + return kernel; +} + +void ConvolveBorderColumn(const ImageF& in, const std::vector& kernel, + const size_t x, float* BUTTERAUGLI_RESTRICT row_out) { + const size_t offset = kernel.size() / 2; + int minx = x < offset ? 0 : x - offset; + int maxx = std::min(in.xsize() - 1, x + offset); + float weight = 0.0f; + for (int j = minx; j <= maxx; ++j) { + weight += kernel[j - x + offset]; + } + float scale = 1.0f / weight; + for (size_t y = 0; y < in.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y); + float sum = 0.0f; + for (int j = minx; j <= maxx; ++j) { + sum += row_in[j] * kernel[j - x + offset]; + } + row_out[y] = sum * scale; + } +} + +// Computes a horizontal convolution and transposes the result. +void ConvolutionWithTranspose(const ImageF& in, + const std::vector& kernel, + ImageF* BUTTERAUGLI_RESTRICT out) { + PROFILER_FUNC; + JXL_CHECK(out->xsize() == in.ysize()); + JXL_CHECK(out->ysize() == in.xsize()); + const size_t len = kernel.size(); + const size_t offset = len / 2; + float weight_no_border = 0.0f; + for (size_t j = 0; j < len; ++j) { + weight_no_border += kernel[j]; + } + const float scale_no_border = 1.0f / weight_no_border; + const size_t border1 = std::min(in.xsize(), offset); + const size_t border2 = in.xsize() > offset ? in.xsize() - offset : 0; + std::vector scaled_kernel(len / 2 + 1); + for (size_t i = 0; i <= len / 2; ++i) { + scaled_kernel[i] = kernel[i] * scale_no_border; + } + + // middle + switch (len) { +#if 1 // speed-optimized version + case 7: { + PROFILER_ZONE("conv7"); + const float sk0 = scaled_kernel[0]; + const float sk1 = scaled_kernel[1]; + const float sk2 = scaled_kernel[2]; + const float sk3 = scaled_kernel[3]; + for (size_t y = 0; y < in.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset; + for (size_t x = border1; x < border2; ++x, ++row_in) { + const float sum0 = (row_in[0] + row_in[6]) * sk0; + const float sum1 = (row_in[1] + row_in[5]) * sk1; + const float sum2 = (row_in[2] + row_in[4]) * sk2; + const float sum = (row_in[3]) * sk3 + sum0 + sum1 + sum2; + float* BUTTERAUGLI_RESTRICT row_out = out->Row(x); + row_out[y] = sum; + } + } + } break; + case 13: { + PROFILER_ZONE("conv15"); + for (size_t y = 0; y < in.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset; + for (size_t x = border1; x < border2; ++x, ++row_in) { + float sum0 = (row_in[0] + row_in[12]) * scaled_kernel[0]; + float sum1 = (row_in[1] + row_in[11]) * scaled_kernel[1]; + float sum2 = (row_in[2] + row_in[10]) * scaled_kernel[2]; + float sum3 = (row_in[3] + row_in[9]) * scaled_kernel[3]; + sum0 += (row_in[4] + row_in[8]) * scaled_kernel[4]; + sum1 += (row_in[5] + row_in[7]) * scaled_kernel[5]; + const float sum = (row_in[6]) * scaled_kernel[6]; + float* BUTTERAUGLI_RESTRICT row_out = out->Row(x); + row_out[y] = sum + sum0 + sum1 + sum2 + sum3; + } + } + break; + } + case 15: { + PROFILER_ZONE("conv15"); + for (size_t y = 0; y < in.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset; + for (size_t x = border1; x < border2; ++x, ++row_in) { + float sum0 = (row_in[0] + row_in[14]) * scaled_kernel[0]; + float sum1 = (row_in[1] + row_in[13]) * scaled_kernel[1]; + float sum2 = (row_in[2] + row_in[12]) * scaled_kernel[2]; + float sum3 = (row_in[3] + row_in[11]) * scaled_kernel[3]; + sum0 += (row_in[4] + row_in[10]) * scaled_kernel[4]; + sum1 += (row_in[5] + row_in[9]) * scaled_kernel[5]; + sum2 += (row_in[6] + row_in[8]) * scaled_kernel[6]; + const float sum = (row_in[7]) * scaled_kernel[7]; + float* BUTTERAUGLI_RESTRICT row_out = out->Row(x); + row_out[y] = sum + sum0 + sum1 + sum2 + sum3; + } + } + break; + } + case 25: { + PROFILER_ZONE("conv25"); + for (size_t y = 0; y < in.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset; + for (size_t x = border1; x < border2; ++x, ++row_in) { + float sum0 = (row_in[0] + row_in[24]) * scaled_kernel[0]; + float sum1 = (row_in[1] + row_in[23]) * scaled_kernel[1]; + float sum2 = (row_in[2] + row_in[22]) * scaled_kernel[2]; + float sum3 = (row_in[3] + row_in[21]) * scaled_kernel[3]; + sum0 += (row_in[4] + row_in[20]) * scaled_kernel[4]; + sum1 += (row_in[5] + row_in[19]) * scaled_kernel[5]; + sum2 += (row_in[6] + row_in[18]) * scaled_kernel[6]; + sum3 += (row_in[7] + row_in[17]) * scaled_kernel[7]; + sum0 += (row_in[8] + row_in[16]) * scaled_kernel[8]; + sum1 += (row_in[9] + row_in[15]) * scaled_kernel[9]; + sum2 += (row_in[10] + row_in[14]) * scaled_kernel[10]; + sum3 += (row_in[11] + row_in[13]) * scaled_kernel[11]; + const float sum = (row_in[12]) * scaled_kernel[12]; + float* BUTTERAUGLI_RESTRICT row_out = out->Row(x); + row_out[y] = sum + sum0 + sum1 + sum2 + sum3; + } + } + break; + } + case 33: { + PROFILER_ZONE("conv33"); + for (size_t y = 0; y < in.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset; + for (size_t x = border1; x < border2; ++x, ++row_in) { + float sum0 = (row_in[0] + row_in[32]) * scaled_kernel[0]; + float sum1 = (row_in[1] + row_in[31]) * scaled_kernel[1]; + float sum2 = (row_in[2] + row_in[30]) * scaled_kernel[2]; + float sum3 = (row_in[3] + row_in[29]) * scaled_kernel[3]; + sum0 += (row_in[4] + row_in[28]) * scaled_kernel[4]; + sum1 += (row_in[5] + row_in[27]) * scaled_kernel[5]; + sum2 += (row_in[6] + row_in[26]) * scaled_kernel[6]; + sum3 += (row_in[7] + row_in[25]) * scaled_kernel[7]; + sum0 += (row_in[8] + row_in[24]) * scaled_kernel[8]; + sum1 += (row_in[9] + row_in[23]) * scaled_kernel[9]; + sum2 += (row_in[10] + row_in[22]) * scaled_kernel[10]; + sum3 += (row_in[11] + row_in[21]) * scaled_kernel[11]; + sum0 += (row_in[12] + row_in[20]) * scaled_kernel[12]; + sum1 += (row_in[13] + row_in[19]) * scaled_kernel[13]; + sum2 += (row_in[14] + row_in[18]) * scaled_kernel[14]; + sum3 += (row_in[15] + row_in[17]) * scaled_kernel[15]; + const float sum = (row_in[16]) * scaled_kernel[16]; + float* BUTTERAUGLI_RESTRICT row_out = out->Row(x); + row_out[y] = sum + sum0 + sum1 + sum2 + sum3; + } + } + break; + } + case 37: { + PROFILER_ZONE("conv37"); + for (size_t y = 0; y < in.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset; + for (size_t x = border1; x < border2; ++x, ++row_in) { + float sum0 = (row_in[0] + row_in[36]) * scaled_kernel[0]; + float sum1 = (row_in[1] + row_in[35]) * scaled_kernel[1]; + float sum2 = (row_in[2] + row_in[34]) * scaled_kernel[2]; + float sum3 = (row_in[3] + row_in[33]) * scaled_kernel[3]; + sum0 += (row_in[4] + row_in[32]) * scaled_kernel[4]; + sum0 += (row_in[5] + row_in[31]) * scaled_kernel[5]; + sum0 += (row_in[6] + row_in[30]) * scaled_kernel[6]; + sum0 += (row_in[7] + row_in[29]) * scaled_kernel[7]; + sum0 += (row_in[8] + row_in[28]) * scaled_kernel[8]; + sum1 += (row_in[9] + row_in[27]) * scaled_kernel[9]; + sum2 += (row_in[10] + row_in[26]) * scaled_kernel[10]; + sum3 += (row_in[11] + row_in[25]) * scaled_kernel[11]; + sum0 += (row_in[12] + row_in[24]) * scaled_kernel[12]; + sum1 += (row_in[13] + row_in[23]) * scaled_kernel[13]; + sum2 += (row_in[14] + row_in[22]) * scaled_kernel[14]; + sum3 += (row_in[15] + row_in[21]) * scaled_kernel[15]; + sum0 += (row_in[16] + row_in[20]) * scaled_kernel[16]; + sum1 += (row_in[17] + row_in[19]) * scaled_kernel[17]; + const float sum = (row_in[18]) * scaled_kernel[18]; + float* BUTTERAUGLI_RESTRICT row_out = out->Row(x); + row_out[y] = sum + sum0 + sum1 + sum2 + sum3; + } + } + break; + } + default: + printf("Warning: Unexpected kernel size! %zu\n", len); +#else + default: +#endif + for (size_t y = 0; y < in.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y); + for (size_t x = border1; x < border2; ++x) { + const int d = x - offset; + float* BUTTERAUGLI_RESTRICT row_out = out->Row(x); + float sum = 0.0f; + size_t j; + for (j = 0; j <= len / 2; ++j) { + sum += row_in[d + j] * scaled_kernel[j]; + } + for (; j < len; ++j) { + sum += row_in[d + j] * scaled_kernel[len - 1 - j]; + } + row_out[y] = sum; + } + } + } + // left border + for (size_t x = 0; x < border1; ++x) { + ConvolveBorderColumn(in, kernel, x, out->Row(x)); + } + + // right border + for (size_t x = border2; x < in.xsize(); ++x) { + ConvolveBorderColumn(in, kernel, x, out->Row(x)); + } +} + +// Separate horizontal and vertical (next function) convolution passes. +void BlurHorizontalConv(const ImageF& in, const intptr_t xbegin, + const intptr_t xend, const intptr_t ybegin, + const intptr_t yend, const std::vector& kernel, + ImageF* out) { + if (xbegin >= xend || ybegin >= yend) return; + const intptr_t xsize = in.xsize(); + const intptr_t ysize = in.ysize(); + JXL_ASSERT(0 <= xbegin && xend <= xsize); + JXL_ASSERT(0 <= ybegin && yend <= ysize); + (void)xsize; + (void)ysize; + const intptr_t radius = kernel.size() / 2; + + for (intptr_t y = ybegin; y < yend; ++y) { + float* JXL_RESTRICT row_out = out->Row(y); + for (intptr_t x = xbegin; x < xend; ++x) { + float sum = 0.0f; + float sum_weights = 0.0f; + const float* JXL_RESTRICT row_in = in.Row(y); + for (intptr_t ix = -radius; ix <= radius; ++ix) { + const intptr_t in_x = x + ix; + if (in_x < 0 || in_x >= xsize) continue; + const float weight_x = kernel[ix + radius]; + sum += row_in[in_x] * weight_x; + sum_weights += weight_x; + } + row_out[x] = sum / sum_weights; + } + } +} + +void BlurVerticalConv(const ImageF& in, const intptr_t xbegin, + const intptr_t xend, const intptr_t ybegin, + const intptr_t yend, const std::vector& kernel, + ImageF* out) { + if (xbegin >= xend || ybegin >= yend) return; + const intptr_t xsize = in.xsize(); + const intptr_t ysize = in.ysize(); + JXL_ASSERT(0 <= xbegin && xend <= xsize); + JXL_ASSERT(0 <= ybegin && yend <= ysize); + (void)xsize; + const intptr_t radius = kernel.size() / 2; + for (intptr_t y = ybegin; y < yend; ++y) { + float* JXL_RESTRICT row_out = out->Row(y); + for (intptr_t x = xbegin; x < xend; ++x) { + float sum = 0.0f; + float sum_weights = 0.0f; + for (intptr_t iy = -radius; iy <= radius; ++iy) { + const intptr_t in_y = y + iy; + if (in_y < 0 || in_y >= ysize) continue; + const float weight_y = kernel[iy + radius]; + sum += in.ConstRow(in_y)[x] * weight_y; + sum_weights += weight_y; + } + row_out[x] = sum / sum_weights; + } + } +} + +// A blur somewhat similar to a 2D Gaussian blur. +// See: https://en.wikipedia.org/wiki/Gaussian_blur +// +// This is a bottleneck because the sigma can be quite large (>7). We can use +// gauss_blur.cc (runtime independent of sigma, closer to a 4*sigma truncated +// Gaussian and our 2.25 in ComputeKernel), but its boundary conditions are +// zero-valued. This leads to noticeable differences at the edges of diffmaps. +// We retain a special case for 5x5 kernels (even faster than gauss_blur), +// optionally use gauss_blur followed by fixup of the borders for large images, +// or fall back to the previous truncated FIR followed by a transpose. +void Blur(const ImageF& in, float sigma, const ButteraugliParams& params, + BlurTemp* temp, ImageF* out) { + std::vector kernel = ComputeKernel(sigma); + // Separable5 does an in-place convolution, so this fast path is not safe if + // in aliases out. + if (kernel.size() == 5 && &in != out) { + float sum_weights = 0.0f; + for (const float w : kernel) { + sum_weights += w; + } + const float scale = 1.0f / sum_weights; + const float w0 = kernel[2] * scale; + const float w1 = kernel[1] * scale; + const float w2 = kernel[0] * scale; + const WeightsSeparable5 weights = { + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}, + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}, + }; + Separable5(in, Rect(in), weights, /*pool=*/nullptr, out); + return; + } + + const bool fast_gauss = params.approximate_border; + const bool kBorderFixup = fast_gauss && false; + // Fast+fixup is actually slower for small images that are all border. + const bool too_small_for_fast_gauss = + kBorderFixup && + in.xsize() * in.ysize() < 9 * kernel.size() * kernel.size(); + // If fast gaussian is disabled, use previous transposed convolution. + if (!fast_gauss || too_small_for_fast_gauss) { + ImageF* JXL_RESTRICT temp_t = temp->GetTransposed(in); + ConvolutionWithTranspose(in, kernel, temp_t); + ConvolutionWithTranspose(*temp_t, kernel, out); + return; + } + auto rg = CreateRecursiveGaussian(sigma); + ImageF* JXL_RESTRICT temp_ = temp->Get(in); + ThreadPool* null_pool = nullptr; + FastGaussian(rg, in, null_pool, temp_, out); + + if (kBorderFixup) { + // Produce rg_radius extra pixels around each border + const intptr_t rg_radius = rg->radius; + const intptr_t radius = kernel.size() / 2; + const intptr_t xsize = in.xsize(); + const intptr_t ysize = in.ysize(); + const intptr_t yend_top = std::min(rg_radius + radius, ysize); + const intptr_t ybegin_bottom = + std::max(intptr_t(0), ysize - rg_radius - radius); + // Top (requires radius extra for the vertical pass) + BlurHorizontalConv(in, 0, xsize, 0, yend_top, kernel, temp_); + // Bottom + BlurHorizontalConv(in, 0, xsize, ybegin_bottom, ysize, kernel, temp_); + // Left/right columns between top and bottom + const intptr_t xbegin_right = std::max(intptr_t(0), xsize - rg_radius); + const intptr_t xend_left = std::min(rg_radius, xsize); + BlurHorizontalConv(in, 0, xend_left, yend_top, ybegin_bottom, kernel, + temp_); + BlurHorizontalConv(in, xbegin_right, xsize, yend_top, ybegin_bottom, kernel, + temp_); + + // Entire left/right columns + BlurVerticalConv(*temp_, 0, xend_left, 0, ysize, kernel, out); + BlurVerticalConv(*temp_, xbegin_right, xsize, 0, ysize, kernel, out); + // Top/bottom between left/right + const intptr_t ybegin_bottom2 = std::max(intptr_t(0), ysize - rg_radius); + const intptr_t yend_top2 = std::min(rg_radius, ysize); + BlurVerticalConv(*temp_, xend_left, xbegin_right, 0, yend_top2, kernel, + out); + BlurVerticalConv(*temp_, xend_left, xbegin_right, ybegin_bottom2, ysize, + kernel, out); + } +} + +// Allows PaddedMaltaUnit to call either function via overloading. +struct MaltaTagLF {}; +struct MaltaTag {}; + +} // namespace jxl + +#endif // JXL_BUTTERAUGLI_ONCE + +#include +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Vec; + +template +HWY_INLINE V MaximumClamp(D d, V v, double kMaxVal) { + static const double kMul = 0.724216145665; + const V mul = Set(d, kMul); + const V maxval = Set(d, kMaxVal); + // If greater than maxval or less than -maxval, replace with if_*. + const V if_pos = MulAdd(v - maxval, mul, maxval); + const V if_neg = MulSub(v + maxval, mul, maxval); + const V pos_or_v = IfThenElse(v >= maxval, if_pos, v); + return IfThenElse(v < Neg(maxval), if_neg, pos_or_v); +} + +// Make area around zero less important (remove it). +template +HWY_INLINE V RemoveRangeAroundZero(const D d, const double kw, const V x) { + const auto w = Set(d, kw); + return IfThenElse(x > w, x - w, IfThenElseZero(x < Neg(w), x + w)); +} + +// Make area around zero more important (2x it until the limit). +template +HWY_INLINE V AmplifyRangeAroundZero(const D d, const double kw, const V x) { + const auto w = Set(d, kw); + return IfThenElse(x > w, x + w, IfThenElse(x < Neg(w), x - w, x + x)); +} + +// XybLowFreqToVals converts from low-frequency XYB space to the 'vals' space. +// Vals space can be converted to L2-norm space (Euclidean and normalized) +// through visual masking. +template +HWY_INLINE void XybLowFreqToVals(const D d, const V& x, const V& y, + const V& b_arg, V* HWY_RESTRICT valx, + V* HWY_RESTRICT valy, V* HWY_RESTRICT valb) { + static const double xmuli = 32.2217497012; + static const double ymuli = 13.7697791434; + static const double bmuli = 47.504615728; + static const double y_to_b_muli = -0.362267051518; + const V xmul = Set(d, xmuli); + const V ymul = Set(d, ymuli); + const V bmul = Set(d, bmuli); + const V y_to_b_mul = Set(d, y_to_b_muli); + const V b = MulAdd(y_to_b_mul, y, b_arg); + *valb = b * bmul; + *valx = x * xmul; + *valy = y * ymul; +} + +void SuppressXByY(const ImageF& in_x, const ImageF& in_y, const double yw, + ImageF* HWY_RESTRICT out) { + JXL_DASSERT(SameSize(in_x, in_y) && SameSize(in_x, *out)); + const size_t xsize = in_x.xsize(); + const size_t ysize = in_x.ysize(); + + const HWY_FULL(float) d; + static const double s = 0.653020556257; + const auto sv = Set(d, s); + const auto one_minus_s = Set(d, 1.0 - s); + const auto ywv = Set(d, yw); + + for (size_t y = 0; y < ysize; ++y) { + const float* HWY_RESTRICT row_x = in_x.ConstRow(y); + const float* HWY_RESTRICT row_y = in_y.ConstRow(y); + float* HWY_RESTRICT row_out = out->Row(y); + + for (size_t x = 0; x < xsize; x += Lanes(d)) { + const auto vx = Load(d, row_x + x); + const auto vy = Load(d, row_y + x); + const auto scaler = MulAdd(ywv / MulAdd(vy, vy, ywv), one_minus_s, sv); + Store(scaler * vx, d, row_out + x); + } + } +} + +static void SeparateFrequencies(size_t xsize, size_t ysize, + const ButteraugliParams& params, + BlurTemp* blur_temp, const Image3F& xyb, + PsychoImage& ps) { + PROFILER_FUNC; + const HWY_FULL(float) d; + + // Extract lf ... + static const double kSigmaLf = 7.15593339443; + static const double kSigmaHf = 3.22489901262; + static const double kSigmaUhf = 1.56416327805; + ps.mf = Image3F(xsize, ysize); + ps.hf[0] = ImageF(xsize, ysize); + ps.hf[1] = ImageF(xsize, ysize); + ps.lf = Image3F(xyb.xsize(), xyb.ysize()); + ps.mf = Image3F(xyb.xsize(), xyb.ysize()); + for (int i = 0; i < 3; ++i) { + Blur(xyb.Plane(i), kSigmaLf, params, blur_temp, &ps.lf.Plane(i)); + + // ... and keep everything else in mf. + for (size_t y = 0; y < ysize; ++y) { + const float* BUTTERAUGLI_RESTRICT row_xyb = xyb.PlaneRow(i, y); + const float* BUTTERAUGLI_RESTRICT row_lf = ps.lf.ConstPlaneRow(i, y); + float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(i, y); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + const auto mf = Load(d, row_xyb + x) - Load(d, row_lf + x); + Store(mf, d, row_mf + x); + } + } + if (i == 2) { + Blur(ps.mf.Plane(i), kSigmaHf, params, blur_temp, &ps.mf.Plane(i)); + break; + } + // Divide mf into mf and hf. + for (size_t y = 0; y < ysize; ++y) { + float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(i, y); + float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[i].Row(y); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + Store(Load(d, row_mf + x), d, row_hf + x); + } + } + Blur(ps.mf.Plane(i), kSigmaHf, params, blur_temp, &ps.mf.Plane(i)); + static const double kRemoveMfRange = 0.29; + static const double kAddMfRange = 0.1; + if (i == 0) { + for (size_t y = 0; y < ysize; ++y) { + float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(0, y); + float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[0].Row(y); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + auto mf = Load(d, row_mf + x); + auto hf = Load(d, row_hf + x) - mf; + mf = RemoveRangeAroundZero(d, kRemoveMfRange, mf); + Store(mf, d, row_mf + x); + Store(hf, d, row_hf + x); + } + } + } else { + for (size_t y = 0; y < ysize; ++y) { + float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(1, y); + float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[1].Row(y); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + auto mf = Load(d, row_mf + x); + auto hf = Load(d, row_hf + x) - mf; + + mf = AmplifyRangeAroundZero(d, kAddMfRange, mf); + Store(mf, d, row_mf + x); + Store(hf, d, row_hf + x); + } + } + } + } + + // Temporarily used as output of SuppressXByY + ps.uhf[0] = ImageF(xsize, ysize); + ps.uhf[1] = ImageF(xsize, ysize); + + // Suppress red-green by intensity change in the high freq channels. + static const double suppress = 46.0; + SuppressXByY(ps.hf[0], ps.hf[1], suppress, &ps.uhf[0]); + // hf is the SuppressXByY output, uhf will be written below. + ps.hf[0].Swap(ps.uhf[0]); + + for (int i = 0; i < 2; ++i) { + // Divide hf into hf and uhf. + for (size_t y = 0; y < ysize; ++y) { + float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[i].Row(y); + float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[i].Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_uhf[x] = row_hf[x]; + } + } + Blur(ps.hf[i], kSigmaUhf, params, blur_temp, &ps.hf[i]); + static const double kRemoveHfRange = 1.5; + static const double kAddHfRange = 0.132; + static const double kRemoveUhfRange = 0.04; + static const double kMaxclampHf = 28.4691806922; + static const double kMaxclampUhf = 5.19175294647; + static double kMulYHf = 2.155; + static double kMulYUhf = 2.69313763794; + if (i == 0) { + for (size_t y = 0; y < ysize; ++y) { + float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[0].Row(y); + float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[0].Row(y); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + auto hf = Load(d, row_hf + x); + auto uhf = Load(d, row_uhf + x) - hf; + hf = RemoveRangeAroundZero(d, kRemoveHfRange, hf); + uhf = RemoveRangeAroundZero(d, kRemoveUhfRange, uhf); + Store(hf, d, row_hf + x); + Store(uhf, d, row_uhf + x); + } + } + } else { + for (size_t y = 0; y < ysize; ++y) { + float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[1].Row(y); + float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[1].Row(y); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + auto hf = Load(d, row_hf + x); + hf = MaximumClamp(d, hf, kMaxclampHf); + + auto uhf = Load(d, row_uhf + x) - hf; + uhf = MaximumClamp(d, uhf, kMaxclampUhf); + uhf *= Set(d, kMulYUhf); + Store(uhf, d, row_uhf + x); + + hf *= Set(d, kMulYHf); + hf = AmplifyRangeAroundZero(d, kAddHfRange, hf); + Store(hf, d, row_hf + x); + } + } + } + } + // Modify range around zero code only concerns the high frequency + // planes and only the X and Y channels. + // Convert low freq xyb to vals space so that we can do a simple squared sum + // diff on the low frequencies later. + for (size_t y = 0; y < ysize; ++y) { + float* BUTTERAUGLI_RESTRICT row_x = ps.lf.PlaneRow(0, y); + float* BUTTERAUGLI_RESTRICT row_y = ps.lf.PlaneRow(1, y); + float* BUTTERAUGLI_RESTRICT row_b = ps.lf.PlaneRow(2, y); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + auto valx = Undefined(d); + auto valy = Undefined(d); + auto valb = Undefined(d); + XybLowFreqToVals(d, Load(d, row_x + x), Load(d, row_y + x), + Load(d, row_b + x), &valx, &valy, &valb); + Store(valx, d, row_x + x); + Store(valy, d, row_y + x); + Store(valb, d, row_b + x); + } + } +} + +template +Vec MaltaUnit(MaltaTagLF /*tag*/, const D df, + const float* BUTTERAUGLI_RESTRICT d, const intptr_t xs) { + const intptr_t xs3 = 3 * xs; + + const auto center = LoadU(df, d); + + // x grows, y constant + const auto sum_yconst = LoadU(df, d - 4) + LoadU(df, d - 2) + center + + LoadU(df, d + 2) + LoadU(df, d + 4); + // Will return this, sum of all line kernels + auto retval = sum_yconst * sum_yconst; + { + // y grows, x constant + auto sum = LoadU(df, d - xs3 - xs) + LoadU(df, d - xs - xs) + center + + LoadU(df, d + xs + xs) + LoadU(df, d + xs3 + xs); + retval = MulAdd(sum, sum, retval); + } + { + // both grow + auto sum = LoadU(df, d - xs3 - 3) + LoadU(df, d - xs - xs - 2) + center + + LoadU(df, d + xs + xs + 2) + LoadU(df, d + xs3 + 3); + retval = MulAdd(sum, sum, retval); + } + { + // y grows, x shrinks + auto sum = LoadU(df, d - xs3 + 3) + LoadU(df, d - xs - xs + 2) + center + + LoadU(df, d + xs + xs - 2) + LoadU(df, d + xs3 - 3); + retval = MulAdd(sum, sum, retval); + } + { + // y grows -4 to 4, x shrinks 1 -> -1 + auto sum = LoadU(df, d - xs3 - xs + 1) + LoadU(df, d - xs - xs + 1) + + center + LoadU(df, d + xs + xs - 1) + + LoadU(df, d + xs3 + xs - 1); + retval = MulAdd(sum, sum, retval); + } + { + // y grows -4 to 4, x grows -1 -> 1 + auto sum = LoadU(df, d - xs3 - xs - 1) + LoadU(df, d - xs - xs - 1) + + center + LoadU(df, d + xs + xs + 1) + + LoadU(df, d + xs3 + xs + 1); + retval = MulAdd(sum, sum, retval); + } + { + // x grows -4 to 4, y grows -1 to 1 + auto sum = LoadU(df, d - 4 - xs) + LoadU(df, d - 2 - xs) + center + + LoadU(df, d + 2 + xs) + LoadU(df, d + 4 + xs); + retval = MulAdd(sum, sum, retval); + } + { + // x grows -4 to 4, y shrinks 1 to -1 + auto sum = LoadU(df, d - 4 + xs) + LoadU(df, d - 2 + xs) + center + + LoadU(df, d + 2 - xs) + LoadU(df, d + 4 - xs); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1__*______ + 2___*_____ + 3_________ + 4____0____ + 5_________ + 6_____*___ + 7______*__ + 8_________ */ + auto sum = LoadU(df, d - xs3 - 2) + LoadU(df, d - xs - xs - 1) + center + + LoadU(df, d + xs + xs + 1) + LoadU(df, d + xs3 + 2); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1______*__ + 2_____*___ + 3_________ + 4____0____ + 5_________ + 6___*_____ + 7__*______ + 8_________ */ + auto sum = LoadU(df, d - xs3 + 2) + LoadU(df, d - xs - xs + 1) + center + + LoadU(df, d + xs + xs - 1) + LoadU(df, d + xs3 - 2); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1_________ + 2_*_______ + 3__*______ + 4____0____ + 5______*__ + 6_______*_ + 7_________ + 8_________ */ + auto sum = LoadU(df, d - xs - xs - 3) + LoadU(df, d - xs - 2) + center + + LoadU(df, d + xs + 2) + LoadU(df, d + xs + xs + 3); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1_________ + 2_______*_ + 3______*__ + 4____0____ + 5__*______ + 6_*_______ + 7_________ + 8_________ */ + auto sum = LoadU(df, d - xs - xs + 3) + LoadU(df, d - xs + 2) + center + + LoadU(df, d + xs - 2) + LoadU(df, d + xs + xs - 3); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1_________ + 2________* + 3______*__ + 4____0____ + 5__*______ + 6*________ + 7_________ + 8_________ */ + + auto sum = LoadU(df, d + xs + xs - 4) + LoadU(df, d + xs - 2) + center + + LoadU(df, d - xs + 2) + LoadU(df, d - xs - xs + 4); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1_________ + 2*________ + 3__*______ + 4____0____ + 5______*__ + 6________* + 7_________ + 8_________ */ + auto sum = LoadU(df, d - xs - xs - 4) + LoadU(df, d - xs - 2) + center + + LoadU(df, d + xs + 2) + LoadU(df, d + xs + xs + 4); + retval = MulAdd(sum, sum, retval); + } + { + /* 0__*______ + 1_________ + 2___*_____ + 3_________ + 4____0____ + 5_________ + 6_____*___ + 7_________ + 8______*__ */ + auto sum = LoadU(df, d - xs3 - xs - 2) + LoadU(df, d - xs - xs - 1) + + center + LoadU(df, d + xs + xs + 1) + + LoadU(df, d + xs3 + xs + 2); + retval = MulAdd(sum, sum, retval); + } + { + /* 0______*__ + 1_________ + 2_____*___ + 3_________ + 4____0____ + 5_________ + 6___*_____ + 7_________ + 8__*______ */ + auto sum = LoadU(df, d - xs3 - xs + 2) + LoadU(df, d - xs - xs + 1) + + center + LoadU(df, d + xs + xs - 1) + + LoadU(df, d + xs3 + xs - 2); + retval = MulAdd(sum, sum, retval); + } + return retval; +} + +template +Vec MaltaUnit(MaltaTag /*tag*/, const D df, + const float* BUTTERAUGLI_RESTRICT d, const intptr_t xs) { + const intptr_t xs3 = 3 * xs; + + const auto center = LoadU(df, d); + + // x grows, y constant + const auto sum_yconst = LoadU(df, d - 4) + LoadU(df, d - 3) + + LoadU(df, d - 2) + LoadU(df, d - 1) + center + + LoadU(df, d + 1) + LoadU(df, d + 2) + + LoadU(df, d + 3) + LoadU(df, d + 4); + // Will return this, sum of all line kernels + auto retval = sum_yconst * sum_yconst; + + { + // y grows, x constant + auto sum = LoadU(df, d - xs3 - xs) + LoadU(df, d - xs3) + + LoadU(df, d - xs - xs) + LoadU(df, d - xs) + center + + LoadU(df, d + xs) + LoadU(df, d + xs + xs) + LoadU(df, d + xs3) + + LoadU(df, d + xs3 + xs); + retval = MulAdd(sum, sum, retval); + } + { + // both grow + auto sum = LoadU(df, d - xs3 - 3) + LoadU(df, d - xs - xs - 2) + + LoadU(df, d - xs - 1) + center + LoadU(df, d + xs + 1) + + LoadU(df, d + xs + xs + 2) + LoadU(df, d + xs3 + 3); + retval = MulAdd(sum, sum, retval); + } + { + // y grows, x shrinks + auto sum = LoadU(df, d - xs3 + 3) + LoadU(df, d - xs - xs + 2) + + LoadU(df, d - xs + 1) + center + LoadU(df, d + xs - 1) + + LoadU(df, d + xs + xs - 2) + LoadU(df, d + xs3 - 3); + retval = MulAdd(sum, sum, retval); + } + { + // y grows -4 to 4, x shrinks 1 -> -1 + auto sum = LoadU(df, d - xs3 - xs + 1) + LoadU(df, d - xs3 + 1) + + LoadU(df, d - xs - xs + 1) + LoadU(df, d - xs) + center + + LoadU(df, d + xs) + LoadU(df, d + xs + xs - 1) + + LoadU(df, d + xs3 - 1) + LoadU(df, d + xs3 + xs - 1); + retval = MulAdd(sum, sum, retval); + } + { + // y grows -4 to 4, x grows -1 -> 1 + auto sum = LoadU(df, d - xs3 - xs - 1) + LoadU(df, d - xs3 - 1) + + LoadU(df, d - xs - xs - 1) + LoadU(df, d - xs) + center + + LoadU(df, d + xs) + LoadU(df, d + xs + xs + 1) + + LoadU(df, d + xs3 + 1) + LoadU(df, d + xs3 + xs + 1); + retval = MulAdd(sum, sum, retval); + } + { + // x grows -4 to 4, y grows -1 to 1 + auto sum = LoadU(df, d - 4 - xs) + LoadU(df, d - 3 - xs) + + LoadU(df, d - 2 - xs) + LoadU(df, d - 1) + center + + LoadU(df, d + 1) + LoadU(df, d + 2 + xs) + + LoadU(df, d + 3 + xs) + LoadU(df, d + 4 + xs); + retval = MulAdd(sum, sum, retval); + } + { + // x grows -4 to 4, y shrinks 1 to -1 + auto sum = LoadU(df, d - 4 + xs) + LoadU(df, d - 3 + xs) + + LoadU(df, d - 2 + xs) + LoadU(df, d - 1) + center + + LoadU(df, d + 1) + LoadU(df, d + 2 - xs) + + LoadU(df, d + 3 - xs) + LoadU(df, d + 4 - xs); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1__*______ + 2___*_____ + 3___*_____ + 4____0____ + 5_____*___ + 6_____*___ + 7______*__ + 8_________ */ + auto sum = LoadU(df, d - xs3 - 2) + LoadU(df, d - xs - xs - 1) + + LoadU(df, d - xs - 1) + center + LoadU(df, d + xs + 1) + + LoadU(df, d + xs + xs + 1) + LoadU(df, d + xs3 + 2); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1______*__ + 2_____*___ + 3_____*___ + 4____0____ + 5___*_____ + 6___*_____ + 7__*______ + 8_________ */ + auto sum = LoadU(df, d - xs3 + 2) + LoadU(df, d - xs - xs + 1) + + LoadU(df, d - xs + 1) + center + LoadU(df, d + xs - 1) + + LoadU(df, d + xs + xs - 1) + LoadU(df, d + xs3 - 2); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1_________ + 2_*_______ + 3__**_____ + 4____0____ + 5_____**__ + 6_______*_ + 7_________ + 8_________ */ + auto sum = LoadU(df, d - xs - xs - 3) + LoadU(df, d - xs - 2) + + LoadU(df, d - xs - 1) + center + LoadU(df, d + xs + 1) + + LoadU(df, d + xs + 2) + LoadU(df, d + xs + xs + 3); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1_________ + 2_______*_ + 3_____**__ + 4____0____ + 5__**_____ + 6_*_______ + 7_________ + 8_________ */ + auto sum = LoadU(df, d - xs - xs + 3) + LoadU(df, d - xs + 2) + + LoadU(df, d - xs + 1) + center + LoadU(df, d + xs - 1) + + LoadU(df, d + xs - 2) + LoadU(df, d + xs + xs - 3); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1_________ + 2_________ + 3______*** + 4___*0*___ + 5***______ + 6_________ + 7_________ + 8_________ */ + + auto sum = LoadU(df, d + xs - 4) + LoadU(df, d + xs - 3) + + LoadU(df, d + xs - 2) + LoadU(df, d - 1) + center + + LoadU(df, d + 1) + LoadU(df, d - xs + 2) + + LoadU(df, d - xs + 3) + LoadU(df, d - xs + 4); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_________ + 1_________ + 2_________ + 3***______ + 4___*0*___ + 5______*** + 6_________ + 7_________ + 8_________ */ + auto sum = LoadU(df, d - xs - 4) + LoadU(df, d - xs - 3) + + LoadU(df, d - xs - 2) + LoadU(df, d - 1) + center + + LoadU(df, d + 1) + LoadU(df, d + xs + 2) + + LoadU(df, d + xs + 3) + LoadU(df, d + xs + 4); + retval = MulAdd(sum, sum, retval); + } + { + /* 0___*_____ + 1___*_____ + 2___*_____ + 3____*____ + 4____0____ + 5____*____ + 6_____*___ + 7_____*___ + 8_____*___ */ + auto sum = LoadU(df, d - xs3 - xs - 1) + LoadU(df, d - xs3 - 1) + + LoadU(df, d - xs - xs - 1) + LoadU(df, d - xs) + center + + LoadU(df, d + xs) + LoadU(df, d + xs + xs + 1) + + LoadU(df, d + xs3 + 1) + LoadU(df, d + xs3 + xs + 1); + retval = MulAdd(sum, sum, retval); + } + { + /* 0_____*___ + 1_____*___ + 2____ *___ + 3____*____ + 4____0____ + 5____*____ + 6___*_____ + 7___*_____ + 8___*_____ */ + auto sum = LoadU(df, d - xs3 - xs + 1) + LoadU(df, d - xs3 + 1) + + LoadU(df, d - xs - xs + 1) + LoadU(df, d - xs) + center + + LoadU(df, d + xs) + LoadU(df, d + xs + xs - 1) + + LoadU(df, d + xs3 - 1) + LoadU(df, d + xs3 + xs - 1); + retval = MulAdd(sum, sum, retval); + } + return retval; +} + +// Returns MaltaUnit. Avoids bounds-checks when x0 and y0 are known +// to be far enough from the image borders. "diffs" is a packed image. +template +static BUTTERAUGLI_INLINE float PaddedMaltaUnit(const ImageF& diffs, + const size_t x0, + const size_t y0) { + const float* BUTTERAUGLI_RESTRICT d = diffs.ConstRow(y0) + x0; + const HWY_CAPPED(float, 1) df; + if ((x0 >= 4 && y0 >= 4 && x0 < (diffs.xsize() - 4) && + y0 < (diffs.ysize() - 4))) { + return GetLane(MaltaUnit(Tag(), df, d, diffs.PixelsPerRow())); + } + + PROFILER_ZONE("Padded Malta"); + float borderimage[12 * 9]; // round up to 4 + for (int dy = 0; dy < 9; ++dy) { + int y = y0 + dy - 4; + if (y < 0 || static_cast(y) >= diffs.ysize()) { + for (int dx = 0; dx < 12; ++dx) { + borderimage[dy * 12 + dx] = 0.0f; + } + continue; + } + + const float* row_diffs = diffs.ConstRow(y); + for (int dx = 0; dx < 9; ++dx) { + int x = x0 + dx - 4; + if (x < 0 || static_cast(x) >= diffs.xsize()) { + borderimage[dy * 12 + dx] = 0.0f; + } else { + borderimage[dy * 12 + dx] = row_diffs[x]; + } + } + std::fill(borderimage + dy * 12 + 9, borderimage + dy * 12 + 12, 0.0f); + } + return GetLane(MaltaUnit(Tag(), df, &borderimage[4 * 12 + 4], 12)); +} + +template +static void MaltaDiffMapT(const Tag tag, const ImageF& lum0, const ImageF& lum1, + const double w_0gt1, const double w_0lt1, + const double norm1, const double len, + const double mulli, ImageF* HWY_RESTRICT diffs, + Image3F* HWY_RESTRICT block_diff_ac, size_t c) { + JXL_DASSERT(SameSize(lum0, lum1) && SameSize(lum0, *diffs)); + const size_t xsize_ = lum0.xsize(); + const size_t ysize_ = lum0.ysize(); + + const float kWeight0 = 0.5; + const float kWeight1 = 0.33; + + const double w_pre0gt1 = mulli * std::sqrt(kWeight0 * w_0gt1) / (len * 2 + 1); + const double w_pre0lt1 = mulli * std::sqrt(kWeight1 * w_0lt1) / (len * 2 + 1); + const float norm2_0gt1 = w_pre0gt1 * norm1; + const float norm2_0lt1 = w_pre0lt1 * norm1; + + for (size_t y = 0; y < ysize_; ++y) { + const float* HWY_RESTRICT row0 = lum0.ConstRow(y); + const float* HWY_RESTRICT row1 = lum1.ConstRow(y); + float* HWY_RESTRICT row_diffs = diffs->Row(y); + for (size_t x = 0; x < xsize_; ++x) { + const float absval = 0.5f * (std::abs(row0[x]) + std::abs(row1[x])); + const float diff = row0[x] - row1[x]; + const float scaler = norm2_0gt1 / (static_cast(norm1) + absval); + + // Primary symmetric quadratic objective. + row_diffs[x] = scaler * diff; + + const float scaler2 = norm2_0lt1 / (static_cast(norm1) + absval); + const double fabs0 = std::fabs(row0[x]); + + // Secondary half-open quadratic objectives. + const double too_small = 0.55 * fabs0; + const double too_big = 1.05 * fabs0; + + if (row0[x] < 0) { + if (row1[x] > -too_small) { + double impact = scaler2 * (row1[x] + too_small); + if (diff < 0) { + row_diffs[x] -= impact; + } else { + row_diffs[x] += impact; + } + } else if (row1[x] < -too_big) { + double impact = scaler2 * (-row1[x] - too_big); + if (diff < 0) { + row_diffs[x] -= impact; + } else { + row_diffs[x] += impact; + } + } + } else { + if (row1[x] < too_small) { + double impact = scaler2 * (too_small - row1[x]); + if (diff < 0) { + row_diffs[x] -= impact; + } else { + row_diffs[x] += impact; + } + } else if (row1[x] > too_big) { + double impact = scaler2 * (row1[x] - too_big); + if (diff < 0) { + row_diffs[x] -= impact; + } else { + row_diffs[x] += impact; + } + } + } + } + } + + size_t y0 = 0; + // Top + for (; y0 < 4; ++y0) { + float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0); + for (size_t x0 = 0; x0 < xsize_; ++x0) { + row_diff[x0] += PaddedMaltaUnit(*diffs, x0, y0); + } + } + + const HWY_FULL(float) df; + const size_t aligned_x = std::max(size_t(4), Lanes(df)); + const intptr_t stride = diffs->PixelsPerRow(); + + // Middle + for (; y0 < ysize_ - 4; ++y0) { + const float* BUTTERAUGLI_RESTRICT row_in = diffs->ConstRow(y0); + float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0); + size_t x0 = 0; + for (; x0 < aligned_x; ++x0) { + row_diff[x0] += PaddedMaltaUnit(*diffs, x0, y0); + } + for (; x0 + Lanes(df) + 4 <= xsize_; x0 += Lanes(df)) { + auto diff = Load(df, row_diff + x0); + diff += MaltaUnit(Tag(), df, row_in + x0, stride); + Store(diff, df, row_diff + x0); + } + + for (; x0 < xsize_; ++x0) { + row_diff[x0] += PaddedMaltaUnit(*diffs, x0, y0); + } + } + + // Bottom + for (; y0 < ysize_; ++y0) { + float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0); + for (size_t x0 = 0; x0 < xsize_; ++x0) { + row_diff[x0] += PaddedMaltaUnit(*diffs, x0, y0); + } + } +} + +// Need non-template wrapper functions for HWY_EXPORT. +void MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1, + const double w_0lt1, const double norm1, const double len, + const double mulli, ImageF* HWY_RESTRICT diffs, + Image3F* HWY_RESTRICT block_diff_ac, size_t c) { + MaltaDiffMapT(MaltaTag(), lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, + diffs, block_diff_ac, c); +} + +void MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1, const double w_0gt1, + const double w_0lt1, const double norm1, const double len, + const double mulli, ImageF* HWY_RESTRICT diffs, + Image3F* HWY_RESTRICT block_diff_ac, size_t c) { + MaltaDiffMapT(MaltaTagLF(), lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, + diffs, block_diff_ac, c); +} + +void DiffPrecompute(const ImageF& xyb, float mul, float bias_arg, ImageF* out) { + PROFILER_FUNC; + const size_t xsize = xyb.xsize(); + const size_t ysize = xyb.ysize(); + const float bias = mul * bias_arg; + const float sqrt_bias = sqrt(bias); + for (size_t y = 0; y < ysize; ++y) { + const float* BUTTERAUGLI_RESTRICT row_in = xyb.Row(y); + float* BUTTERAUGLI_RESTRICT row_out = out->Row(y); + for (size_t x = 0; x < xsize; ++x) { + // kBias makes sqrt behave more linearly. + row_out[x] = sqrt(mul * std::abs(row_in[x]) + bias) - sqrt_bias; + } + } +} + +// std::log(80.0) / std::log(255.0); +constexpr float kIntensityTargetNormalizationHack = 0.79079917404f; +static const float kInternalGoodQualityThreshold = + 17.1984479671f * kIntensityTargetNormalizationHack; +static const float kGlobalScale = 1.0 / kInternalGoodQualityThreshold; + +void StoreMin3(const float v, float& min0, float& min1, float& min2) { + if (v < min2) { + if (v < min0) { + min2 = min1; + min1 = min0; + min0 = v; + } else if (v < min1) { + min2 = min1; + min1 = v; + } else { + min2 = v; + } + } +} + +// Look for smooth areas near the area of degradation. +// If the areas area generally smooth, don't do masking. +void FuzzyErosion(const ImageF& from, ImageF* to) { + const size_t xsize = from.xsize(); + const size_t ysize = from.ysize(); + static const int kStep = 3; + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + float min0 = from.Row(y)[x]; + float min1 = 2 * min0; + float min2 = min1; + if (x >= kStep) { + float v = from.Row(y)[x - kStep]; + StoreMin3(v, min0, min1, min2); + if (y >= kStep) { + float v = from.Row(y - kStep)[x - kStep]; + StoreMin3(v, min0, min1, min2); + } + if (y < ysize - kStep) { + float v = from.Row(y + kStep)[x - kStep]; + StoreMin3(v, min0, min1, min2); + } + } + if (x < xsize - kStep) { + float v = from.Row(y)[x + kStep]; + StoreMin3(v, min0, min1, min2); + if (y >= kStep) { + float v = from.Row(y - kStep)[x + kStep]; + StoreMin3(v, min0, min1, min2); + } + if (y < ysize - kStep) { + float v = from.Row(y + kStep)[x + kStep]; + StoreMin3(v, min0, min1, min2); + } + } + if (y >= kStep) { + float v = from.Row(y - kStep)[x]; + StoreMin3(v, min0, min1, min2); + } + if (y < ysize - kStep) { + float v = from.Row(y + kStep)[x]; + StoreMin3(v, min0, min1, min2); + } + to->Row(y)[x] = (0.45f * min0 + 0.3f * min1 + 0.25f * min2); + } + } +} + +// Compute values of local frequency and dc masking based on the activity +// in the two images. img_diff_ac may be null. +void Mask(const ImageF& mask0, const ImageF& mask1, + const ButteraugliParams& params, BlurTemp* blur_temp, + ImageF* BUTTERAUGLI_RESTRICT mask, + ImageF* BUTTERAUGLI_RESTRICT diff_ac) { + // Only X and Y components are involved in masking. B's influence + // is considered less important in the high frequency area, and we + // don't model masking from lower frequency signals. + PROFILER_FUNC; + const size_t xsize = mask0.xsize(); + const size_t ysize = mask0.ysize(); + *mask = ImageF(xsize, ysize); + static const float kMul = 6.19424080439; + static const float kBias = 12.61050594197; + static const float kRadius = 2.7; + ImageF diff0(xsize, ysize); + ImageF diff1(xsize, ysize); + ImageF blurred0(xsize, ysize); + ImageF blurred1(xsize, ysize); + DiffPrecompute(mask0, kMul, kBias, &diff0); + DiffPrecompute(mask1, kMul, kBias, &diff1); + Blur(diff0, kRadius, params, blur_temp, &blurred0); + FuzzyErosion(blurred0, &diff0); + Blur(diff1, kRadius, params, blur_temp, &blurred1); + FuzzyErosion(blurred1, &diff1); + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + mask->Row(y)[x] = diff1.Row(y)[x]; + if (diff_ac != nullptr) { + static const float kMaskToErrorMul = 10.0; + float diff = blurred0.Row(y)[x] - blurred1.Row(y)[x]; + diff_ac->Row(y)[x] += kMaskToErrorMul * diff * diff; + } + } + } +} + +// `diff_ac` may be null. +void MaskPsychoImage(const PsychoImage& pi0, const PsychoImage& pi1, + const size_t xsize, const size_t ysize, + const ButteraugliParams& params, Image3F* temp, + BlurTemp* blur_temp, ImageF* BUTTERAUGLI_RESTRICT mask, + ImageF* BUTTERAUGLI_RESTRICT diff_ac) { + ImageF mask0(xsize, ysize); + ImageF mask1(xsize, ysize); + static const float muls[3] = { + 8.75000241361f, + 0.620978104816f, + 0.307585098253f, + }; + // Silly and unoptimized approach here. TODO(jyrki): rework this. + for (size_t y = 0; y < ysize; ++y) { + const float* BUTTERAUGLI_RESTRICT row_y_hf0 = pi0.hf[1].Row(y); + const float* BUTTERAUGLI_RESTRICT row_y_hf1 = pi1.hf[1].Row(y); + const float* BUTTERAUGLI_RESTRICT row_y_uhf0 = pi0.uhf[1].Row(y); + const float* BUTTERAUGLI_RESTRICT row_y_uhf1 = pi1.uhf[1].Row(y); + const float* BUTTERAUGLI_RESTRICT row_x_hf0 = pi0.hf[0].Row(y); + const float* BUTTERAUGLI_RESTRICT row_x_hf1 = pi1.hf[0].Row(y); + const float* BUTTERAUGLI_RESTRICT row_x_uhf0 = pi0.uhf[0].Row(y); + const float* BUTTERAUGLI_RESTRICT row_x_uhf1 = pi1.uhf[0].Row(y); + float* BUTTERAUGLI_RESTRICT row0 = mask0.Row(y); + float* BUTTERAUGLI_RESTRICT row1 = mask1.Row(y); + for (size_t x = 0; x < xsize; ++x) { + float xdiff0 = (row_x_uhf0[x] + row_x_hf0[x]) * muls[0]; + float xdiff1 = (row_x_uhf1[x] + row_x_hf1[x]) * muls[0]; + float ydiff0 = row_y_uhf0[x] * muls[1] + row_y_hf0[x] * muls[2]; + float ydiff1 = row_y_uhf1[x] * muls[1] + row_y_hf1[x] * muls[2]; + row0[x] = xdiff0 * xdiff0 + ydiff0 * ydiff0; + row0[x] = sqrt(row0[x]); + row1[x] = xdiff1 * xdiff1 + ydiff1 * ydiff1; + row1[x] = sqrt(row1[x]); + } + } + Mask(mask0, mask1, params, blur_temp, mask, diff_ac); +} + +double MaskY(double delta) { + static const double offset = 0.829591754942; + static const double scaler = 0.451936922203; + static const double mul = 2.5485944793; + const double c = mul / ((scaler * delta) + offset); + const double retval = kGlobalScale * (1.0 + c); + return retval * retval; +} + +double MaskDcY(double delta) { + static const double offset = 0.20025578522; + static const double scaler = 3.87449418804; + static const double mul = 0.505054525019; + const double c = mul / ((scaler * delta) + offset); + const double retval = kGlobalScale * (1.0 + c); + return retval * retval; +} + +inline float MaskColor(const float color[3], const float mask) { + return color[0] * mask + color[1] * mask + color[2] * mask; +} + +// Diffmap := sqrt of sum{diff images by multiplied by X and Y/B masks} +void CombineChannelsToDiffmap(const ImageF& mask, const Image3F& block_diff_dc, + const Image3F& block_diff_ac, float xmul, + ImageF* result) { + PROFILER_FUNC; + JXL_CHECK(SameSize(mask, *result)); + size_t xsize = mask.xsize(); + size_t ysize = mask.ysize(); + for (size_t y = 0; y < ysize; ++y) { + float* BUTTERAUGLI_RESTRICT row_out = result->Row(y); + for (size_t x = 0; x < xsize; ++x) { + float val = mask.Row(y)[x]; + float maskval = MaskY(val); + float dc_maskval = MaskDcY(val); + float diff_dc[3]; + float diff_ac[3]; + for (int i = 0; i < 3; ++i) { + diff_dc[i] = block_diff_dc.PlaneRow(i, y)[x]; + diff_ac[i] = block_diff_ac.PlaneRow(i, y)[x]; + } + diff_ac[0] *= xmul; + diff_dc[0] *= xmul; + row_out[x] = + sqrt(MaskColor(diff_dc, dc_maskval) + MaskColor(diff_ac, maskval)); + } + } +} + +// Adds weighted L2 difference between i0 and i1 to diffmap. +static void L2Diff(const ImageF& i0, const ImageF& i1, const float w, + Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) { + if (w == 0) return; + + const HWY_FULL(float) d; + const auto weight = Set(d, w); + + for (size_t y = 0; y < i0.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y); + const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y); + float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y); + + for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) { + const auto diff = Load(d, row0 + x) - Load(d, row1 + x); + const auto diff2 = diff * diff; + const auto prev = Load(d, row_diff + x); + Store(MulAdd(diff2, weight, prev), d, row_diff + x); + } + } +} + +// Initializes diffmap to the weighted L2 difference between i0 and i1. +static void SetL2Diff(const ImageF& i0, const ImageF& i1, const float w, + Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) { + if (w == 0) return; + + const HWY_FULL(float) d; + const auto weight = Set(d, w); + + for (size_t y = 0; y < i0.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y); + const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y); + float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y); + + for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) { + const auto diff = Load(d, row0 + x) - Load(d, row1 + x); + const auto diff2 = diff * diff; + Store(diff2 * weight, d, row_diff + x); + } + } +} + +// i0 is the original image. +// i1 is the deformed copy. +static void L2DiffAsymmetric(const ImageF& i0, const ImageF& i1, float w_0gt1, + float w_0lt1, + Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) { + if (w_0gt1 == 0 && w_0lt1 == 0) { + return; + } + + const HWY_FULL(float) d; + const auto vw_0gt1 = Set(d, w_0gt1 * 0.8); + const auto vw_0lt1 = Set(d, w_0lt1 * 0.8); + + for (size_t y = 0; y < i0.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row0 = i0.Row(y); + const float* BUTTERAUGLI_RESTRICT row1 = i1.Row(y); + float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y); + + for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) { + const auto val0 = Load(d, row0 + x); + const auto val1 = Load(d, row1 + x); + + // Primary symmetric quadratic objective. + const auto diff = val0 - val1; + auto total = MulAdd(diff * diff, vw_0gt1, Load(d, row_diff + x)); + + // Secondary half-open quadratic objectives. + const auto fabs0 = Abs(val0); + const auto too_small = Set(d, 0.4) * fabs0; + const auto too_big = fabs0; + + const auto if_neg = + IfThenElse(val1 > Neg(too_small), val1 + too_small, + IfThenElseZero(val1 < Neg(too_big), Neg(val1) - too_big)); + const auto if_pos = + IfThenElse(val1 < too_small, too_small - val1, + IfThenElseZero(val1 > too_big, val1 - too_big)); + const auto v = IfThenElse(val0 < Zero(d), if_neg, if_pos); + total += vw_0lt1 * v * v; + Store(total, d, row_diff + x); + } + } +} + +// A simple HDR compatible gamma function. +template +V Gamma(const DF df, V v) { + // ln(2) constant folded in because we want std::log but have FastLog2f. + const auto kRetMul = Set(df, 19.245013259874995f * 0.693147180559945f); + const auto kRetAdd = Set(df, -23.16046239805755); + // This should happen rarely, but may lead to a NaN in log, which is + // undesirable. Since negative photons don't exist we solve the NaNs by + // clamping here. + v = ZeroIfNegative(v); + + const auto biased = v + Set(df, 9.9710635769299145); + const auto log = FastLog2f(df, biased); + // We could fold this into a custom Log2 polynomial, but there would be + // relatively little gain. + return MulAdd(kRetMul, log, kRetAdd); +} + +template +BUTTERAUGLI_INLINE void OpsinAbsorbance(const DF df, const V& in0, const V& in1, + const V& in2, V* JXL_RESTRICT out0, + V* JXL_RESTRICT out1, + V* JXL_RESTRICT out2) { + // https://en.wikipedia.org/wiki/Photopsin absorbance modeling. + static const double mixi0 = 0.29956550340058319; + static const double mixi1 = 0.63373087833825936; + static const double mixi2 = 0.077705617820981968; + static const double mixi3 = 1.7557483643287353; + static const double mixi4 = 0.22158691104574774; + static const double mixi5 = 0.69391388044116142; + static const double mixi6 = 0.0987313588422; + static const double mixi7 = 1.7557483643287353; + static const double mixi8 = 0.02; + static const double mixi9 = 0.02; + static const double mixi10 = 0.20480129041026129; + static const double mixi11 = 12.226454707163354; + + const V mix0 = Set(df, mixi0); + const V mix1 = Set(df, mixi1); + const V mix2 = Set(df, mixi2); + const V mix3 = Set(df, mixi3); + const V mix4 = Set(df, mixi4); + const V mix5 = Set(df, mixi5); + const V mix6 = Set(df, mixi6); + const V mix7 = Set(df, mixi7); + const V mix8 = Set(df, mixi8); + const V mix9 = Set(df, mixi9); + const V mix10 = Set(df, mixi10); + const V mix11 = Set(df, mixi11); + + *out0 = mix0 * in0 + mix1 * in1 + mix2 * in2 + mix3; + *out1 = mix4 * in0 + mix5 * in1 + mix6 * in2 + mix7; + *out2 = mix8 * in0 + mix9 * in1 + mix10 * in2 + mix11; + + if (Clamp) { + *out0 = Max(*out0, mix3); + *out1 = Max(*out1, mix7); + *out2 = Max(*out2, mix11); + } +} + +// `blurred` is a temporary image used inside this function and not returned. +Image3F OpsinDynamicsImage(const Image3F& rgb, const ButteraugliParams& params, + Image3F* blurred, BlurTemp* blur_temp) { + PROFILER_FUNC; + Image3F xyb(rgb.xsize(), rgb.ysize()); + const double kSigma = 1.2; + Blur(rgb.Plane(0), kSigma, params, blur_temp, &blurred->Plane(0)); + Blur(rgb.Plane(1), kSigma, params, blur_temp, &blurred->Plane(1)); + Blur(rgb.Plane(2), kSigma, params, blur_temp, &blurred->Plane(2)); + const HWY_FULL(float) df; + const auto intensity_target_multiplier = Set(df, params.intensity_target); + for (size_t y = 0; y < rgb.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_r = rgb.ConstPlaneRow(0, y); + const float* BUTTERAUGLI_RESTRICT row_g = rgb.ConstPlaneRow(1, y); + const float* BUTTERAUGLI_RESTRICT row_b = rgb.ConstPlaneRow(2, y); + const float* BUTTERAUGLI_RESTRICT row_blurred_r = + blurred->ConstPlaneRow(0, y); + const float* BUTTERAUGLI_RESTRICT row_blurred_g = + blurred->ConstPlaneRow(1, y); + const float* BUTTERAUGLI_RESTRICT row_blurred_b = + blurred->ConstPlaneRow(2, y); + float* BUTTERAUGLI_RESTRICT row_out_x = xyb.PlaneRow(0, y); + float* BUTTERAUGLI_RESTRICT row_out_y = xyb.PlaneRow(1, y); + float* BUTTERAUGLI_RESTRICT row_out_b = xyb.PlaneRow(2, y); + const auto min = Set(df, 1e-4f); + for (size_t x = 0; x < rgb.xsize(); x += Lanes(df)) { + auto sensitivity0 = Undefined(df); + auto sensitivity1 = Undefined(df); + auto sensitivity2 = Undefined(df); + { + // Calculate sensitivity based on the smoothed image gamma derivative. + auto pre_mixed0 = Undefined(df); + auto pre_mixed1 = Undefined(df); + auto pre_mixed2 = Undefined(df); + OpsinAbsorbance( + df, Load(df, row_blurred_r + x) * intensity_target_multiplier, + Load(df, row_blurred_g + x) * intensity_target_multiplier, + Load(df, row_blurred_b + x) * intensity_target_multiplier, + &pre_mixed0, &pre_mixed1, &pre_mixed2); + pre_mixed0 = Max(pre_mixed0, min); + pre_mixed1 = Max(pre_mixed1, min); + pre_mixed2 = Max(pre_mixed2, min); + sensitivity0 = Gamma(df, pre_mixed0) / pre_mixed0; + sensitivity1 = Gamma(df, pre_mixed1) / pre_mixed1; + sensitivity2 = Gamma(df, pre_mixed2) / pre_mixed2; + sensitivity0 = Max(sensitivity0, min); + sensitivity1 = Max(sensitivity1, min); + sensitivity2 = Max(sensitivity2, min); + } + auto cur_mixed0 = Undefined(df); + auto cur_mixed1 = Undefined(df); + auto cur_mixed2 = Undefined(df); + OpsinAbsorbance(df, + Load(df, row_r + x) * intensity_target_multiplier, + Load(df, row_g + x) * intensity_target_multiplier, + Load(df, row_b + x) * intensity_target_multiplier, + &cur_mixed0, &cur_mixed1, &cur_mixed2); + cur_mixed0 *= sensitivity0; + cur_mixed1 *= sensitivity1; + cur_mixed2 *= sensitivity2; + // This is a kludge. The negative values should be zeroed away before + // blurring. Ideally there would be no negative values in the first place. + const auto min01 = Set(df, 1.7557483643287353f); + const auto min2 = Set(df, 12.226454707163354f); + cur_mixed0 = Max(cur_mixed0, min01); + cur_mixed1 = Max(cur_mixed1, min01); + cur_mixed2 = Max(cur_mixed2, min2); + + Store(cur_mixed0 - cur_mixed1, df, row_out_x + x); + Store(cur_mixed0 + cur_mixed1, df, row_out_y + x); + Store(cur_mixed2, df, row_out_b + x); + } + } + return xyb; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(SeparateFrequencies); // Local function. +HWY_EXPORT(MaskPsychoImage); // Local function. +HWY_EXPORT(L2DiffAsymmetric); // Local function. +HWY_EXPORT(L2Diff); // Local function. +HWY_EXPORT(SetL2Diff); // Local function. +HWY_EXPORT(CombineChannelsToDiffmap); // Local function. +HWY_EXPORT(MaltaDiffMap); // Local function. +HWY_EXPORT(MaltaDiffMapLF); // Local function. +HWY_EXPORT(OpsinDynamicsImage); // Local function. + +#if BUTTERAUGLI_ENABLE_CHECKS + +static inline bool IsNan(const float x) { + uint32_t bits; + memcpy(&bits, &x, sizeof(bits)); + const uint32_t bitmask_exp = 0x7F800000; + return (bits & bitmask_exp) == bitmask_exp && (bits & 0x7FFFFF); +} + +static inline bool IsNan(const double x) { + uint64_t bits; + memcpy(&bits, &x, sizeof(bits)); + return (0x7ff0000000000001ULL <= bits && bits <= 0x7fffffffffffffffULL) || + (0xfff0000000000001ULL <= bits && bits <= 0xffffffffffffffffULL); +} + +static inline void CheckImage(const ImageF& image, const char* name) { + PROFILER_FUNC; + for (size_t y = 0; y < image.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row = image.Row(y); + for (size_t x = 0; x < image.xsize(); ++x) { + if (IsNan(row[x])) { + printf("NAN: Image %s @ %zu,%zu (of %zu,%zu)\n", name, x, y, + image.xsize(), image.ysize()); + exit(1); + } + } + } +} + +#define CHECK_NAN(x, str) \ + do { \ + if (IsNan(x)) { \ + printf("%d: %s\n", __LINE__, str); \ + abort(); \ + } \ + } while (0) + +#define CHECK_IMAGE(image, name) CheckImage(image, name) + +#else // BUTTERAUGLI_ENABLE_CHECKS + +#define CHECK_NAN(x, str) +#define CHECK_IMAGE(image, name) + +#endif // BUTTERAUGLI_ENABLE_CHECKS + +// Calculate a 2x2 subsampled image for purposes of recursive butteraugli at +// multiresolution. +static Image3F SubSample2x(const Image3F& in) { + size_t xs = (in.xsize() + 1) / 2; + size_t ys = (in.ysize() + 1) / 2; + Image3F retval(xs, ys); + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < ys; ++y) { + for (size_t x = 0; x < xs; ++x) { + retval.PlaneRow(c, y)[x] = 0; + } + } + } + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < in.ysize(); ++y) { + for (size_t x = 0; x < in.xsize(); ++x) { + retval.PlaneRow(c, y / 2)[x / 2] += 0.25f * in.PlaneRow(c, y)[x]; + } + } + if ((in.xsize() & 1) != 0) { + for (size_t y = 0; y < retval.ysize(); ++y) { + size_t last_column = retval.xsize() - 1; + retval.PlaneRow(c, y)[last_column] *= 2.0f; + } + } + if ((in.ysize() & 1) != 0) { + for (size_t x = 0; x < retval.xsize(); ++x) { + size_t last_row = retval.ysize() - 1; + retval.PlaneRow(c, last_row)[x] *= 2.0f; + } + } + } + return retval; +} + +// Supersample src by 2x and add it to dest. +static void AddSupersampled2x(const ImageF& src, float w, ImageF& dest) { + for (size_t y = 0; y < dest.ysize(); ++y) { + for (size_t x = 0; x < dest.xsize(); ++x) { + // There will be less errors from the more averaged images. + // We take it into account to some extent using a scaler. + static const double kHeuristicMixingValue = 0.3; + dest.Row(y)[x] *= 1.0 - kHeuristicMixingValue * w; + dest.Row(y)[x] += w * src.Row(y / 2)[x / 2]; + } + } +} + +Image3F* ButteraugliComparator::Temp() const { + bool was_in_use = temp_in_use_.test_and_set(std::memory_order_acq_rel); + JXL_ASSERT(!was_in_use); + (void)was_in_use; + return &temp_; +} + +void ButteraugliComparator::ReleaseTemp() const { temp_in_use_.clear(); } + +ButteraugliComparator::ButteraugliComparator(const Image3F& rgb0, + const ButteraugliParams& params) + : xsize_(rgb0.xsize()), + ysize_(rgb0.ysize()), + params_(params), + temp_(xsize_, ysize_) { + if (xsize_ < 8 || ysize_ < 8) { + return; + } + + Image3F xyb0 = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(rgb0, params, Temp(), + &blur_temp_); + ReleaseTemp(); + HWY_DYNAMIC_DISPATCH(SeparateFrequencies) + (xsize_, ysize_, params_, &blur_temp_, xyb0, pi0_); + + // Awful recursive construction of samples of different resolution. + // This is an after-thought and possibly somewhat parallel in + // functionality with the PsychoImage multi-resolution approach. + sub_.reset(new ButteraugliComparator(SubSample2x(rgb0), params)); +} + +void ButteraugliComparator::Mask(ImageF* BUTTERAUGLI_RESTRICT mask) const { + HWY_DYNAMIC_DISPATCH(MaskPsychoImage) + (pi0_, pi0_, xsize_, ysize_, params_, Temp(), &blur_temp_, mask, nullptr); + ReleaseTemp(); +} + +void ButteraugliComparator::Diffmap(const Image3F& rgb1, ImageF& result) const { + PROFILER_FUNC; + if (xsize_ < 8 || ysize_ < 8) { + ZeroFillImage(&result); + return; + } + const Image3F xyb1 = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)( + rgb1, params_, Temp(), &blur_temp_); + ReleaseTemp(); + DiffmapOpsinDynamicsImage(xyb1, result); + if (sub_) { + if (sub_->xsize_ < 8 || sub_->ysize_ < 8) { + return; + } + const Image3F sub_xyb = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)( + SubSample2x(rgb1), params_, sub_->Temp(), &sub_->blur_temp_); + sub_->ReleaseTemp(); + ImageF subresult; + sub_->DiffmapOpsinDynamicsImage(sub_xyb, subresult); + AddSupersampled2x(subresult, 0.5, result); + } +} + +void ButteraugliComparator::DiffmapOpsinDynamicsImage(const Image3F& xyb1, + ImageF& result) const { + PROFILER_FUNC; + if (xsize_ < 8 || ysize_ < 8) { + ZeroFillImage(&result); + return; + } + PsychoImage pi1; + HWY_DYNAMIC_DISPATCH(SeparateFrequencies) + (xsize_, ysize_, params_, &blur_temp_, xyb1, pi1); + result = ImageF(xsize_, ysize_); + DiffmapPsychoImage(pi1, result); +} + +namespace { + +void MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1, + const double w_0lt1, const double norm1, + ImageF* HWY_RESTRICT diffs, + Image3F* HWY_RESTRICT block_diff_ac, size_t c) { + PROFILER_FUNC; + const double len = 3.75; + static const double mulli = 0.39905817637; + HWY_DYNAMIC_DISPATCH(MaltaDiffMap) + (lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, diffs, block_diff_ac, c); +} + +void MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1, const double w_0gt1, + const double w_0lt1, const double norm1, + ImageF* HWY_RESTRICT diffs, + Image3F* HWY_RESTRICT block_diff_ac, size_t c) { + PROFILER_FUNC; + const double len = 3.75; + static const double mulli = 0.611612573796; + HWY_DYNAMIC_DISPATCH(MaltaDiffMapLF) + (lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, diffs, block_diff_ac, c); +} + +} // namespace + +void ButteraugliComparator::DiffmapPsychoImage(const PsychoImage& pi1, + ImageF& diffmap) const { + PROFILER_FUNC; + if (xsize_ < 8 || ysize_ < 8) { + ZeroFillImage(&diffmap); + return; + } + + const float hf_asymmetry_ = params_.hf_asymmetry; + const float xmul_ = params_.xmul; + + ImageF diffs(xsize_, ysize_); + Image3F block_diff_ac(xsize_, ysize_); + ZeroFillImage(&block_diff_ac); + static const double wUhfMalta = 1.10039032555; + static const double norm1Uhf = 71.7800275169; + MaltaDiffMap(pi0_.uhf[1], pi1.uhf[1], wUhfMalta * hf_asymmetry_, + wUhfMalta / hf_asymmetry_, norm1Uhf, &diffs, &block_diff_ac, 1); + + static const double wUhfMaltaX = 173.5; + static const double norm1UhfX = 5.0; + MaltaDiffMap(pi0_.uhf[0], pi1.uhf[0], wUhfMaltaX * hf_asymmetry_, + wUhfMaltaX / hf_asymmetry_, norm1UhfX, &diffs, &block_diff_ac, + 0); + + static const double wHfMalta = 18.7237414387; + static const double norm1Hf = 4498534.45232; + MaltaDiffMapLF(pi0_.hf[1], pi1.hf[1], wHfMalta * std::sqrt(hf_asymmetry_), + wHfMalta / std::sqrt(hf_asymmetry_), norm1Hf, &diffs, + &block_diff_ac, 1); + + static const double wHfMaltaX = 6923.99476109; + static const double norm1HfX = 8051.15833247; + MaltaDiffMapLF(pi0_.hf[0], pi1.hf[0], wHfMaltaX * std::sqrt(hf_asymmetry_), + wHfMaltaX / std::sqrt(hf_asymmetry_), norm1HfX, &diffs, + &block_diff_ac, 0); + + static const double wMfMalta = 37.0819870399; + static const double norm1Mf = 130262059.556; + MaltaDiffMapLF(pi0_.mf.Plane(1), pi1.mf.Plane(1), wMfMalta, wMfMalta, norm1Mf, + &diffs, &block_diff_ac, 1); + + static const double wMfMaltaX = 8246.75321353; + static const double norm1MfX = 1009002.70582; + MaltaDiffMapLF(pi0_.mf.Plane(0), pi1.mf.Plane(0), wMfMaltaX, wMfMaltaX, + norm1MfX, &diffs, &block_diff_ac, 0); + + static const double wmul[9] = { + 400.0, 1.50815703118, 0, + 2150.0, 10.6195433239, 16.2176043152, + 29.2353797994, 0.844626970982, 0.703646627719, + }; + Image3F block_diff_dc(xsize_, ysize_); + for (size_t c = 0; c < 3; ++c) { + if (c < 2) { // No blue channel error accumulated at HF. + HWY_DYNAMIC_DISPATCH(L2DiffAsymmetric) + (pi0_.hf[c], pi1.hf[c], wmul[c] * hf_asymmetry_, wmul[c] / hf_asymmetry_, + &block_diff_ac, c); + } + HWY_DYNAMIC_DISPATCH(L2Diff) + (pi0_.mf.Plane(c), pi1.mf.Plane(c), wmul[3 + c], &block_diff_ac, c); + HWY_DYNAMIC_DISPATCH(SetL2Diff) + (pi0_.lf.Plane(c), pi1.lf.Plane(c), wmul[6 + c], &block_diff_dc, c); + } + + ImageF mask; + HWY_DYNAMIC_DISPATCH(MaskPsychoImage) + (pi0_, pi1, xsize_, ysize_, params_, Temp(), &blur_temp_, &mask, + &block_diff_ac.Plane(1)); + ReleaseTemp(); + + HWY_DYNAMIC_DISPATCH(CombineChannelsToDiffmap) + (mask, block_diff_dc, block_diff_ac, xmul_, &diffmap); +} + +double ButteraugliScoreFromDiffmap(const ImageF& diffmap, + const ButteraugliParams* params) { + PROFILER_FUNC; + // In approximate-border mode, skip pixels on the border likely to be affected + // by FastGauss' zero-valued-boundary behavior. The border is about half of + // the largest-diameter kernel (37x37 pixels), but only if the image is big. + size_t border = (params != nullptr && params->approximate_border) ? 8 : 0; + if (diffmap.xsize() <= 2 * border || diffmap.ysize() <= 2 * border) { + border = 0; + } + float retval = 0.0f; + for (size_t y = border; y < diffmap.ysize() - border; ++y) { + const float* BUTTERAUGLI_RESTRICT row = diffmap.ConstRow(y); + for (size_t x = border; x < diffmap.xsize() - border; ++x) { + retval = std::max(retval, row[x]); + } + } + return retval; +} + +bool ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1, + double hf_asymmetry, double xmul, ImageF& diffmap) { + ButteraugliParams params; + params.hf_asymmetry = hf_asymmetry; + params.xmul = xmul; + return ButteraugliDiffmap(rgb0, rgb1, params, diffmap); +} + +bool ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1, + const ButteraugliParams& params, ImageF& diffmap) { + PROFILER_FUNC; + const size_t xsize = rgb0.xsize(); + const size_t ysize = rgb0.ysize(); + if (xsize < 1 || ysize < 1) { + return JXL_FAILURE("Zero-sized image"); + } + if (!SameSize(rgb0, rgb1)) { + return JXL_FAILURE("Size mismatch"); + } + static const int kMax = 8; + if (xsize < kMax || ysize < kMax) { + // Butteraugli values for small (where xsize or ysize is smaller + // than 8 pixels) images are non-sensical, but most likely it is + // less disruptive to try to compute something than just give up. + // Temporarily extend the borders of the image to fit 8 x 8 size. + size_t xborder = xsize < kMax ? (kMax - xsize) / 2 : 0; + size_t yborder = ysize < kMax ? (kMax - ysize) / 2 : 0; + size_t xscaled = std::max(kMax, xsize); + size_t yscaled = std::max(kMax, ysize); + Image3F scaled0(xscaled, yscaled); + Image3F scaled1(xscaled, yscaled); + for (int i = 0; i < 3; ++i) { + for (size_t y = 0; y < yscaled; ++y) { + for (size_t x = 0; x < xscaled; ++x) { + size_t x2 = + std::min(xsize - 1, std::max(0, x - xborder)); + size_t y2 = + std::min(ysize - 1, std::max(0, y - yborder)); + scaled0.PlaneRow(i, y)[x] = rgb0.PlaneRow(i, y2)[x2]; + scaled1.PlaneRow(i, y)[x] = rgb1.PlaneRow(i, y2)[x2]; + } + } + } + ImageF diffmap_scaled; + const bool ok = + ButteraugliDiffmap(scaled0, scaled1, params, diffmap_scaled); + diffmap = ImageF(xsize, ysize); + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + diffmap.Row(y)[x] = diffmap_scaled.Row(y + yborder)[x + xborder]; + } + } + return ok; + } + ButteraugliComparator butteraugli(rgb0, params); + butteraugli.Diffmap(rgb1, diffmap); + return true; +} + +bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1, + float hf_asymmetry, float xmul, ImageF& diffmap, + double& diffvalue) { + ButteraugliParams params; + params.hf_asymmetry = hf_asymmetry; + params.xmul = xmul; + return ButteraugliInterface(rgb0, rgb1, params, diffmap, diffvalue); +} + +bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1, + const ButteraugliParams& params, ImageF& diffmap, + double& diffvalue) { +#if PROFILER_ENABLED + auto trace_start = std::chrono::steady_clock::now(); +#endif + if (!ButteraugliDiffmap(rgb0, rgb1, params, diffmap)) { + return false; + } +#if PROFILER_ENABLED + auto trace_end = std::chrono::steady_clock::now(); + std::chrono::duration elapsed = trace_end - trace_start; + const size_t mp = rgb0.xsize() * rgb0.ysize(); + printf("diff MP/s %f\n", mp / elapsed.count() * 1E-6); +#endif + diffvalue = ButteraugliScoreFromDiffmap(diffmap, ¶ms); + return true; +} + +double ButteraugliFuzzyClass(double score) { + static const double fuzzy_width_up = 4.8; + static const double fuzzy_width_down = 4.8; + static const double m0 = 2.0; + static const double scaler = 0.7777; + double val; + if (score < 1.0) { + // val in [scaler .. 2.0] + val = m0 / (1.0 + exp((score - 1.0) * fuzzy_width_down)); + val -= 1.0; // from [1 .. 2] to [0 .. 1] + val *= 2.0 - scaler; // from [0 .. 1] to [0 .. 2.0 - scaler] + val += scaler; // from [0 .. 2.0 - scaler] to [scaler .. 2.0] + } else { + // val in [0 .. scaler] + val = m0 / (1.0 + exp((score - 1.0) * fuzzy_width_up)); + val *= scaler; + } + return val; +} + +// #define PRINT_OUT_NORMALIZATION + +double ButteraugliFuzzyInverse(double seek) { + double pos = 0; + // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter) + for (double range = 1.0; range >= 1e-10; range *= 0.5) { + double cur = ButteraugliFuzzyClass(pos); + if (cur < seek) { + pos -= range; + } else { + pos += range; + } + } +#ifdef PRINT_OUT_NORMALIZATION + if (seek == 1.0) { + fprintf(stderr, "Fuzzy inverse %g\n", pos); + } +#endif + return pos; +} + +#ifdef PRINT_OUT_NORMALIZATION +static double print_out_normalization = ButteraugliFuzzyInverse(1.0); +#endif + +namespace { + +void ScoreToRgb(double score, double good_threshold, double bad_threshold, + float rgb[3]) { + double heatmap[12][3] = { + {0, 0, 0}, {0, 0, 1}, + {0, 1, 1}, {0, 1, 0}, // Good level + {1, 1, 0}, {1, 0, 0}, // Bad level + {1, 0, 1}, {0.5, 0.5, 1.0}, + {1.0, 0.5, 0.5}, // Pastel colors for the very bad quality range. + {1.0, 1.0, 0.5}, {1, 1, 1}, + {1, 1, 1}, // Last color repeated to have a solid range of white. + }; + if (score < good_threshold) { + score = (score / good_threshold) * 0.3; + } else if (score < bad_threshold) { + score = 0.3 + + (score - good_threshold) / (bad_threshold - good_threshold) * 0.15; + } else { + score = 0.45 + (score - bad_threshold) / (bad_threshold * 12) * 0.5; + } + static const int kTableSize = sizeof(heatmap) / sizeof(heatmap[0]); + score = std::min(std::max(score * (kTableSize - 1), 0.0), + kTableSize - 2); + int ix = static_cast(score); + ix = std::min(std::max(0, ix), kTableSize - 2); // Handle NaN + double mix = score - ix; + for (int i = 0; i < 3; ++i) { + double v = mix * heatmap[ix + 1][i] + (1 - mix) * heatmap[ix][i]; + rgb[i] = pow(v, 0.5); + } +} + +} // namespace + +Image3F CreateHeatMapImage(const ImageF& distmap, double good_threshold, + double bad_threshold) { + Image3F heatmap(distmap.xsize(), distmap.ysize()); + for (size_t y = 0; y < distmap.ysize(); ++y) { + const float* BUTTERAUGLI_RESTRICT row_distmap = distmap.ConstRow(y); + float* BUTTERAUGLI_RESTRICT row_h0 = heatmap.PlaneRow(0, y); + float* BUTTERAUGLI_RESTRICT row_h1 = heatmap.PlaneRow(1, y); + float* BUTTERAUGLI_RESTRICT row_h2 = heatmap.PlaneRow(2, y); + for (size_t x = 0; x < distmap.xsize(); ++x) { + const float d = row_distmap[x]; + float rgb[3]; + ScoreToRgb(d, good_threshold, bad_threshold, rgb); + row_h0[x] = rgb[0]; + row_h1[x] = rgb[1]; + row_h2[x] = rgb[2]; + } + } + return heatmap; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli/butteraugli.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli/butteraugli.h new file mode 100644 index 0000000000..d029722d13 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli/butteraugli.h @@ -0,0 +1,220 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Author: Jyrki Alakuijala (jyrki.alakuijala@gmail.com) + +#ifndef LIB_JXL_BUTTERAUGLI_BUTTERAUGLI_H_ +#define LIB_JXL_BUTTERAUGLI_BUTTERAUGLI_H_ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" + +#define BUTTERAUGLI_ENABLE_CHECKS 0 +#define BUTTERAUGLI_RESTRICT JXL_RESTRICT + +// This is the main interface to butteraugli image similarity +// analysis function. + +namespace jxl { + +struct ButteraugliParams { + // Multiplier for penalizing new HF artifacts more than blurring away + // features. 1.0=neutral. + float hf_asymmetry = 1.0f; + + // Multiplier for the psychovisual difference in the X channel. + float xmul = 1.0f; + + // Number of nits that correspond to 1.0f input values. + float intensity_target = 80.0f; + + bool approximate_border = false; +}; + +// ButteraugliInterface defines the public interface for butteraugli. +// +// It calculates the difference between rgb0 and rgb1. +// +// rgb0 and rgb1 contain the images. rgb0[c][px] and rgb1[c][px] contains +// the red image for c == 0, green for c == 1, blue for c == 2. Location index +// px is calculated as y * xsize + x. +// +// Value of pixels of images rgb0 and rgb1 need to be represented as raw +// intensity. Most image formats store gamma corrected intensity in pixel +// values. This gamma correction has to be removed, by applying the following +// function to values in the 0-1 range: +// butteraugli_val = pow(input_val, gamma); +// A typical value of gamma is 2.2. It is usually stored in the image header. +// Take care not to confuse that value with its inverse. The gamma value should +// be always greater than one. +// Butteraugli does not work as intended if the caller does not perform +// gamma correction. +// +// hf_asymmetry is a multiplier for penalizing new HF artifacts more than +// blurring away features (1.0 -> neutral). +// +// diffmap will contain an image of the size xsize * ysize, containing +// localized differences for values px (indexed with the px the same as rgb0 +// and rgb1). diffvalue will give a global score of similarity. +// +// A diffvalue smaller than kButteraugliGood indicates that images can be +// observed as the same image. +// diffvalue larger than kButteraugliBad indicates that a difference between +// the images can be observed. +// A diffvalue between kButteraugliGood and kButteraugliBad indicates that +// a subtle difference can be observed between the images. +// +// Returns true on success. +bool ButteraugliInterface(const Image3F &rgb0, const Image3F &rgb1, + const ButteraugliParams ¶ms, ImageF &diffmap, + double &diffvalue); + +// Deprecated (calls the previous function) +bool ButteraugliInterface(const Image3F &rgb0, const Image3F &rgb1, + float hf_asymmetry, float xmul, ImageF &diffmap, + double &diffvalue); + +// Converts the butteraugli score into fuzzy class values that are continuous +// at the class boundary. The class boundary location is based on human +// raters, but the slope is arbitrary. Particularly, it does not reflect +// the expectation value of probabilities of the human raters. It is just +// expected that a smoother class boundary will allow for higher-level +// optimization algorithms to work faster. +// +// Returns 2.0 for a perfect match, and 1.0 for 'ok', 0.0 for bad. Because the +// scoring is fuzzy, a butteraugli score of 0.96 would return a class of +// around 1.9. +double ButteraugliFuzzyClass(double score); + +// Input values should be in range 0 (bad) to 2 (good). Use +// kButteraugliNormalization as normalization. +double ButteraugliFuzzyInverse(double seek); + +// Implementation details, don't use anything below or your code will +// break in the future. + +#ifdef _MSC_VER +#define BUTTERAUGLI_INLINE __forceinline +#else +#define BUTTERAUGLI_INLINE inline +#endif + +#ifdef __clang__ +// Early versions of Clang did not support __builtin_assume_aligned. +#define BUTTERAUGLI_HAS_ASSUME_ALIGNED __has_builtin(__builtin_assume_aligned) +#elif defined(__GNUC__) +#define BUTTERAUGLI_HAS_ASSUME_ALIGNED 1 +#else +#define BUTTERAUGLI_HAS_ASSUME_ALIGNED 0 +#endif + +// Returns a void* pointer which the compiler then assumes is N-byte aligned. +// Example: float* JXL_RESTRICT aligned = (float*)JXL_ASSUME_ALIGNED(in, 32); +// +// The assignment semantics are required by GCC/Clang. ICC provides an in-place +// __assume_aligned, whereas MSVC's __assume appears unsuitable. +#if BUTTERAUGLI_HAS_ASSUME_ALIGNED +#define BUTTERAUGLI_ASSUME_ALIGNED(ptr, align) \ + __builtin_assume_aligned((ptr), (align)) +#else +#define BUTTERAUGLI_ASSUME_ALIGNED(ptr, align) (ptr) +#endif // BUTTERAUGLI_HAS_ASSUME_ALIGNED + +struct PsychoImage { + ImageF uhf[2]; // XY + ImageF hf[2]; // XY + Image3F mf; // XYB + Image3F lf; // XYB +}; + +// Depending on implementation, Blur either needs a normal or transposed image. +// Hold one or both of them here and only allocate on demand to reduce memory +// usage. +struct BlurTemp { + ImageF *Get(const ImageF &in) { + if (temp.xsize() == 0) { + temp = ImageF(in.xsize(), in.ysize()); + } + return &temp; + } + + ImageF *GetTransposed(const ImageF &in) { + if (transposed_temp.xsize() == 0) { + transposed_temp = ImageF(in.ysize(), in.xsize()); + } + return &transposed_temp; + } + + ImageF temp; + ImageF transposed_temp; +}; + +class ButteraugliComparator { + public: + // Butteraugli is calibrated at xmul = 1.0. We add a multiplier here so that + // we can test the hypothesis that a higher weighing of the X channel would + // improve results at higher Butteraugli values. + ButteraugliComparator(const Image3F &rgb0, const ButteraugliParams ¶ms); + virtual ~ButteraugliComparator() = default; + + // Computes the butteraugli map between the original image given in the + // constructor and the distorted image give here. + void Diffmap(const Image3F &rgb1, ImageF &result) const; + + // Same as above, but OpsinDynamicsImage() was already applied. + void DiffmapOpsinDynamicsImage(const Image3F &xyb1, ImageF &result) const; + + // Same as above, but the frequency decomposition was already applied. + void DiffmapPsychoImage(const PsychoImage &pi1, ImageF &diffmap) const; + + void Mask(ImageF *BUTTERAUGLI_RESTRICT mask) const; + + private: + Image3F *Temp() const; + void ReleaseTemp() const; + + const size_t xsize_; + const size_t ysize_; + ButteraugliParams params_; + PsychoImage pi0_; + + // Shared temporary image storage to reduce the number of allocations; + // obtained via Temp(), must call ReleaseTemp when no longer needed. + mutable Image3F temp_; + mutable std::atomic_flag temp_in_use_ = ATOMIC_FLAG_INIT; + + mutable BlurTemp blur_temp_; + std::unique_ptr sub_; +}; + +// Deprecated. +bool ButteraugliDiffmap(const Image3F &rgb0, const Image3F &rgb1, + double hf_asymmetry, double xmul, ImageF &diffmap); + +bool ButteraugliDiffmap(const Image3F &rgb0, const Image3F &rgb1, + const ButteraugliParams ¶ms, ImageF &diffmap); + +double ButteraugliScoreFromDiffmap(const ImageF &diffmap, + const ButteraugliParams *params = nullptr); + +// Generate rgb-representation of the distance between two images. +Image3F CreateHeatMapImage(const ImageF &distmap, double good_threshold, + double bad_threshold); + +} // namespace jxl + +#endif // LIB_JXL_BUTTERAUGLI_BUTTERAUGLI_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli_test.cc new file mode 100644 index 0000000000..98ec7888aa --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli_test.cc @@ -0,0 +1,102 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "jxl/butteraugli.h" + +#include "gtest/gtest.h" +#include "jxl/butteraugli_cxx.h" +#include "lib/jxl/test_utils.h" + +TEST(ButteraugliTest, Lossless) { + uint32_t xsize = 171; + uint32_t ysize = 219; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr)); + JxlButteraugliResultPtr result(JxlButteraugliCompute( + api.get(), xsize, ysize, &pixel_format, pixels.data(), pixels.size(), + &pixel_format, pixels.data(), pixels.size())); + EXPECT_EQ(0.0, JxlButteraugliResultGetDistance(result.get(), 8.0)); +} + +TEST(ButteraugliTest, Distmap) { + uint32_t xsize = 171; + uint32_t ysize = 219; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr)); + JxlButteraugliResultPtr result(JxlButteraugliCompute( + api.get(), xsize, ysize, &pixel_format, pixels.data(), pixels.size(), + &pixel_format, pixels.data(), pixels.size())); + EXPECT_EQ(0.0, JxlButteraugliResultGetDistance(result.get(), 8.0)); + const float* distmap; + uint32_t row_stride; + JxlButteraugliResultGetDistmap(result.get(), &distmap, &row_stride); + for (uint32_t y = 0; y < ysize; y++) { + for (uint32_t x = 0; x < xsize; x++) { + EXPECT_EQ(0.0, distmap[y * row_stride + x]); + } + } +} + +TEST(ButteraugliTest, Distorted) { + uint32_t xsize = 171; + uint32_t ysize = 219; + std::vector orig_pixels = + jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + std::vector dist_pixels = + jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + dist_pixels[0] += 128; + + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr)); + JxlButteraugliResultPtr result(JxlButteraugliCompute( + api.get(), xsize, ysize, &pixel_format, orig_pixels.data(), + orig_pixels.size(), &pixel_format, dist_pixels.data(), + dist_pixels.size())); + EXPECT_NE(0.0, JxlButteraugliResultGetDistance(result.get(), 8.0)); +} + +TEST(ButteraugliTest, Api) { + uint32_t xsize = 171; + uint32_t ysize = 219; + std::vector orig_pixels = + jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + std::vector dist_pixels = + jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + dist_pixels[0] += 128; + + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr)); + JxlButteraugliApiSetHFAsymmetry(api.get(), 1.0f); + JxlButteraugliApiSetIntensityTarget(api.get(), 250.0f); + JxlButteraugliResultPtr result(JxlButteraugliCompute( + api.get(), xsize, ysize, &pixel_format, orig_pixels.data(), + orig_pixels.size(), &pixel_format, dist_pixels.data(), + dist_pixels.size())); + double distance0 = JxlButteraugliResultGetDistance(result.get(), 8.0); + + JxlButteraugliApiSetHFAsymmetry(api.get(), 2.0f); + result.reset(JxlButteraugliCompute(api.get(), xsize, ysize, &pixel_format, + orig_pixels.data(), orig_pixels.size(), + &pixel_format, dist_pixels.data(), + dist_pixels.size())); + double distance1 = JxlButteraugliResultGetDistance(result.get(), 8.0); + + EXPECT_NE(distance0, distance1); + + JxlButteraugliApiSetIntensityTarget(api.get(), 80.0f); + result.reset(JxlButteraugliCompute(api.get(), xsize, ysize, &pixel_format, + orig_pixels.data(), orig_pixels.size(), + &pixel_format, dist_pixels.data(), + dist_pixels.size())); + double distance2 = JxlButteraugliResultGetDistance(result.get(), 8.0); + + EXPECT_NE(distance1, distance2); +} diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli_wrapper.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli_wrapper.cc new file mode 100644 index 0000000000..a2d2bc3c93 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/butteraugli_wrapper.cc @@ -0,0 +1,207 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include + +#include + +#include "jxl/butteraugli.h" +#include "jxl/parallel_runner.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/butteraugli/butteraugli.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_butteraugli_pnorm.h" +#include "lib/jxl/enc_external_image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/memory_manager_internal.h" + +namespace { + +void SetMetadataFromPixelFormat(const JxlPixelFormat* pixel_format, + jxl::ImageMetadata* metadata) { + uint32_t potential_alpha_bits = 0; + switch (pixel_format->data_type) { + case JXL_TYPE_FLOAT: + metadata->SetFloat32Samples(); + potential_alpha_bits = 16; + break; + case JXL_TYPE_FLOAT16: + metadata->SetFloat16Samples(); + potential_alpha_bits = 16; + break; + case JXL_TYPE_UINT32: + metadata->SetUintSamples(32); + potential_alpha_bits = 16; + break; + case JXL_TYPE_UINT16: + metadata->SetUintSamples(16); + potential_alpha_bits = 16; + break; + case JXL_TYPE_UINT8: + metadata->SetUintSamples(8); + potential_alpha_bits = 8; + break; + case JXL_TYPE_BOOLEAN: + metadata->SetUintSamples(2); + potential_alpha_bits = 2; + break; + } + if (pixel_format->num_channels == 2 || pixel_format->num_channels == 4) { + metadata->SetAlphaBits(potential_alpha_bits); + } +} + +} // namespace + +struct JxlButteraugliResultStruct { + JxlMemoryManager memory_manager; + + jxl::ImageF distmap; + jxl::ButteraugliParams params; +}; + +struct JxlButteraugliApiStruct { + // Multiplier for penalizing new HF artifacts more than blurring away + // features. 1.0=neutral. + float hf_asymmetry = 1.0f; + + // Multiplier for the psychovisual difference in the X channel. + float xmul = 1.0f; + + // Number of nits that correspond to 1.0f input values. + float intensity_target = jxl::kDefaultIntensityTarget; + + bool approximate_border = false; + + JxlMemoryManager memory_manager; + std::unique_ptr thread_pool{nullptr}; +}; + +JxlButteraugliApi* JxlButteraugliApiCreate( + const JxlMemoryManager* memory_manager) { + JxlMemoryManager local_memory_manager; + if (!jxl::MemoryManagerInit(&local_memory_manager, memory_manager)) + return nullptr; + + void* alloc = + jxl::MemoryManagerAlloc(&local_memory_manager, sizeof(JxlButteraugliApi)); + if (!alloc) return nullptr; + // Placement new constructor on allocated memory + JxlButteraugliApi* ret = new (alloc) JxlButteraugliApi(); + ret->memory_manager = local_memory_manager; + return ret; +} + +void JxlButteraugliApiSetParallelRunner(JxlButteraugliApi* api, + JxlParallelRunner parallel_runner, + void* parallel_runner_opaque) { + api->thread_pool = jxl::make_unique(parallel_runner, + parallel_runner_opaque); +} + +void JxlButteraugliApiSetHFAsymmetry(JxlButteraugliApi* api, float v) { + api->hf_asymmetry = v; +} + +void JxlButteraugliApiSetIntensityTarget(JxlButteraugliApi* api, float v) { + api->intensity_target = v; +} + +void JxlButteraugliApiDestroy(JxlButteraugliApi* api) { + if (api) { + // Call destructor directly since custom free function is used. + api->~JxlButteraugliApi(); + jxl::MemoryManagerFree(&api->memory_manager, api); + } +} + +JxlButteraugliResult* JxlButteraugliCompute( + const JxlButteraugliApi* api, uint32_t xsize, uint32_t ysize, + const JxlPixelFormat* pixel_format_orig, const void* buffer_orig, + size_t size_orig, const JxlPixelFormat* pixel_format_dist, + const void* buffer_dist, size_t size_dist) { + jxl::ImageMetadata orig_metadata; + SetMetadataFromPixelFormat(pixel_format_orig, &orig_metadata); + jxl::ImageBundle orig_ib(&orig_metadata); + jxl::ColorEncoding c_current; + if (pixel_format_orig->data_type == JXL_TYPE_FLOAT) { + c_current = + jxl::ColorEncoding::LinearSRGB(pixel_format_orig->num_channels < 3); + } else { + c_current = jxl::ColorEncoding::SRGB(pixel_format_orig->num_channels < 3); + } + if (!jxl::BufferToImageBundle(*pixel_format_orig, xsize, ysize, buffer_orig, + size_orig, api->thread_pool.get(), c_current, + &orig_ib)) { + return nullptr; + } + + jxl::ImageMetadata dist_metadata; + SetMetadataFromPixelFormat(pixel_format_dist, &dist_metadata); + jxl::ImageBundle dist_ib(&dist_metadata); + if (pixel_format_dist->data_type == JXL_TYPE_FLOAT) { + c_current = + jxl::ColorEncoding::LinearSRGB(pixel_format_dist->num_channels < 3); + } else { + c_current = jxl::ColorEncoding::SRGB(pixel_format_dist->num_channels < 3); + } + if (!jxl::BufferToImageBundle(*pixel_format_dist, xsize, ysize, buffer_dist, + size_dist, api->thread_pool.get(), c_current, + &dist_ib)) { + return nullptr; + } + + void* alloc = jxl::MemoryManagerAlloc(&api->memory_manager, + sizeof(JxlButteraugliResult)); + if (!alloc) return nullptr; + // Placement new constructor on allocated memory + JxlButteraugliResult* result = new (alloc) JxlButteraugliResult(); + result->memory_manager = api->memory_manager; + result->params.hf_asymmetry = api->hf_asymmetry; + result->params.xmul = api->xmul; + result->params.intensity_target = api->intensity_target; + result->params.approximate_border = api->approximate_border; + jxl::ButteraugliDistance(orig_ib, dist_ib, result->params, &result->distmap, + api->thread_pool.get()); + + return result; +} + +float JxlButteraugliResultGetDistance(const JxlButteraugliResult* result, + float pnorm) { + return static_cast( + jxl::ComputeDistanceP(result->distmap, result->params, pnorm)); +} + +void JxlButteraugliResultGetDistmap(const JxlButteraugliResult* result, + const float** buffer, + uint32_t* row_stride) { + *buffer = result->distmap.Row(0); + *row_stride = result->distmap.PixelsPerRow(); +} + +float JxlButteraugliResultGetMaxDistance(const JxlButteraugliResult* result) { + float max_distance = 0.0; + for (uint32_t y = 0; y < result->distmap.ysize(); y++) { + for (uint32_t x = 0; x < result->distmap.xsize(); x++) { + if (result->distmap.ConstRow(y)[x] > max_distance) { + max_distance = result->distmap.ConstRow(y)[x]; + } + } + } + return max_distance; +} + +void JxlButteraugliResultDestroy(JxlButteraugliResult* result) { + if (result) { + // Call destructor directly since custom free function is used. + result->~JxlButteraugliResult(); + jxl::MemoryManagerFree(&result->memory_manager, result); + } +} diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/byte_order_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/byte_order_test.cc new file mode 100644 index 0000000000..c1ea19f312 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/byte_order_test.cc @@ -0,0 +1,53 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/byte_order.h" + +#include "gtest/gtest.h" + +namespace jxl { +namespace { + +TEST(ByteOrderTest, TestRoundTripBE16) { + const uint32_t in = 0x1234; + uint8_t buf[2]; + StoreBE16(in, buf); + EXPECT_EQ(in, LoadBE16(buf)); + EXPECT_NE(in, LoadLE16(buf)); +} + +TEST(ByteOrderTest, TestRoundTripLE16) { + const uint32_t in = 0x1234; + uint8_t buf[2]; + StoreLE16(in, buf); + EXPECT_EQ(in, LoadLE16(buf)); + EXPECT_NE(in, LoadBE16(buf)); +} + +TEST(ByteOrderTest, TestRoundTripBE32) { + const uint32_t in = 0xFEDCBA98u; + uint8_t buf[4]; + StoreBE32(in, buf); + EXPECT_EQ(in, LoadBE32(buf)); + EXPECT_NE(in, LoadLE32(buf)); +} + +TEST(ByteOrderTest, TestRoundTripLE32) { + const uint32_t in = 0xFEDCBA98u; + uint8_t buf[4]; + StoreLE32(in, buf); + EXPECT_EQ(in, LoadLE32(buf)); + EXPECT_NE(in, LoadBE32(buf)); +} + +TEST(ByteOrderTest, TestRoundTripLE64) { + const uint64_t in = 0xFEDCBA9876543210ull; + uint8_t buf[8]; + StoreLE64(in, buf); + EXPECT_EQ(in, LoadLE64(buf)); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/chroma_from_luma.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/chroma_from_luma.cc new file mode 100644 index 0000000000..63d21cbb4b --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/chroma_from_luma.cc @@ -0,0 +1,21 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/chroma_from_luma.h" + +namespace jxl { + +ColorCorrelationMap::ColorCorrelationMap(size_t xsize, size_t ysize, bool XYB) + : ytox_map(DivCeil(xsize, kColorTileDim), DivCeil(ysize, kColorTileDim)), + ytob_map(DivCeil(xsize, kColorTileDim), DivCeil(ysize, kColorTileDim)) { + ZeroFillImage(&ytox_map); + ZeroFillImage(&ytob_map); + if (!XYB) { + base_correlation_b_ = 0; + } + RecomputeDCFactors(); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/chroma_from_luma.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/chroma_from_luma.h new file mode 100644 index 0000000000..cf2f90e43d --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/chroma_from_luma.h @@ -0,0 +1,151 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_CHROMA_FROM_LUMA_H_ +#define LIB_JXL_CHROMA_FROM_LUMA_H_ + +// Chroma-from-luma, computed using heuristics to determine the best linear +// model for the X and B channels from the Y channel. + +#include +#include + +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/quant_weights.h" + +namespace jxl { + +// Tile is the rectangular grid of blocks that share color correlation +// parameters ("factor_x/b" such that residual_b = blue - Y * factor_b). +static constexpr size_t kColorTileDim = 64; + +static_assert(kColorTileDim % kBlockDim == 0, + "Color tile dim should be divisible by block dim"); +static constexpr size_t kColorTileDimInBlocks = kColorTileDim / kBlockDim; + +static_assert(kGroupDimInBlocks % kColorTileDimInBlocks == 0, + "Group dim should be divisible by color tile dim"); + +static constexpr uint8_t kDefaultColorFactor = 84; + +// JPEG DCT coefficients are at most 1024. CfL constants are at most 127, and +// the ratio of two entries in a JPEG quantization table is at most 255. Thus, +// since the CfL denominator is 84, this leaves 12 bits of mantissa to be used. +// For extra caution, we use 11. +static constexpr uint8_t kCFLFixedPointPrecision = 11; + +static constexpr U32Enc kColorFactorDist(Val(kDefaultColorFactor), Val(256), + BitsOffset(8, 2), BitsOffset(16, 258)); + +struct ColorCorrelationMap { + ColorCorrelationMap() = default; + // xsize/ysize are in pixels + // set XYB=false to do something close to no-op cmap (needed for now since + // cmap is mandatory) + ColorCorrelationMap(size_t xsize, size_t ysize, bool XYB = true); + + float YtoXRatio(int32_t x_factor) const { + return base_correlation_x_ + x_factor * color_scale_; + } + + float YtoBRatio(int32_t b_factor) const { + return base_correlation_b_ + b_factor * color_scale_; + } + + Status DecodeDC(BitReader* br) { + if (br->ReadFixedBits<1>() == 1) { + // All default. + return true; + } + SetColorFactor(U32Coder::Read(kColorFactorDist, br)); + JXL_RETURN_IF_ERROR(F16Coder::Read(br, &base_correlation_x_)); + if (std::abs(base_correlation_x_) > 4.0f) { + return JXL_FAILURE("Base X correlation is out of range"); + } + JXL_RETURN_IF_ERROR(F16Coder::Read(br, &base_correlation_b_)); + if (std::abs(base_correlation_b_) > 4.0f) { + return JXL_FAILURE("Base B correlation is out of range"); + } + ytox_dc_ = static_cast(br->ReadFixedBits()) + + std::numeric_limits::min(); + ytob_dc_ = static_cast(br->ReadFixedBits()) + + std::numeric_limits::min(); + RecomputeDCFactors(); + return true; + } + + // We consider a CfL map to be JPEG-reconstruction-compatible if base + // correlation is 0, no DC correlation is used, and we use the default color + // factor. + bool IsJPEGCompatible() const { + return base_correlation_x_ == 0 && base_correlation_b_ == 0 && + ytob_dc_ == 0 && ytox_dc_ == 0 && + color_factor_ == kDefaultColorFactor; + } + + int32_t RatioJPEG(int32_t factor) const { + return factor * (1 << kCFLFixedPointPrecision) / kDefaultColorFactor; + } + + void SetColorFactor(uint32_t factor) { + color_factor_ = factor; + color_scale_ = 1.0f / color_factor_; + RecomputeDCFactors(); + } + + void SetYToBDC(int32_t ytob_dc) { + ytob_dc_ = ytob_dc; + RecomputeDCFactors(); + } + void SetYToXDC(int32_t ytox_dc) { + ytox_dc_ = ytox_dc; + RecomputeDCFactors(); + } + + int32_t GetYToXDC() const { return ytox_dc_; } + int32_t GetYToBDC() const { return ytob_dc_; } + float GetColorFactor() const { return color_factor_; } + float GetBaseCorrelationX() const { return base_correlation_x_; } + float GetBaseCorrelationB() const { return base_correlation_b_; } + + const float* DCFactors() const { return dc_factors_; } + + void RecomputeDCFactors() { + dc_factors_[0] = YtoXRatio(ytox_dc_); + dc_factors_[2] = YtoBRatio(ytob_dc_); + } + + ImageSB ytox_map; + ImageSB ytob_map; + + private: + float dc_factors_[4] = {}; + // range of factor: -1.51 to +1.52 + uint32_t color_factor_ = kDefaultColorFactor; + float color_scale_ = 1.0f / color_factor_; + float base_correlation_x_ = 0.0f; + float base_correlation_b_ = kYToBRatio; + int32_t ytox_dc_ = 0; + int32_t ytob_dc_ = 0; +}; + +} // namespace jxl + +#endif // LIB_JXL_CHROMA_FROM_LUMA_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/codec_in_out.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/codec_in_out.h new file mode 100644 index 0000000000..2c2b767a66 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/codec_in_out.h @@ -0,0 +1,253 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_CODEC_IN_OUT_H_ +#define LIB_JXL_CODEC_IN_OUT_H_ + +// Holds inputs/outputs for decoding/encoding images. + +#include + +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/common.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/luminance.h" + +namespace jxl { + +// Per-channel interval, used to convert between (full-range) external and +// (bounded or unbounded) temp values. See external_image.cc for the definitions +// of temp/external. +struct CodecInterval { + CodecInterval() = default; + constexpr CodecInterval(float min, float max) : min(min), width(max - min) {} + // Defaults for temp. + float min = 0.0f; + float width = 1.0f; +}; + +struct SizeConstraints { + // Upper limit on pixel dimensions/area, enforced by VerifyDimensions + // (called from decoders). Fuzzers set smaller values to limit memory use. + uint32_t dec_max_xsize = 0xFFFFFFFFu; + uint32_t dec_max_ysize = 0xFFFFFFFFu; + uint64_t dec_max_pixels = 0xFFFFFFFFu; // Might be up to ~0ull +}; + +template ::value>::type> +Status VerifyDimensions(const SizeConstraints* constraints, T xs, T ys) { + if (!constraints) return true; + + if (xs == 0 || ys == 0) return JXL_FAILURE("Empty image."); + if (xs > constraints->dec_max_xsize) return JXL_FAILURE("Image too wide."); + if (ys > constraints->dec_max_ysize) return JXL_FAILURE("Image too tall."); + + const uint64_t num_pixels = static_cast(xs) * ys; + if (num_pixels > constraints->dec_max_pixels) { + return JXL_FAILURE("Image too big."); + } + + return true; +} + +using CodecIntervals = std::array; // RGB[A] or Y[A] + +// Allows passing arbitrary metadata to decoders (required for PNM). +class DecoderHints { + public: + // key=color_space, value=Description(c/pp): specify the ColorEncoding of + // the pixels for decoding. Otherwise, if the codec did not obtain an ICC + // profile from the image, assume sRGB. + // + // Strings are taken from the command line, so avoid spaces for convenience. + void Add(const std::string& key, const std::string& value) { + kv_.emplace_back(key, value); + } + + // Calls `func(key, value)` for each key/value in the order they were added, + // returning false immediately if `func` returns false. + template + Status Foreach(const Func& func) const { + for (const KeyValue& kv : kv_) { + Status ok = func(kv.key, kv.value); + if (!ok) { + return JXL_FAILURE("DecoderHints::Foreach returned false"); + } + } + return true; + } + + private: + // Splitting into key/value avoids parsing in each codec. + struct KeyValue { + KeyValue(std::string key, std::string value) + : key(std::move(key)), value(std::move(value)) {} + + std::string key; + std::string value; + }; + + std::vector kv_; +}; + +// Optional text/EXIF metadata. +struct Blobs { + PaddedBytes exif; + PaddedBytes iptc; + PaddedBytes jumbf; + PaddedBytes xmp; +}; + +// For Codec::kJPG, convert between JPEG and pixels or between JPEG and +// quantized DCT coefficients +// For pixel data, the nominal range is 0..1. +enum class DecodeTarget { kPixels, kQuantizedCoeffs }; + +// Holds a preview, a main image or one or more frames, plus the inputs/outputs +// to/from decoding/encoding. +class CodecInOut { + public: + CodecInOut() : preview_frame(&metadata.m) { + frames.reserve(1); + frames.emplace_back(&metadata.m); + } + + // Move-only. + CodecInOut(CodecInOut&&) = default; + CodecInOut& operator=(CodecInOut&&) = default; + + size_t LastStillFrame() const { + JXL_DASSERT(frames.size() > 0); + size_t last = 0; + for (size_t i = 0; i < frames.size(); i++) { + last = i; + if (frames[i].duration > 0) break; + } + return last; + } + + ImageBundle& Main() { return frames[LastStillFrame()]; } + const ImageBundle& Main() const { return frames[LastStillFrame()]; } + + // If c_current.IsGray(), all planes must be identical. + void SetFromImage(Image3F&& color, const ColorEncoding& c_current) { + Main().SetFromImage(std::move(color), c_current); + SetIntensityTarget(this); + SetSize(Main().xsize(), Main().ysize()); + } + + void SetSize(size_t xsize, size_t ysize) { + JXL_CHECK(metadata.size.Set(xsize, ysize)); + } + + void CheckMetadata() const { + JXL_CHECK(metadata.m.bit_depth.bits_per_sample != 0); + JXL_CHECK(!metadata.m.color_encoding.ICC().empty()); + + if (preview_frame.xsize() != 0) preview_frame.VerifyMetadata(); + JXL_CHECK(preview_frame.metadata() == &metadata.m); + + for (const ImageBundle& ib : frames) { + ib.VerifyMetadata(); + JXL_CHECK(ib.metadata() == &metadata.m); + } + } + + size_t xsize() const { return metadata.size.xsize(); } + size_t ysize() const { return metadata.size.ysize(); } + void ShrinkTo(size_t xsize, size_t ysize) { + // preview is unaffected. + for (ImageBundle& ib : frames) { + ib.ShrinkTo(xsize, ysize); + } + SetSize(xsize, ysize); + } + + // Calls TransformTo for each ImageBundle (preview/frames). + Status TransformTo(const ColorEncoding& c_desired, + ThreadPool* pool = nullptr) { + if (metadata.m.have_preview) { + JXL_RETURN_IF_ERROR(preview_frame.TransformTo(c_desired, pool)); + } + for (ImageBundle& ib : frames) { + JXL_RETURN_IF_ERROR(ib.TransformTo(c_desired, pool)); + } + return true; + } + // Calls PremultiplyAlpha for each ImageBundle (preview/frames). + void PremultiplyAlpha() { + ExtraChannelInfo* eci = metadata.m.Find(ExtraChannel::kAlpha); + if (eci == nullptr || eci->alpha_associated) return; // nothing to do + if (metadata.m.have_preview) { + preview_frame.PremultiplyAlpha(); + } + for (ImageBundle& ib : frames) { + ib.PremultiplyAlpha(); + } + eci->alpha_associated = true; + return; + } + + // -- DECODER INPUT: + + SizeConstraints constraints; + // Used to set c_current for codecs that lack color space metadata. + DecoderHints dec_hints; + // Decode to pixels or keep JPEG as quantized DCT coefficients + DecodeTarget dec_target = DecodeTarget::kPixels; + + // Intended white luminance, in nits (cd/m^2). + // It is used by codecs that do not know the absolute luminance of their + // images. For those codecs, decoders map from white to this luminance. There + // is no other way of knowing the target brightness for those codecs - depends + // on source material. 709 typically targets 100 nits, BT.2100 PQ up to 10K, + // but HDR content is more typically mastered to 4K nits. Codecs that do know + // the absolute luminance of their images will typically ignore it as a + // decoder input. The corresponding decoder output and encoder input is the + // intensity target in the metadata. ALL decoders MUST set that metadata + // appropriately, but it does not have to be identical to this hint. Encoders + // for codecs that do not encode absolute luminance levels should use that + // metadata to decide on what to map to white. Encoders for codecs that *do* + // encode absolute luminance levels may use it to decide on encoding values, + // but not in a way that would affect the range of interpreted luminance. + // + // 0 means that it is up to the codec to decide on a reasonable value to use. + + float target_nits = 0; + + // -- DECODER OUTPUT: + + // Total number of pixels decoded (may differ from #frames * xsize * ysize + // if frames are cropped) + uint64_t dec_pixels = 0; + + // -- DECODER OUTPUT, ENCODER INPUT: + + // Metadata stored into / retrieved from bitstreams. + + Blobs blobs; + + CodecMetadata metadata; // applies to preview and all frames + + // If metadata.have_preview: + ImageBundle preview_frame; + + std::vector frames; // size=1 if !metadata.have_animation + + bool use_sjpeg = false; + // If the image should be written to a JPEG, use this quality for encoding. + size_t jpeg_quality; +}; + +} // namespace jxl + +#endif // LIB_JXL_CODEC_IN_OUT_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order.cc new file mode 100644 index 0000000000..e87728339d --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order.cc @@ -0,0 +1,154 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/coeff_order.h" + +#include + +#include +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/lehmer_code.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/modular_image.h" + +namespace jxl { + +void SetDefaultOrder(AcStrategy acs, coeff_order_t* JXL_RESTRICT order) { + PROFILER_FUNC; + const size_t size = + kDCTBlockSize * acs.covered_blocks_x() * acs.covered_blocks_y(); + const coeff_order_t* natural_coeff_order = acs.NaturalCoeffOrder(); + for (size_t k = 0; k < size; ++k) { + order[k] = natural_coeff_order[k]; + } +} + +uint32_t CoeffOrderContext(uint32_t val) { + uint32_t token, nbits, bits; + HybridUintConfig(0, 0, 0).Encode(val, &token, &nbits, &bits); + return std::min(token, kPermutationContexts - 1); +} + +namespace { +Status ReadPermutation(size_t skip, size_t size, coeff_order_t* order, + BitReader* br, ANSSymbolReader* reader, + const std::vector& context_map) { + std::vector lehmer(size); + // temp space needs to be as large as the next power of 2, so doubling the + // allocated size is enough. + std::vector temp(size * 2); + uint32_t end = + reader->ReadHybridUint(CoeffOrderContext(size), br, context_map) + skip; + if (end > size) { + return JXL_FAILURE("Invalid permutation size"); + } + uint32_t last = 0; + for (size_t i = skip; i < end; ++i) { + lehmer[i] = + reader->ReadHybridUint(CoeffOrderContext(last), br, context_map); + last = lehmer[i]; + if (lehmer[i] + i >= size) { + return JXL_FAILURE("Invalid lehmer code"); + } + } + if (order == nullptr) return true; + DecodeLehmerCode(lehmer.data(), temp.data(), size, order); + return true; +} + +} // namespace + +Status DecodePermutation(size_t skip, size_t size, coeff_order_t* order, + BitReader* br) { + std::vector context_map; + ANSCode code; + JXL_RETURN_IF_ERROR( + DecodeHistograms(br, kPermutationContexts, &code, &context_map)); + ANSSymbolReader reader(&code, br); + JXL_RETURN_IF_ERROR( + ReadPermutation(skip, size, order, br, &reader, context_map)); + if (!reader.CheckANSFinalState()) { + return JXL_FAILURE("Invalid ANS stream"); + } + return true; +} + +namespace { + +Status DecodeCoeffOrder(AcStrategy acs, coeff_order_t* order, BitReader* br, + ANSSymbolReader* reader, + const std::vector& context_map) { + PROFILER_FUNC; + const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y(); + const size_t size = kDCTBlockSize * llf; + + JXL_RETURN_IF_ERROR( + ReadPermutation(llf, size, order, br, reader, context_map)); + if (order == nullptr) return true; + const coeff_order_t* natural_coeff_order = acs.NaturalCoeffOrder(); + for (size_t k = 0; k < size; ++k) { + order[k] = natural_coeff_order[order[k]]; + } + return true; +} + +} // namespace + +Status DecodeCoeffOrders(uint16_t used_orders, uint32_t used_acs, + coeff_order_t* order, BitReader* br) { + uint16_t computed = 0; + std::vector context_map; + ANSCode code; + std::unique_ptr reader; + // Bitstream does not have histograms if no coefficient order is used. + if (used_orders != 0) { + JXL_RETURN_IF_ERROR( + DecodeHistograms(br, kPermutationContexts, &code, &context_map)); + reader = make_unique(&code, br); + } + uint32_t acs_mask = 0; + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + if ((used_acs & (1 << o)) == 0) continue; + acs_mask |= 1 << kStrategyOrder[o]; + } + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + uint8_t ord = kStrategyOrder[o]; + if (computed & (1 << ord)) continue; + computed |= 1 << ord; + AcStrategy acs = AcStrategy::FromRawStrategy(o); + bool used = (acs_mask & (1 << ord)) != 0; + if ((used_orders & (1 << ord)) == 0) { + // No need to set the default order if no ACS uses this order. + if (used) { + for (size_t c = 0; c < 3; c++) { + SetDefaultOrder(acs, &order[CoeffOrderOffset(ord, c)]); + } + } + } else { + for (size_t c = 0; c < 3; c++) { + coeff_order_t* dest = used ? &order[CoeffOrderOffset(ord, c)] : nullptr; + JXL_RETURN_IF_ERROR( + DecodeCoeffOrder(acs, dest, br, reader.get(), context_map)); + } + } + } + if (used_orders && !reader->CheckANSFinalState()) { + return JXL_FAILURE("Invalid ANS stream"); + } + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order.h new file mode 100644 index 0000000000..c600b7b3bf --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order.h @@ -0,0 +1,66 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_COEFF_ORDER_H_ +#define LIB_JXL_COEFF_ORDER_H_ + +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/dec_bit_reader.h" + +namespace jxl { + +// Those offsets get multiplied by kDCTBlockSize. +static constexpr size_t kCoeffOrderOffset[] = { + 0, 1, 2, 3, 4, 5, 6, 10, 14, 18, + 34, 50, 66, 68, 70, 72, 76, 80, 84, 92, + 100, 108, 172, 236, 300, 332, 364, 396, 652, 908, + 1164, 1292, 1420, 1548, 2572, 3596, 4620, 5132, 5644, 6156, +}; +static_assert(3 * kNumOrders + 1 == + sizeof(kCoeffOrderOffset) / sizeof(*kCoeffOrderOffset), + "Update this array when adding or removing order types."); + +static constexpr size_t CoeffOrderOffset(size_t order, size_t c) { + return kCoeffOrderOffset[3 * order + c] * kDCTBlockSize; +} + +static constexpr size_t kCoeffOrderMaxSize = + kCoeffOrderOffset[3 * kNumOrders] * kDCTBlockSize; + +// Mapping from AC strategy to order bucket. Strategies with different natural +// orders must have different buckets. +constexpr uint8_t kStrategyOrder[] = { + 0, 1, 1, 1, 2, 3, 4, 4, 5, 5, 6, 6, 1, 1, + 1, 1, 1, 1, 7, 8, 8, 9, 10, 10, 11, 12, 12, +}; + +static_assert(AcStrategy::kNumValidStrategies == + sizeof(kStrategyOrder) / sizeof(*kStrategyOrder), + "Update this array when adding or removing AC strategies."); + +constexpr uint32_t kPermutationContexts = 8; + +uint32_t CoeffOrderContext(uint32_t val); + +void SetDefaultOrder(AcStrategy acs, coeff_order_t* JXL_RESTRICT order); + +Status DecodeCoeffOrders(uint16_t used_orders, uint32_t used_acs, + coeff_order_t* order, BitReader* br); + +Status DecodePermutation(size_t skip, size_t size, coeff_order_t* order, + BitReader* br); + +} // namespace jxl + +#endif // LIB_JXL_COEFF_ORDER_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order_fwd.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order_fwd.h new file mode 100644 index 0000000000..700e9a83d4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order_fwd.h @@ -0,0 +1,47 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_COEFF_ORDER_FWD_H_ +#define LIB_JXL_COEFF_ORDER_FWD_H_ + +// Breaks circular dependency between ac_strategy and coeff_order. + +#include +#include + +#include "base/compiler_specific.h" + +namespace jxl { + +// Needs at least 16 bits. A 32-bit type speeds up DecodeAC by 2% at the cost of +// more memory. +using coeff_order_t = uint32_t; + +// Maximum number of orders to be used. Note that this needs to be multiplied by +// the number of channels. One per "size class" (plus one extra for DCT8), +// shared between transforms of size XxY and of size YxX. +constexpr uint8_t kNumOrders = 13; + +// DCT coefficients are laid out in such a way that the number of rows of +// coefficients is always the smaller coordinate. +JXL_INLINE constexpr size_t CoefficientRows(size_t rows, size_t columns) { + return rows < columns ? rows : columns; +} + +JXL_INLINE constexpr size_t CoefficientColumns(size_t rows, size_t columns) { + return rows < columns ? columns : rows; +} + +JXL_INLINE void CoefficientLayout(size_t* JXL_RESTRICT rows, + size_t* JXL_RESTRICT columns) { + size_t r = *rows; + size_t c = *columns; + *rows = CoefficientRows(r, c); + *columns = CoefficientColumns(r, c); +} + +} // namespace jxl + +#endif // LIB_JXL_COEFF_ORDER_FWD_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order_test.cc new file mode 100644 index 0000000000..2408905001 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/coeff_order_test.cc @@ -0,0 +1,101 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/coeff_order.h" + +#include + +#include +#include // iota +#include +#include +#include + +#include "gtest/gtest.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_coeff_order.h" + +namespace jxl { +namespace { + +void RoundtripPermutation(coeff_order_t* perm, coeff_order_t* out, size_t len, + size_t* size) { + BitWriter writer; + EncodePermutation(perm, 0, len, &writer, 0, nullptr); + writer.ZeroPadToByte(); + Status status = true; + { + BitReader reader(writer.GetSpan()); + BitReaderScopedCloser closer(&reader, &status); + ASSERT_TRUE(DecodePermutation(0, len, out, &reader)); + } + ASSERT_TRUE(status); + *size = writer.GetSpan().size(); +} + +enum Permutation { kIdentity, kFewSwaps, kFewSlides, kRandom }; + +constexpr size_t kNumReps = 128; +constexpr size_t kSwaps = 32; + +void TestPermutation(Permutation kind, size_t len) { + std::vector perm(len); + std::iota(perm.begin(), perm.end(), 0); + std::mt19937 rng; + if (kind == kFewSwaps) { + std::uniform_int_distribution dist(0, len - 1); + for (size_t i = 0; i < kSwaps; i++) { + size_t a = dist(rng); + size_t b = dist(rng); + std::swap(perm[a], perm[b]); + } + } + if (kind == kFewSlides) { + std::uniform_int_distribution dist(0, len - 1); + for (size_t i = 0; i < kSwaps; i++) { + size_t a = dist(rng); + size_t b = dist(rng); + size_t from = std::min(a, b); + size_t to = std::max(a, b); + size_t start = perm[from]; + for (size_t j = from; j < to; j++) { + perm[j] = perm[j + 1]; + } + perm[to] = start; + } + } + if (kind == kRandom) { + std::shuffle(perm.begin(), perm.end(), rng); + } + std::vector out(len); + size_t size = 0; + for (size_t i = 0; i < kNumReps; i++) { + RoundtripPermutation(perm.data(), out.data(), len, &size); + for (size_t idx = 0; idx < len; idx++) { + EXPECT_EQ(perm[idx], out[idx]); + } + } + printf("Encoded size: %zu\n", size); +} + +TEST(CoeffOrderTest, IdentitySmall) { TestPermutation(kIdentity, 256); } +TEST(CoeffOrderTest, FewSlidesSmall) { TestPermutation(kFewSlides, 256); } +TEST(CoeffOrderTest, FewSwapsSmall) { TestPermutation(kFewSwaps, 256); } +TEST(CoeffOrderTest, RandomSmall) { TestPermutation(kRandom, 256); } + +TEST(CoeffOrderTest, IdentityMedium) { TestPermutation(kIdentity, 1 << 12); } +TEST(CoeffOrderTest, FewSlidesMedium) { TestPermutation(kFewSlides, 1 << 12); } +TEST(CoeffOrderTest, FewSwapsMedium) { TestPermutation(kFewSwaps, 1 << 12); } +TEST(CoeffOrderTest, RandomMedium) { TestPermutation(kRandom, 1 << 12); } + +TEST(CoeffOrderTest, IdentityBig) { TestPermutation(kIdentity, 1 << 16); } +TEST(CoeffOrderTest, FewSlidesBig) { TestPermutation(kFewSlides, 1 << 16); } +TEST(CoeffOrderTest, FewSwapsBig) { TestPermutation(kFewSwaps, 1 << 16); } +TEST(CoeffOrderTest, RandomBig) { TestPermutation(kRandom, 1 << 16); } + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal.cc new file mode 100644 index 0000000000..0a3899839b --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal.cc @@ -0,0 +1,782 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/color_encoding_internal.h" + +#include + +#include +#include + +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/linalg.h" + +namespace jxl { +namespace { + +// Highest reasonable value for the gamma of a transfer curve. +constexpr uint32_t kMaxGamma = 8192; + +// These strings are baked into Description - do not change. + +std::string ToString(ColorSpace color_space) { + switch (color_space) { + case ColorSpace::kRGB: + return "RGB"; + case ColorSpace::kGray: + return "Gra"; + case ColorSpace::kXYB: + return "XYB"; + case ColorSpace::kUnknown: + return "CS?"; + } + // Should not happen - visitor fails if enum is invalid. + JXL_ABORT("Invalid ColorSpace %u", static_cast(color_space)); +} + +std::string ToString(WhitePoint white_point) { + switch (white_point) { + case WhitePoint::kD65: + return "D65"; + case WhitePoint::kCustom: + return "Cst"; + case WhitePoint::kE: + return "EER"; + case WhitePoint::kDCI: + return "DCI"; + } + // Should not happen - visitor fails if enum is invalid. + JXL_ABORT("Invalid WhitePoint %u", static_cast(white_point)); +} + +std::string ToString(Primaries primaries) { + switch (primaries) { + case Primaries::kSRGB: + return "SRG"; + case Primaries::k2100: + return "202"; + case Primaries::kP3: + return "DCI"; + case Primaries::kCustom: + return "Cst"; + } + // Should not happen - visitor fails if enum is invalid. + JXL_ABORT("Invalid Primaries %u", static_cast(primaries)); +} + +std::string ToString(TransferFunction transfer_function) { + switch (transfer_function) { + case TransferFunction::kSRGB: + return "SRG"; + case TransferFunction::kLinear: + return "Lin"; + case TransferFunction::k709: + return "709"; + case TransferFunction::kPQ: + return "PeQ"; + case TransferFunction::kHLG: + return "HLG"; + case TransferFunction::kDCI: + return "DCI"; + case TransferFunction::kUnknown: + return "TF?"; + } + // Should not happen - visitor fails if enum is invalid. + JXL_ABORT("Invalid TransferFunction %u", + static_cast(transfer_function)); +} + +std::string ToString(RenderingIntent rendering_intent) { + switch (rendering_intent) { + case RenderingIntent::kPerceptual: + return "Per"; + case RenderingIntent::kRelative: + return "Rel"; + case RenderingIntent::kSaturation: + return "Sat"; + case RenderingIntent::kAbsolute: + return "Abs"; + } + // Should not happen - visitor fails if enum is invalid. + JXL_ABORT("Invalid RenderingIntent %u", + static_cast(rendering_intent)); +} + +template +Status ParseEnum(const std::string& token, Enum* value) { + std::string str; + for (Enum e : Values()) { + if (ToString(e) == token) { + *value = e; + return true; + } + } + return false; +} + +class Tokenizer { + public: + Tokenizer(const std::string* input, char separator) + : input_(input), separator_(separator) {} + + Status Next(std::string* JXL_RESTRICT next) { + const size_t end = input_->find(separator_, start_); + if (end == std::string::npos) { + *next = input_->substr(start_); // rest of string + } else { + *next = input_->substr(start_, end - start_); + } + if (next->empty()) return JXL_FAILURE("Missing token"); + start_ = end + 1; + return true; + } + + private: + const std::string* const input_; // not owned + const char separator_; + size_t start_ = 0; // of next token +}; + +Status ParseDouble(const std::string& num, double* JXL_RESTRICT d) { + char* end; + errno = 0; + *d = strtod(num.c_str(), &end); + if (*d == 0.0 && end == num.c_str()) { + return JXL_FAILURE("Invalid double: %s", num.c_str()); + } + if (std::isnan(*d)) { + return JXL_FAILURE("Invalid double: %s", num.c_str()); + } + if (errno == ERANGE) { + return JXL_FAILURE("Double out of range: %s", num.c_str()); + } + return true; +} + +Status ParseDouble(Tokenizer* tokenizer, double* JXL_RESTRICT d) { + std::string num; + JXL_RETURN_IF_ERROR(tokenizer->Next(&num)); + return ParseDouble(num, d); +} + +Status ParseColorSpace(Tokenizer* JXL_RESTRICT tokenizer, + ColorEncoding* JXL_RESTRICT c) { + std::string str; + JXL_RETURN_IF_ERROR(tokenizer->Next(&str)); + ColorSpace cs; + if (ParseEnum(str, &cs)) { + c->SetColorSpace(cs); + return true; + } + + return JXL_FAILURE("Unknown ColorSpace %s", str.c_str()); +} + +Status ParseWhitePoint(Tokenizer* JXL_RESTRICT tokenizer, + ColorEncoding* JXL_RESTRICT c) { + if (c->ImplicitWhitePoint()) return true; + + std::string str; + JXL_RETURN_IF_ERROR(tokenizer->Next(&str)); + if (ParseEnum(str, &c->white_point)) return true; + + CIExy xy; + Tokenizer xy_tokenizer(&str, ';'); + JXL_RETURN_IF_ERROR(ParseDouble(&xy_tokenizer, &xy.x)); + JXL_RETURN_IF_ERROR(ParseDouble(&xy_tokenizer, &xy.y)); + if (c->SetWhitePoint(xy)) return true; + + return JXL_FAILURE("Invalid white point %s", str.c_str()); +} + +Status ParsePrimaries(Tokenizer* JXL_RESTRICT tokenizer, + ColorEncoding* JXL_RESTRICT c) { + if (!c->HasPrimaries()) return true; + + std::string str; + JXL_RETURN_IF_ERROR(tokenizer->Next(&str)); + if (ParseEnum(str, &c->primaries)) return true; + + PrimariesCIExy xy; + Tokenizer xy_tokenizer(&str, ';'); + JXL_RETURN_IF_ERROR(ParseDouble(&xy_tokenizer, &xy.r.x)); + JXL_RETURN_IF_ERROR(ParseDouble(&xy_tokenizer, &xy.r.y)); + JXL_RETURN_IF_ERROR(ParseDouble(&xy_tokenizer, &xy.g.x)); + JXL_RETURN_IF_ERROR(ParseDouble(&xy_tokenizer, &xy.g.y)); + JXL_RETURN_IF_ERROR(ParseDouble(&xy_tokenizer, &xy.b.x)); + JXL_RETURN_IF_ERROR(ParseDouble(&xy_tokenizer, &xy.b.y)); + if (c->SetPrimaries(xy)) return true; + + return JXL_FAILURE("Invalid primaries %s", str.c_str()); +} + +Status ParseRenderingIntent(Tokenizer* JXL_RESTRICT tokenizer, + ColorEncoding* JXL_RESTRICT c) { + std::string str; + JXL_RETURN_IF_ERROR(tokenizer->Next(&str)); + if (ParseEnum(str, &c->rendering_intent)) return true; + + return JXL_FAILURE("Invalid RenderingIntent %s\n", str.c_str()); +} + +Status ParseTransferFunction(Tokenizer* JXL_RESTRICT tokenizer, + ColorEncoding* JXL_RESTRICT c) { + if (c->tf.SetImplicit()) return true; + + std::string str; + JXL_RETURN_IF_ERROR(tokenizer->Next(&str)); + TransferFunction transfer_function; + if (ParseEnum(str, &transfer_function)) { + c->tf.SetTransferFunction(transfer_function); + return true; + } + + if (str[0] == 'g') { + double gamma; + JXL_RETURN_IF_ERROR(ParseDouble(str.substr(1), &gamma)); + if (c->tf.SetGamma(gamma)) return true; + } + + return JXL_FAILURE("Invalid gamma %s", str.c_str()); +} + +static double F64FromCustomxyI32(const int32_t i) { return i * 1E-6; } +static Status F64ToCustomxyI32(const double f, int32_t* JXL_RESTRICT i) { + if (!(-4 <= f && f <= 4)) { + return JXL_FAILURE("F64 out of bounds for CustomxyI32"); + } + *i = static_cast(roundf(f * 1E6)); + return true; +} + +} // namespace + +CIExy Customxy::Get() const { + CIExy xy; + xy.x = F64FromCustomxyI32(x); + xy.y = F64FromCustomxyI32(y); + return xy; +} + +Status Customxy::Set(const CIExy& xy) { + JXL_RETURN_IF_ERROR(F64ToCustomxyI32(xy.x, &x)); + JXL_RETURN_IF_ERROR(F64ToCustomxyI32(xy.y, &y)); + size_t extension_bits, total_bits; + if (!Bundle::CanEncode(*this, &extension_bits, &total_bits)) { + return JXL_FAILURE("Unable to encode XY %f %f", xy.x, xy.y); + } + return true; +} + +bool CustomTransferFunction::SetImplicit() { + if (nonserialized_color_space == ColorSpace::kXYB) { + if (!SetGamma(1.0 / 3)) JXL_ASSERT(false); + return true; + } + return false; +} + +Status CustomTransferFunction::SetGamma(double gamma) { + if (gamma < (1.0f / kMaxGamma) || gamma > 1.0) { + return JXL_FAILURE("Invalid gamma %f", gamma); + } + + have_gamma_ = false; + if (ApproxEq(gamma, 1.0)) { + transfer_function_ = TransferFunction::kLinear; + return true; + } + if (ApproxEq(gamma, 1.0 / 2.6)) { + transfer_function_ = TransferFunction::kDCI; + return true; + } + // Don't translate 0.45.. to kSRGB nor k709 - that might change pixel + // values because those curves also have a linear part. + + have_gamma_ = true; + gamma_ = roundf(gamma * kGammaMul); + transfer_function_ = TransferFunction::kUnknown; + return true; +} + +namespace { + +std::array CreateC2(const Primaries pr, + const TransferFunction tf) { + std::array c2; + + { + ColorEncoding* c_rgb = c2.data() + 0; + c_rgb->SetColorSpace(ColorSpace::kRGB); + c_rgb->white_point = WhitePoint::kD65; + c_rgb->primaries = pr; + c_rgb->tf.SetTransferFunction(tf); + JXL_CHECK(c_rgb->CreateICC()); + } + + { + ColorEncoding* c_gray = c2.data() + 1; + c_gray->SetColorSpace(ColorSpace::kGray); + c_gray->white_point = WhitePoint::kD65; + c_gray->primaries = pr; + c_gray->tf.SetTransferFunction(tf); + JXL_CHECK(c_gray->CreateICC()); + } + + return c2; +} + +} // namespace + +const ColorEncoding& ColorEncoding::SRGB(bool is_gray) { + static std::array c2 = + CreateC2(Primaries::kSRGB, TransferFunction::kSRGB); + return c2[is_gray]; +} +const ColorEncoding& ColorEncoding::LinearSRGB(bool is_gray) { + static std::array c2 = + CreateC2(Primaries::kSRGB, TransferFunction::kLinear); + return c2[is_gray]; +} + +CIExy ColorEncoding::GetWhitePoint() const { + JXL_DASSERT(have_fields_); + CIExy xy; + switch (white_point) { + case WhitePoint::kCustom: + return white_.Get(); + + case WhitePoint::kD65: + xy.x = 0.3127; + xy.y = 0.3290; + return xy; + + case WhitePoint::kDCI: + // From https://ieeexplore.ieee.org/document/7290729 C.2 page 11 + xy.x = 0.314; + xy.y = 0.351; + return xy; + + case WhitePoint::kE: + xy.x = xy.y = 1.0 / 3; + return xy; + } + JXL_ABORT("Invalid WhitePoint %u", static_cast(white_point)); +} + +Status ColorEncoding::SetWhitePoint(const CIExy& xy) { + JXL_DASSERT(have_fields_); + if (xy.x == 0.0 || xy.y == 0.0) { + return JXL_FAILURE("Invalid white point %f %f", xy.x, xy.y); + } + if (ApproxEq(xy.x, 0.3127) && ApproxEq(xy.y, 0.3290)) { + white_point = WhitePoint::kD65; + return true; + } + if (ApproxEq(xy.x, 1.0 / 3) && ApproxEq(xy.y, 1.0 / 3)) { + white_point = WhitePoint::kE; + return true; + } + if (ApproxEq(xy.x, 0.314) && ApproxEq(xy.y, 0.351)) { + white_point = WhitePoint::kDCI; + return true; + } + white_point = WhitePoint::kCustom; + return white_.Set(xy); +} + +PrimariesCIExy ColorEncoding::GetPrimaries() const { + JXL_DASSERT(have_fields_); + JXL_ASSERT(HasPrimaries()); + PrimariesCIExy xy; + switch (primaries) { + case Primaries::kCustom: + xy.r = red_.Get(); + xy.g = green_.Get(); + xy.b = blue_.Get(); + return xy; + + case Primaries::kSRGB: + xy.r.x = 0.639998686; + xy.r.y = 0.330010138; + xy.g.x = 0.300003784; + xy.g.y = 0.600003357; + xy.b.x = 0.150002046; + xy.b.y = 0.059997204; + return xy; + + case Primaries::k2100: + xy.r.x = 0.708; + xy.r.y = 0.292; + xy.g.x = 0.170; + xy.g.y = 0.797; + xy.b.x = 0.131; + xy.b.y = 0.046; + return xy; + + case Primaries::kP3: + xy.r.x = 0.680; + xy.r.y = 0.320; + xy.g.x = 0.265; + xy.g.y = 0.690; + xy.b.x = 0.150; + xy.b.y = 0.060; + return xy; + } + JXL_ABORT("Invalid Primaries %u", static_cast(primaries)); +} + +Status ColorEncoding::SetPrimaries(const PrimariesCIExy& xy) { + JXL_DASSERT(have_fields_); + JXL_ASSERT(HasPrimaries()); + if (xy.r.x == 0.0 || xy.r.y == 0.0 || xy.g.x == 0.0 || xy.g.y == 0.0 || + xy.b.x == 0.0 || xy.b.y == 0.0) { + return JXL_FAILURE("Invalid primaries %f %f %f %f %f %f", xy.r.x, xy.r.y, + xy.g.x, xy.g.y, xy.b.x, xy.b.y); + } + + if (ApproxEq(xy.r.x, 0.64) && ApproxEq(xy.r.y, 0.33) && + ApproxEq(xy.g.x, 0.30) && ApproxEq(xy.g.y, 0.60) && + ApproxEq(xy.b.x, 0.15) && ApproxEq(xy.b.y, 0.06)) { + primaries = Primaries::kSRGB; + return true; + } + + if (ApproxEq(xy.r.x, 0.708) && ApproxEq(xy.r.y, 0.292) && + ApproxEq(xy.g.x, 0.170) && ApproxEq(xy.g.y, 0.797) && + ApproxEq(xy.b.x, 0.131) && ApproxEq(xy.b.y, 0.046)) { + primaries = Primaries::k2100; + return true; + } + if (ApproxEq(xy.r.x, 0.680) && ApproxEq(xy.r.y, 0.320) && + ApproxEq(xy.g.x, 0.265) && ApproxEq(xy.g.y, 0.690) && + ApproxEq(xy.b.x, 0.150) && ApproxEq(xy.b.y, 0.060)) { + primaries = Primaries::kP3; + return true; + } + + primaries = Primaries::kCustom; + JXL_RETURN_IF_ERROR(red_.Set(xy.r)); + JXL_RETURN_IF_ERROR(green_.Set(xy.g)); + JXL_RETURN_IF_ERROR(blue_.Set(xy.b)); + return true; +} + +Status ColorEncoding::CreateICC() { + InternalRemoveICC(); + if (!MaybeCreateProfile(*this, &icc_)) { + return JXL_FAILURE("Failed to create profile from fields"); + } + return true; +} + +std::string Description(const ColorEncoding& c_in) { + // Copy required for Implicit* + ColorEncoding c = c_in; + + std::string d = ToString(c.GetColorSpace()); + + if (!c.ImplicitWhitePoint()) { + d += '_'; + if (c.white_point == WhitePoint::kCustom) { + const CIExy wp = c.GetWhitePoint(); + d += ToString(wp.x) + ';'; + d += ToString(wp.y); + } else { + d += ToString(c.white_point); + } + } + + if (c.HasPrimaries()) { + d += '_'; + if (c.primaries == Primaries::kCustom) { + const PrimariesCIExy pr = c.GetPrimaries(); + d += ToString(pr.r.x) + ';'; + d += ToString(pr.r.y) + ';'; + d += ToString(pr.g.x) + ';'; + d += ToString(pr.g.y) + ';'; + d += ToString(pr.b.x) + ';'; + d += ToString(pr.b.y); + } else { + d += ToString(c.primaries); + } + } + + d += '_'; + d += ToString(c.rendering_intent); + + if (!c.tf.SetImplicit()) { + d += '_'; + if (c.tf.IsGamma()) { + d += 'g'; + d += ToString(c.tf.GetGamma()); + } else { + d += ToString(c.tf.GetTransferFunction()); + } + } + + return d; +} + +Status ParseDescription(const std::string& description, + ColorEncoding* JXL_RESTRICT c) { + Tokenizer tokenizer(&description, '_'); + JXL_RETURN_IF_ERROR(ParseColorSpace(&tokenizer, c)); + JXL_RETURN_IF_ERROR(ParseWhitePoint(&tokenizer, c)); + JXL_RETURN_IF_ERROR(ParsePrimaries(&tokenizer, c)); + JXL_RETURN_IF_ERROR(ParseRenderingIntent(&tokenizer, c)); + JXL_RETURN_IF_ERROR(ParseTransferFunction(&tokenizer, c)); + return true; +} + +Customxy::Customxy() { Bundle::Init(this); } +Status Customxy::VisitFields(Visitor* JXL_RESTRICT visitor) { + uint32_t ux = PackSigned(x); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Bits(19), BitsOffset(19, 524288), + BitsOffset(20, 1048576), + BitsOffset(21, 2097152), 0, &ux)); + x = UnpackSigned(ux); + uint32_t uy = PackSigned(y); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Bits(19), BitsOffset(19, 524288), + BitsOffset(20, 1048576), + BitsOffset(21, 2097152), 0, &uy)); + y = UnpackSigned(uy); + return true; +} + +CustomTransferFunction::CustomTransferFunction() { Bundle::Init(this); } +Status CustomTransferFunction::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->Conditional(!SetImplicit())) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_gamma_)); + + if (visitor->Conditional(have_gamma_)) { + // Gamma is represented as a 24-bit int, the exponent used is + // gamma_ / 1e7. Valid values are (0, 1]. On the low end side, we also + // limit it to kMaxGamma/1e7. + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(24, kGammaMul, &gamma_)); + if (gamma_ > kGammaMul || + static_cast(gamma_) * kMaxGamma < kGammaMul) { + return JXL_FAILURE("Invalid gamma %u", gamma_); + } + } + + if (visitor->Conditional(!have_gamma_)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->Enum(TransferFunction::kSRGB, &transfer_function_)); + } + } + + return true; +} + +ColorEncoding::ColorEncoding() { Bundle::Init(this); } +Status ColorEncoding::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &want_icc_)); + + // Always send even if want_icc_ because this affects decoding. + // We can skip the white point/primaries because they do not. + JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(ColorSpace::kRGB, &color_space_)); + + if (visitor->Conditional(!WantICC())) { + // Serialize enums. NOTE: we set the defaults to the most common values so + // ImageMetadata.all_default is true in the common case. + + if (visitor->Conditional(!ImplicitWhitePoint())) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(WhitePoint::kD65, &white_point)); + if (visitor->Conditional(white_point == WhitePoint::kCustom)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&white_)); + } + } + + if (visitor->Conditional(HasPrimaries())) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(Primaries::kSRGB, &primaries)); + if (visitor->Conditional(primaries == Primaries::kCustom)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&red_)); + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&green_)); + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&blue_)); + } + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&tf)); + + JXL_QUIET_RETURN_IF_ERROR( + visitor->Enum(RenderingIntent::kRelative, &rendering_intent)); + + // We didn't have ICC, so all fields should be known. + if (color_space_ == ColorSpace::kUnknown || tf.IsUnknown()) { + return JXL_FAILURE( + "No ICC but cs %u and tf %u%s", + static_cast(color_space_), + tf.IsGamma() ? 0 + : static_cast(tf.GetTransferFunction()), + tf.IsGamma() ? "(gamma)" : ""); + } + + JXL_RETURN_IF_ERROR(CreateICC()); + } + + if (WantICC() && visitor->IsReading()) { + // Haven't called SetICC() yet, do nothing. + } else { + if (ICC().empty()) return JXL_FAILURE("Empty ICC"); + } + + return true; +} + +void ConvertInternalToExternalColorEncoding(const ColorEncoding& internal, + JxlColorEncoding* external) { + external->color_space = static_cast(internal.GetColorSpace()); + + external->white_point = static_cast(internal.white_point); + + jxl::CIExy whitepoint = internal.GetWhitePoint(); + external->white_point_xy[0] = whitepoint.x; + external->white_point_xy[1] = whitepoint.y; + + if (external->color_space == JXL_COLOR_SPACE_RGB || + external->color_space == JXL_COLOR_SPACE_UNKNOWN) { + external->primaries = static_cast(internal.primaries); + jxl::PrimariesCIExy primaries = internal.GetPrimaries(); + external->primaries_red_xy[0] = primaries.r.x; + external->primaries_red_xy[1] = primaries.r.y; + external->primaries_green_xy[0] = primaries.g.x; + external->primaries_green_xy[1] = primaries.g.y; + external->primaries_blue_xy[0] = primaries.b.x; + external->primaries_blue_xy[1] = primaries.b.y; + } + + if (internal.tf.IsGamma()) { + external->transfer_function = JXL_TRANSFER_FUNCTION_GAMMA; + external->gamma = internal.tf.GetGamma(); + } else { + external->transfer_function = + static_cast(internal.tf.GetTransferFunction()); + external->gamma = 0; + } + + external->rendering_intent = + static_cast(internal.rendering_intent); +} + +Status ConvertExternalToInternalColorEncoding(const JxlColorEncoding& external, + ColorEncoding* internal) { + internal->SetColorSpace(static_cast(external.color_space)); + + CIExy wp; + wp.x = external.white_point_xy[0]; + wp.y = external.white_point_xy[1]; + JXL_RETURN_IF_ERROR(internal->SetWhitePoint(wp)); + + if (external.color_space == JXL_COLOR_SPACE_RGB || + external.color_space == JXL_COLOR_SPACE_UNKNOWN) { + internal->primaries = static_cast(external.primaries); + PrimariesCIExy primaries; + primaries.r.x = external.primaries_red_xy[0]; + primaries.r.y = external.primaries_red_xy[1]; + primaries.g.x = external.primaries_green_xy[0]; + primaries.g.y = external.primaries_green_xy[1]; + primaries.b.x = external.primaries_blue_xy[0]; + primaries.b.y = external.primaries_blue_xy[1]; + JXL_RETURN_IF_ERROR(internal->SetPrimaries(primaries)); + } + CustomTransferFunction tf; + if (external.transfer_function == JXL_TRANSFER_FUNCTION_GAMMA) { + JXL_RETURN_IF_ERROR(tf.SetGamma(external.gamma)); + } else { + tf.SetTransferFunction( + static_cast(external.transfer_function)); + } + internal->tf = tf; + + internal->rendering_intent = + static_cast(external.rendering_intent); + + return true; +} + +/* Chromatic adaptation matrices*/ +static const float kBradford[9] = { + 0.8951f, 0.2664f, -0.1614f, -0.7502f, 1.7135f, + 0.0367f, 0.0389f, -0.0685f, 1.0296f, +}; + +static const float kBradfordInv[9] = { + 0.9869929f, -0.1470543f, 0.1599627f, 0.4323053f, 0.5183603f, + 0.0492912f, -0.0085287f, 0.0400428f, 0.9684867f, +}; + +// Adapts whitepoint x, y to D50 +Status AdaptToXYZD50(float wx, float wy, float matrix[9]) { + if (wx < 0 || wx > 1 || wy <= 0 || wy > 1) { + // Out of range values can cause division through zero + // further down with the bradford adaptation too. + return JXL_FAILURE("Invalid white point"); + } + float w[3] = {wx / wy, 1.0f, (1.0f - wx - wy) / wy}; + // 1 / tiny float can still overflow + JXL_RETURN_IF_ERROR(std::isfinite(w[0]) && std::isfinite(w[2])); + float w50[3] = {0.96422f, 1.0f, 0.82521f}; + + float lms[3]; + float lms50[3]; + + MatMul(kBradford, w, 3, 3, 1, lms); + MatMul(kBradford, w50, 3, 3, 1, lms50); + + float a[9] = { + lms50[0] / lms[0], 0, 0, 0, lms50[1] / lms[1], 0, 0, 0, lms50[2] / lms[2], + }; + + float b[9]; + MatMul(a, kBradford, 3, 3, 3, b); + MatMul(kBradfordInv, b, 3, 3, 3, matrix); + + return true; +} + +Status PrimariesToXYZD50(float rx, float ry, float gx, float gy, float bx, + float by, float wx, float wy, float matrix[9]) { + if (wx < 0 || wx > 1 || wy <= 0 || wy > 1) { + return JXL_FAILURE("Invalid white point"); + } + // TODO(lode): also require rx, ry, gx, gy, bx, to be in range 0-1? ICC + // profiles in theory forbid negative XYZ values, but in practice the ACES P0 + // color space uses a negative y for the blue primary. + float primaries[9] = { + rx, gx, bx, ry, gy, by, 1.0f - rx - ry, 1.0f - gx - gy, 1.0f - bx - by}; + float primaries_inv[9]; + memcpy(primaries_inv, primaries, sizeof(float) * 9); + JXL_RETURN_IF_ERROR(Inv3x3Matrix(primaries_inv)); + + float w[3] = {wx / wy, 1.0f, (1.0f - wx - wy) / wy}; + // 1 / tiny float can still overflow + JXL_RETURN_IF_ERROR(std::isfinite(w[0]) && std::isfinite(w[2])); + float xyz[3]; + MatMul(primaries_inv, w, 3, 3, 1, xyz); + + float a[9] = { + xyz[0], 0, 0, 0, xyz[1], 0, 0, 0, xyz[2], + }; + + float toXYZ[9]; + MatMul(primaries, a, 3, 3, 3, toXYZ); + + float d50[9]; + JXL_RETURN_IF_ERROR(AdaptToXYZD50(wx, wy, d50)); + + MatMul(d50, toXYZ, 3, 3, 3, matrix); + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal.h new file mode 100644 index 0000000000..13ee3b433f --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal.h @@ -0,0 +1,462 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_COLOR_ENCODING_INTERNAL_H_ +#define LIB_JXL_COLOR_ENCODING_INTERNAL_H_ + +// Metadata for color space conversions. + +#include +#include +#include + +#include // std::abs +#include +#include +#include + +#include "jxl/color_encoding.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/field_encodings.h" + +namespace jxl { + +// (All CIE units are for the standard 1931 2 degree observer) + +// Color space the color pixel data is encoded in. The color pixel data is +// 3-channel in all cases except in case of kGray, where it uses only 1 channel. +// This also determines the amount of channels used in modular encoding. +enum class ColorSpace : uint32_t { + // Trichromatic color data. This also includes CMYK if a kBlack + // ExtraChannelInfo is present. This implies, if there is an ICC profile, that + // the ICC profile uses a 3-channel color space if no kBlack extra channel is + // present, or uses color space 'CMYK' if a kBlack extra channel is present. + kRGB, + // Single-channel data. This implies, if there is an ICC profile, that the ICC + // profile also represents single-channel data and has the appropriate color + // space ('GRAY'). + kGray, + // Like kRGB, but implies fixed values for primaries etc. + kXYB, + // For non-RGB/gray data, e.g. from non-electro-optical sensors. Otherwise + // the same conditions as kRGB apply. + kUnknown +}; + +static inline const char* EnumName(ColorSpace /*unused*/) { + return "ColorSpace"; +} +static inline constexpr uint64_t EnumBits(ColorSpace /*unused*/) { + using CS = ColorSpace; + return MakeBit(CS::kRGB) | MakeBit(CS::kGray) | MakeBit(CS::kXYB) | + MakeBit(CS::kUnknown); +} + +// Values from CICP ColourPrimaries. +enum class WhitePoint : uint32_t { + kD65 = 1, // sRGB/BT.709/Display P3/BT.2020 + kCustom = 2, // Actual values encoded in separate fields + kE = 10, // XYZ + kDCI = 11, // DCI-P3 +}; + +static inline const char* EnumName(WhitePoint /*unused*/) { + return "WhitePoint"; +} +static inline constexpr uint64_t EnumBits(WhitePoint /*unused*/) { + return MakeBit(WhitePoint::kD65) | MakeBit(WhitePoint::kCustom) | + MakeBit(WhitePoint::kE) | MakeBit(WhitePoint::kDCI); +} + +// Values from CICP ColourPrimaries +enum class Primaries : uint32_t { + kSRGB = 1, // Same as BT.709 + kCustom = 2, // Actual values encoded in separate fields + k2100 = 9, // Same as BT.2020 + kP3 = 11, +}; + +static inline const char* EnumName(Primaries /*unused*/) { return "Primaries"; } +static inline constexpr uint64_t EnumBits(Primaries /*unused*/) { + using Pr = Primaries; + return MakeBit(Pr::kSRGB) | MakeBit(Pr::kCustom) | MakeBit(Pr::k2100) | + MakeBit(Pr::kP3); +} + +// Values from CICP TransferCharacteristics +enum TransferFunction : uint32_t { + k709 = 1, + kUnknown = 2, + kLinear = 8, + kSRGB = 13, + kPQ = 16, // from BT.2100 + kDCI = 17, // from SMPTE RP 431-2 reference projector + kHLG = 18, // from BT.2100 +}; + +static inline const char* EnumName(TransferFunction /*unused*/) { + return "TransferFunction"; +} +static inline constexpr uint64_t EnumBits(TransferFunction /*unused*/) { + using TF = TransferFunction; + return MakeBit(TF::k709) | MakeBit(TF::kLinear) | MakeBit(TF::kSRGB) | + MakeBit(TF::kPQ) | MakeBit(TF::kDCI) | MakeBit(TF::kHLG) | + MakeBit(TF::kUnknown); +} + +enum class RenderingIntent : uint32_t { + // Values match ICC sRGB encodings. + kPerceptual = 0, // good for photos, requires a profile with LUT. + kRelative, // good for logos. + kSaturation, // perhaps useful for CG with fully saturated colors. + kAbsolute, // leaves white point unchanged; good for proofing. +}; + +static inline const char* EnumName(RenderingIntent /*unused*/) { + return "RenderingIntent"; +} +static inline constexpr uint64_t EnumBits(RenderingIntent /*unused*/) { + using RI = RenderingIntent; + return MakeBit(RI::kPerceptual) | MakeBit(RI::kRelative) | + MakeBit(RI::kSaturation) | MakeBit(RI::kAbsolute); +} + +// Chromaticity (Y is omitted because it is 1 for primaries/white points) +struct CIExy { + double x = 0.0; + double y = 0.0; +}; + +struct PrimariesCIExy { + CIExy r; + CIExy g; + CIExy b; +}; + +// Serializable form of CIExy. +struct Customxy : public Fields { + Customxy(); + const char* Name() const override { return "Customxy"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + CIExy Get() const; + // Returns false if x or y do not fit in the encoding. + Status Set(const CIExy& xy); + + int32_t x; + int32_t y; +}; + +struct CustomTransferFunction : public Fields { + CustomTransferFunction(); + const char* Name() const override { return "CustomTransferFunction"; } + + // Sets fields and returns true if nonserialized_color_space has an implicit + // transfer function, otherwise leaves fields unchanged and returns false. + bool SetImplicit(); + + // Gamma: only used for PNG inputs + bool IsGamma() const { return have_gamma_; } + double GetGamma() const { + JXL_ASSERT(IsGamma()); + return gamma_ * 1E-7; // (0, 1) + } + Status SetGamma(double gamma); + + TransferFunction GetTransferFunction() const { + JXL_ASSERT(!IsGamma()); + return transfer_function_; + } + void SetTransferFunction(const TransferFunction tf) { + have_gamma_ = false; + transfer_function_ = tf; + } + + bool IsUnknown() const { + return !have_gamma_ && (transfer_function_ == TransferFunction::kUnknown); + } + bool IsSRGB() const { + return !have_gamma_ && (transfer_function_ == TransferFunction::kSRGB); + } + bool IsLinear() const { + return !have_gamma_ && (transfer_function_ == TransferFunction::kLinear); + } + bool IsPQ() const { + return !have_gamma_ && (transfer_function_ == TransferFunction::kPQ); + } + bool IsHLG() const { + return !have_gamma_ && (transfer_function_ == TransferFunction::kHLG); + } + bool Is709() const { + return !have_gamma_ && (transfer_function_ == TransferFunction::k709); + } + bool IsDCI() const { + return !have_gamma_ && (transfer_function_ == TransferFunction::kDCI); + } + bool IsSame(const CustomTransferFunction& other) const { + if (have_gamma_ != other.have_gamma_) return false; + if (have_gamma_) { + if (gamma_ != other.gamma_) return false; + } else { + if (transfer_function_ != other.transfer_function_) return false; + } + return true; + } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + // Must be set before calling VisitFields! + ColorSpace nonserialized_color_space = ColorSpace::kRGB; + + private: + static constexpr uint32_t kGammaMul = 10000000; + + bool have_gamma_; + + // OETF exponent to go from linear to gamma-compressed. + uint32_t gamma_; // Only used if have_gamma_. + + // Can be kUnknown. + TransferFunction transfer_function_; // Only used if !have_gamma_. +}; + +// Compact encoding of data required to interpret and translate pixels to a +// known color space. Stored in Metadata. Thread-compatible. +struct ColorEncoding : public Fields { + ColorEncoding(); + const char* Name() const override { return "ColorEncoding"; } + + // Returns ready-to-use color encodings (initialized on-demand). + static const ColorEncoding& SRGB(bool is_gray = false); + static const ColorEncoding& LinearSRGB(bool is_gray = false); + + // Returns true if an ICC profile was successfully created from fields. + // Must be called after modifying fields. Defined in color_management.cc. + Status CreateICC(); + + // Returns non-empty and valid ICC profile, unless: + // - between calling InternalRemoveICC() and CreateICC() in tests; + // - WantICC() == true and SetICC() was not yet called; + // - after a failed call to SetSRGB(), SetICC(), or CreateICC(). + const PaddedBytes& ICC() const { return icc_; } + + // Internal only, do not call except from tests. + void InternalRemoveICC() { icc_.clear(); } + + // Returns true if `icc` is assigned and decoded successfully. If so, + // subsequent WantICC() will return true until DecideIfWantICC() changes it. + // Returning false indicates data has been lost. + Status SetICC(PaddedBytes&& icc) { + if (icc.empty()) return false; + icc_ = std::move(icc); + + if (!SetFieldsFromICC()) { + InternalRemoveICC(); + return false; + } + + want_icc_ = true; + return true; + } + + // Sets the raw ICC profile bytes, without parsing the ICC, and without + // updating the direct fields such as whitepoint, primaries and color + // space. Functions to get and set fields, such as SetWhitePoint, cannot be + // used anymore after this and functions such as IsSRGB return false no matter + // what the contents of the icc profile. + Status SetICCRaw(PaddedBytes&& icc) { + if (icc.empty()) return false; + icc_ = std::move(icc); + + want_icc_ = true; + have_fields_ = false; + return true; + } + + // Returns whether to send the ICC profile in the codestream. + bool WantICC() const { return want_icc_; } + + // Return whether the direct fields are set, if false but ICC is set, only + // raw ICC bytes are known. + bool HaveFields() const { return have_fields_; } + + // Causes WantICC() to return false if ICC() can be reconstructed from fields. + // Defined in color_management.cc. + void DecideIfWantICC(); + + bool IsGray() const { return color_space_ == ColorSpace::kGray; } + size_t Channels() const { return IsGray() ? 1 : 3; } + + // Returns false if the field is invalid and unusable. + bool HasPrimaries() const { + return !IsGray() && color_space_ != ColorSpace::kXYB; + } + + // Returns true after setting the field to a value defined by color_space, + // otherwise false and leaves the field unchanged. + bool ImplicitWhitePoint() { + if (color_space_ == ColorSpace::kXYB) { + white_point = WhitePoint::kD65; + return true; + } + return false; + } + + // Returns whether the color space is known to be sRGB. If a raw unparsed ICC + // profile is set without the fields being set, this returns false, even if + // the content of the ICC profile would match sRGB. + bool IsSRGB() const { + if (!have_fields_) return false; + if (!IsGray() && color_space_ != ColorSpace::kRGB) return false; + if (white_point != WhitePoint::kD65) return false; + if (primaries != Primaries::kSRGB) return false; + if (!tf.IsSRGB()) return false; + return true; + } + + // Returns whether the color space is known to be linear sRGB. If a raw + // unparsed ICC profile is set without the fields being set, this returns + // false, even if the content of the ICC profile would match linear sRGB. + bool IsLinearSRGB() const { + if (!have_fields_) return false; + if (!IsGray() && color_space_ != ColorSpace::kRGB) return false; + if (white_point != WhitePoint::kD65) return false; + if (primaries != Primaries::kSRGB) return false; + if (!tf.IsLinear()) return false; + return true; + } + + Status SetSRGB(const ColorSpace cs, + const RenderingIntent ri = RenderingIntent::kRelative) { + InternalRemoveICC(); + JXL_ASSERT(cs == ColorSpace::kGray || cs == ColorSpace::kRGB); + color_space_ = cs; + white_point = WhitePoint::kD65; + primaries = Primaries::kSRGB; + tf.SetTransferFunction(TransferFunction::kSRGB); + rendering_intent = ri; + return CreateICC(); + } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + // Accessors ensure tf.nonserialized_color_space is updated at the same time. + ColorSpace GetColorSpace() const { return color_space_; } + void SetColorSpace(const ColorSpace cs) { + color_space_ = cs; + tf.nonserialized_color_space = cs; + } + + CIExy GetWhitePoint() const; + Status SetWhitePoint(const CIExy& xy); + + PrimariesCIExy GetPrimaries() const; + Status SetPrimaries(const PrimariesCIExy& xy); + + // Checks if the color spaces (including white point / primaries) are the + // same, but ignores the transfer function, rendering intent and ICC bytes. + bool SameColorSpace(const ColorEncoding& other) const { + if (color_space_ != other.color_space_) return false; + + if (white_point != other.white_point) return false; + if (white_point == WhitePoint::kCustom) { + if (white_.x != other.white_.x || white_.y != other.white_.y) + return false; + } + + if (HasPrimaries() != other.HasPrimaries()) return false; + if (HasPrimaries()) { + if (primaries != other.primaries) return false; + if (primaries == Primaries::kCustom) { + if (red_.x != other.red_.x || red_.y != other.red_.y) return false; + if (green_.x != other.green_.x || green_.y != other.green_.y) + return false; + if (blue_.x != other.blue_.x || blue_.y != other.blue_.y) return false; + } + } + return true; + } + + // Checks if the color space and transfer function are the same, ignoring + // rendering intent and ICC bytes + bool SameColorEncoding(const ColorEncoding& other) const { + return SameColorSpace(other) && tf.IsSame(other.tf); + } + + mutable bool all_default; + + // Only valid if HaveFields() + WhitePoint white_point; + Primaries primaries; // Only valid if HasPrimaries() + CustomTransferFunction tf; + RenderingIntent rendering_intent; + + private: + // Returns true if all fields have been initialized (possibly to kUnknown). + // Returns false if the ICC profile is invalid or decoding it fails. + // Defined in color_management.cc. + Status SetFieldsFromICC(); + + // If true, the codestream contains an ICC profile and we do not serialize + // fields. Otherwise, fields are serialized and we create an ICC profile. + bool want_icc_; + + // When false, fields such as white_point and tf are invalid and must not be + // used. This occurs after setting a raw bytes-only ICC profile, only the + // ICC bytes may be used. The color_space_ field is still valid. + bool have_fields_ = true; + + PaddedBytes icc_; // Valid ICC profile + + ColorSpace color_space_; // Can be kUnknown + + // Only used if white_point == kCustom. + Customxy white_; + + // Only used if primaries == kCustom. + Customxy red_; + Customxy green_; + Customxy blue_; +}; + +// Returns whether the two inputs are approximately equal. +static inline bool ApproxEq(const double a, const double b, +#if JPEGXL_ENABLE_SKCMS + double max_l1 = 1E-3) { +#else + double max_l1 = 8E-5) { +#endif + // Threshold should be sufficient for ICC's 15-bit fixed-point numbers. + // We have seen differences of 7.1E-5 with lcms2 and 1E-3 with skcms. + return std::abs(a - b) <= max_l1; +} + +// Returns a representation of the ColorEncoding fields (not icc). +// Example description: "RGB_D65_SRG_Rel_Lin" +std::string Description(const ColorEncoding& c); +Status ParseDescription(const std::string& description, + ColorEncoding* JXL_RESTRICT c); + +static inline std::ostream& operator<<(std::ostream& os, + const ColorEncoding& c) { + return os << Description(c); +} + +void ConvertInternalToExternalColorEncoding(const jxl::ColorEncoding& internal, + JxlColorEncoding* external); + +Status ConvertExternalToInternalColorEncoding(const JxlColorEncoding& external, + jxl::ColorEncoding* internal); + +Status PrimariesToXYZD50(float rx, float ry, float gx, float gy, float bx, + float by, float wx, float wy, float matrix[9]); +Status AdaptToXYZD50(float wx, float wy, float matrix[9]); + +} // namespace jxl + +#endif // LIB_JXL_COLOR_ENCODING_INTERNAL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal_test.cc new file mode 100644 index 0000000000..16393813aa --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_encoding_internal_test.cc @@ -0,0 +1,174 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/color_encoding_internal.h" + +#include + +#include "gtest/gtest.h" +#include "lib/jxl/encode_internal.h" +#include "lib/jxl/test_utils.h" + +namespace jxl { +namespace { + +TEST(ColorEncodingTest, RoundTripAll) { + for (const test::ColorEncodingDescriptor& cdesc : test::AllEncodings()) { + const ColorEncoding c_original = test::ColorEncodingFromDescriptor(cdesc); + // Verify Set(Get) yields the same white point/primaries/gamma. + { + ColorEncoding c; + EXPECT_TRUE(c.SetWhitePoint(c_original.GetWhitePoint())); + EXPECT_EQ(c_original.white_point, c.white_point); + } + { + ColorEncoding c; + EXPECT_TRUE(c.SetPrimaries(c_original.GetPrimaries())); + EXPECT_EQ(c_original.primaries, c.primaries); + } + if (c_original.tf.IsGamma()) { + ColorEncoding c; + EXPECT_TRUE(c.tf.SetGamma(c_original.tf.GetGamma())); + EXPECT_TRUE(c_original.tf.IsSame(c.tf)); + } + + // Verify ParseDescription(Description) yields the same ColorEncoding + { + const std::string description = Description(c_original); + printf("%s\n", description.c_str()); + ColorEncoding c; + EXPECT_TRUE(ParseDescription(description, &c)); + EXPECT_TRUE(c_original.SameColorEncoding(c)); + } + } +} + +// Verify Set(Get) for specific custom values + +TEST(ColorEncodingTest, NanGamma) { + const std::string description = "Gra_2_Per_gnan"; + ColorEncoding c; + EXPECT_FALSE(ParseDescription(description, &c)); +} + +TEST(ColorEncodingTest, CustomWhitePoint) { + ColorEncoding c; + // Nonsensical values + CIExy xy_in; + xy_in.x = 0.8; + xy_in.y = 0.01; + EXPECT_TRUE(c.SetWhitePoint(xy_in)); + const CIExy xy = c.GetWhitePoint(); + + ColorEncoding c2; + EXPECT_TRUE(c2.SetWhitePoint(xy)); + EXPECT_TRUE(c.SameColorSpace(c2)); +} + +TEST(ColorEncodingTest, CustomPrimaries) { + ColorEncoding c; + PrimariesCIExy xy_in; + // Nonsensical values + xy_in.r.x = -0.01; + xy_in.r.y = 0.2; + xy_in.g.x = 0.4; + xy_in.g.y = 0.401; + xy_in.b.x = 1.1; + xy_in.b.y = -1.2; + EXPECT_TRUE(c.SetPrimaries(xy_in)); + const PrimariesCIExy xy = c.GetPrimaries(); + + ColorEncoding c2; + EXPECT_TRUE(c2.SetPrimaries(xy)); + EXPECT_TRUE(c.SameColorSpace(c2)); +} + +TEST(ColorEncodingTest, CustomGamma) { + ColorEncoding c; +#ifndef JXL_CRASH_ON_ERROR + EXPECT_FALSE(c.tf.SetGamma(0.0)); + EXPECT_FALSE(c.tf.SetGamma(-1E-6)); + EXPECT_FALSE(c.tf.SetGamma(1.001)); +#endif + EXPECT_TRUE(c.tf.SetGamma(1.0)); + EXPECT_FALSE(c.tf.IsGamma()); + EXPECT_TRUE(c.tf.IsLinear()); + + EXPECT_TRUE(c.tf.SetGamma(0.123)); + EXPECT_TRUE(c.tf.IsGamma()); + const double gamma = c.tf.GetGamma(); + + ColorEncoding c2; + EXPECT_TRUE(c2.tf.SetGamma(gamma)); + EXPECT_TRUE(c.SameColorEncoding(c2)); + EXPECT_TRUE(c2.tf.IsGamma()); +} + +TEST(ColorEncodingTest, InternalExternalConversion) { + ColorEncoding source_internal; + JxlColorEncoding external; + ColorEncoding destination_internal; + + for (int i = 0; i < 100; i++) { + source_internal.SetColorSpace(static_cast(rand() % 4)); + CIExy wp; + wp.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25; + wp.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25; + EXPECT_TRUE(source_internal.SetWhitePoint(wp)); + if (source_internal.HasPrimaries()) { + PrimariesCIExy primaries; + primaries.r.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25; + primaries.r.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25; + primaries.g.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25; + primaries.g.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25; + primaries.b.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25; + primaries.b.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25; + EXPECT_TRUE(source_internal.SetPrimaries(primaries)); + } + CustomTransferFunction tf; + EXPECT_TRUE(tf.SetGamma((float(rand()) / float((RAND_MAX)) * 0.5) + 0.25)); + source_internal.tf = tf; + source_internal.rendering_intent = static_cast(rand() % 4); + + ConvertInternalToExternalColorEncoding(source_internal, &external); + EXPECT_TRUE(ConvertExternalToInternalColorEncoding(external, + &destination_internal)); + + EXPECT_EQ(source_internal.GetColorSpace(), + destination_internal.GetColorSpace()); + EXPECT_EQ(source_internal.white_point, destination_internal.white_point); + EXPECT_EQ(source_internal.GetWhitePoint().x, + destination_internal.GetWhitePoint().x); + EXPECT_EQ(source_internal.GetWhitePoint().y, + destination_internal.GetWhitePoint().y); + if (source_internal.HasPrimaries()) { + EXPECT_EQ(source_internal.GetPrimaries().r.x, + destination_internal.GetPrimaries().r.x); + EXPECT_EQ(source_internal.GetPrimaries().r.y, + destination_internal.GetPrimaries().r.y); + EXPECT_EQ(source_internal.GetPrimaries().g.x, + destination_internal.GetPrimaries().g.x); + EXPECT_EQ(source_internal.GetPrimaries().g.y, + destination_internal.GetPrimaries().g.y); + EXPECT_EQ(source_internal.GetPrimaries().b.x, + destination_internal.GetPrimaries().b.x); + EXPECT_EQ(source_internal.GetPrimaries().b.y, + destination_internal.GetPrimaries().b.y); + } + EXPECT_EQ(source_internal.tf.IsGamma(), destination_internal.tf.IsGamma()); + if (source_internal.tf.IsGamma()) { + EXPECT_EQ(source_internal.tf.GetGamma(), + destination_internal.tf.GetGamma()); + } else { + EXPECT_EQ(source_internal.tf.GetTransferFunction(), + destination_internal.tf.GetTransferFunction()); + } + EXPECT_EQ(source_internal.rendering_intent, + destination_internal.rendering_intent); + } +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_management.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_management.cc new file mode 100644 index 0000000000..feb5140d9a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_management.cc @@ -0,0 +1,433 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Defined by build system; this avoids IDE warnings. Must come before +// color_management.h (affects header definitions). +#ifndef JPEGXL_ENABLE_SKCMS +#define JPEGXL_ENABLE_SKCMS 0 +#endif + +#include "lib/jxl/color_management.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/color_management.cc" +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/linalg.h" // MatMul, Inv3x3Matrix +#include "lib/jxl/transfer_functions-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// NOTE: this is only used to provide a reasonable ICC profile that other +// software can read. Our own transforms use ExtraTF instead because that is +// more precise and supports unbounded mode. +std::vector CreateTableCurve(uint32_t N, const ExtraTF tf) { + JXL_ASSERT(N <= 4096); // ICC MFT2 only allows 4K entries + JXL_ASSERT(tf == ExtraTF::kPQ || tf == ExtraTF::kHLG); + // No point using float - LCMS converts to 16-bit for A2B/MFT. + std::vector table(N); + for (uint32_t i = 0; i < N; ++i) { + const float x = static_cast(i) / (N - 1); // 1.0 at index N - 1. + const double dx = static_cast(x); + // LCMS requires EOTF (e.g. 2.4 exponent). + double y = (tf == ExtraTF::kHLG) ? TF_HLG().DisplayFromEncoded(dx) + : TF_PQ().DisplayFromEncoded(dx); + JXL_ASSERT(y >= 0.0); + // Clamp to table range - necessary for HLG. + if (y > 1.0) y = 1.0; + // 1.0 corresponds to table value 0xFFFF. + table[i] = static_cast(roundf(y * 65535.0)); + } + return table; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(CreateTableCurve); // Local function. + +Status CIEXYZFromWhiteCIExy(const CIExy& xy, float XYZ[3]) { + // Target Y = 1. + if (std::abs(xy.y) < 1e-12) return JXL_FAILURE("Y value is too small"); + const float factor = 1 / xy.y; + XYZ[0] = xy.x * factor; + XYZ[1] = 1; + XYZ[2] = (1 - xy.x - xy.y) * factor; + return true; +} + +namespace { + +// NOTE: this is only used to provide a reasonable ICC profile that other +// software can read. Our own transforms use ExtraTF instead because that is +// more precise and supports unbounded mode. +template +std::vector CreateTableCurve(uint32_t N, const Func& func) { + JXL_ASSERT(N <= 4096); // ICC MFT2 only allows 4K entries + // No point using float - LCMS converts to 16-bit for A2B/MFT. + std::vector table(N); + for (uint32_t i = 0; i < N; ++i) { + const float x = static_cast(i) / (N - 1); // 1.0 at index N - 1. + // LCMS requires EOTF (e.g. 2.4 exponent). + double y = func.DisplayFromEncoded(static_cast(x)); + JXL_ASSERT(y >= 0.0); + // Clamp to table range - necessary for HLG. + if (y > 1.0) y = 1.0; + // 1.0 corresponds to table value 0xFFFF. + table[i] = static_cast(roundf(y * 65535.0)); + } + return table; +} + +Status CreateICCChadMatrix(CIExy w, float result[9]) { + float m[9]; + if (w.y == 0) { // WhitePoint can not be pitch-black. + return JXL_FAILURE("Invalid WhitePoint"); + } + JXL_RETURN_IF_ERROR(AdaptToXYZD50(w.x, w.y, m)); + memcpy(result, m, sizeof(float) * 9); + return true; +} + +// Creates RGB to XYZ matrix given RGB primaries and whitepoint in xy. +Status CreateICCRGBMatrix(CIExy r, CIExy g, CIExy b, CIExy w, float result[9]) { + float m[9]; + JXL_RETURN_IF_ERROR( + PrimariesToXYZD50(r.x, r.y, g.x, g.y, b.x, b.y, w.x, w.y, m)); + memcpy(result, m, sizeof(float) * 9); + return true; +} + +void WriteICCUint32(uint32_t value, size_t pos, PaddedBytes* JXL_RESTRICT icc) { + if (icc->size() < pos + 4) icc->resize(pos + 4); + (*icc)[pos + 0] = (value >> 24u) & 255; + (*icc)[pos + 1] = (value >> 16u) & 255; + (*icc)[pos + 2] = (value >> 8u) & 255; + (*icc)[pos + 3] = value & 255; +} + +void WriteICCUint16(uint16_t value, size_t pos, PaddedBytes* JXL_RESTRICT icc) { + if (icc->size() < pos + 2) icc->resize(pos + 2); + (*icc)[pos + 0] = (value >> 8u) & 255; + (*icc)[pos + 1] = value & 255; +} + +// Writes a 4-character tag +void WriteICCTag(const char* value, size_t pos, PaddedBytes* JXL_RESTRICT icc) { + if (icc->size() < pos + 4) icc->resize(pos + 4); + memcpy(icc->data() + pos, value, 4); +} + +Status WriteICCS15Fixed16(float value, size_t pos, + PaddedBytes* JXL_RESTRICT icc) { + // "nextafterf" for 32768.0f towards zero are: + // 32767.998046875, 32767.99609375, 32767.994140625 + // Even the first value works well,... + bool ok = (-32767.995f <= value) && (value <= 32767.995f); + if (!ok) return JXL_FAILURE("ICC value is out of range / NaN"); + int32_t i = value * 65536.0f + 0.5f; + // Use two's complement + uint32_t u = static_cast(i); + WriteICCUint32(u, pos, icc); + return true; +} + +Status CreateICCHeader(const ColorEncoding& c, + PaddedBytes* JXL_RESTRICT header) { + // TODO(lode): choose color management engine name, e.g. "skia" if + // integrated in skia. + static const char* kCmm = "jxl "; + + header->resize(128, 0); + + WriteICCUint32(0, 0, header); // size, correct value filled in at end + WriteICCTag(kCmm, 4, header); + WriteICCUint32(0x04300000u, 8, header); + WriteICCTag("mntr", 12, header); + WriteICCTag(c.IsGray() ? "GRAY" : "RGB ", 16, header); + WriteICCTag("XYZ ", 20, header); + + // Three uint32_t's date/time encoding. + // TODO(lode): encode actual date and time, this is a placeholder + uint32_t year = 2019, month = 12, day = 1; + uint32_t hour = 0, minute = 0, second = 0; + WriteICCUint16(year, 24, header); + WriteICCUint16(month, 26, header); + WriteICCUint16(day, 28, header); + WriteICCUint16(hour, 30, header); + WriteICCUint16(minute, 32, header); + WriteICCUint16(second, 34, header); + + WriteICCTag("acsp", 36, header); + WriteICCTag("APPL", 40, header); + WriteICCUint32(0, 44, header); // flags + WriteICCUint32(0, 48, header); // device manufacturer + WriteICCUint32(0, 52, header); // device model + WriteICCUint32(0, 56, header); // device attributes + WriteICCUint32(0, 60, header); // device attributes + WriteICCUint32(static_cast(c.rendering_intent), 64, header); + + // Mandatory D50 white point of profile connection space + WriteICCUint32(0x0000f6d6, 68, header); + WriteICCUint32(0x00010000, 72, header); + WriteICCUint32(0x0000d32d, 76, header); + + WriteICCTag(kCmm, 80, header); + + return true; +} + +void AddToICCTagTable(const char* tag, size_t offset, size_t size, + PaddedBytes* JXL_RESTRICT tagtable, + std::vector* offsets) { + WriteICCTag(tag, tagtable->size(), tagtable); + // writing true offset deferred to later + WriteICCUint32(0, tagtable->size(), tagtable); + offsets->push_back(offset); + WriteICCUint32(size, tagtable->size(), tagtable); +} + +void FinalizeICCTag(PaddedBytes* JXL_RESTRICT tags, size_t* offset, + size_t* size) { + while ((tags->size() & 3) != 0) { + tags->push_back(0); + } + *offset += *size; + *size = tags->size() - *offset; +} + +// The input text must be ASCII, writing other characters to UTF-16 is not +// implemented. +void CreateICCMlucTag(const std::string& text, PaddedBytes* JXL_RESTRICT tags) { + WriteICCTag("mluc", tags->size(), tags); + WriteICCUint32(0, tags->size(), tags); + WriteICCUint32(1, tags->size(), tags); + WriteICCUint32(12, tags->size(), tags); + WriteICCTag("enUS", tags->size(), tags); + WriteICCUint32(text.size() * 2, tags->size(), tags); + WriteICCUint32(28, tags->size(), tags); + for (size_t i = 0; i < text.size(); i++) { + tags->push_back(0); // prepend 0 for UTF-16 + tags->push_back(text[i]); + } +} + +Status CreateICCXYZTag(float xyz[3], PaddedBytes* JXL_RESTRICT tags) { + WriteICCTag("XYZ ", tags->size(), tags); + WriteICCUint32(0, tags->size(), tags); + for (size_t i = 0; i < 3; ++i) { + JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(xyz[i], tags->size(), tags)); + } + return true; +} + +Status CreateICCChadTag(float chad[9], PaddedBytes* JXL_RESTRICT tags) { + WriteICCTag("sf32", tags->size(), tags); + WriteICCUint32(0, tags->size(), tags); + for (size_t i = 0; i < 9; i++) { + JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(chad[i], tags->size(), tags)); + } + return true; +} + +void CreateICCCurvCurvTag(const std::vector& curve, + PaddedBytes* JXL_RESTRICT tags) { + size_t pos = tags->size(); + tags->resize(tags->size() + 12 + curve.size() * 2, 0); + WriteICCTag("curv", pos, tags); + WriteICCUint32(0, pos + 4, tags); + WriteICCUint32(curve.size(), pos + 8, tags); + for (size_t i = 0; i < curve.size(); i++) { + WriteICCUint16(curve[i], pos + 12 + i * 2, tags); + } +} + +Status CreateICCCurvParaTag(std::vector params, size_t curve_type, + PaddedBytes* JXL_RESTRICT tags) { + WriteICCTag("para", tags->size(), tags); + WriteICCUint32(0, tags->size(), tags); + WriteICCUint16(curve_type, tags->size(), tags); + WriteICCUint16(0, tags->size(), tags); + for (size_t i = 0; i < params.size(); i++) { + JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(params[i], tags->size(), tags)); + } + return true; +} +} // namespace + +Status MaybeCreateProfile(const ColorEncoding& c, + PaddedBytes* JXL_RESTRICT icc) { + PaddedBytes header, tagtable, tags; + + if (c.GetColorSpace() == ColorSpace::kUnknown || c.tf.IsUnknown()) { + return false; // Not an error + } + + switch (c.GetColorSpace()) { + case ColorSpace::kRGB: + case ColorSpace::kGray: + break; // OK + case ColorSpace::kXYB: + return JXL_FAILURE("XYB ICC not yet implemented"); + default: + return JXL_FAILURE("Invalid CS %u", + static_cast(c.GetColorSpace())); + } + + JXL_RETURN_IF_ERROR(CreateICCHeader(c, &header)); + + std::vector offsets; + // tag count, deferred to later + WriteICCUint32(0, tagtable.size(), &tagtable); + + size_t tag_offset = 0, tag_size = 0; + + CreateICCMlucTag(Description(c), &tags); + FinalizeICCTag(&tags, &tag_offset, &tag_size); + AddToICCTagTable("desc", tag_offset, tag_size, &tagtable, &offsets); + + const std::string copyright = + "Copyright 2019 Google LLC, CC-BY-SA 3.0 Unported " + "license(https://creativecommons.org/licenses/by-sa/3.0/legalcode)"; + CreateICCMlucTag(copyright, &tags); + FinalizeICCTag(&tags, &tag_offset, &tag_size); + AddToICCTagTable("cprt", tag_offset, tag_size, &tagtable, &offsets); + + // TODO(eustas): isn't it the other way round: gray image has d50 WhitePoint? + if (c.IsGray()) { + float wtpt[3]; + JXL_RETURN_IF_ERROR(CIEXYZFromWhiteCIExy(c.GetWhitePoint(), wtpt)); + JXL_RETURN_IF_ERROR(CreateICCXYZTag(wtpt, &tags)); + } else { + float d50[3] = {0.964203, 1.0, 0.824905}; + JXL_RETURN_IF_ERROR(CreateICCXYZTag(d50, &tags)); + } + FinalizeICCTag(&tags, &tag_offset, &tag_size); + AddToICCTagTable("wtpt", tag_offset, tag_size, &tagtable, &offsets); + + if (!c.IsGray()) { + // Chromatic adaptation matrix + float chad[9]; + JXL_RETURN_IF_ERROR(CreateICCChadMatrix(c.GetWhitePoint(), chad)); + + const PrimariesCIExy primaries = c.GetPrimaries(); + float m[9]; + JXL_RETURN_IF_ERROR(CreateICCRGBMatrix(primaries.r, primaries.g, + primaries.b, c.GetWhitePoint(), m)); + float r[3] = {m[0], m[3], m[6]}; + float g[3] = {m[1], m[4], m[7]}; + float b[3] = {m[2], m[5], m[8]}; + + JXL_RETURN_IF_ERROR(CreateICCChadTag(chad, &tags)); + FinalizeICCTag(&tags, &tag_offset, &tag_size); + AddToICCTagTable("chad", tag_offset, tag_size, &tagtable, &offsets); + + JXL_RETURN_IF_ERROR(CreateICCXYZTag(r, &tags)); + FinalizeICCTag(&tags, &tag_offset, &tag_size); + AddToICCTagTable("rXYZ", tag_offset, tag_size, &tagtable, &offsets); + + JXL_RETURN_IF_ERROR(CreateICCXYZTag(g, &tags)); + FinalizeICCTag(&tags, &tag_offset, &tag_size); + AddToICCTagTable("gXYZ", tag_offset, tag_size, &tagtable, &offsets); + + JXL_RETURN_IF_ERROR(CreateICCXYZTag(b, &tags)); + FinalizeICCTag(&tags, &tag_offset, &tag_size); + AddToICCTagTable("bXYZ", tag_offset, tag_size, &tagtable, &offsets); + } + + if (c.tf.IsGamma()) { + float gamma = 1.0 / c.tf.GetGamma(); + JXL_RETURN_IF_ERROR( + CreateICCCurvParaTag({gamma, 1.0, 0.0, 1.0, 0.0}, 3, &tags)); + } else { + switch (c.tf.GetTransferFunction()) { + case TransferFunction::kHLG: + CreateICCCurvCurvTag( + HWY_DYNAMIC_DISPATCH(CreateTableCurve)(4096, ExtraTF::kHLG), &tags); + break; + case TransferFunction::kPQ: + CreateICCCurvCurvTag( + HWY_DYNAMIC_DISPATCH(CreateTableCurve)(4096, ExtraTF::kPQ), &tags); + break; + case TransferFunction::kSRGB: + JXL_RETURN_IF_ERROR(CreateICCCurvParaTag( + {2.4, 1.0 / 1.055, 0.055 / 1.055, 1.0 / 12.92, 0.04045}, 3, &tags)); + break; + case TransferFunction::k709: + JXL_RETURN_IF_ERROR(CreateICCCurvParaTag( + {1.0 / 0.45, 1.0 / 1.099, 0.099 / 1.099, 1.0 / 4.5, 0.081}, 3, + &tags)); + break; + case TransferFunction::kLinear: + JXL_RETURN_IF_ERROR( + CreateICCCurvParaTag({1.0, 1.0, 0.0, 1.0, 0.0}, 3, &tags)); + break; + case TransferFunction::kDCI: + JXL_RETURN_IF_ERROR( + CreateICCCurvParaTag({2.6, 1.0, 0.0, 1.0, 0.0}, 3, &tags)); + break; + default: + JXL_ABORT("Unknown TF %d", c.tf.GetTransferFunction()); + } + } + FinalizeICCTag(&tags, &tag_offset, &tag_size); + if (c.IsGray()) { + AddToICCTagTable("kTRC", tag_offset, tag_size, &tagtable, &offsets); + } else { + AddToICCTagTable("rTRC", tag_offset, tag_size, &tagtable, &offsets); + AddToICCTagTable("gTRC", tag_offset, tag_size, &tagtable, &offsets); + AddToICCTagTable("bTRC", tag_offset, tag_size, &tagtable, &offsets); + } + + // Tag count + WriteICCUint32(offsets.size(), 0, &tagtable); + for (size_t i = 0; i < offsets.size(); i++) { + WriteICCUint32(offsets[i] + header.size() + tagtable.size(), 4 + 12 * i + 4, + &tagtable); + } + + // ICC profile size + WriteICCUint32(header.size() + tagtable.size() + tags.size(), 0, &header); + + *icc = header; + icc->append(tagtable); + icc->append(tags); + + // rendering intent, and region of the checksum itself, set to 0. + // TODO(lode): manually verify with a reliable tool that this creates correct + // signature (profile id) for ICC profiles. + PaddedBytes icc_sum = *icc; + memset(icc_sum.data() + 44, 0, 4); + memset(icc_sum.data() + 64, 0, 4); + + return true; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_management.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_management.h new file mode 100644 index 0000000000..f728fe589a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_management.h @@ -0,0 +1,38 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_COLOR_MANAGEMENT_H_ +#define LIB_JXL_COLOR_MANAGEMENT_H_ + +// ICC profiles and color space conversions. + +#include +#include + +#include + +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image.h" + +namespace jxl { + +enum class ExtraTF { + kNone, + kPQ, + kHLG, + kSRGB, +}; + +Status MaybeCreateProfile(const ColorEncoding& c, + PaddedBytes* JXL_RESTRICT icc); + +Status CIEXYZFromWhiteCIExy(const CIExy& xy, float XYZ[3]); + +} // namespace jxl + +#endif // LIB_JXL_COLOR_MANAGEMENT_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_management_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_management_test.cc new file mode 100644 index 0000000000..0747e5c9e3 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/color_management_test.cc @@ -0,0 +1,237 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/color_management.h" + +#include +#include + +#include +#include +#include +#include + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/file_io.h" +#include "lib/jxl/base/thread_pool_internal.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testdata.h" + +namespace jxl { + +std::ostream& operator<<(std::ostream& os, const CIExy& xy) { + return os << "{x=" << xy.x << ", y=" << xy.y << "}"; +} + +std::ostream& operator<<(std::ostream& os, const PrimariesCIExy& primaries) { + return os << "{r=" << primaries.r << ", g=" << primaries.g + << ", b=" << primaries.b << "}"; +} + +namespace { + +using ::testing::ElementsAre; +using ::testing::FloatNear; + +// Small enough to be fast. If changed, must update Generate*. +static constexpr size_t kWidth = 16; + +struct Globals { + // TODO(deymo): Make this a const. + static Globals* GetInstance() { + static Globals ret; + return &ret; + } + + private: + static constexpr size_t kNumThreads = 0; // only have a single row. + + Globals() : pool(kNumThreads) { + in_gray = GenerateGray(); + in_color = GenerateColor(); + out_gray = ImageF(kWidth, 1); + out_color = ImageF(kWidth * 3, 1); + + c_native = ColorEncoding::LinearSRGB(/*is_gray=*/false); + c_gray = ColorEncoding::LinearSRGB(/*is_gray=*/true); + } + + static ImageF GenerateGray() { + ImageF gray(kWidth, 1); + float* JXL_RESTRICT row = gray.Row(0); + // Increasing left to right + for (uint32_t x = 0; x < kWidth; ++x) { + row[x] = x * 1.0f / (kWidth - 1); // [0, 1] + } + return gray; + } + + static ImageF GenerateColor() { + ImageF image(kWidth * 3, 1); + float* JXL_RESTRICT interleaved = image.Row(0); + std::fill(interleaved, interleaved + kWidth * 3, 0.0f); + + // [0, 4): neutral + for (int32_t x = 0; x < 4; ++x) { + interleaved[3 * x + 0] = x * 1.0f / 3; // [0, 1] + interleaved[3 * x + 2] = interleaved[3 * x + 1] = interleaved[3 * x + 0]; + } + + // [4, 13): pure RGB with low/medium/high saturation + for (int32_t c = 0; c < 3; ++c) { + interleaved[3 * (4 + c) + c] = 0.08f + c * 0.01f; + interleaved[3 * (7 + c) + c] = 0.75f + c * 0.01f; + interleaved[3 * (10 + c) + c] = 1.0f; + } + + // [13, 16): impure, not quite saturated RGB + interleaved[3 * 13 + 0] = 0.86f; + interleaved[3 * 13 + 2] = interleaved[3 * 13 + 1] = 0.16f; + interleaved[3 * 14 + 1] = 0.87f; + interleaved[3 * 14 + 2] = interleaved[3 * 14 + 0] = 0.16f; + interleaved[3 * 15 + 2] = 0.88f; + interleaved[3 * 15 + 1] = interleaved[3 * 15 + 0] = 0.16f; + + return image; + } + + public: + ThreadPoolInternal pool; + + // ImageF so we can use VerifyRelativeError; all are interleaved RGB. + ImageF in_gray; + ImageF in_color; + ImageF out_gray; + ImageF out_color; + ColorEncoding c_native; + ColorEncoding c_gray; +}; + +class ColorManagementTest + : public ::testing::TestWithParam { + public: + static void VerifySameFields(const ColorEncoding& c, + const ColorEncoding& c2) { + ASSERT_EQ(c.rendering_intent, c2.rendering_intent); + ASSERT_EQ(c.GetColorSpace(), c2.GetColorSpace()); + ASSERT_EQ(c.white_point, c2.white_point); + if (c.HasPrimaries()) { + ASSERT_EQ(c.primaries, c2.primaries); + } + ASSERT_TRUE(c.tf.IsSame(c2.tf)); + } + + // "Same" pixels after converting g->c_native -> c -> g->c_native. + static void VerifyPixelRoundTrip(const ColorEncoding& c) { + Globals* g = Globals::GetInstance(); + const ColorEncoding& c_native = c.IsGray() ? g->c_gray : g->c_native; + ColorSpaceTransform xform_fwd; + ColorSpaceTransform xform_rev; + ASSERT_TRUE(xform_fwd.Init(c_native, c, kDefaultIntensityTarget, kWidth, + g->pool.NumThreads())); + ASSERT_TRUE(xform_rev.Init(c, c_native, kDefaultIntensityTarget, kWidth, + g->pool.NumThreads())); + + const size_t thread = 0; + const ImageF& in = c.IsGray() ? g->in_gray : g->in_color; + ImageF* JXL_RESTRICT out = c.IsGray() ? &g->out_gray : &g->out_color; + DoColorSpaceTransform(&xform_fwd, thread, in.Row(0), + xform_fwd.BufDst(thread)); + DoColorSpaceTransform(&xform_rev, thread, xform_fwd.BufDst(thread), + out->Row(0)); + +#if JPEGXL_ENABLE_SKCMS + double max_l1 = 7E-4; + double max_rel = 4E-7; +#else + double max_l1 = 5E-5; + // Most are lower; reached 3E-7 with D60 AP0. + double max_rel = 4E-7; +#endif + if (c.IsGray()) max_rel = 2E-5; + VerifyRelativeError(in, *out, max_l1, max_rel); + } +}; +JXL_GTEST_INSTANTIATE_TEST_SUITE_P(ColorManagementTestInstantiation, + ColorManagementTest, + ::testing::ValuesIn(test::AllEncodings())); + +// Exercises the ColorManagement interface for ALL ColorEncoding synthesizable +// via enums. +TEST_P(ColorManagementTest, VerifyAllProfiles) { + ColorEncoding c = ColorEncodingFromDescriptor(GetParam()); + printf("%s\n", Description(c).c_str()); + + // Can create profile. + ASSERT_TRUE(c.CreateICC()); + + // Can set an equivalent ColorEncoding from the generated ICC profile. + ColorEncoding c3; + ASSERT_TRUE(c3.SetICC(PaddedBytes(c.ICC()))); + VerifySameFields(c, c3); + + VerifyPixelRoundTrip(c); +} + +testing::Matcher CIExyIs(const double x, const double y) { + static constexpr double kMaxError = 1e-4; + return testing::AllOf( + testing::Field(&CIExy::x, testing::DoubleNear(x, kMaxError)), + testing::Field(&CIExy::y, testing::DoubleNear(y, kMaxError))); +} + +testing::Matcher PrimariesAre( + const testing::Matcher& r, const testing::Matcher& g, + const testing::Matcher& b) { + return testing::AllOf(testing::Field(&PrimariesCIExy::r, r), + testing::Field(&PrimariesCIExy::g, g), + testing::Field(&PrimariesCIExy::b, b)); +} + +TEST_F(ColorManagementTest, sRGBChromaticity) { + const ColorEncoding sRGB = ColorEncoding::SRGB(); + EXPECT_THAT(sRGB.GetWhitePoint(), CIExyIs(0.3127, 0.3290)); + EXPECT_THAT(sRGB.GetPrimaries(), + PrimariesAre(CIExyIs(0.64, 0.33), CIExyIs(0.30, 0.60), + CIExyIs(0.15, 0.06))); +} + +TEST_F(ColorManagementTest, D2700Chromaticity) { + PaddedBytes icc = ReadTestData("jxl/color_management/sRGB-D2700.icc"); + ColorEncoding sRGB_D2700; + ASSERT_TRUE(sRGB_D2700.SetICC(std::move(icc))); + + EXPECT_THAT(sRGB_D2700.GetWhitePoint(), CIExyIs(0.45986, 0.41060)); + // The illuminant-relative chromaticities of this profile's primaries are the + // same as for sRGB. It is the PCS-relative chromaticities that would be + // different. + EXPECT_THAT(sRGB_D2700.GetPrimaries(), + PrimariesAre(CIExyIs(0.64, 0.33), CIExyIs(0.30, 0.60), + CIExyIs(0.15, 0.06))); +} + +TEST_F(ColorManagementTest, D2700ToSRGB) { + PaddedBytes icc = ReadTestData("jxl/color_management/sRGB-D2700.icc"); + ColorEncoding sRGB_D2700; + ASSERT_TRUE(sRGB_D2700.SetICC(std::move(icc))); + + ColorSpaceTransform transform; + ASSERT_TRUE(transform.Init(sRGB_D2700, ColorEncoding::SRGB(), + kDefaultIntensityTarget, 1, 1)); + const float sRGB_D2700_values[3] = {0.863, 0.737, 0.490}; + float sRGB_values[3]; + DoColorSpaceTransform(&transform, 0, sRGB_D2700_values, sRGB_values); + EXPECT_THAT(sRGB_values, + ElementsAre(FloatNear(0.914, 1e-3), FloatNear(0.745, 1e-3), + FloatNear(0.601, 1e-3))); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/common.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/common.h new file mode 100644 index 0000000000..a71216ecca --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/common.h @@ -0,0 +1,194 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_COMMON_H_ +#define LIB_JXL_COMMON_H_ + +// Shared constants and helper functions. + +#include +#include +#include + +#include // numeric_limits +#include // unique_ptr +#include + +#include "lib/jxl/base/compiler_specific.h" + +#ifndef JXL_HIGH_PRECISION +#define JXL_HIGH_PRECISION 1 +#endif + +// Macro that defines whether support for decoding JXL files to JPEG is enabled. +#ifndef JPEGXL_ENABLE_TRANSCODE_JPEG +#define JPEGXL_ENABLE_TRANSCODE_JPEG 1 +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + +namespace jxl { +// Some enums and typedefs used by more than one header file. + +constexpr size_t kBitsPerByte = 8; // more clear than CHAR_BIT + +constexpr inline size_t RoundUpBitsToByteMultiple(size_t bits) { + return (bits + 7) & ~size_t(7); +} + +constexpr inline size_t RoundUpToBlockDim(size_t dim) { + return (dim + 7) & ~size_t(7); +} + +static inline bool JXL_MAYBE_UNUSED SafeAdd(const uint64_t a, const uint64_t b, + uint64_t& sum) { + sum = a + b; + return sum >= a; // no need to check b - either sum >= both or < both. +} + +template +constexpr inline T1 DivCeil(T1 a, T2 b) { + return (a + b - 1) / b; +} + +// Works for any `align`; if a power of two, compiler emits ADD+AND. +constexpr inline size_t RoundUpTo(size_t what, size_t align) { + return DivCeil(what, align) * align; +} + +constexpr double kPi = 3.14159265358979323846264338327950288; + +// Reasonable default for sRGB, matches common monitors. We map white to this +// many nits (cd/m^2) by default. Butteraugli was tuned for 250 nits, which is +// very close. +static constexpr float kDefaultIntensityTarget = 255; + +template +constexpr T Pi(T multiplier) { + return static_cast(multiplier * kPi); +} + +// Block is the square grid of pixels to which an "energy compaction" +// transformation (e.g. DCT) is applied. Each block has its own AC quantizer. +constexpr size_t kBlockDim = 8; + +constexpr size_t kDCTBlockSize = kBlockDim * kBlockDim; + +constexpr size_t kGroupDim = 256; +static_assert(kGroupDim % kBlockDim == 0, + "Group dim should be divisible by block dim"); +constexpr size_t kGroupDimInBlocks = kGroupDim / kBlockDim; + +// Maximum number of passes in an image. +constexpr size_t kMaxNumPasses = 11; + +// Maximum number of reference frames. +constexpr size_t kMaxNumReferenceFrames = 4; + +// Dimensions of a frame, in pixels, and other derived dimensions. +// Computed from FrameHeader. +// TODO(veluca): add extra channels. +struct FrameDimensions { + void Set(size_t xsize, size_t ysize, size_t group_size_shift, + size_t max_hshift, size_t max_vshift, bool modular_mode, + size_t upsampling) { + group_dim = (kGroupDim >> 1) << group_size_shift; + dc_group_dim = group_dim * kBlockDim; + xsize_upsampled = xsize; + ysize_upsampled = ysize; + this->xsize = DivCeil(xsize, upsampling); + this->ysize = DivCeil(ysize, upsampling); + xsize_blocks = DivCeil(this->xsize, kBlockDim << max_hshift) << max_hshift; + ysize_blocks = DivCeil(this->ysize, kBlockDim << max_vshift) << max_vshift; + xsize_padded = xsize_blocks * kBlockDim; + ysize_padded = ysize_blocks * kBlockDim; + if (modular_mode) { + // Modular mode doesn't have any padding. + xsize_padded = this->xsize; + ysize_padded = this->ysize; + } + xsize_upsampled_padded = xsize_padded * upsampling; + ysize_upsampled_padded = ysize_padded * upsampling; + xsize_groups = DivCeil(this->xsize, group_dim); + ysize_groups = DivCeil(this->ysize, group_dim); + xsize_dc_groups = DivCeil(xsize_blocks, group_dim); + ysize_dc_groups = DivCeil(ysize_blocks, group_dim); + num_groups = xsize_groups * ysize_groups; + num_dc_groups = xsize_dc_groups * ysize_dc_groups; + } + + // Image size without any upsampling, i.e. original_size / upsampling. + size_t xsize; + size_t ysize; + // Original image size. + size_t xsize_upsampled; + size_t ysize_upsampled; + // Image size after upsampling the padded image. + size_t xsize_upsampled_padded; + size_t ysize_upsampled_padded; + // Image size after padding to a multiple of kBlockDim (if VarDCT mode). + size_t xsize_padded; + size_t ysize_padded; + // Image size in kBlockDim blocks. + size_t xsize_blocks; + size_t ysize_blocks; + // Image size in number of groups. + size_t xsize_groups; + size_t ysize_groups; + // Image size in number of DC groups. + size_t xsize_dc_groups; + size_t ysize_dc_groups; + // Number of AC or DC groups. + size_t num_groups; + size_t num_dc_groups; + // Size of a group. + size_t group_dim; + size_t dc_group_dim; +}; + +// Prior to C++14 (i.e. C++11): provide our own make_unique +#if __cplusplus < 201402L +template +std::unique_ptr make_unique(Args&&... args) { + return std::unique_ptr(new T(std::forward(args)...)); +} +#else +using std::make_unique; +#endif + +template +JXL_INLINE T Clamp1(T val, T low, T hi) { + return val < low ? low : val > hi ? hi : val; +} + +// Encodes non-negative (X) into (2 * X), negative (-X) into (2 * X - 1) +constexpr uint32_t PackSigned(int32_t value) + JXL_NO_SANITIZE("unsigned-integer-overflow") { + return (static_cast(value) << 1) ^ + ((static_cast(~value) >> 31) - 1); +} + +// Reverse to PackSigned, i.e. UnpackSigned(PackSigned(X)) == X. +constexpr intptr_t UnpackSigned(size_t value) { + return static_cast((value >> 1) ^ (((~value) & 1) - 1)); +} + +// conversion from integer to string. +template +std::string ToString(T n) { + char data[32] = {}; + if (T(0.1) != T(0)) { + // float + snprintf(data, sizeof(data), "%g", static_cast(n)); + } else if (T(-1) > T(0)) { + // unsigned + snprintf(data, sizeof(data), "%llu", static_cast(n)); + } else { + // signed + snprintf(data, sizeof(data), "%lld", static_cast(n)); + } + return data; +} +} // namespace jxl + +#endif // LIB_JXL_COMMON_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_dc.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_dc.cc new file mode 100644 index 0000000000..bac580acaa --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_dc.cc @@ -0,0 +1,312 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/compressed_dc.h" + +#include +#include +#include + +#include +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/compressed_dc.cc" +#include +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/image.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +using D = HWY_FULL(float); +using DScalar = HWY_CAPPED(float, 1); + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Rebind; +using hwy::HWY_NAMESPACE::Vec; + +// TODO(veluca): optimize constants. +const float w1 = 0.20345139757231578f; +const float w2 = 0.0334829185968739f; +const float w0 = 1.0f - 4.0f * (w1 + w2); + +template +V MaxWorkaround(V a, V b) { +#if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800 + // Prevents "Do not know how to split the result of this operator" error + return IfThenElse(a > b, a, b); +#else + return Max(a, b); +#endif +} + +template +JXL_INLINE void ComputePixelChannel(const D d, const float dc_factor, + const float* JXL_RESTRICT row_top, + const float* JXL_RESTRICT row, + const float* JXL_RESTRICT row_bottom, + Vec* JXL_RESTRICT mc, + Vec* JXL_RESTRICT sm, + Vec* JXL_RESTRICT gap, size_t x) { + const auto tl = LoadU(d, row_top + x - 1); + const auto tc = Load(d, row_top + x); + const auto tr = LoadU(d, row_top + x + 1); + + const auto ml = LoadU(d, row + x - 1); + *mc = Load(d, row + x); + const auto mr = LoadU(d, row + x + 1); + + const auto bl = LoadU(d, row_bottom + x - 1); + const auto bc = Load(d, row_bottom + x); + const auto br = LoadU(d, row_bottom + x + 1); + + const auto w_center = Set(d, w0); + const auto w_side = Set(d, w1); + const auto w_corner = Set(d, w2); + + const auto corner = tl + tr + bl + br; + const auto side = ml + mr + tc + bc; + *sm = corner * w_corner + side * w_side + *mc * w_center; + + const auto dc_quant = Set(d, dc_factor); + *gap = MaxWorkaround(*gap, Abs((*mc - *sm) / dc_quant)); +} + +template +JXL_INLINE void ComputePixel( + const float* JXL_RESTRICT dc_factors, + const float* JXL_RESTRICT* JXL_RESTRICT rows_top, + const float* JXL_RESTRICT* JXL_RESTRICT rows, + const float* JXL_RESTRICT* JXL_RESTRICT rows_bottom, + float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) { + const D d; + auto mc_x = Undefined(d); + auto mc_y = Undefined(d); + auto mc_b = Undefined(d); + auto sm_x = Undefined(d); + auto sm_y = Undefined(d); + auto sm_b = Undefined(d); + auto gap = Set(d, 0.5f); + ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0], + &mc_x, &sm_x, &gap, x); + ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1], + &mc_y, &sm_y, &gap, x); + ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2], + &mc_b, &sm_b, &gap, x); + auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f)); + factor = ZeroIfNegative(factor); + + auto out = MulAdd(sm_x - mc_x, factor, mc_x); + Store(out, d, out_rows[0] + x); + out = MulAdd(sm_y - mc_y, factor, mc_y); + Store(out, d, out_rows[1] + x); + out = MulAdd(sm_b - mc_b, factor, mc_b); + Store(out, d, out_rows[2] + x); +} + +void AdaptiveDCSmoothing(const float* dc_factors, Image3F* dc, + ThreadPool* pool) { + const size_t xsize = dc->xsize(); + const size_t ysize = dc->ysize(); + if (ysize <= 2 || xsize <= 2) return; + + // TODO(veluca): use tile-based processing? + // TODO(veluca): decide if changes to the y channel should be propagated to + // the x and b channels through color correlation. + JXL_ASSERT(w1 + w2 < 0.25f); + + PROFILER_FUNC; + + Image3F smoothed(xsize, ysize); + // Fill in borders that the loop below will not. First and last are unused. + for (size_t c = 0; c < 3; c++) { + for (size_t y : {size_t(0), ysize - 1}) { + memcpy(smoothed.PlaneRow(c, y), dc->PlaneRow(c, y), + xsize * sizeof(float)); + } + } + auto process_row = [&](int y, int /*thread*/) { + const float* JXL_RESTRICT rows_top[3]{ + dc->ConstPlaneRow(0, y - 1), + dc->ConstPlaneRow(1, y - 1), + dc->ConstPlaneRow(2, y - 1), + }; + const float* JXL_RESTRICT rows[3] = { + dc->ConstPlaneRow(0, y), + dc->ConstPlaneRow(1, y), + dc->ConstPlaneRow(2, y), + }; + const float* JXL_RESTRICT rows_bottom[3] = { + dc->ConstPlaneRow(0, y + 1), + dc->ConstPlaneRow(1, y + 1), + dc->ConstPlaneRow(2, y + 1), + }; + float* JXL_RESTRICT rows_out[3] = { + smoothed.PlaneRow(0, y), + smoothed.PlaneRow(1, y), + smoothed.PlaneRow(2, y), + }; + for (size_t x : {size_t(0), xsize - 1}) { + for (size_t c = 0; c < 3; c++) { + rows_out[c][x] = rows[c][x]; + } + } + + size_t x = 1; + // First pixels + const size_t N = Lanes(D()); + for (; x < std::min(N, xsize - 1); x++) { + ComputePixel(dc_factors, rows_top, rows, rows_bottom, rows_out, + x); + } + // Full vectors. + for (; x + N <= xsize - 1; x += N) { + ComputePixel(dc_factors, rows_top, rows, rows_bottom, rows_out, x); + } + // Last pixels. + for (; x < xsize - 1; x++) { + ComputePixel(dc_factors, rows_top, rows, rows_bottom, rows_out, + x); + } + }; + RunOnPool(pool, 1, ysize - 1, ThreadPool::SkipInit(), process_row, + "DCSmoothingRow"); + dc->Swap(smoothed); +} + +// DC dequantization. +void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in, + const float* dc_factors, float mul, const float* cfl_factors, + YCbCrChromaSubsampling chroma_subsampling, + const BlockCtxMap& bctx) { + const HWY_FULL(float) df; + const Rebind di; // assumes pixel_type <= float + if (chroma_subsampling.Is444()) { + const auto fac_x = Set(df, dc_factors[0] * mul); + const auto fac_y = Set(df, dc_factors[1] * mul); + const auto fac_b = Set(df, dc_factors[2] * mul); + const auto cfl_fac_x = Set(df, cfl_factors[0]); + const auto cfl_fac_b = Set(df, cfl_factors[2]); + for (size_t y = 0; y < r.ysize(); y++) { + float* dec_row_x = r.PlaneRow(dc, 0, y); + float* dec_row_y = r.PlaneRow(dc, 1, y); + float* dec_row_b = r.PlaneRow(dc, 2, y); + const int32_t* quant_row_x = in.channel[1].plane.Row(y); + const int32_t* quant_row_y = in.channel[0].plane.Row(y); + const int32_t* quant_row_b = in.channel[2].plane.Row(y); + for (size_t x = 0; x < r.xsize(); x += Lanes(di)) { + const auto in_q_x = Load(di, quant_row_x + x); + const auto in_q_y = Load(di, quant_row_y + x); + const auto in_q_b = Load(di, quant_row_b + x); + const auto in_x = ConvertTo(df, in_q_x) * fac_x; + const auto in_y = ConvertTo(df, in_q_y) * fac_y; + const auto in_b = ConvertTo(df, in_q_b) * fac_b; + Store(in_y, df, dec_row_y + x); + Store(MulAdd(in_y, cfl_fac_x, in_x), df, dec_row_x + x); + Store(MulAdd(in_y, cfl_fac_b, in_b), df, dec_row_b + x); + } + } + } else { + for (size_t c : {1, 0, 2}) { + Rect rect(r.x0() >> chroma_subsampling.HShift(c), + r.y0() >> chroma_subsampling.VShift(c), + r.xsize() >> chroma_subsampling.HShift(c), + r.ysize() >> chroma_subsampling.VShift(c)); + const auto fac = Set(df, dc_factors[c] * mul); + const Channel& ch = in.channel[c < 2 ? c ^ 1 : c]; + for (size_t y = 0; y < rect.ysize(); y++) { + const int32_t* quant_row = ch.plane.Row(y); + float* row = rect.PlaneRow(dc, c, y); + for (size_t x = 0; x < rect.xsize(); x += Lanes(di)) { + const auto in_q = Load(di, quant_row + x); + const auto in = ConvertTo(df, in_q) * fac; + Store(in, df, row + x); + } + } + } + } + if (bctx.num_dc_ctxs <= 1) { + for (size_t y = 0; y < r.ysize(); y++) { + uint8_t* qdc_row = r.Row(quant_dc, y); + memset(qdc_row, 0, sizeof(*qdc_row) * r.xsize()); + } + } else { + for (size_t y = 0; y < r.ysize(); y++) { + uint8_t* qdc_row_val = r.Row(quant_dc, y); + const int32_t* quant_row_x = + in.channel[1].plane.Row(y >> chroma_subsampling.VShift(0)); + const int32_t* quant_row_y = + in.channel[0].plane.Row(y >> chroma_subsampling.VShift(1)); + const int32_t* quant_row_b = + in.channel[2].plane.Row(y >> chroma_subsampling.VShift(2)); + for (size_t x = 0; x < r.xsize(); x++) { + int bucket_x = 0, bucket_y = 0, bucket_b = 0; + for (int t : bctx.dc_thresholds[0]) { + if (quant_row_x[x >> chroma_subsampling.HShift(0)] > t) bucket_x++; + } + for (int t : bctx.dc_thresholds[1]) { + if (quant_row_y[x >> chroma_subsampling.HShift(1)] > t) bucket_y++; + } + for (int t : bctx.dc_thresholds[2]) { + if (quant_row_b[x >> chroma_subsampling.HShift(2)] > t) bucket_b++; + } + int bucket = bucket_x; + bucket *= bctx.dc_thresholds[2].size() + 1; + bucket += bucket_b; + bucket *= bctx.dc_thresholds[1].size() + 1; + bucket += bucket_y; + qdc_row_val[x] = bucket; + } + } + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(DequantDC); +HWY_EXPORT(AdaptiveDCSmoothing); +void AdaptiveDCSmoothing(const float* dc_factors, Image3F* dc, + ThreadPool* pool) { + return HWY_DYNAMIC_DISPATCH(AdaptiveDCSmoothing)(dc_factors, dc, pool); +} + +void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in, + const float* dc_factors, float mul, const float* cfl_factors, + YCbCrChromaSubsampling chroma_subsampling, + const BlockCtxMap& bctx) { + return HWY_DYNAMIC_DISPATCH(DequantDC)(r, dc, quant_dc, in, dc_factors, mul, + cfl_factors, chroma_subsampling, bctx); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_dc.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_dc.h new file mode 100644 index 0000000000..b06e5931f0 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_dc.h @@ -0,0 +1,34 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_COMPRESSED_DC_H_ +#define LIB_JXL_COMPRESSED_DC_H_ + +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/modular/modular_image.h" + +// DC handling functions: encoding and decoding of DC to and from bitstream, and +// related function to initialize the per-group decoder cache. + +namespace jxl { + +// Smooth DC in already-smooth areas, to counteract banding. +void AdaptiveDCSmoothing(const float* dc_factors, Image3F* dc, + ThreadPool* pool); + +void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in, + const float* dc_factors, float mul, const float* cfl_factors, + YCbCrChromaSubsampling chroma_subsampling, + const BlockCtxMap& bctx); + +} // namespace jxl + +#endif // LIB_JXL_COMPRESSED_DC_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_image_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_image_test.cc new file mode 100644 index 0000000000..7546127616 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/compressed_image_test.cc @@ -0,0 +1,102 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include + +#include "gtest/gtest.h" +#include "lib/extras/codec.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/base/thread_pool_internal.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/gaborish.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/passes_state.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/testdata.h" + +namespace jxl { +namespace { + +// Verifies ReconOpsinImage reconstructs with low butteraugli distance. +void RunRGBRoundTrip(float distance, bool fast) { + ThreadPoolInternal pool(4); + + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + JXL_CHECK(SetFromBytes(Span(orig), &io, &pool)); + // This test can only handle a single group. + io.ShrinkTo(std::min(io.xsize(), kGroupDim), std::min(io.ysize(), kGroupDim)); + + Image3F opsin(io.xsize(), io.ysize()); + (void)ToXYB(io.Main(), &pool, &opsin); + opsin = PadImageToMultiple(opsin, kBlockDim); + GaborishInverse(&opsin, 1.0f, &pool); + + CompressParams cparams; + cparams.butteraugli_distance = distance; + if (fast) { + cparams.speed_tier = SpeedTier::kWombat; + } + + JXL_CHECK(io.metadata.size.Set(opsin.xsize(), opsin.ysize())); + FrameHeader frame_header(&io.metadata); + frame_header.color_transform = ColorTransform::kXYB; + frame_header.loop_filter.epf_iters = 0; + + // Use custom weights for Gaborish. + frame_header.loop_filter.gab_custom = true; + frame_header.loop_filter.gab_x_weight1 = 0.11501538179658321f; + frame_header.loop_filter.gab_x_weight2 = 0.089979079587015454f; + frame_header.loop_filter.gab_y_weight1 = 0.11501538179658321f; + frame_header.loop_filter.gab_y_weight2 = 0.089979079587015454f; + frame_header.loop_filter.gab_b_weight1 = 0.11501538179658321f; + frame_header.loop_filter.gab_b_weight2 = 0.089979079587015454f; + + PassesEncoderState enc_state; + JXL_CHECK(InitializePassesSharedState(frame_header, &enc_state.shared)); + + enc_state.shared.quantizer.SetQuant(4.0f, 4.0f, + &enc_state.shared.raw_quant_field); + enc_state.shared.ac_strategy.FillDCT8(); + enc_state.cparams = cparams; + ZeroFillImage(&enc_state.shared.epf_sharpness); + CodecInOut io1; + io1.Main() = RoundtripImage(opsin, &enc_state, &pool); + io1.metadata.m.color_encoding = io1.Main().c_current(); + + EXPECT_LE(ButteraugliDistance(io, io1, cparams.ba_params, + /*distmap=*/nullptr, &pool), + 1.2); +} + +TEST(CompressedImageTest, RGBRoundTrip_1) { RunRGBRoundTrip(1.0, false); } + +TEST(CompressedImageTest, RGBRoundTrip_1_fast) { RunRGBRoundTrip(1.0, true); } + +TEST(CompressedImageTest, RGBRoundTrip_2) { RunRGBRoundTrip(2.0, false); } + +TEST(CompressedImageTest, RGBRoundTrip_2_fast) { RunRGBRoundTrip(2.0, true); } + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/convolve-inl.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/convolve-inl.h new file mode 100644 index 0000000000..255bb9d051 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/convolve-inl.h @@ -0,0 +1,119 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#if defined(LIB_JXL_CONVOLVE_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_CONVOLVE_INL_H_ +#undef LIB_JXL_CONVOLVE_INL_H_ +#else +#define LIB_JXL_CONVOLVE_INL_H_ +#endif + +#include + +#include "lib/jxl/base/status.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Broadcast; +#if HWY_TARGET != HWY_SCALAR +using hwy::HWY_NAMESPACE::CombineShiftRightBytes; +#endif +using hwy::HWY_NAMESPACE::Vec; + +// Synthesizes left/right neighbors from a vector of center pixels. +class Neighbors { + public: + using D = HWY_CAPPED(float, 16); + using V = Vec; + + // Returns l[i] == c[Mirror(i - 1)]. + HWY_INLINE HWY_MAYBE_UNUSED static V FirstL1(const V c) { +#if HWY_CAP_GE256 + const D d; + HWY_ALIGN constexpr int32_t lanes[16] = {0, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14}; + const auto indices = SetTableIndices(d, lanes); + // c = PONM'LKJI + return TableLookupLanes(c, indices); // ONML'KJII +#elif HWY_TARGET == HWY_SCALAR + return c; // Same (the first mirrored value is the last valid one) +#else // 128 bit + // c = LKJI +#if HWY_ARCH_X86 + return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(2, 1, 0, 0))}; // KJII +#else + const D d; + // TODO(deymo): Figure out if this can be optimized using a single vsri + // instruction to convert LKJI to KJII. + HWY_ALIGN constexpr int lanes[4] = {0, 0, 1, 2}; // KJII + const auto indices = SetTableIndices(d, lanes); + return TableLookupLanes(c, indices); +#endif +#endif + } + + // Returns l[i] == c[Mirror(i - 2)]. + HWY_INLINE HWY_MAYBE_UNUSED static V FirstL2(const V c) { +#if HWY_CAP_GE256 + const D d; + HWY_ALIGN constexpr int32_t lanes[16] = {1, 0, 0, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13}; + const auto indices = SetTableIndices(d, lanes); + // c = PONM'LKJI + return TableLookupLanes(c, indices); // NMLK'JIIJ +#elif HWY_TARGET == HWY_SCALAR + const D d; + JXL_ASSERT(false); // unsupported, avoid calling this. + return Zero(d); +#else // 128 bit + // c = LKJI +#if HWY_ARCH_X86 + return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(1, 0, 0, 1))}; // JIIJ +#else + const D d; + HWY_ALIGN constexpr int lanes[4] = {1, 0, 0, 1}; // JIIJ + const auto indices = SetTableIndices(d, lanes); + return TableLookupLanes(c, indices); +#endif +#endif + } + + // Returns l[i] == c[Mirror(i - 3)]. + HWY_INLINE HWY_MAYBE_UNUSED static V FirstL3(const V c) { +#if HWY_CAP_GE256 + const D d; + HWY_ALIGN constexpr int32_t lanes[16] = {2, 1, 0, 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12}; + const auto indices = SetTableIndices(d, lanes); + // c = PONM'LKJI + return TableLookupLanes(c, indices); // MLKJ'IIJK +#elif HWY_TARGET == HWY_SCALAR + const D d; + JXL_ASSERT(false); // unsupported, avoid calling this. + return Zero(d); +#else // 128 bit + // c = LKJI +#if HWY_ARCH_X86 + return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(0, 0, 1, 2))}; // IIJK +#else + const D d; + HWY_ALIGN constexpr int lanes[4] = {2, 1, 0, 0}; // IIJK + const auto indices = SetTableIndices(d, lanes); + return TableLookupLanes(c, indices); +#endif +#endif + } +}; + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_CONVOLVE_INL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/convolve.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/convolve.cc new file mode 100644 index 0000000000..cc7fc3f90e --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/convolve.cc @@ -0,0 +1,1332 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/convolve.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/convolve.cc" +#include +#include + +#include "lib/jxl/common.h" // RoundUpTo +#include "lib/jxl/convolve-inl.h" +#include "lib/jxl/image_ops.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Vec; + +// Weighted sum of 1x5 pixels around ix, iy with [wx2 wx1 wx0 wx1 wx2]. +template +static float WeightedSumBorder(const ImageF& in, const WrapY wrap_y, + const int64_t ix, const int64_t iy, + const size_t xsize, const size_t ysize, + const float wx0, const float wx1, + const float wx2) { + const WrapMirror wrap_x; + const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize)); + const float in_m2 = row[wrap_x(ix - 2, xsize)]; + const float in_p2 = row[wrap_x(ix + 2, xsize)]; + const float in_m1 = row[wrap_x(ix - 1, xsize)]; + const float in_p1 = row[wrap_x(ix + 1, xsize)]; + const float in_00 = row[ix]; + const float sum_2 = wx2 * (in_m2 + in_p2); + const float sum_1 = wx1 * (in_m1 + in_p1); + const float sum_0 = wx0 * in_00; + return sum_2 + sum_1 + sum_0; +} + +template +static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix, + const int64_t iy, const size_t ysize, const V wx0, + const V wx1, const V wx2) { + const HWY_FULL(float) d; + const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix; + const auto in_m2 = LoadU(d, center - 2); + const auto in_p2 = LoadU(d, center + 2); + const auto in_m1 = LoadU(d, center - 1); + const auto in_p1 = LoadU(d, center + 1); + const auto in_00 = Load(d, center); + const auto sum_2 = wx2 * (in_m2 + in_p2); + const auto sum_1 = wx1 * (in_m1 + in_p1); + const auto sum_0 = wx0 * in_00; + return sum_2 + sum_1 + sum_0; +} + +// Produces result for one pixel +template +float Symmetric5Border(const ImageF& in, const Rect& rect, const int64_t ix, + const int64_t iy, const WeightsSymmetric5& weights) { + const float w0 = weights.c[0]; + const float w1 = weights.r[0]; + const float w2 = weights.R[0]; + const float w4 = weights.d[0]; + const float w5 = weights.L[0]; + const float w8 = weights.D[0]; + + const size_t xsize = rect.xsize(); + const size_t ysize = rect.ysize(); + const WrapY wrap_y; + // Unrolled loop over all 5 rows of the kernel. + float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2); + + sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8); + float sum1 = + WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8); + + sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5); + sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5); + + return sum0 + sum1; +} + +// Produces result for one vector's worth of pixels +template +static void Symmetric5Interior(const ImageF& in, const Rect& rect, + const int64_t ix, const int64_t iy, + const WeightsSymmetric5& weights, + float* JXL_RESTRICT row_out) { + const HWY_FULL(float) d; + + const auto w0 = LoadDup128(d, weights.c); + const auto w1 = LoadDup128(d, weights.r); + const auto w2 = LoadDup128(d, weights.R); + const auto w4 = LoadDup128(d, weights.d); + const auto w5 = LoadDup128(d, weights.L); + const auto w8 = LoadDup128(d, weights.D); + + const size_t ysize = rect.ysize(); + const WrapY wrap_y; + // Unrolled loop over all 5 rows of the kernel. + auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2); + + sum0 += WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8); + auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8); + + sum0 += WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5); + sum1 += WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5); + + Store(sum0 + sum1, d, row_out + ix); +} + +template +static void Symmetric5Row(const ImageF& in, const Rect& rect, const int64_t iy, + const WeightsSymmetric5& weights, + float* JXL_RESTRICT row_out) { + const int64_t kRadius = 2; + const size_t xsize = rect.xsize(); + + size_t ix = 0; + const HWY_FULL(float) d; + const size_t N = Lanes(d); + const size_t aligned_x = RoundUpTo(kRadius, N); + for (; ix < std::min(aligned_x, xsize); ++ix) { + row_out[ix] = Symmetric5Border(in, rect, ix, iy, weights); + } + for (; ix + N + kRadius <= xsize; ix += N) { + Symmetric5Interior(in, rect, ix, iy, weights, row_out); + } + for (; ix < xsize; ++ix) { + row_out[ix] = Symmetric5Border(in, rect, ix, iy, weights); + } +} + +static JXL_NOINLINE void Symmetric5BorderRow(const ImageF& in, const Rect& rect, + const int64_t iy, + const WeightsSymmetric5& weights, + float* JXL_RESTRICT row_out) { + return Symmetric5Row(in, rect, iy, weights, row_out); +} + +#if HWY_TARGET != HWY_SCALAR + +// Returns indices for SetTableIndices such that TableLookupLanes on the +// rightmost unaligned vector (rightmost sample in its most-significant lane) +// returns the mirrored values, with the mirror outside the last valid sample. +static inline const int32_t* MirrorLanes(const size_t mod) { + const HWY_CAPPED(float, 16) d; + constexpr size_t kN = MaxLanes(d); + + // For mod = `image width mod 16` 0..15: + // last full vec mirrored (mem order) loadedVec mirrorVec idxVec + // 0123456789abcdef| fedcba9876543210 fed..210 012..def 012..def + // 0123456789abcdef|0 0fedcba98765432 0fe..321 234..f00 123..eff + // 0123456789abcdef|01 10fedcba987654 10f..432 456..110 234..ffe + // 0123456789abcdef|012 210fedcba9876 210..543 67..2210 34..ffed + // 0123456789abcdef|0123 3210fedcba98 321..654 8..33210 4..ffedc + // 0123456789abcdef|01234 43210fedcba + // 0123456789abcdef|012345 543210fedc + // 0123456789abcdef|0123456 6543210fe + // 0123456789abcdef|01234567 76543210 + // 0123456789abcdef|012345678 8765432 + // 0123456789abcdef|0123456789 987654 + // 0123456789abcdef|0123456789A A9876 + // 0123456789abcdef|0123456789AB BA98 + // 0123456789abcdef|0123456789ABC CBA + // 0123456789abcdef|0123456789ABCD DC + // 0123456789abcdef|0123456789ABCDE E EDC..10f EED..210 ffe..321 +#if HWY_CAP_GE512 + HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, // + 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; +#elif HWY_CAP_GE256 + HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = { + 1, 2, 3, 4, 5, 6, 7, 7, // + 6, 5, 4, 3, 2, 1, 0}; +#else // 128-bit + HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {1, 2, 3, 3, // + 2, 1, 0}; +#endif + return idx_lanes + kN - 1 - mod; +} + +#endif // HWY_TARGET != HWY_SCALAR + +namespace strategy { + +struct StrategyBase { + using D = HWY_CAPPED(float, 16); + using V = Vec; +}; + +// 3x3 convolution by symmetric kernel with a single scan through the input. +class Symmetric3 : public StrategyBase { + public: + static constexpr int64_t kRadius = 1; + + // Only accesses pixels in [0, xsize). + template + static JXL_INLINE void ConvolveRow(const float* const JXL_RESTRICT row_m, + const size_t xsize, const int64_t stride, + const WrapRow& wrap_row, + const WeightsSymmetric3& weights, + float* const JXL_RESTRICT row_out) { + const D d; + // t, m, b = top, middle, bottom row; + const float* const JXL_RESTRICT row_t = wrap_row(row_m - stride, stride); + const float* const JXL_RESTRICT row_b = wrap_row(row_m + stride, stride); + + // Must load in advance - compiler doesn't understand LoadDup128 and + // schedules them too late. + const V w0 = LoadDup128(d, weights.c); + const V w1 = LoadDup128(d, weights.r); + const V w2 = LoadDup128(d, weights.d); + + // l, c, r = left, center, right. Leftmost vector: need FirstL1. + { + const V tc = LoadU(d, row_t + 0); + const V mc = LoadU(d, row_m + 0); + const V bc = LoadU(d, row_b + 0); + const V tl = Neighbors::FirstL1(tc); + const V tr = LoadU(d, row_t + 0 + 1); + const V ml = Neighbors::FirstL1(mc); + const V mr = LoadU(d, row_m + 0 + 1); + const V bl = Neighbors::FirstL1(bc); + const V br = LoadU(d, row_b + 0 + 1); + const V conv = + WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2); + Store(conv, d, row_out + 0); + } + + // Loop as long as we can load enough new values: + const size_t N = Lanes(d); + size_t x = N; + for (; x + N + kRadius <= xsize; x += N) { + const auto conv = ConvolveValid(row_t, row_m, row_b, x, w0, w1, w2); + Store(conv, d, row_out + x); + } + + // For final (partial) vector: + const V tc = LoadU(d, row_t + x); + const V mc = LoadU(d, row_m + x); + const V bc = LoadU(d, row_b + x); + + V tr, mr, br; +#if HWY_TARGET == HWY_SCALAR + tr = tc; // Single-lane => mirrored right neighbor = center value. + mr = mc; + br = bc; +#else + if (kSizeModN == 0) { + // The above loop didn't handle the last vector because it needs an + // additional right neighbor (generated via mirroring). + auto mirror = SetTableIndices(d, MirrorLanes(N - 1)); + tr = TableLookupLanes(tc, mirror); + mr = TableLookupLanes(mc, mirror); + br = TableLookupLanes(bc, mirror); + } else { + auto mirror = SetTableIndices(d, MirrorLanes((xsize % N) - 1)); + // Loads last valid value into uppermost lane and mirrors. + tr = TableLookupLanes(LoadU(d, row_t + xsize - N), mirror); + mr = TableLookupLanes(LoadU(d, row_m + xsize - N), mirror); + br = TableLookupLanes(LoadU(d, row_b + xsize - N), mirror); + } +#endif + + const V tl = LoadU(d, row_t + x - 1); + const V ml = LoadU(d, row_m + x - 1); + const V bl = LoadU(d, row_b + x - 1); + const V conv = WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2); + Store(conv, d, row_out + x); + } + + private: + // Returns sum{x_i * w_i}. + template + static JXL_INLINE V WeightedSum(const V tl, const V tc, const V tr, + const V ml, const V mc, const V mr, + const V bl, const V bc, const V br, + const V w0, const V w1, const V w2) { + const V sum_tb = tc + bc; + + // Faster than 5 mul + 4 FMA. + const V mul0 = mc * w0; + const V sum_lr = ml + mr; + + const V x1 = sum_tb + sum_lr; + const V mul1 = MulAdd(x1, w1, mul0); + + const V sum_t2 = tl + tr; + const V sum_b2 = bl + br; + const V x2 = sum_t2 + sum_b2; + const V mul2 = MulAdd(x2, w2, mul1); + return mul2; + } + + static JXL_INLINE V ConvolveValid(const float* JXL_RESTRICT row_t, + const float* JXL_RESTRICT row_m, + const float* JXL_RESTRICT row_b, + const int64_t x, const V w0, const V w1, + const V w2) { + const D d; + const V tc = LoadU(d, row_t + x); + const V mc = LoadU(d, row_m + x); + const V bc = LoadU(d, row_b + x); + const V tl = LoadU(d, row_t + x - 1); + const V tr = LoadU(d, row_t + x + 1); + const V ml = LoadU(d, row_m + x - 1); + const V mr = LoadU(d, row_m + x + 1); + const V bl = LoadU(d, row_b + x - 1); + const V br = LoadU(d, row_b + x + 1); + return WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2); + } +}; + +// 5x5 convolution by separable kernel with a single scan through the input. +// This is more cache-efficient than separate horizontal/vertical passes, and +// possibly faster (given enough registers) than tiling and/or transposing. +// +// Overview: imagine a 5x5 window around a central pixel. First convolve the +// rows by multiplying the pixels with the corresponding weights from +// WeightsSeparable5.horz[abs(x_offset) * 4]. Then multiply each of these +// intermediate results by the corresponding vertical weight, i.e. +// vert[abs(y_offset) * 4]. Finally, store the sum of these values as the +// convolution result at the position of the central pixel in the output. +// +// Each of these operations uses SIMD vectors. The central pixel and most +// importantly the output are aligned, so neighnoring pixels (e.g. x_offset=1) +// require unaligned loads. Because weights are supplied in identical groups of +// 4, we can use LoadDup128 to load them (slightly faster). +// +// Uses mirrored boundary handling. Until x >= kRadius, the horizontal +// convolution uses Neighbors class to shuffle vectors as if each of its lanes +// had been loaded from the mirrored offset. Similarly, the last full vector to +// write uses mirroring. In the case of scalar vectors, Neighbors is not usable +// and the value is loaded directly. Otherwise, the number of valid pixels +// modulo the vector size enables a small optimization: for smaller offsets, +// a non-mirrored load is sufficient. +class Separable5 : public StrategyBase { + public: + static constexpr int64_t kRadius = 2; + + template + static JXL_INLINE void ConvolveRow(const float* const JXL_RESTRICT row_m, + const size_t xsize, const int64_t stride, + const WrapRow& wrap_row, + const WeightsSeparable5& weights, + float* const JXL_RESTRICT row_out) { + const D d; + const int64_t neg_stride = -stride; // allows LEA addressing. + const float* const JXL_RESTRICT row_t2 = + wrap_row(row_m + 2 * neg_stride, stride); + const float* const JXL_RESTRICT row_t1 = + wrap_row(row_m + 1 * neg_stride, stride); + const float* const JXL_RESTRICT row_b1 = + wrap_row(row_m + 1 * stride, stride); + const float* const JXL_RESTRICT row_b2 = + wrap_row(row_m + 2 * stride, stride); + + const V wh0 = LoadDup128(d, weights.horz + 0 * 4); + const V wh1 = LoadDup128(d, weights.horz + 1 * 4); + const V wh2 = LoadDup128(d, weights.horz + 2 * 4); + const V wv0 = LoadDup128(d, weights.vert + 0 * 4); + const V wv1 = LoadDup128(d, weights.vert + 1 * 4); + const V wv2 = LoadDup128(d, weights.vert + 2 * 4); + + size_t x = 0; + + // More than one iteration for scalars. + for (; x < kRadius; x += Lanes(d)) { + const V conv0 = HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2) * wv0; + + const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2); + const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2); + const V conv1 = MulAdd(conv1t + conv1b, wv1, conv0); + + const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2); + const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2); + const V conv2 = MulAdd(conv2t + conv2b, wv2, conv1); + Store(conv2, d, row_out + x); + } + + // Main loop: load inputs without padding + for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) { + const V conv0 = HorzConvolve(row_m + x, wh0, wh1, wh2) * wv0; + + const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2); + const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2); + const V conv1 = MulAdd(conv1t + conv1b, wv1, conv0); + + const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2); + const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2); + const V conv2 = MulAdd(conv2t + conv2b, wv2, conv1); + Store(conv2, d, row_out + x); + } + + // Last full vector to write (the above loop handled mod >= kRadius) +#if HWY_TARGET == HWY_SCALAR + while (x < xsize) { +#else + if (kSizeModN < kRadius) { +#endif + const V conv0 = + HorzConvolveLast(row_m, x, xsize, wh0, wh1, wh2) * wv0; + + const V conv1t = + HorzConvolveLast(row_t1, x, xsize, wh0, wh1, wh2); + const V conv1b = + HorzConvolveLast(row_b1, x, xsize, wh0, wh1, wh2); + const V conv1 = MulAdd(conv1t + conv1b, wv1, conv0); + + const V conv2t = + HorzConvolveLast(row_t2, x, xsize, wh0, wh1, wh2); + const V conv2b = + HorzConvolveLast(row_b2, x, xsize, wh0, wh1, wh2); + const V conv2 = MulAdd(conv2t + conv2b, wv2, conv1); + Store(conv2, d, row_out + x); + x += Lanes(d); + } + + // If mod = 0, the above vector was the last. + if (kSizeModN != 0) { + for (; x < xsize; ++x) { + float mul = 0.0f; + for (int64_t dy = -kRadius; dy <= kRadius; ++dy) { + const float wy = weights.vert[std::abs(dy) * 4]; + const float* clamped_row = wrap_row(row_m + dy * stride, stride); + for (int64_t dx = -kRadius; dx <= kRadius; ++dx) { + const float wx = weights.horz[std::abs(dx) * 4]; + const int64_t clamped_x = Mirror(x + dx, xsize); + mul += clamped_row[clamped_x] * wx * wy; + } + } + row_out[x] = mul; + } + } + } + + private: + // Same as HorzConvolve for the first/last vector in a row. + static JXL_INLINE V HorzConvolveFirst(const float* const JXL_RESTRICT row, + const int64_t x, const int64_t xsize, + const V wh0, const V wh1, const V wh2) { + const D d; + const V c = LoadU(d, row + x); + const V mul0 = c * wh0; + +#if HWY_TARGET == HWY_SCALAR + const V l1 = LoadU(d, row + Mirror(x - 1, xsize)); + const V l2 = LoadU(d, row + Mirror(x - 2, xsize)); +#else + (void)xsize; + const V l1 = Neighbors::FirstL1(c); + const V l2 = Neighbors::FirstL2(c); +#endif + + const V r1 = LoadU(d, row + x + 1); + const V r2 = LoadU(d, row + x + 2); + + const V mul1 = MulAdd(l1 + r1, wh1, mul0); + const V mul2 = MulAdd(l2 + r2, wh2, mul1); + return mul2; + } + + template + static JXL_INLINE V HorzConvolveLast(const float* const JXL_RESTRICT row, + const int64_t x, const int64_t xsize, + const V wh0, const V wh1, const V wh2) { + const D d; + const V c = LoadU(d, row + x); + const V mul0 = c * wh0; + + const V l1 = LoadU(d, row + x - 1); + const V l2 = LoadU(d, row + x - 2); + + V r1, r2; +#if HWY_TARGET == HWY_SCALAR + r1 = LoadU(d, row + Mirror(x + 1, xsize)); + r2 = LoadU(d, row + Mirror(x + 2, xsize)); +#else + const size_t N = Lanes(d); + if (kSizeModN == 0) { + r2 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 2))); + r1 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 1))); + } else { // == 1 + const auto last = LoadU(d, row + xsize - N); + r2 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1))); + r1 = last; + } +#endif + + // Sum of pixels with Manhattan distance i, multiplied by weights[i]. + const V sum1 = l1 + r1; + const V mul1 = MulAdd(sum1, wh1, mul0); + const V sum2 = l2 + r2; + const V mul2 = MulAdd(sum2, wh2, mul1); + return mul2; + } + + // Requires kRadius valid pixels before/after pos. + static JXL_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos, + const V wh0, const V wh1, const V wh2) { + const D d; + const V c = LoadU(d, pos); + const V mul0 = c * wh0; + + // Loading anew is faster than combining vectors. + const V l1 = LoadU(d, pos - 1); + const V r1 = LoadU(d, pos + 1); + const V l2 = LoadU(d, pos - 2); + const V r2 = LoadU(d, pos + 2); + // Sum of pixels with Manhattan distance i, multiplied by weights[i]. + const V sum1 = l1 + r1; + const V mul1 = MulAdd(sum1, wh1, mul0); + const V sum2 = l2 + r2; + const V mul2 = MulAdd(sum2, wh2, mul1); + return mul2; + } +}; // namespace strategy + +// 7x7 convolution by separable kernel with a single scan through the input. +// Extended version of Separable5, see documentation there. +class Separable7 : public StrategyBase { + public: + static constexpr int64_t kRadius = 3; + + template + static JXL_INLINE void ConvolveRow(const float* const JXL_RESTRICT row_m, + const size_t xsize, const int64_t stride, + const WrapRow& wrap_row, + const WeightsSeparable7& weights, + float* const JXL_RESTRICT row_out) { + const D d; + const int64_t neg_stride = -stride; // allows LEA addressing. + const float* const JXL_RESTRICT row_t3 = + wrap_row(row_m + 3 * neg_stride, stride); + const float* const JXL_RESTRICT row_t2 = + wrap_row(row_m + 2 * neg_stride, stride); + const float* const JXL_RESTRICT row_t1 = + wrap_row(row_m + 1 * neg_stride, stride); + const float* const JXL_RESTRICT row_b1 = + wrap_row(row_m + 1 * stride, stride); + const float* const JXL_RESTRICT row_b2 = + wrap_row(row_m + 2 * stride, stride); + const float* const JXL_RESTRICT row_b3 = + wrap_row(row_m + 3 * stride, stride); + + const V wh0 = LoadDup128(d, weights.horz + 0 * 4); + const V wh1 = LoadDup128(d, weights.horz + 1 * 4); + const V wh2 = LoadDup128(d, weights.horz + 2 * 4); + const V wh3 = LoadDup128(d, weights.horz + 3 * 4); + const V wv0 = LoadDup128(d, weights.vert + 0 * 4); + const V wv1 = LoadDup128(d, weights.vert + 1 * 4); + const V wv2 = LoadDup128(d, weights.vert + 2 * 4); + const V wv3 = LoadDup128(d, weights.vert + 3 * 4); + + size_t x = 0; + + // More than one iteration for scalars. + for (; x < kRadius; x += Lanes(d)) { + const V conv0 = + HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2, wh3) * wv0; + + const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2, wh3); + const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2, wh3); + const V conv1 = MulAdd(conv1t + conv1b, wv1, conv0); + + const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2, wh3); + const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2, wh3); + const V conv2 = MulAdd(conv2t + conv2b, wv2, conv1); + + const V conv3t = HorzConvolveFirst(row_t3, x, xsize, wh0, wh1, wh2, wh3); + const V conv3b = HorzConvolveFirst(row_b3, x, xsize, wh0, wh1, wh2, wh3); + const V conv3 = MulAdd(conv3t + conv3b, wv3, conv2); + + Store(conv3, d, row_out + x); + } + + // Main loop: load inputs without padding + for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) { + const V conv0 = HorzConvolve(row_m + x, wh0, wh1, wh2, wh3) * wv0; + + const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2, wh3); + const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2, wh3); + const V conv1 = MulAdd(conv1t + conv1b, wv1, conv0); + + const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2, wh3); + const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2, wh3); + const V conv2 = MulAdd(conv2t + conv2b, wv2, conv1); + + const V conv3t = HorzConvolve(row_t3 + x, wh0, wh1, wh2, wh3); + const V conv3b = HorzConvolve(row_b3 + x, wh0, wh1, wh2, wh3); + const V conv3 = MulAdd(conv3t + conv3b, wv3, conv2); + + Store(conv3, d, row_out + x); + } + + // Last full vector to write (the above loop handled mod >= kRadius) +#if HWY_TARGET == HWY_SCALAR + while (x < xsize) { +#else + if (kSizeModN < kRadius) { +#endif + const V conv0 = + HorzConvolveLast(row_m, x, xsize, wh0, wh1, wh2, wh3) * + wv0; + + const V conv1t = + HorzConvolveLast(row_t1, x, xsize, wh0, wh1, wh2, wh3); + const V conv1b = + HorzConvolveLast(row_b1, x, xsize, wh0, wh1, wh2, wh3); + const V conv1 = MulAdd(conv1t + conv1b, wv1, conv0); + + const V conv2t = + HorzConvolveLast(row_t2, x, xsize, wh0, wh1, wh2, wh3); + const V conv2b = + HorzConvolveLast(row_b2, x, xsize, wh0, wh1, wh2, wh3); + const V conv2 = MulAdd(conv2t + conv2b, wv2, conv1); + + const V conv3t = + HorzConvolveLast(row_t3, x, xsize, wh0, wh1, wh2, wh3); + const V conv3b = + HorzConvolveLast(row_b3, x, xsize, wh0, wh1, wh2, wh3); + const V conv3 = MulAdd(conv3t + conv3b, wv3, conv2); + + Store(conv3, d, row_out + x); + x += Lanes(d); + } + + // If mod = 0, the above vector was the last. + if (kSizeModN != 0) { + for (; x < xsize; ++x) { + float mul = 0.0f; + for (int64_t dy = -kRadius; dy <= kRadius; ++dy) { + const float wy = weights.vert[std::abs(dy) * 4]; + const float* clamped_row = wrap_row(row_m + dy * stride, stride); + for (int64_t dx = -kRadius; dx <= kRadius; ++dx) { + const float wx = weights.horz[std::abs(dx) * 4]; + const int64_t clamped_x = Mirror(x + dx, xsize); + mul += clamped_row[clamped_x] * wx * wy; + } + } + row_out[x] = mul; + } + } + } + + private: + // Same as HorzConvolve for the first/last vector in a row. + static JXL_INLINE V HorzConvolveFirst(const float* const JXL_RESTRICT row, + const int64_t x, const int64_t xsize, + const V wh0, const V wh1, const V wh2, + const V wh3) { + const D d; + const V c = LoadU(d, row + x); + const V mul0 = c * wh0; + +#if HWY_TARGET == HWY_SCALAR + const V l1 = LoadU(d, row + Mirror(x - 1, xsize)); + const V l2 = LoadU(d, row + Mirror(x - 2, xsize)); + const V l3 = LoadU(d, row + Mirror(x - 3, xsize)); +#else + (void)xsize; + const V l1 = Neighbors::FirstL1(c); + const V l2 = Neighbors::FirstL2(c); + const V l3 = Neighbors::FirstL3(c); +#endif + + const V r1 = LoadU(d, row + x + 1); + const V r2 = LoadU(d, row + x + 2); + const V r3 = LoadU(d, row + x + 3); + + const V mul1 = MulAdd(l1 + r1, wh1, mul0); + const V mul2 = MulAdd(l2 + r2, wh2, mul1); + const V mul3 = MulAdd(l3 + r3, wh3, mul2); + return mul3; + } + + template + static JXL_INLINE V HorzConvolveLast(const float* const JXL_RESTRICT row, + const int64_t x, const int64_t xsize, + const V wh0, const V wh1, const V wh2, + const V wh3) { + const D d; + const V c = LoadU(d, row + x); + const V mul0 = c * wh0; + + const V l1 = LoadU(d, row + x - 1); + const V l2 = LoadU(d, row + x - 2); + const V l3 = LoadU(d, row + x - 3); + + V r1, r2, r3; +#if HWY_TARGET == HWY_SCALAR + r1 = LoadU(d, row + Mirror(x + 1, xsize)); + r2 = LoadU(d, row + Mirror(x + 2, xsize)); + r3 = LoadU(d, row + Mirror(x + 3, xsize)); +#else + const size_t N = Lanes(d); + if (kSizeModN == 0) { + r3 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 3))); + r2 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 2))); + r1 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 1))); + } else if (kSizeModN == 1) { + const auto last = LoadU(d, row + xsize - N); + r3 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 2))); + r2 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1))); + r1 = last; + } else /* kSizeModN >= 2 */ { + const auto last = LoadU(d, row + xsize - N); + r3 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1))); + r2 = last; + r1 = LoadU(d, row + x + 1); + } +#endif + + // Sum of pixels with Manhattan distance i, multiplied by weights[i]. + const V sum1 = l1 + r1; + const V mul1 = MulAdd(sum1, wh1, mul0); + const V sum2 = l2 + r2; + const V mul2 = MulAdd(sum2, wh2, mul1); + const V sum3 = l3 + r3; + const V mul3 = MulAdd(sum3, wh3, mul2); + return mul3; + } + + // Returns one vector of horizontal convolution results; lane i is the result + // for pixel pos + i. This is the fast path for interior pixels, i.e. kRadius + // valid pixels before/after pos. + static JXL_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos, + const V wh0, const V wh1, const V wh2, + const V wh3) { + const D d; + const V c = LoadU(d, pos); + const V mul0 = c * wh0; + + // TODO(janwas): better to Combine + const V l1 = LoadU(d, pos - 1); + const V r1 = LoadU(d, pos + 1); + const V l2 = LoadU(d, pos - 2); + const V r2 = LoadU(d, pos + 2); + const V l3 = LoadU(d, pos - 3); + const V r3 = LoadU(d, pos + 3); + // Sum of pixels with Manhattan distance i, multiplied by weights[i]. + const V sum1 = l1 + r1; + const V mul1 = MulAdd(sum1, wh1, mul0); + const V sum2 = l2 + r2; + const V mul2 = MulAdd(sum2, wh2, mul1); + const V sum3 = l3 + r3; + const V mul3 = MulAdd(sum3, wh3, mul2); + return mul3; + } +}; // namespace HWY_NAMESPACE + +} // namespace strategy + +// Single entry point for convolution. +// "Strategy" (Direct*/Separable*) decides kernel size and how to evaluate it. +template +class ConvolveT { + static constexpr int64_t kRadius = Strategy::kRadius; + using Simd = HWY_CAPPED(float, 16); + + public: + static size_t MinWidth() { +#if HWY_TARGET == HWY_SCALAR + // First/Last use mirrored loads of up to +/- kRadius. + return 2 * kRadius; +#else + return Lanes(Simd()) + kRadius; +#endif + } + + // "Image" is ImageF or Image3F. + template + static void Run(const Image& in, const Rect& rect, const Weights& weights, + ThreadPool* pool, Image* out) { + PROFILER_ZONE("ConvolveT::Run"); + JXL_CHECK(SameSize(rect, *out)); + JXL_CHECK(rect.xsize() >= MinWidth()); + + static_assert(int64_t(kRadius) <= 3, + "Must handle [0, kRadius) and >= kRadius"); + switch (rect.xsize() % Lanes(Simd())) { + case 0: + return RunRows<0>(in, rect, weights, pool, out); + case 1: + return RunRows<1>(in, rect, weights, pool, out); + case 2: + return RunRows<2>(in, rect, weights, pool, out); + default: + return RunRows<3>(in, rect, weights, pool, out); + } + } + + private: + template + static JXL_INLINE void RunRow(const float* JXL_RESTRICT in, + const size_t xsize, const int64_t stride, + const WrapRow& wrap_row, const Weights& weights, + float* JXL_RESTRICT out) { + Strategy::template ConvolveRow(in, xsize, stride, wrap_row, + weights, out); + } + + template + static JXL_INLINE void RunBorderRows(const ImageF& in, const Rect& rect, + const int64_t ybegin, const int64_t yend, + const Weights& weights, ImageF* out) { + const int64_t stride = in.PixelsPerRow(); + const WrapRowMirror wrap_row(in, rect.ysize()); + for (int64_t y = ybegin; y < yend; ++y) { + RunRow(rect.ConstRow(in, y), rect.xsize(), stride, wrap_row, + weights, out->Row(y)); + } + } + + // Image3F. + template + static JXL_INLINE void RunBorderRows(const Image3F& in, const Rect& rect, + const int64_t ybegin, const int64_t yend, + const Weights& weights, Image3F* out) { + const int64_t stride = in.PixelsPerRow(); + for (int64_t y = ybegin; y < yend; ++y) { + for (size_t c = 0; c < 3; ++c) { + const WrapRowMirror wrap_row(in.Plane(c), rect.ysize()); + RunRow(rect.ConstPlaneRow(in, c, y), rect.xsize(), stride, + wrap_row, weights, out->PlaneRow(c, y)); + } + } + } + + template + static JXL_INLINE void RunInteriorRows(const ImageF& in, const Rect& rect, + const int64_t ybegin, + const int64_t yend, + const Weights& weights, + ThreadPool* pool, ImageF* out) { + const int64_t stride = in.PixelsPerRow(); + RunOnPool( + pool, ybegin, yend, ThreadPool::SkipInit(), + [&](const int y, int /*thread*/) HWY_ATTR { + RunRow(rect.ConstRow(in, y), rect.xsize(), stride, + WrapRowUnchanged(), weights, out->Row(y)); + }, + "Convolve"); + } + + // Image3F. + template + static JXL_INLINE void RunInteriorRows(const Image3F& in, const Rect& rect, + const int64_t ybegin, + const int64_t yend, + const Weights& weights, + ThreadPool* pool, Image3F* out) { + const int64_t stride = in.PixelsPerRow(); + RunOnPool( + pool, ybegin, yend, ThreadPool::SkipInit(), + [&](const int y, int /*thread*/) HWY_ATTR { + for (size_t c = 0; c < 3; ++c) { + RunRow(rect.ConstPlaneRow(in, c, y), rect.xsize(), + stride, WrapRowUnchanged(), weights, + out->PlaneRow(c, y)); + } + }, + "Convolve3"); + } + + template + static JXL_INLINE void RunRows(const Image& in, const Rect& rect, + const Weights& weights, ThreadPool* pool, + Image* out) { + const int64_t ysize = rect.ysize(); + RunBorderRows(in, rect, 0, std::min(int64_t(kRadius), ysize), + weights, out); + if (ysize > 2 * int64_t(kRadius)) { + RunInteriorRows(in, rect, int64_t(kRadius), + ysize - int64_t(kRadius), weights, pool, out); + } + if (ysize > int64_t(kRadius)) { + RunBorderRows(in, rect, ysize - int64_t(kRadius), ysize, + weights, out); + } + } +}; + +void Symmetric3(const ImageF& in, const Rect& rect, + const WeightsSymmetric3& weights, ThreadPool* pool, + ImageF* out) { + using Conv = ConvolveT; + if (rect.xsize() >= Conv::MinWidth()) { + return Conv::Run(in, rect, weights, pool, out); + } + + return SlowSymmetric3(in, rect, weights, pool, out); +} + +// Symmetric5 is implemented above without ConvolveT. + +void Separable5(const ImageF& in, const Rect& rect, + const WeightsSeparable5& weights, ThreadPool* pool, + ImageF* out) { + using Conv = ConvolveT; + if (rect.xsize() >= Conv::MinWidth()) { + return Conv::Run(in, rect, weights, pool, out); + } + + return SlowSeparable5(in, rect, weights, pool, out); +} +void Separable5_3(const Image3F& in, const Rect& rect, + const WeightsSeparable5& weights, ThreadPool* pool, + Image3F* out) { + using Conv = ConvolveT; + if (rect.xsize() >= Conv::MinWidth()) { + return Conv::Run(in, rect, weights, pool, out); + } + + return SlowSeparable5(in, rect, weights, pool, out); +} + +void Separable7(const ImageF& in, const Rect& rect, + const WeightsSeparable7& weights, ThreadPool* pool, + ImageF* out) { + using Conv = ConvolveT; + if (rect.xsize() >= Conv::MinWidth()) { + return Conv::Run(in, rect, weights, pool, out); + } + + return SlowSeparable7(in, rect, weights, pool, out); +} +void Separable7_3(const Image3F& in, const Rect& rect, + const WeightsSeparable7& weights, ThreadPool* pool, + Image3F* out) { + using Conv = ConvolveT; + if (rect.xsize() >= Conv::MinWidth()) { + return Conv::Run(in, rect, weights, pool, out); + } + + return SlowSeparable7(in, rect, weights, pool, out); +} + +// Semi-vectorized (interior pixels Fonly); called directly like slow::, unlike +// the fully vectorized strategies below. +void Symmetric5(const ImageF& in, const Rect& rect, + const WeightsSymmetric5& weights, ThreadPool* pool, + ImageF* JXL_RESTRICT out) { + PROFILER_FUNC; + + const size_t ysize = rect.ysize(); + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const int64_t iy = task; + + if (iy < 2 || iy >= static_cast(ysize) - 2) { + Symmetric5BorderRow(in, rect, iy, weights, out->Row(iy)); + } else { + Symmetric5Row(in, rect, iy, weights, out->Row(iy)); + } + }, + "Symmetric5x5Convolution"); +} + +void Symmetric5_3(const Image3F& in, const Rect& rect, + const WeightsSymmetric5& weights, ThreadPool* pool, + Image3F* JXL_RESTRICT out) { + PROFILER_FUNC; + + const size_t ysize = rect.ysize(); + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const size_t iy = task; + + if (iy < 2 || iy >= ysize - 2) { + for (size_t c = 0; c < 3; ++c) { + Symmetric5BorderRow(in.Plane(c), rect, iy, weights, + out->PlaneRow(c, iy)); + } + } else { + for (size_t c = 0; c < 3; ++c) { + Symmetric5Row(in.Plane(c), rect, iy, weights, + out->PlaneRow(c, iy)); + } + } + }, + "Symmetric5x5Convolution3"); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(Symmetric3); +void Symmetric3(const ImageF& in, const Rect& rect, + const WeightsSymmetric3& weights, ThreadPool* pool, + ImageF* out) { + return HWY_DYNAMIC_DISPATCH(Symmetric3)(in, rect, weights, pool, out); +} + +HWY_EXPORT(Symmetric5); +void Symmetric5(const ImageF& in, const Rect& rect, + const WeightsSymmetric5& weights, ThreadPool* pool, + ImageF* JXL_RESTRICT out) { + return HWY_DYNAMIC_DISPATCH(Symmetric5)(in, rect, weights, pool, out); +} + +HWY_EXPORT(Symmetric5_3); +void Symmetric5_3(const Image3F& in, const Rect& rect, + const WeightsSymmetric5& weights, ThreadPool* pool, + Image3F* JXL_RESTRICT out) { + return HWY_DYNAMIC_DISPATCH(Symmetric5_3)(in, rect, weights, pool, out); +} + +HWY_EXPORT(Separable5); +void Separable5(const ImageF& in, const Rect& rect, + const WeightsSeparable5& weights, ThreadPool* pool, + ImageF* out) { + return HWY_DYNAMIC_DISPATCH(Separable5)(in, rect, weights, pool, out); +} + +HWY_EXPORT(Separable5_3); +void Separable5_3(const Image3F& in, const Rect& rect, + const WeightsSeparable5& weights, ThreadPool* pool, + Image3F* out) { + return HWY_DYNAMIC_DISPATCH(Separable5_3)(in, rect, weights, pool, out); +} + +HWY_EXPORT(Separable7); +void Separable7(const ImageF& in, const Rect& rect, + const WeightsSeparable7& weights, ThreadPool* pool, + ImageF* out) { + return HWY_DYNAMIC_DISPATCH(Separable7)(in, rect, weights, pool, out); +} + +HWY_EXPORT(Separable7_3); +void Separable7_3(const Image3F& in, const Rect& rect, + const WeightsSeparable7& weights, ThreadPool* pool, + Image3F* out) { + return HWY_DYNAMIC_DISPATCH(Separable7_3)(in, rect, weights, pool, out); +} + +//------------------------------------------------------------------------------ +// Kernels + +// Concentrates energy in low-frequency components (e.g. for antialiasing). +const WeightsSymmetric3& WeightsSymmetric3Lowpass() { + // Computed by research/convolve_weights.py's cubic spline approximations of + // prolate spheroidal wave functions. + constexpr float w0 = 0.36208932f; + constexpr float w1 = 0.12820096f; + constexpr float w2 = 0.03127668f; + static constexpr WeightsSymmetric3 weights = { + {HWY_REP4(w0)}, {HWY_REP4(w1)}, {HWY_REP4(w2)}}; + return weights; +} + +const WeightsSeparable5& WeightsSeparable5Lowpass() { + constexpr float w0 = 0.41714928f; + constexpr float w1 = 0.25539268f; + constexpr float w2 = 0.03603267f; + static constexpr WeightsSeparable5 weights = { + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}, + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}}; + return weights; +} + +const WeightsSymmetric5& WeightsSymmetric5Lowpass() { + static constexpr WeightsSymmetric5 weights = { + {HWY_REP4(0.1740135f)}, {HWY_REP4(0.1065369f)}, {HWY_REP4(0.0150310f)}, + {HWY_REP4(0.0652254f)}, {HWY_REP4(0.0012984f)}, {HWY_REP4(0.0092025f)}}; + return weights; +} + +const WeightsSeparable5& WeightsSeparable5Gaussian1() { + constexpr float w0 = 0.38774f; + constexpr float w1 = 0.24477f; + constexpr float w2 = 0.06136f; + static constexpr WeightsSeparable5 weights = { + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}, + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}}; + return weights; +} + +const WeightsSeparable5& WeightsSeparable5Gaussian2() { + constexpr float w0 = 0.250301f; + constexpr float w1 = 0.221461f; + constexpr float w2 = 0.153388f; + static constexpr WeightsSeparable5 weights = { + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}, + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}}; + return weights; +} + +//------------------------------------------------------------------------------ +// Slow + +namespace { + +template +float SlowSymmetric3Pixel(const ImageF& in, const int64_t ix, const int64_t iy, + const int64_t xsize, const int64_t ysize, + const WeightsSymmetric3& weights) { + float sum = 0.0f; + + // ix: image; kx: kernel + for (int64_t ky = -1; ky <= 1; ky++) { + const int64_t y = WrapY()(iy + ky, ysize); + const float* JXL_RESTRICT row_in = in.ConstRow(static_cast(y)); + + const float wc = ky == 0 ? weights.c[0] : weights.r[0]; + const float wlr = ky == 0 ? weights.r[0] : weights.d[0]; + + const int64_t xm1 = WrapX()(ix - 1, xsize); + const int64_t xp1 = WrapX()(ix + 1, xsize); + sum += row_in[ix] * wc + (row_in[xm1] + row_in[xp1]) * wlr; + } + return sum; +} + +template +void SlowSymmetric3Row(const ImageF& in, const int64_t iy, const int64_t xsize, + const int64_t ysize, const WeightsSymmetric3& weights, + float* JXL_RESTRICT row_out) { + row_out[0] = + SlowSymmetric3Pixel(in, 0, iy, xsize, ysize, weights); + for (int64_t ix = 1; ix < xsize - 1; ix++) { + row_out[ix] = SlowSymmetric3Pixel(in, ix, iy, xsize, + ysize, weights); + } + { + const int64_t ix = xsize - 1; + row_out[ix] = SlowSymmetric3Pixel(in, ix, iy, xsize, + ysize, weights); + } +} + +} // namespace + +void SlowSymmetric3(const ImageF& in, const Rect& rect, + const WeightsSymmetric3& weights, ThreadPool* pool, + ImageF* JXL_RESTRICT out) { + PROFILER_FUNC; + + const int64_t xsize = static_cast(rect.xsize()); + const int64_t ysize = static_cast(rect.ysize()); + const int64_t kRadius = 1; + + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const int64_t iy = task; + float* JXL_RESTRICT out_row = out->Row(static_cast(iy)); + + if (iy < kRadius || iy >= ysize - kRadius) { + SlowSymmetric3Row(in, iy, xsize, ysize, weights, out_row); + } else { + SlowSymmetric3Row(in, iy, xsize, ysize, weights, + out_row); + } + }, + "SlowSymmetric3"); +} + +void SlowSymmetric3(const Image3F& in, const Rect& rect, + const WeightsSymmetric3& weights, ThreadPool* pool, + Image3F* JXL_RESTRICT out) { + PROFILER_FUNC; + + const int64_t xsize = static_cast(rect.xsize()); + const int64_t ysize = static_cast(rect.ysize()); + const int64_t kRadius = 1; + + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const int64_t iy = task; + const size_t oy = static_cast(iy); + + if (iy < kRadius || iy >= ysize - kRadius) { + for (size_t c = 0; c < 3; ++c) { + SlowSymmetric3Row(in.Plane(c), iy, xsize, ysize, + weights, out->PlaneRow(c, oy)); + } + } else { + for (size_t c = 0; c < 3; ++c) { + SlowSymmetric3Row(in.Plane(c), iy, xsize, ysize, + weights, out->PlaneRow(c, oy)); + } + } + }, + "SlowSymmetric3"); +} + +namespace { + +// Separable kernels, any radius. +float SlowSeparablePixel(const ImageF& in, const Rect& rect, const int64_t x, + const int64_t y, const int64_t radius, + const float* JXL_RESTRICT horz_weights, + const float* JXL_RESTRICT vert_weights) { + const size_t xsize = rect.xsize(); + const size_t ysize = rect.ysize(); + const WrapMirror wrap; + + float mul = 0.0f; + for (int dy = -radius; dy <= radius; ++dy) { + const float wy = vert_weights[std::abs(dy) * 4]; + const size_t sy = wrap(y + dy, ysize); + JXL_CHECK(sy < ysize); + const float* const JXL_RESTRICT row = rect.ConstRow(in, sy); + for (int dx = -radius; dx <= radius; ++dx) { + const float wx = horz_weights[std::abs(dx) * 4]; + const size_t sx = wrap(x + dx, xsize); + JXL_CHECK(sx < xsize); + mul += row[sx] * wx * wy; + } + } + return mul; +} + +} // namespace + +void SlowSeparable5(const ImageF& in, const Rect& rect, + const WeightsSeparable5& weights, ThreadPool* pool, + ImageF* out) { + PROFILER_FUNC; + const float* horz_weights = &weights.horz[0]; + const float* vert_weights = &weights.vert[0]; + + const size_t ysize = rect.ysize(); + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const int64_t y = task; + + float* const JXL_RESTRICT row_out = out->Row(y); + for (size_t x = 0; x < rect.xsize(); ++x) { + row_out[x] = SlowSeparablePixel(in, rect, x, y, /*radius=*/2, + horz_weights, vert_weights); + } + }, + "SlowSeparable5"); +} + +void SlowSeparable5(const Image3F& in, const Rect& rect, + const WeightsSeparable5& weights, ThreadPool* pool, + Image3F* out) { + for (size_t c = 0; c < 3; ++c) { + SlowSeparable5(in.Plane(c), rect, weights, pool, &out->Plane(c)); + } +} + +void SlowSeparable7(const ImageF& in, const Rect& rect, + const WeightsSeparable7& weights, ThreadPool* pool, + ImageF* out) { + PROFILER_FUNC; + const float* horz_weights = &weights.horz[0]; + const float* vert_weights = &weights.vert[0]; + + const size_t ysize = rect.ysize(); + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const int64_t y = task; + + float* const JXL_RESTRICT row_out = out->Row(y); + for (size_t x = 0; x < rect.xsize(); ++x) { + row_out[x] = SlowSeparablePixel(in, rect, x, y, /*radius=*/3, + horz_weights, vert_weights); + } + }, + "SlowSeparable7"); +} + +void SlowSeparable7(const Image3F& in, const Rect& rect, + const WeightsSeparable7& weights, ThreadPool* pool, + Image3F* out) { + for (size_t c = 0; c < 3; ++c) { + SlowSeparable7(in.Plane(c), rect, weights, pool, &out->Plane(c)); + } +} + +void SlowLaplacian5(const ImageF& in, const Rect& rect, ThreadPool* pool, + ImageF* out) { + PROFILER_FUNC; + JXL_CHECK(SameSize(rect, *out)); + + const size_t xsize = rect.xsize(); + const size_t ysize = rect.ysize(); + const WrapMirror wrap; + + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const int64_t y = task; + + const float* const JXL_RESTRICT row_t = + rect.ConstRow(in, wrap(y - 2, ysize)); + const float* const JXL_RESTRICT row_m = rect.ConstRow(in, y); + const float* const JXL_RESTRICT row_b = + rect.ConstRow(in, wrap(y + 2, ysize)); + float* const JXL_RESTRICT row_out = out->Row(y); + + for (int64_t x = 0; static_cast(x) < xsize; ++x) { + const int64_t xm2 = wrap(x - 2, xsize); + const int64_t xp2 = wrap(x + 2, xsize); + float r = 0.0f; + r += /* */ 1.0f * row_t[x]; + r += 1.0f * row_m[xm2] - 4.0f * row_m[x] + 1.0f * row_m[xp2]; + r += /* */ 1.0f * row_b[x]; + row_out[x] = r; + } + }, + "SlowLaplacian5"); +} + +void SlowLaplacian5(const Image3F& in, const Rect& rect, ThreadPool* pool, + Image3F* out) { + for (size_t c = 0; c < 3; ++c) { + SlowLaplacian5(in.Plane(c), rect, pool, &out->Plane(c)); + } +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/convolve.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/convolve.h new file mode 100644 index 0000000000..c2e2ae42fb --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/convolve.h @@ -0,0 +1,131 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_CONVOLVE_H_ +#define LIB_JXL_CONVOLVE_H_ + +// 2D convolution. + +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/image.h" + +namespace jxl { + +// No valid values outside [0, xsize), but the strategy may still safely load +// the preceding vector, and/or round xsize up to the vector lane count. This +// avoids needing PadImage. +// Requires xsize >= kConvolveLanes + kConvolveMaxRadius. +static constexpr size_t kConvolveMaxRadius = 3; + +// Weights must already be normalized. + +struct WeightsSymmetric3 { + // d r d (each replicated 4x) + // r c r + // d r d + float c[4]; + float r[4]; + float d[4]; +}; + +struct WeightsSymmetric5 { + // The lower-right quadrant is: c r R (each replicated 4x) + // r d L + // R L D + float c[4]; + float r[4]; + float R[4]; + float d[4]; + float D[4]; + float L[4]; +}; + +// Weights for separable 5x5 filters (typically but not necessarily the same +// values for horizontal and vertical directions). The kernel must already be +// normalized, but note that values for negative offsets are omitted, so the +// given values do not sum to 1. +struct WeightsSeparable5 { + // Horizontal 1D, distances 0..2 (each replicated 4x) + float horz[3 * 4]; + float vert[3 * 4]; +}; + +// Weights for separable 7x7 filters (typically but not necessarily the same +// values for horizontal and vertical directions). The kernel must already be +// normalized, but note that values for negative offsets are omitted, so the +// given values do not sum to 1. +// +// NOTE: for >= 7x7 Gaussian kernels, it is faster to use FastGaussian instead, +// at least when images exceed the L1 cache size. +struct WeightsSeparable7 { + // Horizontal 1D, distances 0..3 (each replicated 4x) + float horz[4 * 4]; + float vert[4 * 4]; +}; + +const WeightsSymmetric3& WeightsSymmetric3Lowpass(); +const WeightsSeparable5& WeightsSeparable5Lowpass(); +const WeightsSymmetric5& WeightsSymmetric5Lowpass(); + +void SlowSymmetric3(const ImageF& in, const Rect& rect, + const WeightsSymmetric3& weights, ThreadPool* pool, + ImageF* JXL_RESTRICT out); +void SlowSymmetric3(const Image3F& in, const Rect& rect, + const WeightsSymmetric3& weights, ThreadPool* pool, + Image3F* JXL_RESTRICT out); + +void SlowSeparable5(const ImageF& in, const Rect& rect, + const WeightsSeparable5& weights, ThreadPool* pool, + ImageF* out); +void SlowSeparable5(const Image3F& in, const Rect& rect, + const WeightsSeparable5& weights, ThreadPool* pool, + Image3F* out); + +void SlowSeparable7(const ImageF& in, const Rect& rect, + const WeightsSeparable7& weights, ThreadPool* pool, + ImageF* out); +void SlowSeparable7(const Image3F& in, const Rect& rect, + const WeightsSeparable7& weights, ThreadPool* pool, + Image3F* out); + +void SlowLaplacian5(const ImageF& in, const Rect& rect, ThreadPool* pool, + ImageF* out); +void SlowLaplacian5(const Image3F& in, const Rect& rect, ThreadPool* pool, + Image3F* out); + +void Symmetric3(const ImageF& in, const Rect& rect, + const WeightsSymmetric3& weights, ThreadPool* pool, + ImageF* out); + +void Symmetric5(const ImageF& in, const Rect& rect, + const WeightsSymmetric5& weights, ThreadPool* pool, + ImageF* JXL_RESTRICT out); + +void Symmetric5_3(const Image3F& in, const Rect& rect, + const WeightsSymmetric5& weights, ThreadPool* pool, + Image3F* JXL_RESTRICT out); + +void Separable5(const ImageF& in, const Rect& rect, + const WeightsSeparable5& weights, ThreadPool* pool, + ImageF* out); + +void Separable5_3(const Image3F& in, const Rect& rect, + const WeightsSeparable5& weights, ThreadPool* pool, + Image3F* out); + +void Separable7(const ImageF& in, const Rect& rect, + const WeightsSeparable7& weights, ThreadPool* pool, + ImageF* out); + +void Separable7_3(const Image3F& in, const Rect& rect, + const WeightsSeparable7& weights, ThreadPool* pool, + Image3F* out); + +} // namespace jxl + +#endif // LIB_JXL_CONVOLVE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/convolve_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/convolve_test.cc new file mode 100644 index 0000000000..45e7e45f10 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/convolve_test.cc @@ -0,0 +1,250 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/convolve.h" + +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/convolve_test.cc" +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/thread_pool_internal.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/image_test_utils.h" + +#ifndef JXL_DEBUG_CONVOLVE +#define JXL_DEBUG_CONVOLVE 0 +#endif + +#include "lib/jxl/convolve-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +void TestNeighbors() { + const Neighbors::D d; + const Neighbors::V v = Iota(d, 0); + HWY_ALIGN float actual[hwy::kTestMaxVectorSize / sizeof(float)] = {0}; + + HWY_ALIGN float first_l1[hwy::kTestMaxVectorSize / sizeof(float)] = { + 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}; + Store(Neighbors::FirstL1(v), d, actual); + const size_t N = Lanes(d); + EXPECT_EQ(std::vector(first_l1, first_l1 + N), + std::vector(actual, actual + N)); + +#if HWY_TARGET != HWY_SCALAR + HWY_ALIGN float first_l2[hwy::kTestMaxVectorSize / sizeof(float)] = { + 1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}; + Store(Neighbors::FirstL2(v), d, actual); + EXPECT_EQ(std::vector(first_l2, first_l2 + N), + std::vector(actual, actual + N)); + + HWY_ALIGN float first_l3[hwy::kTestMaxVectorSize / sizeof(float)] = { + 2, 1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + Store(Neighbors::FirstL3(v), d, actual); + EXPECT_EQ(std::vector(first_l3, first_l3 + N), + std::vector(actual, actual + N)); +#endif // HWY_TARGET != HWY_SCALAR +} + +template +void VerifySymmetric3(const size_t xsize, const size_t ysize, ThreadPool* pool, + Random* rng) { + const Rect rect(0, 0, xsize, ysize); + + ImageF in(xsize, ysize); + GenerateImage(GeneratorRandom(rng, 1.0f), &in); + + ImageF out_expected(xsize, ysize); + ImageF out_actual(xsize, ysize); + + const WeightsSymmetric3& weights = WeightsSymmetric3Lowpass(); + Symmetric3(in, rect, weights, pool, &out_expected); + SlowSymmetric3(in, rect, weights, pool, &out_actual); + + VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f); +} + +// Ensures Symmetric and Separable give the same result. +template +void VerifySymmetric5(const size_t xsize, const size_t ysize, ThreadPool* pool, + Random* rng) { + const Rect rect(0, 0, xsize, ysize); + + ImageF in(xsize, ysize); + GenerateImage(GeneratorRandom(rng, 1.0f), &in); + + ImageF out_expected(xsize, ysize); + ImageF out_actual(xsize, ysize); + + Separable5(in, Rect(in), WeightsSeparable5Lowpass(), pool, &out_expected); + Symmetric5(in, rect, WeightsSymmetric5Lowpass(), pool, &out_actual); + + VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f); +} + +template +void VerifySeparable5(const size_t xsize, const size_t ysize, ThreadPool* pool, + Random* rng) { + const Rect rect(0, 0, xsize, ysize); + + ImageF in(xsize, ysize); + GenerateImage(GeneratorRandom(rng, 1.0f), &in); + + ImageF out_expected(xsize, ysize); + ImageF out_actual(xsize, ysize); + + const WeightsSeparable5& weights = WeightsSeparable5Lowpass(); + Separable5(in, Rect(in), weights, pool, &out_expected); + SlowSeparable5(in, rect, weights, pool, &out_actual); + + VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f); +} + +template +void VerifySeparable7(const size_t xsize, const size_t ysize, ThreadPool* pool, + Random* rng) { + const Rect rect(0, 0, xsize, ysize); + + ImageF in(xsize, ysize); + GenerateImage(GeneratorRandom(rng, 1.0f), &in); + + ImageF out_expected(xsize, ysize); + ImageF out_actual(xsize, ysize); + + // Gaussian sigma 1.0 + const WeightsSeparable7 weights = {{HWY_REP4(0.383103f), HWY_REP4(0.241843f), + HWY_REP4(0.060626f), HWY_REP4(0.00598f)}, + {HWY_REP4(0.383103f), HWY_REP4(0.241843f), + HWY_REP4(0.060626f), HWY_REP4(0.00598f)}}; + + SlowSeparable7(in, rect, weights, pool, &out_expected); + Separable7(in, Rect(in), weights, pool, &out_actual); + + VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f); +} + +// For all xsize/ysize and kernels: +void TestConvolve() { + TestNeighbors(); + + ThreadPoolInternal pool(4); + pool.Run(kConvolveMaxRadius, 40, ThreadPool::SkipInit(), + [](const int task, int /*thread*/) { + const size_t xsize = task; + std::mt19937_64 rng(129 + 13 * xsize); + + ThreadPool* null_pool = nullptr; + ThreadPoolInternal pool3(3); + for (size_t ysize = kConvolveMaxRadius; ysize < 16; ++ysize) { + JXL_DEBUG(JXL_DEBUG_CONVOLVE, + "%zu x %zu (target %d)===============================", + xsize, ysize, HWY_TARGET); + + JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sym3------------------"); + VerifySymmetric3(xsize, ysize, null_pool, &rng); + VerifySymmetric3(xsize, ysize, &pool3, &rng); + + JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sym5------------------"); + VerifySymmetric5(xsize, ysize, null_pool, &rng); + VerifySymmetric5(xsize, ysize, &pool3, &rng); + + JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sep5------------------"); + VerifySeparable5(xsize, ysize, null_pool, &rng); + VerifySeparable5(xsize, ysize, &pool3, &rng); + + JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sep7------------------"); + VerifySeparable7(xsize, ysize, null_pool, &rng); + VerifySeparable7(xsize, ysize, &pool3, &rng); + } + }); +} + +// Measures durations, verifies results, prints timings. `unpredictable1` +// must have value 1 (unknown to the compiler to prevent elision). +template +void BenchmarkConv(const char* caption, const Conv& conv, + const hwy::FuncInput unpredictable1) { + const size_t kNumInputs = 1; + const hwy::FuncInput inputs[kNumInputs] = {unpredictable1}; + hwy::Result results[kNumInputs]; + + const size_t kDim = 160; // in+out fit in L2 + ImageF in(kDim, kDim); + ZeroFillImage(&in); + in.Row(kDim / 2)[kDim / 2] = unpredictable1; + ImageF out(kDim, kDim); + + hwy::Params p; + p.verbose = false; + p.max_evals = 7; + p.target_rel_mad = 0.002; + const size_t num_results = MeasureClosure( + [&in, &conv, &out](const hwy::FuncInput input) { + conv(in, &out); + return out.Row(input)[0]; + }, + inputs, kNumInputs, results, p); + if (num_results != kNumInputs) { + fprintf(stderr, "MeasureClosure failed.\n"); + } + for (size_t i = 0; i < num_results; ++i) { + const double seconds = static_cast(results[i].ticks) / + hwy::platform::InvariantTicksPerSecond(); + printf("%12s: %7.2f MP/s (MAD=%4.2f%%)\n", caption, + kDim * kDim * 1E-6 / seconds, + static_cast(results[i].variability) * 100.0); + } +} + +struct ConvSymmetric3 { + void operator()(const ImageF& in, ImageF* JXL_RESTRICT out) const { + ThreadPool* null_pool = nullptr; + Symmetric3(in, Rect(in), WeightsSymmetric3Lowpass(), null_pool, out); + } +}; + +struct ConvSeparable5 { + void operator()(const ImageF& in, ImageF* JXL_RESTRICT out) const { + ThreadPool* null_pool = nullptr; + Separable5(in, Rect(in), WeightsSeparable5Lowpass(), null_pool, out); + } +}; + +void BenchmarkAll() { +#if 0 // disabled to avoid test timeouts, run manually on demand + const hwy::FuncInput unpredictable1 = time(nullptr) != 1234; + BenchmarkConv("Symmetric3", ConvSymmetric3(), unpredictable1); + BenchmarkConv("Separable5", ConvSeparable5(), unpredictable1); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +class ConvolveTest : public hwy::TestWithParamTarget {}; +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(ConvolveTest); + +HWY_EXPORT_AND_TEST_P(ConvolveTest, TestConvolve); + +HWY_EXPORT_AND_TEST_P(ConvolveTest, BenchmarkAll); + +} // namespace jxl +#endif diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/data_parallel_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/data_parallel_test.cc new file mode 100644 index 0000000000..63db1f8ca0 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/data_parallel_test.cc @@ -0,0 +1,111 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/data_parallel.h" + +#include "gtest/gtest.h" +#include "lib/jxl/base/thread_pool_internal.h" +#include "lib/jxl/test_utils.h" + +namespace jxl { +namespace { + +class DataParallelTest : public ::testing::Test { + protected: + // A fake class to verify that DataParallel is properly calling the + // client-provided runner functions. + static int FakeRunner(void* runner_opaque, void* jpegxl_opaque, + JxlParallelRunInit init, JxlParallelRunFunction func, + uint32_t start_range, uint32_t end_range) { + DataParallelTest* self = static_cast(runner_opaque); + self->runner_called_++; + self->jpegxl_opaque_ = jpegxl_opaque; + self->init_ = init; + self->func_ = func; + self->start_range_ = start_range; + self->end_range_ = end_range; + return self->runner_return_; + } + + ThreadPool pool_{&DataParallelTest::FakeRunner, this}; + + // Number of times FakeRunner() was called. + int runner_called_ = 0; + + // Parameters passed to FakeRunner. + void* jpegxl_opaque_ = nullptr; + JxlParallelRunInit init_ = nullptr; + JxlParallelRunFunction func_ = nullptr; + uint32_t start_range_ = -1; + uint32_t end_range_ = -1; + + // Return value that FakeRunner will return. + int runner_return_ = 0; +}; + +// JxlParallelRunInit interface. +typedef int (*JxlParallelRunInit)(); +int TestInit(void* jpegxl_opaque, size_t num_threads) { return 0; } + +} // namespace + +TEST_F(DataParallelTest, RunnerCalledParamenters) { + EXPECT_TRUE(pool_.Run( + 1234, 5678, [](const size_t num_threads) { return true; }, + [](const int task, const int thread) { return; })); + EXPECT_EQ(1, runner_called_); + EXPECT_NE(nullptr, init_); + EXPECT_NE(nullptr, func_); + EXPECT_NE(nullptr, jpegxl_opaque_); + EXPECT_EQ(1234u, start_range_); + EXPECT_EQ(5678u, end_range_); +} + +TEST_F(DataParallelTest, RunnerFailurePropagates) { + runner_return_ = -1; // FakeRunner return value. + EXPECT_FALSE(pool_.Run( + 1234, 5678, [](const size_t num_threads) { return false; }, + [](const int task, const int thread) { return; })); + EXPECT_FALSE(RunOnPool( + nullptr, 1234, 5678, [](const size_t num_threads) { return false; }, + [](const int task, const int thread) { return; }, "Test")); +} + +TEST_F(DataParallelTest, RunnerNotCalledOnEmptyRange) { + runner_return_ = -1; // FakeRunner return value. + EXPECT_TRUE(pool_.Run( + 123, 123, [](const size_t num_threads) { return false; }, + [](const int task, const int thread) { return; })); + EXPECT_TRUE(RunOnPool( + nullptr, 123, 123, [](const size_t num_threads) { return false; }, + [](const int task, const int thread) { return; }, "Test")); + // We don't call the external runner when the range is empty. We don't even + // need to call the init function. + EXPECT_EQ(0, runner_called_); +} + +// The TestDivider is slow when compiled in debug mode. +TEST_F(DataParallelTest, JXL_SLOW_TEST(TestDivider)) { + jxl::ThreadPoolInternal pool(8); + // 1, 2 are powers of two. + pool.Run(3, 2 * 1024, ThreadPool::SkipInit(), + [](const int d, const int thread) { + // powers of two are not supported. + if ((d & (d - 1)) == 0) return; + + const Divider div(d); +#ifdef NDEBUG + const int max_dividend = 4 * 1024 * 1024; +#else + const int max_dividend = 2 * 1024 + 1; +#endif + for (int x = 0; x < max_dividend; ++x) { + const int q = div(x); + ASSERT_EQ(x / d, q) << x << "/" << d; + } + }); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct-inl.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct-inl.h new file mode 100644 index 0000000000..ecc3935a5d --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct-inl.h @@ -0,0 +1,361 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Fast SIMD floating-point (I)DCT, any power of two. + +#if defined(LIB_JXL_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_DCT_INL_H_ +#undef LIB_JXL_DCT_INL_H_ +#else +#define LIB_JXL_DCT_INL_H_ +#endif + +#include + +#include + +#include "lib/jxl/dct_block-inl.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/transpose-inl.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +template +struct FVImpl { + using type = HWY_CAPPED(float, SZ); +}; + +template <> +struct FVImpl<0> { + using type = HWY_FULL(float); +}; + +template +using FV = typename FVImpl::type; + +// Implementation of Lowest Complexity Self Recursive Radix-2 DCT II/III +// Algorithms, by Siriani M. Perera and Jianhua Liu. + +template +struct CoeffBundle { + static void AddReverse(const float* JXL_RESTRICT ain1, + const float* JXL_RESTRICT ain2, + float* JXL_RESTRICT aout) { + for (size_t i = 0; i < N; i++) { + auto in1 = Load(FV(), ain1 + i * SZ); + auto in2 = Load(FV(), ain2 + (N - i - 1) * SZ); + Store(in1 + in2, FV(), aout + i * SZ); + } + } + static void SubReverse(const float* JXL_RESTRICT ain1, + const float* JXL_RESTRICT ain2, + float* JXL_RESTRICT aout) { + for (size_t i = 0; i < N; i++) { + auto in1 = Load(FV(), ain1 + i * SZ); + auto in2 = Load(FV(), ain2 + (N - i - 1) * SZ); + Store(in1 - in2, FV(), aout + i * SZ); + } + } + static void B(float* JXL_RESTRICT coeff) { + auto sqrt2 = Set(FV(), square_root<2>::value); + auto in1 = Load(FV(), coeff); + auto in2 = Load(FV(), coeff + SZ); + Store(MulAdd(in1, sqrt2, in2), FV(), coeff); + for (size_t i = 1; i + 1 < N; i++) { + auto in1 = Load(FV(), coeff + i * SZ); + auto in2 = Load(FV(), coeff + (i + 1) * SZ); + Store(in1 + in2, FV(), coeff + i * SZ); + } + } + static void BTranspose(float* JXL_RESTRICT coeff) { + for (size_t i = N - 1; i > 0; i--) { + auto in1 = Load(FV(), coeff + i * SZ); + auto in2 = Load(FV(), coeff + (i - 1) * SZ); + Store(in1 + in2, FV(), coeff + i * SZ); + } + auto sqrt2 = Set(FV(), square_root<2>::value); + auto in1 = Load(FV(), coeff); + Store(in1 * sqrt2, FV(), coeff); + } + // Ideally optimized away by compiler (except the multiply). + static void InverseEvenOdd(const float* JXL_RESTRICT ain, + float* JXL_RESTRICT aout) { + for (size_t i = 0; i < N / 2; i++) { + auto in1 = Load(FV(), ain + i * SZ); + Store(in1, FV(), aout + 2 * i * SZ); + } + for (size_t i = N / 2; i < N; i++) { + auto in1 = Load(FV(), ain + i * SZ); + Store(in1, FV(), aout + (2 * (i - N / 2) + 1) * SZ); + } + } + // Ideally optimized away by compiler. + static void ForwardEvenOdd(const float* JXL_RESTRICT ain, size_t ain_stride, + float* JXL_RESTRICT aout) { + for (size_t i = 0; i < N / 2; i++) { + auto in1 = LoadU(FV(), ain + 2 * i * ain_stride); + Store(in1, FV(), aout + i * SZ); + } + for (size_t i = N / 2; i < N; i++) { + auto in1 = LoadU(FV(), ain + (2 * (i - N / 2) + 1) * ain_stride); + Store(in1, FV(), aout + i * SZ); + } + } + // Invoked on full vector. + static void Multiply(float* JXL_RESTRICT coeff) { + for (size_t i = 0; i < N / 2; i++) { + auto in1 = Load(FV(), coeff + (N / 2 + i) * SZ); + auto mul = Set(FV(), WcMultipliers::kMultipliers[i]); + Store(in1 * mul, FV(), coeff + (N / 2 + i) * SZ); + } + } + static void MultiplyAndAdd(const float* JXL_RESTRICT coeff, + float* JXL_RESTRICT out, size_t out_stride) { + for (size_t i = 0; i < N / 2; i++) { + auto mul = Set(FV(), WcMultipliers::kMultipliers[i]); + auto in1 = Load(FV(), coeff + i * SZ); + auto in2 = Load(FV(), coeff + (N / 2 + i) * SZ); + auto out1 = MulAdd(mul, in2, in1); + auto out2 = NegMulAdd(mul, in2, in1); + StoreU(out1, FV(), out + i * out_stride); + StoreU(out2, FV(), out + (N - i - 1) * out_stride); + } + } + template + static void LoadFromBlock(const Block& in, size_t off, + float* JXL_RESTRICT coeff) { + for (size_t i = 0; i < N; i++) { + Store(in.LoadPart(FV(), i, off), FV(), coeff + i * SZ); + } + } + template + static void StoreToBlockAndScale(const float* JXL_RESTRICT coeff, + const Block& out, size_t off) { + auto mul = Set(FV(), 1.0f / N); + for (size_t i = 0; i < N; i++) { + out.StorePart(FV(), mul * Load(FV(), coeff + i * SZ), i, off); + } + } +}; + +template +struct DCT1DImpl; + +template +struct DCT1DImpl<1, SZ> { + JXL_INLINE void operator()(float* JXL_RESTRICT mem) {} +}; + +template +struct DCT1DImpl<2, SZ> { + JXL_INLINE void operator()(float* JXL_RESTRICT mem) { + auto in1 = Load(FV(), mem); + auto in2 = Load(FV(), mem + SZ); + Store(in1 + in2, FV(), mem); + Store(in1 - in2, FV(), mem + SZ); + } +}; + +template +struct DCT1DImpl { + void operator()(float* JXL_RESTRICT mem) { + // This is relatively small (4kB with 64-DCT and AVX-512) + HWY_ALIGN float tmp[N * SZ]; + CoeffBundle::AddReverse(mem, mem + N / 2 * SZ, tmp); + DCT1DImpl()(tmp); + CoeffBundle::SubReverse(mem, mem + N / 2 * SZ, tmp + N / 2 * SZ); + CoeffBundle::Multiply(tmp); + DCT1DImpl()(tmp + N / 2 * SZ); + CoeffBundle::B(tmp + N / 2 * SZ); + CoeffBundle::InverseEvenOdd(tmp, mem); + } +}; + +template +struct IDCT1DImpl; + +template +struct IDCT1DImpl<1, SZ> { + JXL_INLINE void operator()(const float* from, size_t from_stride, float* to, + size_t to_stride) { + StoreU(LoadU(FV(), from), FV(), to); + } +}; + +template +struct IDCT1DImpl<2, SZ> { + JXL_INLINE void operator()(const float* from, size_t from_stride, float* to, + size_t to_stride) { + JXL_DASSERT(from_stride >= SZ); + JXL_DASSERT(to_stride >= SZ); + auto in1 = LoadU(FV(), from); + auto in2 = LoadU(FV(), from + from_stride); + StoreU(in1 + in2, FV(), to); + StoreU(in1 - in2, FV(), to + to_stride); + } +}; + +template +struct IDCT1DImpl { + void operator()(const float* from, size_t from_stride, float* to, + size_t to_stride) { + JXL_DASSERT(from_stride >= SZ); + JXL_DASSERT(to_stride >= SZ); + // This is relatively small (4kB with 64-DCT and AVX-512) + HWY_ALIGN float tmp[N * SZ]; + CoeffBundle::ForwardEvenOdd(from, from_stride, tmp); + IDCT1DImpl()(tmp, SZ, tmp, SZ); + CoeffBundle::BTranspose(tmp + N / 2 * SZ); + IDCT1DImpl()(tmp + N / 2 * SZ, SZ, tmp + N / 2 * SZ, SZ); + CoeffBundle::MultiplyAndAdd(tmp, to, to_stride); + } +}; + +template +void DCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) { + size_t M = M_or_0 != 0 ? M_or_0 : Mp; + constexpr size_t SZ = MaxLanes(FV()); + HWY_ALIGN float tmp[N * SZ]; + for (size_t i = 0; i < M; i += Lanes(FV())) { + // TODO(veluca): consider removing the temporary memory here (as is done in + // IDCT), if it turns out that some compilers don't optimize away the loads + // and this is performance-critical. + CoeffBundle::LoadFromBlock(from, i, tmp); + DCT1DImpl()(tmp); + CoeffBundle::StoreToBlockAndScale(tmp, to, i); + } +} + +template +void IDCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) { + size_t M = M_or_0 != 0 ? M_or_0 : Mp; + constexpr size_t SZ = MaxLanes(FV()); + for (size_t i = 0; i < M; i += Lanes(FV())) { + IDCT1DImpl()(from.Address(0, i), from.Stride(), to.Address(0, i), + to.Stride()); + } +} + +template +struct DCT1D { + template + void operator()(const FromBlock& from, const ToBlock& to) { + return DCT1DWrapper(from, to, M); + } +}; + +template +struct DCT1D MaxLanes(FV<0>()))>::type> { + template + void operator()(const FromBlock& from, const ToBlock& to) { + return NoInlineWrapper(DCT1DWrapper, from, to, M); + } +}; + +template +struct IDCT1D { + template + void operator()(const FromBlock& from, const ToBlock& to) { + return IDCT1DWrapper(from, to, M); + } +}; + +template +struct IDCT1D MaxLanes(FV<0>()))>::type> { + template + void operator()(const FromBlock& from, const ToBlock& to) { + return NoInlineWrapper(IDCT1DWrapper, from, to, + M); + } +}; + +// Computes the in-place NxN transposed-scaled-DCT (tsDCT) of block. +// Requires that block is HWY_ALIGN'ed. +// +// See also DCTSlow, ComputeDCT +template +struct ComputeTransposedScaledDCT { + // scratch_space must be aligned, and should have space for N*N floats. + template + HWY_MAYBE_UNUSED void operator()(const From& from, float* JXL_RESTRICT to, + float* JXL_RESTRICT scratch_space) { + float* JXL_RESTRICT block = scratch_space; + DCT1D()(from, DCTTo(to, N)); + Transpose::Run(DCTFrom(to, N), DCTTo(block, N)); + DCT1D()(DCTFrom(block, N), DCTTo(to, N)); + } +}; + +// Computes the in-place NxN transposed-scaled-iDCT (tsIDCT)of block. +// Requires that block is HWY_ALIGN'ed. +// +// See also IDCTSlow, ComputeIDCT. + +template +struct ComputeTransposedScaledIDCT { + // scratch_space must be aligned, and should have space for N*N floats. + template + HWY_MAYBE_UNUSED void operator()(float* JXL_RESTRICT from, const To& to, + float* JXL_RESTRICT scratch_space) { + float* JXL_RESTRICT block = scratch_space; + IDCT1D()(DCTFrom(from, N), DCTTo(block, N)); + Transpose::Run(DCTFrom(block, N), DCTTo(from, N)); + IDCT1D()(DCTFrom(from, N), to); + } +}; +// Computes the non-transposed, scaled DCT of a block, that needs to be +// HWY_ALIGN'ed. Used for rectangular blocks. +template +struct ComputeScaledDCT { + // scratch_space must be aligned, and should have space for ROWS*COLS + // floats. + template + HWY_MAYBE_UNUSED void operator()(const From& from, float* to, + float* JXL_RESTRICT scratch_space) { + float* JXL_RESTRICT block = scratch_space; + if (ROWS < COLS) { + DCT1D()(from, DCTTo(block, COLS)); + Transpose::Run(DCTFrom(block, COLS), DCTTo(to, ROWS)); + DCT1D()(DCTFrom(to, ROWS), DCTTo(block, ROWS)); + Transpose::Run(DCTFrom(block, ROWS), DCTTo(to, COLS)); + } else { + DCT1D()(from, DCTTo(to, COLS)); + Transpose::Run(DCTFrom(to, COLS), DCTTo(block, ROWS)); + DCT1D()(DCTFrom(block, ROWS), DCTTo(to, ROWS)); + } + } +}; +// Computes the non-transposed, scaled DCT of a block, that needs to be +// HWY_ALIGN'ed. Used for rectangular blocks. +template +struct ComputeScaledIDCT { + // scratch_space must be aligned, and should have space for ROWS*COLS + // floats. + template + HWY_MAYBE_UNUSED void operator()(float* JXL_RESTRICT from, const To& to, + float* JXL_RESTRICT scratch_space) { + float* JXL_RESTRICT block = scratch_space; + // Reverse the steps done in ComputeScaledDCT. + if (ROWS < COLS) { + Transpose::Run(DCTFrom(from, COLS), DCTTo(block, ROWS)); + IDCT1D()(DCTFrom(block, ROWS), DCTTo(from, ROWS)); + Transpose::Run(DCTFrom(from, ROWS), DCTTo(block, COLS)); + IDCT1D()(DCTFrom(block, COLS), to); + } else { + IDCT1D()(DCTFrom(from, ROWS), DCTTo(block, ROWS)); + Transpose::Run(DCTFrom(block, ROWS), DCTTo(from, COLS)); + IDCT1D()(DCTFrom(from, COLS), to); + } + } +}; + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); +#endif // LIB_JXL_DCT_INL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_block-inl.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_block-inl.h new file mode 100644 index 0000000000..179647838d --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_block-inl.h @@ -0,0 +1,108 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Adapters for DCT input/output: from/to contiguous blocks or image rows. + +#if defined(LIB_JXL_DCT_BLOCK_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_DCT_BLOCK_INL_H_ +#undef LIB_JXL_DCT_BLOCK_INL_H_ +#else +#define LIB_JXL_DCT_BLOCK_INL_H_ +#endif + +#include + +#include + +#include "lib/jxl/base/status.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Vec; + +// Block: (x, y) <-> (N * y + x) +// Lines: (x, y) <-> (stride * y + x) +// +// I.e. Block is a specialization of Lines with fixed stride. +// +// FromXXX should implement Read and Load (Read vector). +// ToXXX should implement Write and Store (Write vector). + +template +using BlockDesc = HWY_CAPPED(float, N); + +// Here and in the following, the SZ template parameter specifies the number of +// values to load/store. Needed because we want to handle 4x4 sub-blocks of +// 16x16 blocks. +class DCTFrom { + public: + DCTFrom(const float* data, size_t stride) : stride_(stride), data_(data) {} + + template + HWY_INLINE Vec LoadPart(D, const size_t row, size_t i) const { + JXL_DASSERT(Lanes(D()) <= stride_); + // Since these functions are used also for DC, no alignment at all is + // guaranteed in the case of floating blocks. + // TODO(veluca): consider using a different class for DC-to-LF and + // DC-from-LF, or copying DC values to/from a temporary aligned location. + return LoadU(D(), Address(row, i)); + } + + HWY_INLINE float Read(const size_t row, const size_t i) const { + return *Address(row, i); + } + + HWY_INLINE const float* Address(const size_t row, + const size_t i) const { + return data_ + row * stride_ + i; + } + + size_t Stride() const { return stride_; } + + private: + size_t stride_; + const float* JXL_RESTRICT data_; +}; + +class DCTTo { + public: + DCTTo(float* data, size_t stride) : stride_(stride), data_(data) {} + + template + HWY_INLINE void StorePart(D, const Vec& v, const size_t row, + size_t i) const { + JXL_DASSERT(Lanes(D()) <= stride_); + // Since these functions are used also for DC, no alignment at all is + // guaranteed in the case of floating blocks. + // TODO(veluca): consider using a different class for DC-to-LF and + // DC-from-LF, or copying DC values to/from a temporary aligned location. + StoreU(v, D(), Address(row, i)); + } + + HWY_INLINE void Write(float v, const size_t row, const size_t i) const { + *Address(row, i) = v; + } + + HWY_INLINE float* Address(const size_t row, const size_t i) const { + return data_ + row * stride_ + i; + } + + size_t Stride() const { return stride_; } + + private: + size_t stride_; + float* JXL_RESTRICT data_; +}; + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_DCT_BLOCK_INL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_for_test.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_for_test.h new file mode 100644 index 0000000000..8e32aa7eff --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_for_test.h @@ -0,0 +1,99 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DCT_FOR_TEST_H_ +#define LIB_JXL_DCT_FOR_TEST_H_ + +// Unoptimized DCT only for use in tests. + +#include // memcpy + +#include +#include + +#include "lib/jxl/common.h" // Pi + +namespace jxl { + +namespace test { +static inline double alpha(int u) { return u == 0 ? 0.7071067811865475 : 1.0; } + +// N-DCT on M columns, divided by sqrt(N). Matches the definition in the spec. +template +void DCT1D(double block[N * M], double out[N * M]) { + std::vector matrix(N * N); + const double scale = std::sqrt(2.0) / N; + for (size_t y = 0; y < N; y++) { + for (size_t u = 0; u < N; u++) { + matrix[N * u + y] = alpha(u) * cos((y + 0.5) * u * Pi(1.0 / N)) * scale; + } + } + for (size_t x = 0; x < M; x++) { + for (size_t u = 0; u < N; u++) { + out[M * u + x] = 0; + for (size_t y = 0; y < N; y++) { + out[M * u + x] += matrix[N * u + y] * block[M * y + x]; + } + } + } +} + +// N-IDCT on M columns, multiplied by sqrt(N). Matches the definition in the +// spec. +template +void IDCT1D(double block[N * M], double out[N * M]) { + std::vector matrix(N * N); + const double scale = std::sqrt(2.0); + for (size_t y = 0; y < N; y++) { + for (size_t u = 0; u < N; u++) { + // Transpose of DCT matrix. + matrix[N * y + u] = alpha(u) * cos((y + 0.5) * u * Pi(1.0 / N)) * scale; + } + } + for (size_t x = 0; x < M; x++) { + for (size_t u = 0; u < N; u++) { + out[M * u + x] = 0; + for (size_t y = 0; y < N; y++) { + out[M * u + x] += matrix[N * u + y] * block[M * y + x]; + } + } + } +} + +template +void TransposeBlock(double in[N * M], double out[M * N]) { + for (size_t x = 0; x < N; x++) { + for (size_t y = 0; y < M; y++) { + out[y * N + x] = in[x * M + y]; + } + } +} +} // namespace test + +// Untransposed DCT. +template +void DCTSlow(double block[N * N]) { + constexpr size_t kBlockSize = N * N; + std::vector g(kBlockSize); + test::DCT1D(block, g.data()); + test::TransposeBlock(g.data(), block); + test::DCT1D(block, g.data()); + test::TransposeBlock(g.data(), block); +} + +// Untransposed IDCT. +template +void IDCTSlow(double block[N * N]) { + constexpr size_t kBlockSize = N * N; + std::vector g(kBlockSize); + test::IDCT1D(block, g.data()); + test::TransposeBlock(g.data(), block); + test::IDCT1D(block, g.data()); + test::TransposeBlock(g.data(), block); +} + +} // namespace jxl + +#endif // LIB_JXL_DCT_FOR_TEST_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_scales.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_scales.cc new file mode 100644 index 0000000000..f9e89a6014 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_scales.cc @@ -0,0 +1,31 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dct_scales.h" + +namespace jxl { + +// Definition of constexpr arrays. +constexpr float DCTResampleScales<1, 8>::kScales[]; +constexpr float DCTResampleScales<2, 16>::kScales[]; +constexpr float DCTResampleScales<4, 32>::kScales[]; +constexpr float DCTResampleScales<8, 64>::kScales[]; +constexpr float DCTResampleScales<16, 128>::kScales[]; +constexpr float DCTResampleScales<32, 256>::kScales[]; +constexpr float DCTResampleScales<8, 1>::kScales[]; +constexpr float DCTResampleScales<16, 2>::kScales[]; +constexpr float DCTResampleScales<32, 4>::kScales[]; +constexpr float DCTResampleScales<64, 8>::kScales[]; +constexpr float DCTResampleScales<128, 16>::kScales[]; +constexpr float DCTResampleScales<256, 32>::kScales[]; +constexpr float WcMultipliers<4>::kMultipliers[]; +constexpr float WcMultipliers<8>::kMultipliers[]; +constexpr float WcMultipliers<16>::kMultipliers[]; +constexpr float WcMultipliers<32>::kMultipliers[]; +constexpr float WcMultipliers<64>::kMultipliers[]; +constexpr float WcMultipliers<128>::kMultipliers[]; +constexpr float WcMultipliers<256>::kMultipliers[]; + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_scales.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_scales.h new file mode 100644 index 0000000000..9ec670aedc --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_scales.h @@ -0,0 +1,390 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DCT_SCALES_H_ +#define LIB_JXL_DCT_SCALES_H_ + +// Scaling factors. + +#include + +namespace jxl { +template +struct square_root { + static constexpr float value = square_root::value * 2; +}; + +template <> +struct square_root<1> { + static constexpr float value = 1.0f; +}; + +template <> +struct square_root<2> { + static constexpr float value = 1.4142135623730951f; +}; + +// For n != 0, the n-th basis function of a N-DCT, evaluated in pixel k, has a +// value of cos((k+1/2) n/(2N) pi). When downsampling by 2x, we average +// the values for pixel k and k+1 to get the value for pixel (k/2), thus we get +// +// [cos((k+1/2) n/N pi) + cos((k+3/2) n/N pi)]/2 = +// cos(n/(2N) pi) cos((k+1) n/N pi) = +// cos(n/(2N) pi) cos(((k/2)+1/2) n/(N/2) pi) +// +// which is exactly the same as the value of pixel k/2 of a N/2-sized DCT, +// except for the cos(n/(2N) pi) scaling factor (which does *not* +// depend on the pixel). Thus, when using the lower-frequency coefficients of a +// DCT-N to compute a DCT-(N/2), they should be scaled by this constant. Scaling +// factors for a DCT-(N/4) etc can then be obtained by successive +// multiplications. The structs below contain the above-mentioned scaling +// factors. +// +// Python code for the tables below: +// +// for i in range(N // 8): +// v = math.cos(i / (2 * N) * math.pi) +// v *= math.cos(i / (N) * math.pi) +// v *= math.cos(i / (N / 2) * math.pi) +// print(v, end=", ") + +template +struct DCTResampleScales; + +template <> +struct DCTResampleScales<8, 1> { + static constexpr float kScales[] = { + 1.000000000000000000, + }; +}; + +template <> +struct DCTResampleScales<16, 2> { + static constexpr float kScales[] = { + 1.000000000000000000, + 0.901764195028874394, + }; +}; + +template <> +struct DCTResampleScales<32, 4> { + static constexpr float kScales[] = { + 1.000000000000000000, + 0.974886821136879522, + 0.901764195028874394, + 0.787054918159101335, + }; +}; + +template <> +struct DCTResampleScales<64, 8> { + static constexpr float kScales[] = { + 1.0000000000000000, 0.9936866130906366, 0.9748868211368796, + 0.9440180941651672, 0.9017641950288744, 0.8490574973847023, + 0.7870549181591013, 0.7171081282466044, + }; +}; + +template <> +struct DCTResampleScales<128, 16> { + static constexpr float kScales[] = { + 1.0, + 0.9984194528776054, + 0.9936866130906366, + 0.9858278282666936, + 0.9748868211368796, + 0.9609244059440204, + 0.9440180941651672, + 0.9242615922757944, + 0.9017641950288744, + 0.8766500784429904, + 0.8490574973847023, + 0.8191378932865928, + 0.7870549181591013, + 0.7529833816270532, + 0.7171081282466044, + 0.6796228528314651, + }; +}; + +template <> +struct DCTResampleScales<256, 32> { + static constexpr float kScales[] = { + 1.0, + 0.9996047255830407, + 0.9984194528776054, + 0.9964458326264695, + 0.9936866130906366, + 0.9901456355893141, + 0.9858278282666936, + 0.9807391980963174, + 0.9748868211368796, + 0.9682788310563117, + 0.9609244059440204, + 0.9528337534340876, + 0.9440180941651672, + 0.9344896436056892, + 0.9242615922757944, + 0.913348084400198, + 0.9017641950288744, + 0.8895259056651056, + 0.8766500784429904, + 0.8631544288990163, + 0.8490574973847023, + 0.8343786191696513, + 0.8191378932865928, + 0.8033561501721485, + 0.7870549181591013, + 0.7702563888779096, + 0.7529833816270532, + 0.7352593067735488, + 0.7171081282466044, + 0.6985543251889097, + 0.6796228528314651, + 0.6603391026591464, + }; +}; + +// Inverses of the above. +template <> +struct DCTResampleScales<1, 8> { + static constexpr float kScales[] = { + 1.000000000000000000, + }; +}; + +template <> +struct DCTResampleScales<2, 16> { + static constexpr float kScales[] = { + 1.000000000000000000, + 1.108937353592731823, + }; +}; + +template <> +struct DCTResampleScales<4, 32> { + static constexpr float kScales[] = { + 1.000000000000000000, + 1.025760096781116015, + 1.108937353592731823, + 1.270559368765487251, + }; +}; + +template <> +struct DCTResampleScales<8, 64> { + static constexpr float kScales[] = { + 1.0000000000000000, 1.0063534990068217, 1.0257600967811158, + 1.0593017296817173, 1.1089373535927318, 1.1777765381970435, + 1.2705593687654873, 1.3944898413647777, + }; +}; + +template <> +struct DCTResampleScales<16, 128> { + static constexpr float kScales[] = { + 1.0, + 1.0015830492062623, + 1.0063534990068217, + 1.0143759095928793, + 1.0257600967811158, + 1.0406645869480142, + 1.0593017296817173, + 1.0819447744633812, + 1.1089373535927318, + 1.1407059950032632, + 1.1777765381970435, + 1.2207956782315876, + 1.2705593687654873, + 1.3280505578213306, + 1.3944898413647777, + 1.4714043176061107, + }; +}; + +template <> +struct DCTResampleScales<32, 256> { + static constexpr float kScales[] = { + 1.0, + 1.0003954307206069, + 1.0015830492062623, + 1.0035668445360069, + 1.0063534990068217, + 1.009952439375063, + 1.0143759095928793, + 1.0196390660647288, + 1.0257600967811158, + 1.0327603660498115, + 1.0406645869480142, + 1.049501024072585, + 1.0593017296817173, + 1.0701028169146336, + 1.0819447744633812, + 1.0948728278734026, + 1.1089373535927318, + 1.124194353004584, + 1.1407059950032632, + 1.158541237256391, + 1.1777765381970435, + 1.1984966740820495, + 1.2207956782315876, + 1.244777922949508, + 1.2705593687654873, + 1.2982690107339132, + 1.3280505578213306, + 1.3600643892400104, + 1.3944898413647777, + 1.4315278911623237, + 1.4714043176061107, + 1.5143734423314616, + }; +}; + +// Constants for DCT implementation. Generated by the following snippet: +// for i in range(N // 2): +// print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ") +template +struct WcMultipliers; + +template <> +struct WcMultipliers<4> { + static constexpr float kMultipliers[] = { + 0.541196100146197, + 1.3065629648763764, + }; +}; + +template <> +struct WcMultipliers<8> { + static constexpr float kMultipliers[] = { + 0.5097955791041592, + 0.6013448869350453, + 0.8999762231364156, + 2.5629154477415055, + }; +}; + +template <> +struct WcMultipliers<16> { + static constexpr float kMultipliers[] = { + 0.5024192861881557, 0.5224986149396889, 0.5669440348163577, + 0.6468217833599901, 0.7881546234512502, 1.060677685990347, + 1.7224470982383342, 5.101148618689155, + }; +}; + +template <> +struct WcMultipliers<32> { + static constexpr float kMultipliers[] = { + 0.5006029982351963, 0.5054709598975436, 0.5154473099226246, + 0.5310425910897841, 0.5531038960344445, 0.5829349682061339, + 0.6225041230356648, 0.6748083414550057, 0.7445362710022986, + 0.8393496454155268, 0.9725682378619608, 1.1694399334328847, + 1.4841646163141662, 2.057781009953411, 3.407608418468719, + 10.190008123548033, + }; +}; +template <> +struct WcMultipliers<64> { + static constexpr float kMultipliers[] = { + 0.500150636020651, 0.5013584524464084, 0.5037887256810443, + 0.5074711720725553, 0.5124514794082247, 0.5187927131053328, + 0.52657731515427, 0.535909816907992, 0.5469204379855088, + 0.5597698129470802, 0.57465518403266, 0.5918185358574165, + 0.6115573478825099, 0.6342389366884031, 0.6603198078137061, + 0.6903721282002123, 0.7251205223771985, 0.7654941649730891, + 0.8127020908144905, 0.8683447152233481, 0.9345835970364075, + 1.0144082649970547, 1.1120716205797176, 1.233832737976571, + 1.3892939586328277, 1.5939722833856311, 1.8746759800084078, + 2.282050068005162, 2.924628428158216, 4.084611078129248, + 6.796750711673633, 20.373878167231453, + }; +}; +template <> +struct WcMultipliers<128> { + static constexpr float kMultipliers[] = { + 0.5000376519155477, 0.5003390374428216, 0.5009427176380873, + 0.5018505174842379, 0.5030651913013697, 0.5045904432216454, + 0.5064309549285542, 0.5085924210498143, 0.5110815927066812, + 0.5139063298475396, 0.5170756631334912, 0.5205998663018917, + 0.524490540114724, 0.5287607092074876, 0.5334249333971333, + 0.538499435291984, 0.5440022463817783, 0.549953374183236, + 0.5563749934898856, 0.5632916653417023, 0.5707305880121454, + 0.5787218851348208, 0.5872989370937893, 0.5964987630244563, + 0.606362462272146, 0.6169357260050706, 0.6282694319707711, + 0.6404203382416639, 0.6534518953751283, 0.6674352009263413, + 0.6824501259764195, 0.6985866506472291, 0.7159464549705746, + 0.7346448236478627, 0.7548129391165311, 0.776600658233963, + 0.8001798956216941, 0.8257487738627852, 0.8535367510066064, + 0.8838110045596234, 0.9168844461846523, 0.9531258743921193, + 0.9929729612675466, 1.036949040910389, 1.0856850642580145, + 1.1399486751015042, 1.2006832557294167, 1.2690611716991191, + 1.346557628206286, 1.4350550884414341, 1.5369941008524954, + 1.6555965242641195, 1.7952052190778898, 1.961817848571166, + 2.163957818751979, 2.4141600002500763, 2.7316450287739396, + 3.147462191781909, 3.7152427383269746, 4.5362909369693565, + 5.827688377844654, 8.153848602466814, 13.58429025728446, + 40.744688103351834, + }; +}; + +template <> +struct WcMultipliers<256> { + static constexpr float kMultipliers[128] = { + 0.5000094125358878, 0.500084723455784, 0.5002354020255269, + 0.5004615618093246, 0.5007633734146156, 0.5011410648064231, + 0.5015949217281668, 0.502125288230386, 0.5027325673091954, + 0.5034172216566842, 0.5041797745258774, 0.5050208107132756, + 0.5059409776624396, 0.5069409866925212, 0.5080216143561264, + 0.509183703931388, 0.5104281670536573, 0.5117559854927805, + 0.5131682130825206, 0.5146659778093218, 0.516250484068288, + 0.5179230150949777, 0.5196849355823947, 0.5215376944933958, + 0.5234828280796439, 0.52552196311921, 0.5276568203859896, + 0.5298892183652453, 0.5322210772308335, 0.5346544231010253, + 0.537191392591309, 0.5398342376841637, 0.5425853309375497, + 0.545447171055775, 0.5484223888484947, 0.551513753605893, + 0.554724179920619, 0.5580567349898085, 0.5615146464335654, + 0.5651013106696203, 0.5688203018875696, 0.5726753816701664, + 0.5766705093136241, 0.5808098529038624, 0.5850978012111273, + 0.58953897647151, 0.5941382481306648, 0.5989007476325463, + 0.6038318843443582, 0.6089373627182432, 0.614223200800649, + 0.6196957502119484, 0.6253617177319102, 0.6312281886412079, + 0.6373026519855411, 0.6435930279473415, 0.6501076975307724, + 0.6568555347890955, 0.6638459418498757, 0.6710888870233562, + 0.6785949463131795, 0.6863753486870501, 0.6944420255086364, + 0.7028076645818034, 0.7114857693151208, 0.7204907235796304, + 0.7298378629074134, 0.7395435527641373, 0.749625274727372, + 0.7601017215162176, 0.7709929019493761, 0.7823202570613161, + 0.7941067887834509, 0.8063772028037925, 0.8191580674598145, + 0.83247799080191, 0.8463678182968619, 0.860860854031955, + 0.8759931087426972, 0.8918035785352535, 0.9083345588266809, + 0.9256319988042384, 0.9437459026371479, 0.962730784794803, + 0.9826461881778968, 1.0035572754078206, 1.0255355056139732, + 1.048659411496106, 1.0730154944316674, 1.0986992590905857, + 1.1258164135986009, 1.1544842669978943, 1.184833362908442, + 1.217009397314603, 1.2511754798461228, 1.287514812536712, + 1.326233878832723, 1.3675662599582539, 1.411777227500661, + 1.459169302866857, 1.5100890297227016, 1.5649352798258847, + 1.6241695131835794, 1.6883285509131505, 1.7580406092704062, + 1.8340456094306077, 1.9172211551275689, 2.0086161135167564, + 2.1094945286246385, 2.22139377701127, 2.346202662531156, + 2.486267909203593, 2.644541877144861, 2.824791402350551, + 3.0318994541759925, 3.2723115884254845, 3.5547153325075804, + 3.891107790700307, 4.298537526449054, 4.802076008665048, + 5.440166215091329, 6.274908408039339, 7.413566756422303, + 9.058751453879703, 11.644627325175037, 16.300023088031555, + 27.163977662448232, 81.48784219222516, + }; +}; + +// Apply the DCT algorithm-intrinsic constants to DCTResampleScale. +template +constexpr float DCTTotalResampleScale(size_t x) { + return DCTResampleScales::kScales[x]; +} + +} // namespace jxl + +#endif // LIB_JXL_DCT_SCALES_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_test.cc new file mode 100644 index 0000000000..a51a3178c9 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_test.cc @@ -0,0 +1,390 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/dct_test.cc" +#include +#include +#include + +#include "lib/jxl/base/thread_pool_internal.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct-inl.h" +#include "lib/jxl/dct_for_test.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/image.h" +#include "lib/jxl/test_utils.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// Computes the in-place NxN DCT of block. +// Requires that block is HWY_ALIGN'ed. +// +// Performs ComputeTransposedScaledDCT and then transposes and scales it to +// obtain "vanilla" DCT. +template +void ComputeDCT(float block[N * N]) { + HWY_ALIGN float tmp_block[N * N]; + HWY_ALIGN float scratch_space[N * N]; + ComputeTransposedScaledDCT()(DCTFrom(block, N), tmp_block, scratch_space); + + // Untranspose. + Transpose::Run(DCTFrom(tmp_block, N), DCTTo(block, N)); +} + +// Computes the in-place 8x8 iDCT of block. +// Requires that block is HWY_ALIGN'ed. +template +void ComputeIDCT(float block[N * N]) { + HWY_ALIGN float tmp_block[N * N]; + HWY_ALIGN float scratch_space[N * N]; + // Untranspose. + Transpose::Run(DCTFrom(block, N), DCTTo(tmp_block, N)); + + ComputeTransposedScaledIDCT()(tmp_block, DCTTo(block, N), scratch_space); +} + +template +void TransposeTestT(float accuracy) { + constexpr size_t kBlockSize = N * N; + HWY_ALIGN float src[kBlockSize]; + DCTTo to_src(src, N); + for (size_t y = 0; y < N; ++y) { + for (size_t x = 0; x < N; ++x) { + to_src.Write(y * N + x, y, x); + } + } + HWY_ALIGN float dst[kBlockSize]; + Transpose::Run(DCTFrom(src, N), DCTTo(dst, N)); + DCTFrom from_dst(dst, N); + for (size_t y = 0; y < N; ++y) { + for (size_t x = 0; x < N; ++x) { + float expected = x * N + y; + float actual = from_dst.Read(y, x); + EXPECT_NEAR(expected, actual, accuracy) << "x = " << x << ", y = " << y; + } + } +} + +void TransposeTest() { + TransposeTestT<8>(1e-7f); + TransposeTestT<16>(1e-7f); + TransposeTestT<32>(1e-7f); +} + +template +void ColumnDctRoundtripT(float accuracy) { + constexpr size_t kBlockSize = N * N; + // Though we are only interested in single column result, dct.h has built-in + // limit on minimal number of columns processed. So, to be safe, we do + // regular 8x8 block transformation. On the bright side - we could check all + // 8 basis vectors at once. + HWY_ALIGN float block[kBlockSize]; + DCTTo to(block, N); + DCTFrom from(block, N); + for (size_t i = 0; i < N; ++i) { + for (size_t j = 0; j < N; ++j) { + to.Write((i == j) ? 1.0f : 0.0f, i, j); + } + } + + // Running (I)DCT on the same memory block seems to trigger a compiler bug on + // ARMv7 with clang6. + HWY_ALIGN float tmp[kBlockSize]; + DCTTo to_tmp(tmp, N); + DCTFrom from_tmp(tmp, N); + + DCT1D()(from, to_tmp); + IDCT1D()(from_tmp, to); + + for (size_t i = 0; i < N; ++i) { + for (size_t j = 0; j < N; ++j) { + float expected = (i == j) ? 1.0f : 0.0f; + float actual = from.Read(i, j); + EXPECT_NEAR(expected, actual, accuracy) << " i=" << i << ", j=" << j; + } + } +} + +void ColumnDctRoundtrip() { + ColumnDctRoundtripT<8>(1e-6f); + ColumnDctRoundtripT<16>(1e-6f); + ColumnDctRoundtripT<32>(1e-6f); +} + +template +void TestDctAccuracy(float accuracy, size_t start = 0, size_t end = N * N) { + constexpr size_t kBlockSize = N * N; + for (size_t i = start; i < end; i++) { + HWY_ALIGN float fast[kBlockSize] = {0.0f}; + double slow[kBlockSize] = {0.0}; + fast[i] = 1.0; + slow[i] = 1.0; + DCTSlow(slow); + ComputeDCT(fast); + for (size_t k = 0; k < kBlockSize; ++k) { + EXPECT_NEAR(fast[k], slow[k], accuracy / N) + << "i = " << i << ", k = " << k << ", N = " << N; + } + } +} + +template +void TestIdctAccuracy(float accuracy, size_t start = 0, size_t end = N * N) { + constexpr size_t kBlockSize = N * N; + for (size_t i = start; i < end; i++) { + HWY_ALIGN float fast[kBlockSize] = {0.0f}; + double slow[kBlockSize] = {0.0}; + fast[i] = 1.0; + slow[i] = 1.0; + IDCTSlow(slow); + ComputeIDCT(fast); + for (size_t k = 0; k < kBlockSize; ++k) { + EXPECT_NEAR(fast[k], slow[k], accuracy * N) + << "i = " << i << ", k = " << k << ", N = " << N; + } + } +} + +template +void TestInverseT(float accuracy) { + ThreadPoolInternal pool(N < 32 ? 0 : 8); + enum { kBlockSize = N * N }; + RunOnPool( + &pool, 0, kBlockSize, ThreadPool::SkipInit(), + [accuracy](const int task, int /*thread*/) { + const size_t i = static_cast(task); + HWY_ALIGN float x[kBlockSize] = {0.0f}; + x[i] = 1.0; + + ComputeIDCT(x); + ComputeDCT(x); + + for (size_t k = 0; k < kBlockSize; ++k) { + EXPECT_NEAR(x[k], (k == i) ? 1.0f : 0.0f, accuracy) + << "i = " << i << ", k = " << k; + } + }, + "TestInverse"); +} + +void InverseTest() { + TestInverseT<8>(1e-6f); + TestInverseT<16>(1e-6f); + TestInverseT<32>(3e-6f); +} + +template +void TestDctTranspose(float accuracy, size_t start = 0, size_t end = N * N) { + constexpr size_t kBlockSize = N * N; + for (size_t i = start; i < end; i++) { + for (size_t j = 0; j < kBlockSize; ++j) { + // We check that = . + // That means (Me_j)_i = (M^\dagger{}e_i)_j + + // x := Me_j + HWY_ALIGN float x[kBlockSize] = {0.0f}; + x[j] = 1.0; + ComputeIDCT(x); + // y := M^\dagger{}e_i + HWY_ALIGN float y[kBlockSize] = {0.0f}; + y[i] = 1.0; + ComputeDCT(y); + + EXPECT_NEAR(x[i] / N, y[j] * N, accuracy) << "i = " << i << ", j = " << j; + } + } +} + +template +void TestSlowInverse(float accuracy, size_t start = 0, size_t end = N * N) { + constexpr size_t kBlockSize = N * N; + for (size_t i = start; i < end; i++) { + double x[kBlockSize] = {0.0f}; + x[i] = 1.0; + + DCTSlow(x); + IDCTSlow(x); + + for (size_t k = 0; k < kBlockSize; ++k) { + EXPECT_NEAR(x[k], (k == i) ? 1.0f : 0.0f, accuracy) + << "i = " << i << ", k = " << k; + } + } +} + +template +void TestRectInverseT(float accuracy) { + constexpr size_t kBlockSize = ROWS * COLS; + for (size_t i = 0; i < kBlockSize; ++i) { + HWY_ALIGN float x[kBlockSize] = {0.0f}; + HWY_ALIGN float out[kBlockSize] = {0.0f}; + x[i] = 1.0; + HWY_ALIGN float coeffs[kBlockSize] = {0.0f}; + HWY_ALIGN float scratch_space[kBlockSize * 2]; + + ComputeScaledDCT()(DCTFrom(x, COLS), coeffs, scratch_space); + ComputeScaledIDCT()(coeffs, DCTTo(out, COLS), scratch_space); + + for (size_t k = 0; k < kBlockSize; ++k) { + EXPECT_NEAR(out[k], (k == i) ? 1.0f : 0.0f, accuracy) + << "i = " << i << ", k = " << k << " ROWS = " << ROWS + << " COLS = " << COLS; + } + } +} + +void TestRectInverse() { + TestRectInverseT<16, 32>(1e-6f); + TestRectInverseT<8, 32>(1e-6f); + TestRectInverseT<8, 16>(1e-6f); + TestRectInverseT<4, 8>(1e-6f); + TestRectInverseT<2, 4>(1e-6f); + TestRectInverseT<1, 4>(1e-6f); + TestRectInverseT<1, 2>(1e-6f); + + TestRectInverseT<32, 16>(1e-6f); + TestRectInverseT<32, 8>(1e-6f); + TestRectInverseT<16, 8>(1e-6f); + TestRectInverseT<8, 4>(1e-6f); + TestRectInverseT<4, 2>(1e-6f); + TestRectInverseT<4, 1>(1e-6f); + TestRectInverseT<2, 1>(1e-6f); +} + +template +void TestRectTransposeT(float accuracy) { + constexpr size_t kBlockSize = ROWS * COLS; + HWY_ALIGN float scratch_space[kBlockSize * 2]; + for (size_t px = 0; px < COLS; ++px) { + for (size_t py = 0; py < ROWS; ++py) { + HWY_ALIGN float x1[kBlockSize] = {0.0f}; + HWY_ALIGN float x2[kBlockSize] = {0.0f}; + HWY_ALIGN float coeffs1[kBlockSize] = {0.0f}; + HWY_ALIGN float coeffs2[kBlockSize] = {0.0f}; + x1[py * COLS + px] = 1; + x2[px * ROWS + py] = 1; + + constexpr size_t OUT_ROWS = ROWS < COLS ? ROWS : COLS; + constexpr size_t OUT_COLS = ROWS < COLS ? COLS : ROWS; + + ComputeScaledDCT()(DCTFrom(x1, COLS), coeffs1, scratch_space); + ComputeScaledDCT()(DCTFrom(x2, ROWS), coeffs2, scratch_space); + + for (size_t x = 0; x < OUT_COLS; ++x) { + for (size_t y = 0; y < OUT_ROWS; ++y) { + EXPECT_NEAR(coeffs1[y * OUT_COLS + x], coeffs2[y * OUT_COLS + x], + accuracy) + << " px = " << px << ", py = " << py << ", x = " << x + << ", y = " << y; + } + } + } + } +} + +void TestRectTranspose() { + TestRectTransposeT<16, 32>(1e-6f); + TestRectTransposeT<8, 32>(1e-6f); + TestRectTransposeT<8, 16>(1e-6f); + TestRectTransposeT<4, 8>(1e-6f); + TestRectTransposeT<2, 4>(1e-6f); + TestRectTransposeT<1, 4>(1e-6f); + TestRectTransposeT<1, 2>(1e-6f); + + // Identical to 8, 16 + // TestRectTranspose<16, 8>(1e-6f); +} + +void TestDctAccuracyShard(size_t shard) { + if (shard == 0) { + TestDctAccuracy<1>(1.1E-7f); + TestDctAccuracy<2>(1.1E-7f); + TestDctAccuracy<4>(1.1E-7f); + TestDctAccuracy<8>(1.1E-7f); + TestDctAccuracy<16>(1.3E-7f); + } + TestDctAccuracy<32>(1.1E-7f, 32 * shard, 32 * (shard + 1)); +} + +void TestIdctAccuracyShard(size_t shard) { + if (shard == 0) { + TestIdctAccuracy<1>(1E-7f); + TestIdctAccuracy<2>(1E-7f); + TestIdctAccuracy<4>(1E-7f); + TestIdctAccuracy<8>(1E-7f); + TestIdctAccuracy<16>(1E-7f); + } + TestIdctAccuracy<32>(1E-7f, 32 * shard, 32 * (shard + 1)); +} + +void TestDctTransposeShard(size_t shard) { + if (shard == 0) { + TestDctTranspose<8>(1E-6f); + TestDctTranspose<16>(1E-6f); + } + TestDctTranspose<32>(3E-6f, 32 * shard, 32 * (shard + 1)); +} + +void TestSlowInverseShard(size_t shard) { + if (shard == 0) { + TestSlowInverse<1>(1E-5f); + TestSlowInverse<2>(1E-5f); + TestSlowInverse<4>(1E-5f); + TestSlowInverse<8>(1E-5f); + TestSlowInverse<16>(1E-5f); + } + TestSlowInverse<32>(1E-5f, 32 * shard, 32 * (shard + 1)); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +class TransposeTest : public hwy::TestWithParamTarget {}; + +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(TransposeTest); + +HWY_EXPORT_AND_TEST_P(TransposeTest, TransposeTest); +HWY_EXPORT_AND_TEST_P(TransposeTest, InverseTest); +HWY_EXPORT_AND_TEST_P(TransposeTest, ColumnDctRoundtrip); +HWY_EXPORT_AND_TEST_P(TransposeTest, TestRectInverse); +HWY_EXPORT_AND_TEST_P(TransposeTest, TestRectTranspose); + +// Tests in the DctShardedTest class are sharded for N=32. +class DctShardedTest : public ::hwy::TestWithParamTargetAndT {}; + +std::vector ShardRange(uint32_t n) { +#ifdef JXL_DISABLE_SLOW_TESTS + JXL_ASSERT(n > 6); + std::vector ret = {0, 1, 3, 5, n - 1}; +#else + std::vector ret(n); + std::iota(ret.begin(), ret.end(), 0); +#endif // JXL_DISABLE_SLOW_TESTS + return ret; +} + +HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(DctShardedTest, + ::testing::ValuesIn(ShardRange(32))); + +HWY_EXPORT_AND_TEST_P_T(DctShardedTest, TestDctAccuracyShard); +HWY_EXPORT_AND_TEST_P_T(DctShardedTest, TestIdctAccuracyShard); +HWY_EXPORT_AND_TEST_P_T(DctShardedTest, TestDctTransposeShard); +HWY_EXPORT_AND_TEST_P_T(DctShardedTest, TestSlowInverseShard); + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_util.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_util.h new file mode 100644 index 0000000000..fb6ce3b971 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dct_util.h @@ -0,0 +1,86 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DCT_UTIL_H_ +#define LIB_JXL_DCT_UTIL_H_ + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { + +union ACPtr { + int32_t* ptr32; + int16_t* ptr16; + ACPtr() = default; + explicit ACPtr(int16_t* p) : ptr16(p) {} + explicit ACPtr(int32_t* p) : ptr32(p) {} +}; + +union ConstACPtr { + const int32_t* ptr32; + const int16_t* ptr16; + ConstACPtr() = default; + explicit ConstACPtr(const int16_t* p) : ptr16(p) {} + explicit ConstACPtr(const int32_t* p) : ptr32(p) {} +}; + +enum class ACType { k16 = 0, k32 = 1 }; + +class ACImage { + public: + virtual ~ACImage() = default; + virtual ACType Type() const = 0; + virtual ACPtr PlaneRow(size_t c, size_t y, size_t xbase) = 0; + virtual ConstACPtr PlaneRow(size_t c, size_t y, size_t xbase) const = 0; + virtual size_t PixelsPerRow() const = 0; + virtual void ZeroFill() = 0; + virtual void ZeroFillPlane(size_t c) = 0; + virtual bool IsEmpty() const = 0; +}; + +template +class ACImageT final : public ACImage { + public: + ACImageT() = default; + ACImageT(size_t xsize, size_t ysize) { + static_assert( + std::is_same::value || std::is_same::value, + "ACImage must be either 32- or 16- bit"); + img_ = Image3(xsize, ysize); + } + ACType Type() const override { + return sizeof(T) == 2 ? ACType::k16 : ACType::k32; + } + ACPtr PlaneRow(size_t c, size_t y, size_t xbase) override { + return ACPtr(img_.PlaneRow(c, y) + xbase); + } + ConstACPtr PlaneRow(size_t c, size_t y, size_t xbase) const override { + return ConstACPtr(img_.PlaneRow(c, y) + xbase); + } + + size_t PixelsPerRow() const override { return img_.PixelsPerRow(); } + + void ZeroFill() override { ZeroFillImage(&img_); } + + void ZeroFillPlane(size_t c) override { ZeroFillImage(&img_.Plane(c)); } + + bool IsEmpty() const override { + return img_.xsize() == 0 || img_.ysize() == 0; + } + + private: + Image3 img_; +}; + +} // namespace jxl + +#endif // LIB_JXL_DCT_UTIL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_ans.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_ans.cc new file mode 100644 index 0000000000..06709d7404 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_ans.cc @@ -0,0 +1,375 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_ans.h" + +#include + +#include + +#include "lib/jxl/ans_common.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_context_map.h" +#include "lib/jxl/fields.h" + +namespace jxl { +namespace { + +// Decodes a number in the range [0..255], by reading 1 - 11 bits. +inline int DecodeVarLenUint8(BitReader* input) { + if (input->ReadFixedBits<1>()) { + int nbits = static_cast(input->ReadFixedBits<3>()); + if (nbits == 0) { + return 1; + } else { + return static_cast(input->ReadBits(nbits)) + (1 << nbits); + } + } + return 0; +} + +// Decodes a number in the range [0..65535], by reading 1 - 21 bits. +inline int DecodeVarLenUint16(BitReader* input) { + if (input->ReadFixedBits<1>()) { + int nbits = static_cast(input->ReadFixedBits<4>()); + if (nbits == 0) { + return 1; + } else { + return static_cast(input->ReadBits(nbits)) + (1 << nbits); + } + } + return 0; +} + +Status ReadHistogram(int precision_bits, std::vector* counts, + BitReader* input) { + int simple_code = input->ReadBits(1); + if (simple_code == 1) { + int i; + int symbols[2] = {0}; + int max_symbol = 0; + const int num_symbols = input->ReadBits(1) + 1; + for (i = 0; i < num_symbols; ++i) { + symbols[i] = DecodeVarLenUint8(input); + if (symbols[i] > max_symbol) max_symbol = symbols[i]; + } + counts->resize(max_symbol + 1); + if (num_symbols == 1) { + (*counts)[symbols[0]] = 1 << precision_bits; + } else { + if (symbols[0] == symbols[1]) { // corrupt data + return false; + } + (*counts)[symbols[0]] = input->ReadBits(precision_bits); + (*counts)[symbols[1]] = (1 << precision_bits) - (*counts)[symbols[0]]; + } + } else { + int is_flat = input->ReadBits(1); + if (is_flat == 1) { + int alphabet_size = DecodeVarLenUint8(input) + 1; + if (alphabet_size == 0) { + return JXL_FAILURE("Invalid alphabet size for flat histogram."); + } + *counts = CreateFlatHistogram(alphabet_size, 1 << precision_bits); + return true; + } + + uint32_t shift; + { + // TODO(veluca): speed up reading with table lookups. + int upper_bound_log = FloorLog2Nonzero(ANS_LOG_TAB_SIZE + 1); + int log = 0; + for (; log < upper_bound_log; log++) { + if (input->ReadFixedBits<1>() == 0) break; + } + shift = (input->ReadBits(log) | (1 << log)) - 1; + if (shift > ANS_LOG_TAB_SIZE + 1) { + return JXL_FAILURE("Invalid shift value"); + } + } + + int length = DecodeVarLenUint8(input) + 3; + counts->resize(length); + int total_count = 0; + + static const uint8_t huff[128][2] = { + {3, 10}, {7, 12}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5}, + {3, 10}, {4, 4}, {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2}, + {3, 10}, {5, 0}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5}, + {3, 10}, {4, 4}, {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2}, + {3, 10}, {6, 11}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5}, + {3, 10}, {4, 4}, {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2}, + {3, 10}, {5, 0}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5}, + {3, 10}, {4, 4}, {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2}, + {3, 10}, {7, 13}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5}, + {3, 10}, {4, 4}, {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2}, + {3, 10}, {5, 0}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5}, + {3, 10}, {4, 4}, {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2}, + {3, 10}, {6, 11}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5}, + {3, 10}, {4, 4}, {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2}, + {3, 10}, {5, 0}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5}, + {3, 10}, {4, 4}, {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2}, + }; + + std::vector logcounts(counts->size()); + int omit_log = -1; + int omit_pos = -1; + // This array remembers which symbols have an RLE length. + std::vector same(counts->size(), 0); + for (size_t i = 0; i < logcounts.size(); ++i) { + input->Refill(); // for PeekFixedBits + Advance + int idx = input->PeekFixedBits<7>(); + input->Consume(huff[idx][0]); + logcounts[i] = huff[idx][1]; + // The RLE symbol. + if (logcounts[i] == ANS_LOG_TAB_SIZE + 1) { + int rle_length = DecodeVarLenUint8(input); + same[i] = rle_length + 5; + i += rle_length + 3; + continue; + } + if (logcounts[i] > omit_log) { + omit_log = logcounts[i]; + omit_pos = i; + } + } + // Invalid input, e.g. due to invalid usage of RLE. + if (omit_pos < 0) return JXL_FAILURE("Invalid histogram."); + if (static_cast(omit_pos) + 1 < logcounts.size() && + logcounts[omit_pos + 1] == ANS_TAB_SIZE + 1) { + return JXL_FAILURE("Invalid histogram."); + } + int prev = 0; + int numsame = 0; + for (size_t i = 0; i < logcounts.size(); ++i) { + if (same[i]) { + // RLE sequence, let this loop output the same count for the next + // iterations. + numsame = same[i] - 1; + prev = i > 0 ? (*counts)[i - 1] : 0; + } + if (numsame > 0) { + (*counts)[i] = prev; + numsame--; + } else { + int code = logcounts[i]; + // omit_pos may not be negative at this point (checked before). + if (i == static_cast(omit_pos)) { + continue; + } else if (code == 0) { + continue; + } else if (code == 1) { + (*counts)[i] = 1; + } else { + int bitcount = GetPopulationCountPrecision(code - 1, shift); + (*counts)[i] = (1 << (code - 1)) + + (input->ReadBits(bitcount) << (code - 1 - bitcount)); + } + } + total_count += (*counts)[i]; + } + (*counts)[omit_pos] = (1 << precision_bits) - total_count; + if ((*counts)[omit_pos] <= 0) { + // The histogram we've read sums to more than total_count (including at + // least 1 for the omitted value). + return JXL_FAILURE("Invalid histogram count."); + } + } + return true; +} + +} // namespace + +Status DecodeANSCodes(const size_t num_histograms, + const size_t max_alphabet_size, BitReader* in, + ANSCode* result) { + result->degenerate_symbols.resize(num_histograms, -1); + if (result->use_prefix_code) { + JXL_ASSERT(max_alphabet_size <= 1 << PREFIX_MAX_BITS); + result->huffman_data.resize(num_histograms); + std::vector alphabet_sizes(num_histograms); + for (size_t c = 0; c < num_histograms; c++) { + alphabet_sizes[c] = DecodeVarLenUint16(in) + 1; + if (alphabet_sizes[c] > max_alphabet_size) { + return JXL_FAILURE("Alphabet size is too long: %u", alphabet_sizes[c]); + } + } + for (size_t c = 0; c < num_histograms; c++) { + if (alphabet_sizes[c] > 1) { + if (!result->huffman_data[c].ReadFromBitStream(alphabet_sizes[c], in)) { + if (!in->AllReadsWithinBounds()) { + return JXL_STATUS(StatusCode::kNotEnoughBytes, + "Not enough bytes for huffman code"); + } + return JXL_FAILURE( + "Invalid huffman tree number %zu, alphabet size %u", c, + alphabet_sizes[c]); + } + } else { + // 0-bit codes does not require extension tables. + result->huffman_data[c].table_.clear(); + result->huffman_data[c].table_.resize(1u << kHuffmanTableBits); + } + for (const auto& h : result->huffman_data[c].table_) { + if (h.bits <= kHuffmanTableBits) { + result->UpdateMaxNumBits(c, h.value); + } + } + } + } else { + JXL_ASSERT(max_alphabet_size <= ANS_MAX_ALPHABET_SIZE); + result->alias_tables = + AllocateArray(num_histograms * (1 << result->log_alpha_size) * + sizeof(AliasTable::Entry)); + AliasTable::Entry* alias_tables = + reinterpret_cast(result->alias_tables.get()); + for (size_t c = 0; c < num_histograms; ++c) { + std::vector counts; + if (!ReadHistogram(ANS_LOG_TAB_SIZE, &counts, in)) { + return JXL_FAILURE("Invalid histogram bitstream."); + } + if (counts.size() > max_alphabet_size) { + return JXL_FAILURE("Alphabet size is too long: %zu", counts.size()); + } + while (!counts.empty() && counts.back() == 0) { + counts.pop_back(); + } + for (size_t s = 0; s < counts.size(); s++) { + if (counts[s] != 0) { + result->UpdateMaxNumBits(c, s); + } + } + // InitAliasTable "fixes" empty counts to contain degenerate "0" symbol. + int degenerate_symbol = counts.empty() ? 0 : (counts.size() - 1); + for (int s = 0; s < degenerate_symbol; ++s) { + if (counts[s] != 0) { + degenerate_symbol = -1; + break; + } + } + result->degenerate_symbols[c] = degenerate_symbol; + InitAliasTable(counts, ANS_TAB_SIZE, result->log_alpha_size, + alias_tables + c * (1 << result->log_alpha_size)); + } + } + return true; +} +Status DecodeUintConfig(size_t log_alpha_size, HybridUintConfig* uint_config, + BitReader* br) { + br->Refill(); + size_t split_exponent = br->ReadBits(CeilLog2Nonzero(log_alpha_size + 1)); + size_t msb_in_token = 0, lsb_in_token = 0; + if (split_exponent != log_alpha_size) { + // otherwise, msb/lsb don't matter. + size_t nbits = CeilLog2Nonzero(split_exponent + 1); + msb_in_token = br->ReadBits(nbits); + if (msb_in_token > split_exponent) { + // This could be invalid here already and we need to check this before + // we use its value to read more bits. + return JXL_FAILURE("Invalid HybridUintConfig"); + } + nbits = CeilLog2Nonzero(split_exponent - msb_in_token + 1); + lsb_in_token = br->ReadBits(nbits); + } + if (lsb_in_token + msb_in_token > split_exponent) { + return JXL_FAILURE("Invalid HybridUintConfig"); + } + *uint_config = HybridUintConfig(split_exponent, msb_in_token, lsb_in_token); + return true; +} + +Status DecodeUintConfigs(size_t log_alpha_size, + std::vector* uint_config, + BitReader* br) { + // TODO(veluca): RLE? + for (size_t i = 0; i < uint_config->size(); i++) { + JXL_RETURN_IF_ERROR( + DecodeUintConfig(log_alpha_size, &(*uint_config)[i], br)); + } + return true; +} + +LZ77Params::LZ77Params() { Bundle::Init(this); } +Status LZ77Params::VisitFields(Visitor* JXL_RESTRICT visitor) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &enabled)); + if (!visitor->Conditional(enabled)) return true; + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(224), Val(512), Val(4096), + BitsOffset(15, 8), 224, &min_symbol)); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(3), Val(4), BitsOffset(2, 5), + BitsOffset(8, 9), 3, &min_length)); + return true; +} + +void ANSCode::UpdateMaxNumBits(size_t ctx, size_t symbol) { + HybridUintConfig* cfg = &uint_config[ctx]; + // LZ77 symbols use a different uint config. + if (lz77.enabled && lz77.nonserialized_distance_context != ctx && + symbol >= lz77.min_symbol) { + symbol -= lz77.min_symbol; + cfg = &lz77.length_uint_config; + } + size_t split_token = cfg->split_token; + size_t msb_in_token = cfg->msb_in_token; + size_t lsb_in_token = cfg->lsb_in_token; + size_t split_exponent = cfg->split_exponent; + if (symbol < split_token) { + max_num_bits = std::max(max_num_bits, split_exponent); + return; + } + uint32_t n_extra_bits = + split_exponent - (msb_in_token + lsb_in_token) + + ((symbol - split_token) >> (msb_in_token + lsb_in_token)); + size_t total_bits = msb_in_token + lsb_in_token + n_extra_bits + 1; + max_num_bits = std::max(max_num_bits, total_bits); +} + +Status DecodeHistograms(BitReader* br, size_t num_contexts, ANSCode* code, + std::vector* context_map, bool disallow_lz77) { + PROFILER_FUNC; + JXL_RETURN_IF_ERROR(Bundle::Read(br, &code->lz77)); + if (code->lz77.enabled) { + num_contexts++; + JXL_RETURN_IF_ERROR(DecodeUintConfig(/*log_alpha_size=*/8, + &code->lz77.length_uint_config, br)); + } + if (code->lz77.enabled && disallow_lz77) { + return JXL_FAILURE("Using LZ77 when explicitly disallowed"); + } + size_t num_histograms = 1; + context_map->resize(num_contexts); + if (num_contexts > 1) { + JXL_RETURN_IF_ERROR(DecodeContextMap(context_map, &num_histograms, br)); + } + code->lz77.nonserialized_distance_context = context_map->back(); + code->use_prefix_code = br->ReadFixedBits<1>(); + if (code->use_prefix_code) { + code->log_alpha_size = PREFIX_MAX_BITS; + } else { + code->log_alpha_size = br->ReadFixedBits<2>() + 5; + } + code->uint_config.resize(num_histograms); + JXL_RETURN_IF_ERROR( + DecodeUintConfigs(code->log_alpha_size, &code->uint_config, br)); + const size_t max_alphabet_size = 1 << code->log_alpha_size; + JXL_RETURN_IF_ERROR( + DecodeANSCodes(num_histograms, max_alphabet_size, br, code)); + // When using LZ77, flat codes might result in valid codestreams with + // histograms that potentially allow very large bit counts. + // TODO(veluca): in principle, a valid codestream might contain a histogram + // that could allow very large numbers of bits that is never used during ANS + // decoding. There's no benefit to doing that, though. + if (!code->lz77.enabled && code->max_num_bits > 32) { + // Just emit a warning as there are many opportunities for false positives. + JXL_WARNING("Histogram can represent numbers that are too large: %zu\n", + code->max_num_bits); + } + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_ans.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_ans.h new file mode 100644 index 0000000000..15273a8156 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_ans.h @@ -0,0 +1,432 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_ANS_H_ +#define LIB_JXL_DEC_ANS_H_ + +// Library to decode the ANS population counts from the bit-stream and build a +// decoding table from them. + +#include +#include + +#include +#include + +#include "lib/jxl/ans_common.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/cache_aligned.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_huffman.h" +#include "lib/jxl/field_encodings.h" + +namespace jxl { + +class ANSSymbolReader; + +// Experiments show that best performance is typically achieved for a +// split-exponent of 3 or 4. Trend seems to be that '4' is better +// for large-ish pictures, and '3' better for rather small-ish pictures. +// This is plausible - the more special symbols we have, the better +// statistics we need to get a benefit out of them. + +// Our hybrid-encoding scheme has dedicated tokens for the smallest +// (1 << split_exponents) numbers, and for the rest +// encodes (number of bits) + (msb_in_token sub-leading binary digits) + +// (lsb_in_token lowest binary digits) in the token, with the remaining bits +// then being encoded as data. +// +// Example with split_exponent = 4, msb_in_token = 2, lsb_in_token = 0. +// +// Numbers N in [0 .. 15]: +// These get represented as (token=N, bits=''). +// Numbers N >= 16: +// If n is such that 2**n <= N < 2**(n+1), +// and m = N - 2**n is the 'mantissa', +// these get represented as: +// (token=split_token + +// ((n - split_exponent) * 4) + +// (m >> (n - msb_in_token)), +// bits=m & (1 << (n - msb_in_token)) - 1) +// Specifically, we would get: +// N = 0 - 15: (token=N, nbits=0, bits='') +// N = 16 (10000): (token=16, nbits=2, bits='00') +// N = 17 (10001): (token=16, nbits=2, bits='01') +// N = 20 (10100): (token=17, nbits=2, bits='00') +// N = 24 (11000): (token=18, nbits=2, bits='00') +// N = 28 (11100): (token=19, nbits=2, bits='00') +// N = 32 (100000): (token=20, nbits=3, bits='000') +// N = 65535: (token=63, nbits=13, bits='1111111111111') +struct HybridUintConfig { + uint32_t split_exponent; + uint32_t split_token; + uint32_t msb_in_token; + uint32_t lsb_in_token; + JXL_INLINE void Encode(uint32_t value, uint32_t* JXL_RESTRICT token, + uint32_t* JXL_RESTRICT nbits, + uint32_t* JXL_RESTRICT bits) const { + if (value < split_token) { + *token = value; + *nbits = 0; + *bits = 0; + } else { + uint32_t n = FloorLog2Nonzero(value); + uint32_t m = value - (1 << n); + *token = split_token + + ((n - split_exponent) << (msb_in_token + lsb_in_token)) + + ((m >> (n - msb_in_token)) << lsb_in_token) + + (m & ((1 << lsb_in_token) - 1)); + *nbits = n - msb_in_token - lsb_in_token; + *bits = (value >> lsb_in_token) & ((1UL << *nbits) - 1); + } + } + + explicit HybridUintConfig(uint32_t split_exponent = 4, + uint32_t msb_in_token = 2, + uint32_t lsb_in_token = 0) + : split_exponent(split_exponent), + split_token(1 << split_exponent), + msb_in_token(msb_in_token), + lsb_in_token(lsb_in_token) { + JXL_DASSERT(split_exponent >= msb_in_token + lsb_in_token); + } +}; + +struct LZ77Params : public Fields { + LZ77Params(); + const char* Name() const override { return "LZ77Params"; } + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + bool enabled; + + // Symbols above min_symbol use a special hybrid uint encoding and + // represent a length, to be added to min_length. + uint32_t min_symbol; + uint32_t min_length; + + // Not serialized by VisitFields. + HybridUintConfig length_uint_config{0, 0, 0}; + + size_t nonserialized_distance_context; +}; + +static constexpr size_t kWindowSize = 1 << 20; +static constexpr size_t kNumSpecialDistances = 120; +// Table of special distance codes from WebP lossless. +static constexpr int8_t kSpecialDistances[kNumSpecialDistances][2] = { + {0, 1}, {1, 0}, {1, 1}, {-1, 1}, {0, 2}, {2, 0}, {1, 2}, {-1, 2}, + {2, 1}, {-2, 1}, {2, 2}, {-2, 2}, {0, 3}, {3, 0}, {1, 3}, {-1, 3}, + {3, 1}, {-3, 1}, {2, 3}, {-2, 3}, {3, 2}, {-3, 2}, {0, 4}, {4, 0}, + {1, 4}, {-1, 4}, {4, 1}, {-4, 1}, {3, 3}, {-3, 3}, {2, 4}, {-2, 4}, + {4, 2}, {-4, 2}, {0, 5}, {3, 4}, {-3, 4}, {4, 3}, {-4, 3}, {5, 0}, + {1, 5}, {-1, 5}, {5, 1}, {-5, 1}, {2, 5}, {-2, 5}, {5, 2}, {-5, 2}, + {4, 4}, {-4, 4}, {3, 5}, {-3, 5}, {5, 3}, {-5, 3}, {0, 6}, {6, 0}, + {1, 6}, {-1, 6}, {6, 1}, {-6, 1}, {2, 6}, {-2, 6}, {6, 2}, {-6, 2}, + {4, 5}, {-4, 5}, {5, 4}, {-5, 4}, {3, 6}, {-3, 6}, {6, 3}, {-6, 3}, + {0, 7}, {7, 0}, {1, 7}, {-1, 7}, {5, 5}, {-5, 5}, {7, 1}, {-7, 1}, + {4, 6}, {-4, 6}, {6, 4}, {-6, 4}, {2, 7}, {-2, 7}, {7, 2}, {-7, 2}, + {3, 7}, {-3, 7}, {7, 3}, {-7, 3}, {5, 6}, {-5, 6}, {6, 5}, {-6, 5}, + {8, 0}, {4, 7}, {-4, 7}, {7, 4}, {-7, 4}, {8, 1}, {8, 2}, {6, 6}, + {-6, 6}, {8, 3}, {5, 7}, {-5, 7}, {7, 5}, {-7, 5}, {8, 4}, {6, 7}, + {-6, 7}, {7, 6}, {-7, 6}, {8, 5}, {7, 7}, {-7, 7}, {8, 6}, {8, 7}}; + +struct ANSCode { + CacheAlignedUniquePtr alias_tables; + std::vector huffman_data; + std::vector uint_config; + std::vector degenerate_symbols; + bool use_prefix_code; + uint8_t log_alpha_size; // for ANS. + LZ77Params lz77; + // Maximum number of bits necessary to represent the result of a + // ReadHybridUint call done with this ANSCode. + size_t max_num_bits = 0; + void UpdateMaxNumBits(size_t ctx, size_t symbol); +}; + +class ANSSymbolReader { + public: + // Invalid symbol reader, to be overwritten. + ANSSymbolReader() = default; + ANSSymbolReader(const ANSCode* code, BitReader* JXL_RESTRICT br, + size_t distance_multiplier = 0) + : alias_tables_( + reinterpret_cast(code->alias_tables.get())), + huffman_data_(code->huffman_data.data()), + use_prefix_code_(code->use_prefix_code), + configs(code->uint_config.data()) { + if (!use_prefix_code_) { + state_ = static_cast(br->ReadFixedBits<32>()); + log_alpha_size_ = code->log_alpha_size; + log_entry_size_ = ANS_LOG_TAB_SIZE - code->log_alpha_size; + entry_size_minus_1_ = (1 << log_entry_size_) - 1; + } else { + state_ = (ANS_SIGNATURE << 16u); + } + if (!code->lz77.enabled) return; + // a std::vector incurs unacceptable decoding speed loss because of + // initialization. + lz77_window_storage_ = AllocateArray(kWindowSize * sizeof(uint32_t)); + lz77_window_ = reinterpret_cast(lz77_window_storage_.get()); + lz77_ctx_ = code->lz77.nonserialized_distance_context; + lz77_length_uint_ = code->lz77.length_uint_config; + lz77_threshold_ = code->lz77.min_symbol; + lz77_min_length_ = code->lz77.min_length; + num_special_distances_ = + distance_multiplier == 0 ? 0 : kNumSpecialDistances; + for (size_t i = 0; i < num_special_distances_; i++) { + int dist = kSpecialDistances[i][0]; + dist += static_cast(distance_multiplier) * kSpecialDistances[i][1]; + if (dist < 1) dist = 1; + special_distances_[i] = dist; + } + } + + JXL_INLINE size_t ReadSymbolANSWithoutRefill(const size_t histo_idx, + BitReader* JXL_RESTRICT br) { + const uint32_t res = state_ & (ANS_TAB_SIZE - 1u); + + const AliasTable::Entry* table = + &alias_tables_[histo_idx << log_alpha_size_]; + const AliasTable::Symbol symbol = + AliasTable::Lookup(table, res, log_entry_size_, entry_size_minus_1_); + state_ = symbol.freq * (state_ >> ANS_LOG_TAB_SIZE) + symbol.offset; + +#if 1 + // Branchless version is about equally fast on SKX. + const uint32_t new_state = + (state_ << 16u) | static_cast(br->PeekFixedBits<16>()); + const bool normalize = state_ < (1u << 16u); + state_ = normalize ? new_state : state_; + br->Consume(normalize ? 16 : 0); +#else + if (JXL_UNLIKELY(state_ < (1u << 16u))) { + state_ = (state_ << 16u) | br->PeekFixedBits<16>(); + br->Consume(16); + } +#endif + const uint32_t next_res = state_ & (ANS_TAB_SIZE - 1u); + AliasTable::Prefetch(table, next_res, log_entry_size_); + + return symbol.value; + } + + JXL_INLINE size_t ReadSymbolHuffWithoutRefill(const size_t histo_idx, + BitReader* JXL_RESTRICT br) { + return huffman_data_[histo_idx].ReadSymbol(br); + } + + JXL_INLINE size_t ReadSymbolWithoutRefill(const size_t histo_idx, + BitReader* JXL_RESTRICT br) { + // TODO(veluca): hoist if in hotter loops. + if (JXL_UNLIKELY(use_prefix_code_)) { + return ReadSymbolHuffWithoutRefill(histo_idx, br); + } + return ReadSymbolANSWithoutRefill(histo_idx, br); + } + + JXL_INLINE size_t ReadSymbol(const size_t histo_idx, + BitReader* JXL_RESTRICT br) { + br->Refill(); + return ReadSymbolWithoutRefill(histo_idx, br); + } + + bool CheckANSFinalState() { return state_ == (ANS_SIGNATURE << 16u); } + + template + static JXL_INLINE uint32_t ReadHybridUintConfig( + const HybridUintConfig& config, size_t token, BitReader* br) { + size_t split_token = config.split_token; + size_t msb_in_token = config.msb_in_token; + size_t lsb_in_token = config.lsb_in_token; + size_t split_exponent = config.split_exponent; + // Fast-track version of hybrid integer decoding. + if (token < split_token) return token; + uint32_t nbits = split_exponent - (msb_in_token + lsb_in_token) + + ((token - split_token) >> (msb_in_token + lsb_in_token)); + // Max amount of bits for ReadBits is 32 and max valid left shift is 29 + // bits. However, for speed no error is propagated here, instead limit the + // nbits size. If nbits > 29, the code stream is invalid, but no error is + // returned. + // Note that in most cases we will emit an error if the histogram allows + // representing numbers that would cause invalid shifts, but we need to + // keep this check as when LZ77 is enabled it might make sense to have an + // histogram that could in principle cause invalid shifts. + nbits &= 31u; + uint32_t low = token & ((1 << lsb_in_token) - 1); + token >>= lsb_in_token; + const size_t bits = br->PeekBits(nbits); + br->Consume(nbits); + size_t ret = (((((1 << msb_in_token) | (token & ((1 << msb_in_token) - 1))) + << nbits) | + bits) + << lsb_in_token) | + low; + // TODO(eustas): mark BitReader as unhealthy if nbits > 29 or ret does not + // fit uint32_t + return static_cast(ret); + } + + // Takes a *clustered* idx. + size_t ReadHybridUintClustered(size_t ctx, BitReader* JXL_RESTRICT br) { + if (JXL_UNLIKELY(num_to_copy_ > 0)) { + size_t ret = lz77_window_[(copy_pos_++) & kWindowMask]; + num_to_copy_--; + lz77_window_[(num_decoded_++) & kWindowMask] = ret; + return ret; + } + br->Refill(); // covers ReadSymbolWithoutRefill + PeekBits + size_t token = ReadSymbolWithoutRefill(ctx, br); + if (JXL_UNLIKELY(token >= lz77_threshold_)) { + num_to_copy_ = + ReadHybridUintConfig(lz77_length_uint_, token - lz77_threshold_, br) + + lz77_min_length_; + br->Refill(); // covers ReadSymbolWithoutRefill + PeekBits + // Distance code. + size_t token = ReadSymbolWithoutRefill(lz77_ctx_, br); + size_t distance = ReadHybridUintConfig(configs[lz77_ctx_], token, br); + if (JXL_LIKELY(distance < num_special_distances_)) { + distance = special_distances_[distance]; + } else { + distance = distance + 1 - num_special_distances_; + } + if (JXL_UNLIKELY(distance > num_decoded_)) { + distance = num_decoded_; + } + if (JXL_UNLIKELY(distance > kWindowSize)) { + distance = kWindowSize; + } + copy_pos_ = num_decoded_ - distance; + if (JXL_UNLIKELY(distance == 0)) { + JXL_DASSERT(lz77_window_ != nullptr); + // distance 0 -> num_decoded_ == copy_pos_ == 0 + size_t to_fill = std::min(num_to_copy_, kWindowSize); + memset(lz77_window_, 0, to_fill * sizeof(lz77_window_[0])); + } + // TODO(eustas): overflow; mark BitReader as unhealthy + if (num_to_copy_ < lz77_min_length_) return 0; + return ReadHybridUintClustered(ctx, br); // will trigger a copy. + } + size_t ret = ReadHybridUintConfig(configs[ctx], token, br); + if (lz77_window_) lz77_window_[(num_decoded_++) & kWindowMask] = ret; + return ret; + } + + JXL_INLINE size_t ReadHybridUint(size_t ctx, BitReader* JXL_RESTRICT br, + const std::vector& context_map) { + return ReadHybridUintClustered(context_map[ctx], br); + } + + // ctx is a *clustered* context! + // This function will modify the ANS state as if `count` symbols have been + // decoded. + bool IsSingleValueAndAdvance(size_t ctx, uint32_t* value, size_t count) { + // TODO(veluca): No optimization for Huffman mode yet. + if (use_prefix_code_) return false; + // TODO(eustas): propagate "degenerate_symbol" to simplify this method. + const uint32_t res = state_ & (ANS_TAB_SIZE - 1u); + const AliasTable::Entry* table = &alias_tables_[ctx << log_alpha_size_]; + AliasTable::Symbol symbol = + AliasTable::Lookup(table, res, log_entry_size_, entry_size_minus_1_); + if (symbol.freq != ANS_TAB_SIZE) return false; + if (configs[ctx].split_token <= symbol.value) return false; + if (symbol.value >= lz77_threshold_) return false; + *value = symbol.value; + if (lz77_window_) { + for (size_t i = 0; i < count; i++) { + lz77_window_[(num_decoded_++) & kWindowMask] = symbol.value; + } + } + return true; + } + + static constexpr size_t kMaxCheckpointInterval = 512; + struct Checkpoint { + uint32_t state; + uint32_t num_to_copy; + uint32_t copy_pos; + uint32_t num_decoded; + uint32_t lz77_window[kMaxCheckpointInterval]; + }; + void Save(Checkpoint* checkpoint) { + checkpoint->state = state_; + checkpoint->num_decoded = num_decoded_; + checkpoint->num_to_copy = num_to_copy_; + checkpoint->copy_pos = copy_pos_; + if (lz77_window_) { + size_t win_start = num_decoded_ & kWindowMask; + size_t win_end = (num_decoded_ + kMaxCheckpointInterval) & kWindowMask; + if (win_end > win_start) { + memcpy(checkpoint->lz77_window, lz77_window_ + win_start, + (win_end - win_start) * sizeof(*lz77_window_)); + } else { + memcpy(checkpoint->lz77_window, lz77_window_ + win_start, + (kWindowSize - win_start) * sizeof(*lz77_window_)); + memcpy(checkpoint->lz77_window + (kWindowSize - win_start), + lz77_window_, win_end * sizeof(*lz77_window_)); + } + } + } + void Restore(const Checkpoint& checkpoint) { + state_ = checkpoint.state; + JXL_DASSERT(num_decoded_ <= + checkpoint.num_decoded + kMaxCheckpointInterval); + num_decoded_ = checkpoint.num_decoded; + num_to_copy_ = checkpoint.num_to_copy; + copy_pos_ = checkpoint.copy_pos; + if (lz77_window_) { + size_t win_start = num_decoded_ & kWindowMask; + size_t win_end = (num_decoded_ + kMaxCheckpointInterval) & kWindowMask; + if (win_end > win_start) { + memcpy(lz77_window_ + win_start, checkpoint.lz77_window, + (win_end - win_start) * sizeof(*lz77_window_)); + } else { + memcpy(lz77_window_ + win_start, checkpoint.lz77_window, + (kWindowSize - win_start) * sizeof(*lz77_window_)); + memcpy(lz77_window_, checkpoint.lz77_window + (kWindowSize - win_start), + win_end * sizeof(*lz77_window_)); + } + } + } + + private: + const AliasTable::Entry* JXL_RESTRICT alias_tables_; // not owned + const HuffmanDecodingData* huffman_data_; + bool use_prefix_code_; + uint32_t state_ = ANS_SIGNATURE << 16u; + const HybridUintConfig* JXL_RESTRICT configs; + uint32_t log_alpha_size_; + uint32_t log_entry_size_; + uint32_t entry_size_minus_1_; + + // LZ77 structures and constants. + static constexpr size_t kWindowMask = kWindowSize - 1; + CacheAlignedUniquePtr lz77_window_storage_; + uint32_t* lz77_window_ = nullptr; + uint32_t num_decoded_ = 0; + uint32_t num_to_copy_ = 0; + uint32_t copy_pos_ = 0; + uint32_t lz77_ctx_ = 0; + uint32_t lz77_min_length_ = 0; + uint32_t lz77_threshold_ = 1 << 20; // bigger than any symbol. + HybridUintConfig lz77_length_uint_; + uint32_t special_distances_[kNumSpecialDistances]; + uint32_t num_special_distances_; +}; + +Status DecodeHistograms(BitReader* br, size_t num_contexts, ANSCode* code, + std::vector* context_map, + bool disallow_lz77 = false); + +// Exposed for tests. +Status DecodeUintConfigs(size_t log_alpha_size, + std::vector* uint_config, + BitReader* br); + +} // namespace jxl + +#endif // LIB_JXL_DEC_ANS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_bit_reader.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_bit_reader.h new file mode 100644 index 0000000000..df70284e3b --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_bit_reader.h @@ -0,0 +1,354 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_BIT_READER_H_ +#define LIB_JXL_DEC_BIT_READER_H_ + +// Bounds-checked bit reader; 64-bit buffer with support for deferred refills +// and switching to reading byte-aligned words. + +#include +#include +#include // memcpy + +#ifdef __BMI2__ +#include +#endif + +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" + +namespace jxl { + +// Reads bits previously written to memory by BitWriter. Uses unaligned 8-byte +// little-endian loads. +class BitReader { + public: + static constexpr size_t kMaxBitsPerCall = 56; + + // Constructs an invalid BitReader, to be overwritten before usage. + BitReader() + : buf_(0), + bits_in_buf_(0), + next_byte_{nullptr}, + end_minus_8_{nullptr}, + first_byte_(nullptr) {} + BitReader(const BitReader&) = delete; + + // bytes need not be aligned nor padded! + template + explicit BitReader(const ArrayLike& bytes) + : buf_(0), + bits_in_buf_(0), + next_byte_(bytes.data()), + // Assumes first_byte_ >= 8. + end_minus_8_(bytes.data() - 8 + bytes.size()), + first_byte_(bytes.data()) { + Refill(); + } + ~BitReader() { + // Close() must be called before destroying an initialized bit reader. + // Invalid bit readers will have a nullptr in first_byte_. + JXL_ASSERT(close_called_ || !first_byte_); + } + + // Move operator needs to invalidate the other BitReader such that it is + // irrelevant if we call Close() on it or not. + BitReader& operator=(BitReader&& other) noexcept { + // Ensure the current instance was already closed, before we overwrite it + // with other. + JXL_ASSERT(close_called_ || !first_byte_); + + JXL_DASSERT(!other.close_called_); + buf_ = other.buf_; + bits_in_buf_ = other.bits_in_buf_; + next_byte_ = other.next_byte_; + end_minus_8_ = other.end_minus_8_; + first_byte_ = other.first_byte_; + overread_bytes_ = other.overread_bytes_; + close_called_ = other.close_called_; + + other.first_byte_ = nullptr; + other.next_byte_ = nullptr; + return *this; + } + BitReader& operator=(const BitReader& other) = delete; + + // For time-critical reads, refills can be shared by multiple reads. + // Based on variant 4 (plus bounds-checking), see + // fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/ + JXL_INLINE void Refill() { + if (JXL_UNLIKELY(next_byte_ > end_minus_8_)) { + BoundsCheckedRefill(); + } else { + // It's safe to load 64 bits; insert valid (possibly nonzero) bits above + // bits_in_buf_. The shift requires bits_in_buf_ < 64. + buf_ |= LoadLE64(next_byte_) << bits_in_buf_; + + // Advance by bytes fully absorbed into the buffer. + next_byte_ += (63 - bits_in_buf_) >> 3; + + // We absorbed a multiple of 8 bits, so the lower 3 bits of bits_in_buf_ + // must remain unchanged, otherwise the next refill's shifted bits will + // not align with buf_. Set the three upper bits so the result >= 56. + bits_in_buf_ |= 56; + JXL_DASSERT(56 <= bits_in_buf_ && bits_in_buf_ < 64); + } + } + + // Returns the bits that would be returned by Read without calling Advance(). + // It is legal to PEEK at more bits than present in the bitstream (required + // by Huffman), and those bits will be zero. + template + JXL_INLINE uint64_t PeekFixedBits() const { + static_assert(N <= kMaxBitsPerCall, "Reading too many bits in one call."); + JXL_DASSERT(!close_called_); + return buf_ & ((1ULL << N) - 1); + } + + JXL_INLINE uint64_t PeekBits(size_t nbits) const { + JXL_DASSERT(nbits <= kMaxBitsPerCall); + JXL_DASSERT(!close_called_); + + // Slightly faster but requires BMI2. It is infeasible to make the many + // callers reside between begin/end_target, especially because only the + // callers in dec_ans are time-critical. Therefore only enabled if the + // entire binary is compiled for (and thus requires) BMI2. +#if defined(__BMI2__) && defined(__x86_64__) + return _bzhi_u64(buf_, nbits); +#else + const uint64_t mask = (1ULL << nbits) - 1; + return buf_ & mask; +#endif + } + + // Removes bits from the buffer. Need not match the previous Peek size, but + // the buffer must contain at least num_bits (this prevents consuming more + // than the total number of bits). + JXL_INLINE void Consume(size_t num_bits) { + JXL_DASSERT(!close_called_); + JXL_DASSERT(bits_in_buf_ >= num_bits); +#ifdef JXL_CRASH_ON_ERROR + // When JXL_CRASH_ON_ERROR is defined, it is a fatal error to read more bits + // than available in the stream. A non-zero overread_bytes_ implies that + // next_byte_ is already at the end of the stream, so we don't need to + // check that. + JXL_ASSERT(bits_in_buf_ >= num_bits + overread_bytes_ * kBitsPerByte); +#endif + bits_in_buf_ -= num_bits; + buf_ >>= num_bits; + } + + JXL_INLINE uint64_t ReadBits(size_t nbits) { + JXL_DASSERT(!close_called_); + Refill(); + const uint64_t bits = PeekBits(nbits); + Consume(nbits); + return bits; + } + + template + JXL_INLINE uint64_t ReadFixedBits() { + JXL_DASSERT(!close_called_); + Refill(); + const uint64_t bits = PeekFixedBits(); + Consume(N); + return bits; + } + + // Equivalent to calling ReadFixedBits(1) `skip` times, but much faster. + // `skip` is typically large. + void SkipBits(size_t skip) { + JXL_DASSERT(!close_called_); + // Buffer is large enough - don't zero buf_ below. + if (JXL_UNLIKELY(skip <= bits_in_buf_)) { + Consume(skip); + return; + } + + // First deduct what we can satisfy from the buffer + skip -= bits_in_buf_; + bits_in_buf_ = 0; + // Not enough to call Advance - that may leave some bits in the buffer + // which were previously ABOVE bits_in_buf. + buf_ = 0; + + // Skip whole bytes + const size_t whole_bytes = skip / kBitsPerByte; + skip %= kBitsPerByte; + if (JXL_UNLIKELY(whole_bytes > + static_cast(end_minus_8_ + 8 - next_byte_))) { + // This is already an overflow condition (skipping past the end of the bit + // stream). However if we increase next_byte_ too much we risk overflowing + // that value and potentially making it valid again (next_byte_ < end). + // This will set next_byte_ to the end of the stream and still consume + // some bits in overread_bytes_, however the TotalBitsConsumed() will be + // incorrect (still larger than the TotalBytes()). + next_byte_ = end_minus_8_ + 8; + skip += kBitsPerByte; + } else { + next_byte_ += whole_bytes; + } + + Refill(); + Consume(skip); + } + + size_t TotalBitsConsumed() const { + const size_t bytes_read = static_cast(next_byte_ - first_byte_); + return (bytes_read + overread_bytes_) * kBitsPerByte - bits_in_buf_; + } + + Status JumpToByteBoundary() { + const size_t remainder = TotalBitsConsumed() % kBitsPerByte; + if (remainder == 0) return true; + if (JXL_UNLIKELY(ReadBits(kBitsPerByte - remainder) != 0)) { + return JXL_FAILURE("Non-zero padding bits"); + } + return true; + } + + // For interoperability with other bitreaders (for resuming at + // non-byte-aligned positions). + const uint8_t* FirstByte() const { return first_byte_; } + size_t TotalBytes() const { + return static_cast(end_minus_8_ + 8 - first_byte_); + } + + // Returns span of the remaining (unconsumed) bytes, e.g. for passing to + // external decoders such as Brotli. + Span GetSpan() const { + JXL_DASSERT(first_byte_ != nullptr); + JXL_ASSERT(TotalBitsConsumed() % kBitsPerByte == 0); + const size_t offset = TotalBitsConsumed() / kBitsPerByte; // no remainder + JXL_ASSERT(offset <= TotalBytes()); + return Span(first_byte_ + offset, TotalBytes() - offset); + } + + // Returns whether all the bits read so far have been within the input bounds. + // When reading past the EOF, the Read*() and Consume() functions return zeros + // but flag a failure when calling Close() without checking this function. + Status AllReadsWithinBounds() { + // Mark up to which point the user checked the out of bounds condition. If + // the user handles the condition at higher level (e.g. fetch more bytes + // from network, return a custom JXL_FAILURE, ...), Close() should not + // output a debug error (which would break tests with JXL_CRASH_ON_ERROR + // even when legitimately handling the situation at higher level). This is + // used by Bundle::CanRead. + checked_out_of_bounds_bits_ = TotalBitsConsumed(); + if (TotalBitsConsumed() > TotalBytes() * kBitsPerByte) { + return false; + } + return true; + } + + // Close the bit reader and return whether all the previous reads were + // successful. Close must be called once. + Status Close() { + JXL_DASSERT(!close_called_); + close_called_ = true; + if (!first_byte_) return true; + if (TotalBitsConsumed() > checked_out_of_bounds_bits_ && + TotalBitsConsumed() > TotalBytes() * kBitsPerByte) { + return JXL_FAILURE("Read more bits than available in the bit_reader"); + } + return true; + } + + private: + // Separate function avoids inlining this relatively cold code into callers. + JXL_NOINLINE void BoundsCheckedRefill() { + PROFILER_FUNC; + const uint8_t* end = end_minus_8_ + 8; + + // Read whole bytes until we have [56, 64) bits (same as LoadLE64) + for (; bits_in_buf_ < 64 - kBitsPerByte; bits_in_buf_ += kBitsPerByte) { + if (next_byte_ >= end) break; + buf_ |= static_cast(*next_byte_++) << bits_in_buf_; + } + JXL_DASSERT(bits_in_buf_ < 64); + + // Add extra bytes as 0 at the end of the stream in the bit_buffer_. If + // these bits are read, Close() will return a failure. + size_t extra_bytes = (63 - bits_in_buf_) / kBitsPerByte; + overread_bytes_ += extra_bytes; + bits_in_buf_ += extra_bytes * kBitsPerByte; + + JXL_DASSERT(bits_in_buf_ < 64); + JXL_DASSERT(bits_in_buf_ >= 56); + } + + JXL_NOINLINE uint32_t BoundsCheckedReadByteAlignedWord() { + if (next_byte_ + 1 < end_minus_8_ + 8) { + uint32_t ret = LoadLE16(next_byte_); + next_byte_ += 2; + return ret; + } + overread_bytes_ += 2; + return 0; + } + + uint64_t buf_; + size_t bits_in_buf_; // [0, 64) + const uint8_t* JXL_RESTRICT next_byte_; + const uint8_t* end_minus_8_; // for refill bounds check + const uint8_t* first_byte_; // for GetSpan + + // Number of bytes past the end that were loaded into the buf_. These bytes + // are not read from memory, but instead assumed 0. It is an error (likely due + // to an invalid stream) to Consume() more bits than specified in the range + // passed to the constructor. + uint64_t overread_bytes_{0}; + bool close_called_{false}; + + uint64_t checked_out_of_bounds_bits_{0}; +}; + +// Closes a BitReader when the BitReaderScopedCloser goes out of scope. When +// closing the bit reader, if the status result was failure it sets this failure +// to the passed variable pointer. Typical usage. +// +// Status ret = true; +// { +// BitReader reader(...); +// BitReaderScopedCloser reader_closer(&reader, &ret); +// +// // ... code that can return errors here ... +// } +// // ... more code that doesn't use the BitReader. +// return ret; + +class BitReaderScopedCloser { + public: + BitReaderScopedCloser(BitReader* reader, Status* status) + : reader_(reader), status_(status) { + JXL_DASSERT(reader_ != nullptr); + JXL_DASSERT(status_ != nullptr); + } + ~BitReaderScopedCloser() { + if (reader_ != nullptr) { + Status close_ret = reader_->Close(); + if (!close_ret) *status_ = close_ret; + } + } + void CloseAndSuppressError() { + JXL_ASSERT(reader_ != nullptr); + (void)reader_->Close(); + reader_ = nullptr; + } + BitReaderScopedCloser(const BitReaderScopedCloser&) = delete; + + private: + BitReader* reader_; + Status* status_; +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_BIT_READER_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_cache.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_cache.cc new file mode 100644 index 0000000000..e40a97fcb9 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_cache.cc @@ -0,0 +1,170 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_cache.h" + +#include "lib/jxl/dec_reconstruct.h" + +namespace jxl { + +void PassesDecoderState::EnsureBordersStorage() { + if (!EagerFinalizeImageRect()) return; + size_t padding = FinalizeRectPadding(); + size_t bordery = 2 * padding; + size_t borderx = padding + group_border_assigner.PaddingX(padding); + Rect horizontal = Rect(0, 0, shared->frame_dim.xsize_padded, + bordery * shared->frame_dim.ysize_groups * 2); + if (!SameSize(horizontal, borders_horizontal)) { + borders_horizontal = Image3F(horizontal.xsize(), horizontal.ysize()); + } + Rect vertical = Rect(0, 0, borderx * shared->frame_dim.xsize_groups * 2, + shared->frame_dim.ysize_padded); + if (!SameSize(vertical, borders_vertical)) { + borders_vertical = Image3F(vertical.xsize(), vertical.ysize()); + } +} + +namespace { +void SaveBorders(const Rect& block_rect, size_t hshift, size_t vshift, + size_t padding, const ImageF& plane_in, + ImageF* border_storage_h, ImageF* border_storage_v) { + constexpr size_t kGroupDataXBorder = PassesDecoderState::kGroupDataXBorder; + constexpr size_t kGroupDataYBorder = PassesDecoderState::kGroupDataYBorder; + size_t x0 = DivCeil(block_rect.x0() * kBlockDim, 1 << hshift); + size_t x1 = + DivCeil((block_rect.x0() + block_rect.xsize()) * kBlockDim, 1 << hshift); + size_t y0 = DivCeil(block_rect.y0() * kBlockDim, 1 << vshift); + size_t y1 = + DivCeil((block_rect.y0() + block_rect.ysize()) * kBlockDim, 1 << vshift); + size_t gy = block_rect.y0() / kGroupDimInBlocks; + size_t gx = block_rect.x0() / kGroupDimInBlocks; + // TODO(veluca): this is too much with chroma upsampling. It's just + // inefficient though. + size_t borderx = GroupBorderAssigner::PaddingX(padding); + size_t bordery = padding; + size_t borderx_write = padding + borderx; + size_t bordery_write = padding + bordery; + CopyImageTo( + Rect(kGroupDataXBorder, kGroupDataYBorder, x1 - x0, bordery_write), + plane_in, Rect(x0, (gy * 2) * bordery_write, x1 - x0, bordery_write), + border_storage_h); + CopyImageTo( + Rect(kGroupDataXBorder, kGroupDataYBorder + y1 - y0 - bordery_write, + x1 - x0, bordery_write), + plane_in, Rect(x0, (gy * 2 + 1) * bordery_write, x1 - x0, bordery_write), + border_storage_h); + CopyImageTo( + Rect(kGroupDataXBorder, kGroupDataYBorder, borderx_write, y1 - y0), + plane_in, Rect((gx * 2) * borderx_write, y0, borderx_write, y1 - y0), + border_storage_v); + CopyImageTo(Rect(kGroupDataXBorder + x1 - x0 - borderx_write, + kGroupDataYBorder, borderx_write, y1 - y0), + plane_in, + Rect((gx * 2 + 1) * borderx_write, y0, borderx_write, y1 - y0), + border_storage_v); +} + +void LoadBorders(const Rect& block_rect, size_t hshift, size_t vshift, + const FrameDimensions& frame_dim, size_t padding, + const ImageF& border_storage_h, const ImageF& border_storage_v, + const Rect& r, ImageF* plane_out) { + constexpr size_t kGroupDataXBorder = PassesDecoderState::kGroupDataXBorder; + constexpr size_t kGroupDataYBorder = PassesDecoderState::kGroupDataYBorder; + size_t x0 = DivCeil(block_rect.x0() * kBlockDim, 1 << hshift); + size_t x1 = + DivCeil((block_rect.x0() + block_rect.xsize()) * kBlockDim, 1 << hshift); + size_t y0 = DivCeil(block_rect.y0() * kBlockDim, 1 << vshift); + size_t y1 = + DivCeil((block_rect.y0() + block_rect.ysize()) * kBlockDim, 1 << vshift); + size_t gy = block_rect.y0() / kGroupDimInBlocks; + size_t gx = block_rect.x0() / kGroupDimInBlocks; + size_t borderx = GroupBorderAssigner::PaddingX(padding); + size_t bordery = padding; + size_t borderx_write = padding + borderx; + size_t bordery_write = padding + bordery; + // Limits of the area to copy from, in image coordinates. + JXL_DASSERT(r.x0() == 0 || r.x0() >= borderx); + size_t x0src = DivCeil(r.x0() == 0 ? r.x0() : r.x0() - borderx, 1 << hshift); + size_t x1src = + DivCeil(r.x0() + r.xsize() + + (r.x0() + r.xsize() == frame_dim.xsize_padded ? 0 : borderx), + 1 << hshift); + JXL_DASSERT(r.y0() == 0 || r.y0() >= bordery); + size_t y0src = DivCeil(r.y0() == 0 ? r.y0() : r.y0() - bordery, 1 << vshift); + size_t y1src = + DivCeil(r.y0() + r.ysize() + + (r.y0() + r.ysize() == frame_dim.ysize_padded ? 0 : bordery), + 1 << vshift); + // Copy other groups' borders from the border storage. + if (y0src < y0) { + CopyImageTo( + Rect(x0src, (gy * 2 - 1) * bordery_write, x1src - x0src, bordery_write), + border_storage_h, + Rect(kGroupDataXBorder + x0src - x0, kGroupDataYBorder - bordery_write, + x1src - x0src, bordery_write), + plane_out); + } + if (y1src > y1) { + CopyImageTo( + Rect(x0src, (gy * 2 + 2) * bordery_write, x1src - x0src, bordery_write), + border_storage_h, + Rect(kGroupDataXBorder + x0src - x0, kGroupDataYBorder + y1 - y0, + x1src - x0src, bordery_write), + plane_out); + } + if (x0src < x0) { + CopyImageTo( + Rect((gx * 2 - 1) * borderx_write, y0src, borderx_write, y1src - y0src), + border_storage_v, + Rect(kGroupDataXBorder - borderx_write, kGroupDataYBorder + y0src - y0, + borderx_write, y1src - y0src), + plane_out); + } + if (x1src > x1) { + CopyImageTo( + Rect((gx * 2 + 2) * borderx_write, y0src, borderx_write, y1src - y0src), + border_storage_v, + Rect(kGroupDataXBorder + x1 - x0, kGroupDataYBorder + y0src - y0, + borderx_write, y1src - y0src), + plane_out); + } +} + +} // namespace + +Status PassesDecoderState::FinalizeGroup(size_t group_idx, size_t thread, + Image3F* pixel_data, + ImageBundle* output) { + // Copy the group borders to the border storage. + const Rect block_rect = shared->BlockGroupRect(group_idx); + const YCbCrChromaSubsampling& cs = shared->frame_header.chroma_subsampling; + size_t padding = FinalizeRectPadding(); + for (size_t c = 0; c < 3; c++) { + SaveBorders(block_rect, cs.HShift(c), cs.VShift(c), padding, + pixel_data->Plane(c), &borders_horizontal.Plane(c), + &borders_vertical.Plane(c)); + } + Rect fir_rects[GroupBorderAssigner::kMaxToFinalize]; + size_t num_fir_rects = 0; + group_border_assigner.GroupDone(group_idx, FinalizeRectPadding(), fir_rects, + &num_fir_rects); + for (size_t i = 0; i < num_fir_rects; i++) { + const Rect& r = fir_rects[i]; + for (size_t c = 0; c < 3; c++) { + LoadBorders(block_rect, cs.HShift(c), cs.VShift(c), shared->frame_dim, + padding, borders_horizontal.Plane(c), + borders_vertical.Plane(c), r, &pixel_data->Plane(c)); + } + Rect pixel_data_rect( + kGroupDataXBorder + r.x0() - block_rect.x0() * kBlockDim, + kGroupDataYBorder + r.y0() - block_rect.y0() * kBlockDim, r.xsize(), + r.ysize()); + JXL_RETURN_IF_ERROR(FinalizeImageRect(pixel_data, pixel_data_rect, {}, this, + thread, output, r)); + } + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_cache.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_cache.h new file mode 100644 index 0000000000..85322aa3b5 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_cache.h @@ -0,0 +1,411 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_CACHE_H_ +#define LIB_JXL_DEC_CACHE_H_ + +#include + +#include // HWY_ALIGN_MAX + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/common.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/dec_group_border.h" +#include "lib/jxl/dec_noise.h" +#include "lib/jxl/dec_upsample.h" +#include "lib/jxl/filters.h" +#include "lib/jxl/image.h" +#include "lib/jxl/passes_state.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/sanitizers.h" + +namespace jxl { + +// Per-frame decoder state. All the images here should be accessed through a +// group rect (either with block units or pixel units). +struct PassesDecoderState { + PassesSharedState shared_storage; + // Allows avoiding copies for encoder loop. + const PassesSharedState* JXL_RESTRICT shared = &shared_storage; + + // Upsamplers for all the possible upsampling factors (2 to 8). + Upsampler upsamplers[3]; + + // Storage for RNG output for noise synthesis. + Image3F noise; + + // Storage for pre-color-transform output for displayed + // save_before_color_transform frames. + Image3F pre_color_transform_frame; + // Non-empty (contains originals) if extra-channels were cropped. + std::vector pre_color_transform_ec; + + // For ANS decoding. + std::vector code; + std::vector> context_map; + + // Multiplier to be applied to the quant matrices of the x channel. + float x_dm_multiplier; + float b_dm_multiplier; + + // Decoded image. + Image3F decoded; + std::vector extra_channels; + + // Borders between groups. Only allocated if `decoded` is *not* allocated. + // We also store the extremal borders for simplicity. Horizontal borders are + // stored in an image as wide as the main frame, in top-to-bottom order (top + // border of a group first, followed by the bottom border, followed by top + // border of the next group). Vertical borders are similarly stored. + Image3F borders_horizontal; + Image3F borders_vertical; + + // RGB8 output buffer. If not nullptr, image data will be written to this + // buffer instead of being written to the output ImageBundle. The image data + // is assumed to have the stride given by `rgb_stride`, hence row `i` starts + // at position `i * rgb_stride`. + uint8_t* rgb_output; + size_t rgb_stride = 0; + + // Whether to use int16 float-XYB-to-uint8-srgb conversion. + bool fast_xyb_srgb8_conversion; + + // If true, rgb_output or callback output is RGBA using 4 instead of 3 bytes + // per pixel. + bool rgb_output_is_rgba; + + // Callback for line-by-line output. + std::function pixel_callback; + // Buffer of upsampling * kApplyImageFeaturesTileDim ones. + std::vector opaque_alpha; + // One row per thread + std::vector> pixel_callback_rows; + + // Seed for noise, to have different noise per-frame. + size_t noise_seed = 0; + + // Keep track of the transform types used. + std::atomic used_acs{0}; + + // Storage for coefficients if in "accumulate" mode. + std::unique_ptr coefficients = make_unique>(0, 0); + + // Filter application pipeline used by ApplyImageFeatures. One entry is needed + // per thread. + std::vector filter_pipelines; + + // Input weights used by the filters. These are shared from multiple threads + // but are read-only for the filter application. + FilterWeights filter_weights; + + // Manages the status of borders. + GroupBorderAssigner group_border_assigner; + + // TODO(veluca): this should eventually become "iff no global modular + // transform was applied". + bool EagerFinalizeImageRect() const { + return shared->frame_header.encoding == FrameEncoding::kVarDCT && + shared->frame_header.nonserialized_metadata->m.extra_channel_info + .empty(); + } + + // Amount of padding that will be accessed, in all directions, outside a rect + // during a call to FinalizeImageRect(). + size_t FinalizeRectPadding() const { + size_t padding = shared->frame_header.loop_filter.Padding(); + padding += shared->frame_header.upsampling == 1 ? 0 : 2; + JXL_DASSERT(padding <= kMaxFinalizeRectPadding); + for (auto ups : shared->frame_header.extra_channel_upsampling) { + if (ups > 1) { + padding = std::max(padding, size_t{2}); + } + } + // We could be making a distinction between h and w padding here, but it is + // likely not worth it. + if (!shared->frame_header.chroma_subsampling.Is444()) { + padding = std::max(padding / 2 + 1, padding); + } + return padding; + } + + // Storage for intermediate data during FinalizeRect steps. + // TODO(veluca): these buffers are larger than strictly necessary. + std::vector filter_input_storage; + std::vector padded_upsampling_input_storage; + std::vector upsampling_input_storage; + size_t upsampler_arena_size = 0; + std::vector> upsampler_storage; + // We keep four arrays, one per upsampling level, to reduce memory usage in + // the common case of no upsampling. + std::vector output_pixel_data_storage[4] = {}; + std::vector ec_temp_images; + std::vector ycbcr_temp_images; + std::vector ycbcr_out_images; + + // Buffer for decoded pixel data for a group. + std::vector group_data; + static constexpr size_t kGroupDataYBorder = kMaxFinalizeRectPadding * 2; + static constexpr size_t kGroupDataXBorder = + RoundUpToBlockDim(kMaxFinalizeRectPadding) * 2 + kBlockDim; + + void EnsureStorage(size_t num_threads) { + // We need one filter_storage per thread, ensure we have at least that many. + if (shared->frame_header.loop_filter.epf_iters != 0 || + shared->frame_header.loop_filter.gab) { + if (filter_pipelines.size() < num_threads) { + filter_pipelines.resize(num_threads); + } + } + // We allocate filter_input_storage unconditionally to ensure that the image + // is allocated if we need it for DC upsampling. + for (size_t _ = filter_input_storage.size(); _ < num_threads; _++) { + // Extra padding along the x dimension to ensure memory accesses don't + // load out-of-bounds pixels. + filter_input_storage.emplace_back( + kApplyImageFeaturesTileDim + 2 * kGroupDataXBorder, + kApplyImageFeaturesTileDim + 2 * kGroupDataYBorder); + } + if (shared->frame_header.upsampling != 1) { + for (size_t _ = upsampling_input_storage.size(); _ < num_threads; _++) { + // At this point, we only need up to 2 pixels of border per side for + // upsampling, but we add an extra border for aligned access. + upsampling_input_storage.emplace_back( + kApplyImageFeaturesTileDim + 2 * kBlockDim, + kApplyImageFeaturesTileDim + 4); + padded_upsampling_input_storage.emplace_back( + kApplyImageFeaturesTileDim + 2 * kBlockDim, + kApplyImageFeaturesTileDim + 4); + } + } + const size_t arena_size = Upsampler::GetArenaSize( + kApplyImageFeaturesTileDim * shared->frame_header.upsampling); + if (arena_size > upsampler_arena_size) upsampler_storage.clear(); + for (size_t _ = upsampler_storage.size(); _ < num_threads; _++) { + upsampler_storage.emplace_back(hwy::AllocateAligned(arena_size)); + } + upsampler_arena_size = arena_size; + for (size_t _ = group_data.size(); _ < num_threads; _++) { + group_data.emplace_back(kGroupDim + 2 * kGroupDataXBorder, + kGroupDim + 2 * kGroupDataYBorder); +#if MEMORY_SANITIZER + // Avoid errors due to loading vectors on the outermost padding. + FillImage(msan::kSanitizerSentinel, &group_data.back()); +#endif + } + if (!shared->frame_header.chroma_subsampling.Is444()) { + for (size_t _ = ycbcr_temp_images.size(); _ < num_threads; _++) { + ycbcr_temp_images.emplace_back(kGroupDim + 2 * kGroupDataXBorder, + kGroupDim + 2 * kGroupDataYBorder); + ycbcr_out_images.emplace_back(kGroupDim + 2 * kGroupDataXBorder, + kGroupDim + 2 * kGroupDataYBorder); + } + } + if (rgb_output || pixel_callback) { + size_t log2_upsampling = CeilLog2Nonzero(shared->frame_header.upsampling); + for (size_t _ = output_pixel_data_storage[log2_upsampling].size(); + _ < num_threads; _++) { + output_pixel_data_storage[log2_upsampling].emplace_back( + kApplyImageFeaturesTileDim << log2_upsampling, + kApplyImageFeaturesTileDim << log2_upsampling); + } + opaque_alpha.resize( + kApplyImageFeaturesTileDim * shared->frame_header.upsampling, 1.0f); + if (pixel_callback) { + pixel_callback_rows.resize(num_threads); + for (size_t i = 0; i < pixel_callback_rows.size(); ++i) { + pixel_callback_rows[i].resize(kApplyImageFeaturesTileDim * + shared->frame_header.upsampling * + (rgb_output_is_rgba ? 4 : 3)); + } + } + } + if (shared->metadata->m.num_extra_channels * num_threads > + ec_temp_images.size()) { + ec_temp_images.resize(shared->metadata->m.num_extra_channels * + num_threads); + } + for (size_t i = 0; i < shared->metadata->m.num_extra_channels; i++) { + if (shared->frame_header.extra_channel_upsampling[i] == 1) continue; + // We need up to 2 pixels of padding on each side. On the x axis, we round + // up padding so that 0 starts at a multiple of kBlockDim. + size_t xs = kApplyImageFeaturesTileDim * shared->frame_header.upsampling / + shared->frame_header.extra_channel_upsampling[i] + + 2 * kBlockDim; + size_t ys = kApplyImageFeaturesTileDim * shared->frame_header.upsampling / + shared->frame_header.extra_channel_upsampling[i] + + 4; + for (size_t t = 0; t < num_threads; t++) { + auto& eti = + ec_temp_images[t * shared->metadata->m.num_extra_channels + i]; + if (eti.xsize() < xs || eti.ysize() < ys) { + eti = ImageF(xs, ys); + } + } + } + } + + // Information for colour conversions. + OutputEncodingInfo output_encoding_info; + + // Initializes decoder-specific structures using information from *shared. + Status Init() { + x_dm_multiplier = + std::pow(1 / (1.25f), shared->frame_header.x_qm_scale - 2.0f); + b_dm_multiplier = + std::pow(1 / (1.25f), shared->frame_header.b_qm_scale - 2.0f); + + rgb_output = nullptr; + pixel_callback = nullptr; + rgb_output_is_rgba = false; + fast_xyb_srgb8_conversion = false; + used_acs = 0; + + group_border_assigner.Init(shared->frame_dim); + const LoopFilter& lf = shared->frame_header.loop_filter; + JXL_RETURN_IF_ERROR(filter_weights.Init(lf, shared->frame_dim)); + for (auto& fp : filter_pipelines) { + // De-initialize FilterPipelines. + fp.num_filters = 0; + } + for (size_t i = 0; i < 3; i++) { + upsamplers[i].Init(2 << i, shared->metadata->transform_data); + } + return true; + } + + // Initialize the decoder state after all of DC is decoded. + void InitForAC(ThreadPool* pool) { + shared_storage.coeff_order_size = 0; + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + if (((1 << o) & used_acs) == 0) continue; + uint8_t ord = kStrategyOrder[o]; + shared_storage.coeff_order_size = + std::max(kCoeffOrderOffset[3 * (ord + 1)] * kDCTBlockSize, + shared_storage.coeff_order_size); + } + size_t sz = shared_storage.frame_header.passes.num_passes * + shared_storage.coeff_order_size; + if (sz > shared_storage.coeff_orders.size()) { + shared_storage.coeff_orders.resize(sz); + } + if (shared->frame_header.flags & FrameHeader::kNoise) { + noise = Image3F(shared->frame_dim.xsize_upsampled_padded, + shared->frame_dim.ysize_upsampled_padded); + size_t num_x_groups = DivCeil(noise.xsize(), kGroupDim); + size_t num_y_groups = DivCeil(noise.ysize(), kGroupDim); + PROFILER_ZONE("GenerateNoise"); + auto generate_noise = [&](int group_index, int _) { + size_t gx = group_index % num_x_groups; + size_t gy = group_index / num_x_groups; + Rect rect(gx * kGroupDim, gy * kGroupDim, kGroupDim, kGroupDim, + noise.xsize(), noise.ysize()); + RandomImage3(noise_seed + group_index, rect, &noise); + }; + RunOnPool(pool, 0, num_x_groups * num_y_groups, ThreadPool::SkipInit(), + generate_noise, "Generate noise"); + { + PROFILER_ZONE("High pass noise"); + // 4 * (1 - box kernel) + WeightsSymmetric5 weights{{HWY_REP4(-3.84)}, {HWY_REP4(0.16)}, + {HWY_REP4(0.16)}, {HWY_REP4(0.16)}, + {HWY_REP4(0.16)}, {HWY_REP4(0.16)}}; + // TODO(veluca): avoid copy. + // TODO(veluca): avoid having a full copy of the image in main memory. + ImageF noise_tmp(noise.xsize(), noise.ysize()); + for (size_t c = 0; c < 3; c++) { + Symmetric5(noise.Plane(c), Rect(noise), weights, pool, &noise_tmp); + std::swap(noise.Plane(c), noise_tmp); + } + noise_seed += shared->frame_dim.num_groups; + } + } + EnsureBordersStorage(); + if (!EagerFinalizeImageRect()) { + // decoded must be padded to a multiple of kBlockDim rows since the last + // rows may be used by the filters even if they are outside the frame + // dimension. + decoded = Image3F(shared->frame_dim.xsize_padded, + shared->frame_dim.ysize_padded); + } +#if MEMORY_SANITIZER + // Avoid errors due to loading vectors on the outermost padding. + FillImage(msan::kSanitizerSentinel, &decoded); +#endif + } + + void EnsureBordersStorage(); + + Status FinalizeGroup(size_t group_idx, size_t thread, Image3F* pixel_data, + ImageBundle* output); +}; + +// Temp images required for decoding a single group. Reduces memory allocations +// for large images because we only initialize min(#threads, #groups) instances. +struct GroupDecCache { + void InitOnce(size_t num_passes, size_t used_acs) { + PROFILER_FUNC; + + for (size_t i = 0; i < num_passes; i++) { + if (num_nzeroes[i].xsize() == 0) { + // Allocate enough for a whole group - partial groups on the + // right/bottom border just use a subset. The valid size is passed via + // Rect. + + num_nzeroes[i] = Image3I(kGroupDimInBlocks, kGroupDimInBlocks); + } + } + size_t max_block_area = 0; + + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + AcStrategy acs = AcStrategy::FromRawStrategy(o); + if ((used_acs & (1 << o)) == 0) continue; + size_t area = + acs.covered_blocks_x() * acs.covered_blocks_y() * kDCTBlockSize; + max_block_area = std::max(area, max_block_area); + } + + if (max_block_area > max_block_area_) { + max_block_area_ = max_block_area; + // We need 3x float blocks for dequantized coefficients and 1x for scratch + // space for transforms. + float_memory_ = hwy::AllocateAligned(max_block_area_ * 4); + // We need 3x int32 or int16 blocks for quantized coefficients. + int32_memory_ = hwy::AllocateAligned(max_block_area_ * 3); + int16_memory_ = hwy::AllocateAligned(max_block_area_ * 3); + } + + dec_group_block = float_memory_.get(); + scratch_space = dec_group_block + max_block_area_ * 3; + dec_group_qblock = int32_memory_.get(); + dec_group_qblock16 = int16_memory_.get(); + } + + // Scratch space used by DecGroupImpl(). + float* dec_group_block; + int32_t* dec_group_qblock; + int16_t* dec_group_qblock16; + + // For TransformToPixels. + float* scratch_space; + // Note that scratch_space is never used at the same time as dec_group_qblock. + // Moreover, only one of dec_group_qblock16 is ever used. + // TODO(veluca): figure out if we can save allocations. + + // AC decoding + Image3I num_nzeroes[kMaxNumPasses]; + + private: + hwy::AlignedFreeUniquePtr float_memory_; + hwy::AlignedFreeUniquePtr int32_memory_; + hwy::AlignedFreeUniquePtr int16_memory_; + size_t max_block_area_ = 0; +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_CACHE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_context_map.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_context_map.cc new file mode 100644 index 0000000000..f7fc3d27a4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_context_map.cc @@ -0,0 +1,105 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_context_map.h" + +#include +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/entropy_coder.h" + +namespace jxl { + +namespace { + +void MoveToFront(uint8_t* v, uint8_t index) { + uint8_t value = v[index]; + uint8_t i = index; + for (; i; --i) v[i] = v[i - 1]; + v[0] = value; +} + +void InverseMoveToFrontTransform(uint8_t* v, int v_len) { + uint8_t mtf[256]; + int i; + for (i = 0; i < 256; ++i) { + mtf[i] = static_cast(i); + } + for (i = 0; i < v_len; ++i) { + uint8_t index = v[i]; + v[i] = mtf[index]; + if (index) MoveToFront(mtf, index); + } +} + +bool VerifyContextMap(const std::vector& context_map, + const size_t num_htrees) { + std::vector have_htree(num_htrees); + size_t num_found = 0; + for (const uint8_t htree : context_map) { + if (htree >= num_htrees) { + return JXL_FAILURE("Invalid histogram index in context map."); + } + if (!have_htree[htree]) { + have_htree[htree] = true; + ++num_found; + } + } + if (num_found != num_htrees) { + return JXL_FAILURE("Incomplete context map."); + } + return true; +} + +} // namespace + +bool DecodeContextMap(std::vector* context_map, size_t* num_htrees, + BitReader* input) { + bool is_simple = input->ReadFixedBits<1>(); + if (is_simple) { + int bits_per_entry = input->ReadFixedBits<2>(); + if (bits_per_entry != 0) { + for (size_t i = 0; i < context_map->size(); i++) { + (*context_map)[i] = input->ReadBits(bits_per_entry); + } + } else { + std::fill(context_map->begin(), context_map->end(), 0); + } + } else { + bool use_mtf = input->ReadFixedBits<1>(); + ANSCode code; + std::vector dummy_ctx_map; + // Usage of LZ77 is disallowed if decoding only two symbols. This doesn't + // make sense in non-malicious bitstreams, and could cause a stack overflow + // in malicious bitstreams by making every context map require its own + // context map. + JXL_RETURN_IF_ERROR( + DecodeHistograms(input, 1, &code, &dummy_ctx_map, + /*disallow_lz77=*/context_map->size() <= 2)); + ANSSymbolReader reader(&code, input); + size_t i = 0; + while (i < context_map->size()) { + uint32_t sym = reader.ReadHybridUint(0, input, dummy_ctx_map); + if (sym >= kMaxClusters) { + return JXL_FAILURE("Invalid cluster ID"); + } + (*context_map)[i] = sym; + i++; + } + if (!reader.CheckANSFinalState()) { + return JXL_FAILURE("Invalid context map"); + } + if (use_mtf) { + InverseMoveToFrontTransform(context_map->data(), context_map->size()); + } + } + *num_htrees = *std::max_element(context_map->begin(), context_map->end()) + 1; + return VerifyContextMap(*context_map, *num_htrees); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_context_map.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_context_map.h new file mode 100644 index 0000000000..1db2317827 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_context_map.h @@ -0,0 +1,30 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_CONTEXT_MAP_H_ +#define LIB_JXL_DEC_CONTEXT_MAP_H_ + +#include +#include + +#include + +#include "lib/jxl/dec_bit_reader.h" + +namespace jxl { + +// Context map uses uint8_t. +constexpr size_t kMaxClusters = 256; + +// Reads the context map from the bit stream. On calling this function, +// context_map->size() must be the number of possible context ids. +// Sets *num_htrees to the number of different histogram ids in +// *context_map. +bool DecodeContextMap(std::vector* context_map, size_t* num_htrees, + BitReader* input); + +} // namespace jxl + +#endif // LIB_JXL_DEC_CONTEXT_MAP_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image.cc new file mode 100644 index 0000000000..bb9196da23 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image.cc @@ -0,0 +1,494 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_external_image.h" + +#include + +#include +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/dec_external_image.cc" +#include +#include + +#include "lib/jxl/alpha.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/cache_aligned.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/sanitizers.h" +#include "lib/jxl/transfer_functions-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +void FloatToU32(const float* in, uint32_t* out, size_t num, float mul, + size_t bits_per_sample) { + // TODO(eustas): investigate 24..31 bpp cases. + if (bits_per_sample == 32) { + // Conversion to real 32-bit *unsigned* integers requires more intermediate + // precision that what is given by the usual f32 -> i32 conversion + // instructions, so we run the non-SIMD path for those. + const uint32_t cap = (1ull << bits_per_sample) - 1; + for (size_t x = 0; x < num; x++) { + float v = in[x]; + if (v >= 1.0f) { + out[x] = cap; + } else if (v >= 0.0f) { // Inverted condition => NaN -> 0. + out[x] = static_cast(v * mul + 0.5f); + } else { + out[x] = 0; + } + } + return; + } + + // General SIMD case for less than 32 bits output. + const HWY_FULL(float) d; + const hwy::HWY_NAMESPACE::Rebind du; + + // Unpoison accessing partially-uninitialized vectors with memory sanitizer. + // This is because we run NearestInt() on the vector, which triggers msan even + // it it safe to do so since the values are not mixed between lanes. + const size_t num_round_up = RoundUpTo(num, Lanes(d)); + msan::UnpoisonMemory(in + num, sizeof(in[0]) * (num_round_up - num)); + + const auto one = Set(d, 1.0f); + const auto scale = Set(d, mul); + for (size_t x = 0; x < num; x += Lanes(d)) { + auto v = Load(d, in + x); + // Clamp turns NaN to 'min'. + v = Clamp(v, Zero(d), one); + auto i = NearestInt(v * scale); + Store(BitCast(du, i), du, out + x); + } + + // Poison back the output. + msan::PoisonMemory(out + num, sizeof(out[0]) * (num_round_up - num)); +} + +void FloatToF16(const float* in, hwy::float16_t* out, size_t num) { + const HWY_FULL(float) d; + const hwy::HWY_NAMESPACE::Rebind du; + + // Unpoison accessing partially-uninitialized vectors with memory sanitizer. + // This is because we run DemoteTo() on the vector which triggers msan. + const size_t num_round_up = RoundUpTo(num, Lanes(d)); + msan::UnpoisonMemory(in + num, sizeof(in[0]) * (num_round_up - num)); + + for (size_t x = 0; x < num; x += Lanes(d)) { + auto v = Load(d, in + x); + auto v16 = DemoteTo(du, v); + Store(v16, du, out + x); + } + + // Poison back the output. + msan::PoisonMemory(out + num, sizeof(out[0]) * (num_round_up - num)); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace jxl { +namespace { + +// Stores a float in big endian +void StoreBEFloat(float value, uint8_t* p) { + uint32_t u; + memcpy(&u, &value, 4); + StoreBE32(u, p); +} + +// Stores a float in little endian +void StoreLEFloat(float value, uint8_t* p) { + uint32_t u; + memcpy(&u, &value, 4); + StoreLE32(u, p); +} + +// The orientation may not be identity. +// TODO(lode): SIMDify where possible +template +void UndoOrientation(jxl::Orientation undo_orientation, const Plane& image, + Plane& out, jxl::ThreadPool* pool) { + const size_t xsize = image.xsize(); + const size_t ysize = image.ysize(); + + if (undo_orientation == Orientation::kFlipHorizontal) { + out = Plane(xsize, ysize); + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const int64_t y = task; + const T* JXL_RESTRICT row_in = image.Row(y); + T* JXL_RESTRICT row_out = out.Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_out[xsize - x - 1] = row_in[x]; + } + }, + "UndoOrientation"); + } else if (undo_orientation == Orientation::kRotate180) { + out = Plane(xsize, ysize); + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const int64_t y = task; + const T* JXL_RESTRICT row_in = image.Row(y); + T* JXL_RESTRICT row_out = out.Row(ysize - y - 1); + for (size_t x = 0; x < xsize; ++x) { + row_out[xsize - x - 1] = row_in[x]; + } + }, + "UndoOrientation"); + } else if (undo_orientation == Orientation::kFlipVertical) { + out = Plane(xsize, ysize); + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const int64_t y = task; + const T* JXL_RESTRICT row_in = image.Row(y); + T* JXL_RESTRICT row_out = out.Row(ysize - y - 1); + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = row_in[x]; + } + }, + "UndoOrientation"); + } else if (undo_orientation == Orientation::kTranspose) { + out = Plane(ysize, xsize); + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const int64_t y = task; + const T* JXL_RESTRICT row_in = image.Row(y); + for (size_t x = 0; x < xsize; ++x) { + out.Row(x)[y] = row_in[x]; + } + }, + "UndoOrientation"); + } else if (undo_orientation == Orientation::kRotate90) { + out = Plane(ysize, xsize); + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const int64_t y = task; + const T* JXL_RESTRICT row_in = image.Row(y); + for (size_t x = 0; x < xsize; ++x) { + out.Row(x)[ysize - y - 1] = row_in[x]; + } + }, + "UndoOrientation"); + } else if (undo_orientation == Orientation::kAntiTranspose) { + out = Plane(ysize, xsize); + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const int64_t y = task; + const T* JXL_RESTRICT row_in = image.Row(y); + for (size_t x = 0; x < xsize; ++x) { + out.Row(xsize - x - 1)[ysize - y - 1] = row_in[x]; + } + }, + "UndoOrientation"); + } else if (undo_orientation == Orientation::kRotate270) { + out = Plane(ysize, xsize); + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const int64_t y = task; + const T* JXL_RESTRICT row_in = image.Row(y); + for (size_t x = 0; x < xsize; ++x) { + out.Row(xsize - x - 1)[y] = row_in[x]; + } + }, + "UndoOrientation"); + } +} +} // namespace + +HWY_EXPORT(FloatToU32); +HWY_EXPORT(FloatToF16); + +namespace { + +using StoreFuncType = void(uint32_t value, uint8_t* dest); +template +void StoreUintRow(uint32_t* JXL_RESTRICT* rows_u32, size_t num_channels, + size_t xsize, size_t bytes_per_sample, + uint8_t* JXL_RESTRICT out) { + for (size_t x = 0; x < xsize; ++x) { + for (size_t c = 0; c < num_channels; c++) { + StoreFunc(rows_u32[c][x], + out + (num_channels * x + c) * bytes_per_sample); + } + } +} + +template +void StoreFloatRow(const float* JXL_RESTRICT* rows_in, size_t num_channels, + size_t xsize, uint8_t* JXL_RESTRICT out) { + for (size_t x = 0; x < xsize; ++x) { + for (size_t c = 0; c < num_channels; c++) { + StoreFunc(rows_in[c][x], out + (num_channels * x + c) * sizeof(float)); + } + } +} + +void JXL_INLINE Store8(uint32_t value, uint8_t* dest) { *dest = value & 0xff; } + +} // namespace + +Status ConvertToExternal(const jxl::ImageBundle& ib, size_t bits_per_sample, + bool float_out, size_t num_channels, + JxlEndianness endianness, size_t stride, + jxl::ThreadPool* pool, void* out_image, + size_t out_size, JxlImageOutCallback out_callback, + void* out_opaque, jxl::Orientation undo_orientation) { + if (bits_per_sample < 1 || bits_per_sample > 32) { + return JXL_FAILURE("Invalid bits_per_sample value."); + } + if (!!out_image == !!out_callback) { + return JXL_FAILURE( + "Must provide either an out_image or an out_callback, but not both."); + } + // TODO(deymo): Implement 1-bit per pixel packed in 8 samples per byte. + if (bits_per_sample == 1) { + return JXL_FAILURE("packed 1-bit per sample is not yet supported"); + } + size_t xsize = ib.xsize(); + size_t ysize = ib.ysize(); + + bool want_alpha = num_channels == 2 || num_channels == 4; + size_t color_channels = num_channels <= 2 ? 1 : 3; + + // bytes_per_channel and is only valid for bits_per_sample > 1. + const size_t bytes_per_channel = DivCeil(bits_per_sample, jxl::kBitsPerByte); + const size_t bytes_per_pixel = num_channels * bytes_per_channel; + + const Image3F* color = &ib.color(); + Image3F temp_color, unpremul; + const ImageF* alpha = ib.HasAlpha() ? &ib.alpha() : nullptr; + ImageF temp_alpha; + + std::vector> row_out_callback; + auto InitOutCallback = [&](size_t num_threads) { + if (out_callback) { + row_out_callback.resize(num_threads); + for (size_t i = 0; i < num_threads; ++i) { + row_out_callback[i].resize(stride); + } + } + }; + + if (ib.AlphaIsPremultiplied() && ib.HasAlpha()) { + unpremul = Image3F(color->xsize(), color->ysize()); + CopyImageTo(*color, &unpremul); + for (size_t y = 0; y < unpremul.ysize(); y++) { + UnpremultiplyAlpha(unpremul.PlaneRow(0, y), unpremul.PlaneRow(1, y), + unpremul.PlaneRow(2, y), alpha->Row(y), + unpremul.xsize()); + } + color = &unpremul; + } + if (undo_orientation != Orientation::kIdentity) { + Image3F transformed; + for (size_t c = 0; c < color_channels; ++c) { + UndoOrientation(undo_orientation, color->Plane(c), transformed.Plane(c), + pool); + } + transformed.Swap(temp_color); + color = &temp_color; + if (ib.HasAlpha()) { + UndoOrientation(undo_orientation, *alpha, temp_alpha, pool); + alpha = &temp_alpha; + } + + xsize = color->xsize(); + ysize = color->ysize(); + } + + if (stride < bytes_per_pixel * xsize) { + return JXL_FAILURE( + "stride is smaller than scanline width in bytes: %zu vs %zu", stride, + bytes_per_pixel * xsize); + } + + const bool little_endian = + endianness == JXL_LITTLE_ENDIAN || + (endianness == JXL_NATIVE_ENDIAN && IsLittleEndian()); + + ImageF ones; + if (want_alpha && !ib.HasAlpha()) { + ones = ImageF(xsize, 1); + FillImage(1.0f, &ones); + } + + if (float_out) { + if (bits_per_sample == 16) { + bool swap_endianness = little_endian != IsLittleEndian(); + Plane f16_cache; + RunOnPool( + pool, 0, static_cast(ysize), + [&](size_t num_threads) { + f16_cache = + Plane(xsize, num_channels * num_threads); + InitOutCallback(num_threads); + return true; + }, + [&](const int task, int thread) { + const int64_t y = task; + const float* JXL_RESTRICT row_in[4]; + size_t c = 0; + for (; c < color_channels; c++) { + row_in[c] = color->PlaneRow(c, y); + } + if (want_alpha) { + row_in[c++] = ib.HasAlpha() ? alpha->Row(y) : ones.Row(0); + } + JXL_ASSERT(c == num_channels); + hwy::float16_t* JXL_RESTRICT row_f16[4]; + for (size_t r = 0; r < c; r++) { + row_f16[r] = f16_cache.Row(r + thread * num_channels); + HWY_DYNAMIC_DISPATCH(FloatToF16) + (row_in[r], row_f16[r], xsize); + } + uint8_t* row_out = + out_callback + ? row_out_callback[thread].data() + : &(reinterpret_cast(out_image))[stride * y]; + // interleave the one scanline + hwy::float16_t* row_f16_out = + reinterpret_cast(row_out); + for (size_t x = 0; x < xsize; x++) { + for (size_t r = 0; r < c; r++) { + row_f16_out[x * num_channels + r] = row_f16[r][x]; + } + } + if (swap_endianness) { + size_t size = xsize * num_channels * 2; + for (size_t i = 0; i < size; i += 2) { + std::swap(row_out[i + 0], row_out[i + 1]); + } + } + if (out_callback) { + (*out_callback)(out_opaque, 0, y, xsize, row_out); + } + }, + "ConvertF16"); + } else if (bits_per_sample == 32) { + RunOnPool( + pool, 0, static_cast(ysize), + [&](size_t num_threads) { + InitOutCallback(num_threads); + return true; + }, + [&](const int task, int thread) { + const int64_t y = task; + uint8_t* row_out = + out_callback + ? row_out_callback[thread].data() + : &(reinterpret_cast(out_image))[stride * y]; + const float* JXL_RESTRICT row_in[4]; + size_t c = 0; + for (; c < color_channels; c++) { + row_in[c] = color->PlaneRow(c, y); + } + if (want_alpha) { + row_in[c++] = ib.HasAlpha() ? alpha->Row(y) : ones.Row(0); + } + JXL_ASSERT(c == num_channels); + if (little_endian) { + StoreFloatRow(row_in, c, xsize, row_out); + } else { + StoreFloatRow(row_in, c, xsize, row_out); + } + if (out_callback) { + (*out_callback)(out_opaque, 0, y, xsize, row_out); + } + }, + "ConvertFloat"); + } else { + return JXL_FAILURE("float other than 16-bit and 32-bit not supported"); + } + } else { + // Multiplier to convert from floating point 0-1 range to the integer + // range. + float mul = (1ull << bits_per_sample) - 1; + Plane u32_cache; + RunOnPool( + pool, 0, static_cast(ysize), + [&](size_t num_threads) { + u32_cache = Plane(xsize, num_channels * num_threads); + InitOutCallback(num_threads); + return true; + }, + [&](const int task, int thread) { + const int64_t y = task; + uint8_t* row_out = + out_callback + ? row_out_callback[thread].data() + : &(reinterpret_cast(out_image))[stride * y]; + const float* JXL_RESTRICT row_in[4]; + size_t c = 0; + for (; c < color_channels; c++) { + row_in[c] = color->PlaneRow(c, y); + } + if (want_alpha) { + row_in[c++] = ib.HasAlpha() ? alpha->Row(y) : ones.Row(0); + } + JXL_ASSERT(c == num_channels); + uint32_t* JXL_RESTRICT row_u32[4]; + for (size_t r = 0; r < c; r++) { + row_u32[r] = u32_cache.Row(r + thread * num_channels); + // row_u32[] is a per-thread temporary row storage, this isn't + // intended to be initialized on a previous run. + msan::PoisonMemory(row_u32[r], xsize * sizeof(row_u32[r][0])); + HWY_DYNAMIC_DISPATCH(FloatToU32) + (row_in[r], row_u32[r], xsize, mul, bits_per_sample); + } + // TODO(deymo): add bits_per_sample == 1 case here. + if (bits_per_sample <= 8) { + StoreUintRow(row_u32, c, xsize, 1, row_out); + } else if (bits_per_sample <= 16) { + if (little_endian) { + StoreUintRow(row_u32, c, xsize, 2, row_out); + } else { + StoreUintRow(row_u32, c, xsize, 2, row_out); + } + } else if (bits_per_sample <= 24) { + if (little_endian) { + StoreUintRow(row_u32, c, xsize, 3, row_out); + } else { + StoreUintRow(row_u32, c, xsize, 3, row_out); + } + } else { + if (little_endian) { + StoreUintRow(row_u32, c, xsize, 4, row_out); + } else { + StoreUintRow(row_u32, c, xsize, 4, row_out); + } + } + if (out_callback) { + (*out_callback)(out_opaque, 0, y, xsize, row_out); + } + }, + "ConvertUint"); + } + + return true; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image.h new file mode 100644 index 0000000000..aed8764411 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image.h @@ -0,0 +1,45 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_EXTERNAL_IMAGE_H_ +#define LIB_JXL_DEC_EXTERNAL_IMAGE_H_ + +// Interleaved image for color transforms and Codec. + +#include +#include + +#include "jxl/decode.h" +#include "jxl/types.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +// Converts ib to interleaved void* pixel buffer with the given format. +// bits_per_sample: must be 8, 16 or 32, and must be 32 if float_out +// is true. 1 and 32 int are not yet implemented. +// num_channels: must be 1, 2, 3 or 4 for gray, gray+alpha, RGB, RGB+alpha. +// This supports the features needed for the C API and does not perform +// color space conversion. +// TODO(lode): support 1-bit output (bits_per_sample == 1) +// TODO(lode): support rectangle crop. +// stride_out is output scanline size in bytes, must be >= +// output_xsize * output_bytes_per_pixel. +// undo_orientation is an EXIF orientation to undo. Depending on the +// orientation, the output xsize and ysize are swapped compared to input +// xsize and ysize. +Status ConvertToExternal(const jxl::ImageBundle& ib, size_t bits_per_sample, + bool float_out, size_t num_channels, + JxlEndianness endianness, size_t stride_out, + jxl::ThreadPool* thread_pool, void* out_image, + size_t out_size, JxlImageOutCallback out_callback, + void* out_opaque, jxl::Orientation undo_orientation); + +} // namespace jxl + +#endif // LIB_JXL_DEC_EXTERNAL_IMAGE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image_gbench.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image_gbench.cc new file mode 100644 index 0000000000..283a97529a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_external_image_gbench.cc @@ -0,0 +1,56 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "benchmark/benchmark.h" +#include "lib/jxl/dec_external_image.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { +namespace { + +// Decoder case, interleaves an internal float image. +void BM_DecExternalImage_ConvertImageRGBA(benchmark::State& state) { + const size_t kNumIter = 5; + size_t xsize = state.range(); + size_t ysize = state.range(); + size_t num_channels = 4; + + ImageMetadata im; + im.SetAlphaBits(8); + ImageBundle ib(&im); + Image3F color(xsize, ysize); + ZeroFillImage(&color); + ib.SetFromImage(std::move(color), ColorEncoding::SRGB()); + ImageF alpha(xsize, ysize); + ZeroFillImage(&alpha); + ib.SetAlpha(std::move(alpha), /*alpha_is_premultiplied=*/false); + + const size_t bytes_per_row = xsize * num_channels; + std::vector interleaved(bytes_per_row * ysize); + + for (auto _ : state) { + for (size_t i = 0; i < kNumIter; ++i) { + JXL_CHECK(ConvertToExternal( + ib, + /*bits_per_sample=*/8, + /*float_out=*/false, num_channels, JXL_NATIVE_ENDIAN, + /*stride*/ bytes_per_row, + /*thread_pool=*/nullptr, interleaved.data(), interleaved.size(), + /*out_callback=*/nullptr, /*out_opaque=*/nullptr, + /*undo_orientation=*/jxl::Orientation::kIdentity)); + } + } + + // Pixels per second. + state.SetItemsProcessed(kNumIter * state.iterations() * xsize * ysize); + state.SetBytesProcessed(kNumIter * state.iterations() * interleaved.size()); +} + +BENCHMARK(BM_DecExternalImage_ConvertImageRGBA) + ->RangeMultiplier(2) + ->Range(256, 2048); + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_file.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_file.cc new file mode 100644 index 0000000000..2ee1f66ffd --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_file.cc @@ -0,0 +1,186 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_file.h" + +#include + +#include +#include + +#include "jxl/decode.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_frame.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/icc_codec.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/jpeg/dec_jpeg_data_writer.h" + +namespace jxl { +namespace { + +Status DecodeHeaders(BitReader* reader, CodecInOut* io) { + JXL_RETURN_IF_ERROR(ReadSizeHeader(reader, &io->metadata.size)); + + JXL_RETURN_IF_ERROR(ReadImageMetadata(reader, &io->metadata.m)); + + io->metadata.transform_data.nonserialized_xyb_encoded = + io->metadata.m.xyb_encoded; + JXL_RETURN_IF_ERROR(Bundle::Read(reader, &io->metadata.transform_data)); + + return true; +} + +} // namespace + +Status DecodePreview(const DecompressParams& dparams, + const CodecMetadata& metadata, + BitReader* JXL_RESTRICT reader, ThreadPool* pool, + ImageBundle* JXL_RESTRICT preview, uint64_t* dec_pixels, + const SizeConstraints* constraints) { + // No preview present in file. + if (!metadata.m.have_preview) { + if (dparams.preview == Override::kOn) { + return JXL_FAILURE("preview == kOn but no preview present"); + } + return true; + } + + // Have preview; prepare to skip or read it. + JXL_RETURN_IF_ERROR(reader->JumpToByteBoundary()); + + if (dparams.preview == Override::kOff) { + JXL_RETURN_IF_ERROR(SkipFrame(metadata, reader, /*is_preview=*/true)); + return true; + } + + // Else: default or kOn => decode preview. + PassesDecoderState dec_state; + JXL_RETURN_IF_ERROR(dec_state.output_encoding_info.Set( + metadata, ColorEncoding::LinearSRGB(metadata.m.color_encoding.IsGray()))); + JXL_RETURN_IF_ERROR(DecodeFrame(dparams, &dec_state, pool, reader, preview, + metadata, constraints, + /*is_preview=*/true)); + if (dec_pixels) { + *dec_pixels += dec_state.shared->frame_dim.xsize_upsampled * + dec_state.shared->frame_dim.ysize_upsampled; + } + return true; +} + +// To avoid the complexity of file I/O and buffering, we assume the bitstream +// is loaded (or for large images/sequences: mapped into) memory. +Status DecodeFile(const DecompressParams& dparams, + const Span file, CodecInOut* JXL_RESTRICT io, + ThreadPool* pool) { + PROFILER_ZONE("DecodeFile uninstrumented"); + + // Marker + JxlSignature signature = JxlSignatureCheck(file.data(), file.size()); + if (signature == JXL_SIG_NOT_ENOUGH_BYTES || signature == JXL_SIG_INVALID) { + return JXL_FAILURE("File does not start with known JPEG XL signature"); + } + + std::unique_ptr jpeg_data = nullptr; + if (dparams.keep_dct) { + if (io->Main().jpeg_data == nullptr) { + return JXL_FAILURE("Caller must set jpeg_data"); + } + jpeg_data = std::move(io->Main().jpeg_data); + } + + Status ret = true; + { + BitReader reader(file); + BitReaderScopedCloser reader_closer(&reader, &ret); + (void)reader.ReadFixedBits<16>(); // skip marker + + { + JXL_RETURN_IF_ERROR(DecodeHeaders(&reader, io)); + size_t xsize = io->metadata.xsize(); + size_t ysize = io->metadata.ysize(); + JXL_RETURN_IF_ERROR(VerifyDimensions(&io->constraints, xsize, ysize)); + } + + if (io->metadata.m.color_encoding.WantICC()) { + PaddedBytes icc; + JXL_RETURN_IF_ERROR(ReadICC(&reader, &icc)); + JXL_RETURN_IF_ERROR(io->metadata.m.color_encoding.SetICC(std::move(icc))); + } + // Set ICC profile in jpeg_data. + if (jpeg_data) { + Status res = jpeg::SetJPEGDataFromICC(io->metadata.m.color_encoding.ICC(), + jpeg_data.get()); + if (!res) { + return res; + } + } + + JXL_RETURN_IF_ERROR(DecodePreview(dparams, io->metadata, &reader, pool, + &io->preview_frame, &io->dec_pixels, + &io->constraints)); + + // Only necessary if no ICC and no preview. + JXL_RETURN_IF_ERROR(reader.JumpToByteBoundary()); + if (io->metadata.m.have_animation && dparams.keep_dct) { + return JXL_FAILURE("Cannot decode to JPEG an animation"); + } + + PassesDecoderState dec_state; + JXL_RETURN_IF_ERROR(dec_state.output_encoding_info.Set( + io->metadata, + ColorEncoding::LinearSRGB(io->metadata.m.color_encoding.IsGray()))); + + io->frames.clear(); + Status dec_ok(false); + do { + io->frames.emplace_back(&io->metadata.m); + if (jpeg_data) { + io->frames.back().jpeg_data = std::move(jpeg_data); + } + // Skip frames that are not displayed. + do { + dec_ok = + DecodeFrame(dparams, &dec_state, pool, &reader, &io->frames.back(), + io->metadata, &io->constraints); + if (!dparams.allow_partial_files) { + JXL_RETURN_IF_ERROR(dec_ok); + } else if (!dec_ok) { + io->frames.pop_back(); + break; + } + } while (dec_state.shared->frame_header.frame_type != + FrameType::kRegularFrame && + dec_state.shared->frame_header.frame_type != + FrameType::kSkipProgressive); + io->dec_pixels += io->frames.back().xsize() * io->frames.back().ysize(); + } while (!dec_state.shared->frame_header.is_last && dec_ok); + + if (io->frames.empty()) return JXL_FAILURE("Not enough data."); + + if (dparams.check_decompressed_size && !dparams.allow_partial_files && + dparams.max_downsampling == 1) { + if (reader.TotalBitsConsumed() != file.size() * kBitsPerByte) { + return JXL_FAILURE("DecodeFile reader position not at EOF."); + } + } + // Suppress errors when decoding partial files with DC frames. + if (!reader.AllReadsWithinBounds() && dparams.allow_partial_files) { + reader_closer.CloseAndSuppressError(); + } + + io->CheckMetadata(); + // reader is closed here. + } + return ret; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_file.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_file.h new file mode 100644 index 0000000000..cd04d5d4c7 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_file.h @@ -0,0 +1,48 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_FILE_H_ +#define LIB_JXL_DEC_FILE_H_ + +// Top-level interface for JXL decoding. + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/dec_params.h" + +namespace jxl { + +// Decodes the preview image, if present, and stores it in `preview`. +// Must be the first frame in the file. Does nothing if there is no preview +// frame present according to the metadata. +Status DecodePreview(const DecompressParams& dparams, + const CodecMetadata& metadata, + BitReader* JXL_RESTRICT reader, ThreadPool* pool, + ImageBundle* JXL_RESTRICT preview, uint64_t* dec_pixels, + const SizeConstraints* constraints); + +// Implementation detail: currently decodes to linear sRGB. The contract is: +// `io` appears 'identical' (modulo compression artifacts) to the encoder input +// in a color-aware viewer. Note that `io->metadata.m.color_encoding` +// identifies the color space that was passed to the encoder; clients that want +// that same encoding must call `io->TransformTo` afterwards. +Status DecodeFile(const DecompressParams& params, + const Span file, CodecInOut* io, + ThreadPool* pool = nullptr); + +static inline Status DecodeFile(const DecompressParams& params, + const PaddedBytes& file, CodecInOut* io, + ThreadPool* pool = nullptr) { + return DecodeFile(params, Span(file), io, pool); +} + +} // namespace jxl + +#endif // LIB_JXL_DEC_FILE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_frame.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_frame.cc new file mode 100644 index 0000000000..39da487dd4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_frame.cc @@ -0,0 +1,1010 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_frame.h" + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/dec_group.h" +#include "lib/jxl/dec_modular.h" +#include "lib/jxl/dec_params.h" +#include "lib/jxl/dec_patch_dictionary.h" +#include "lib/jxl/dec_reconstruct.h" +#include "lib/jxl/dec_upsample.h" +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/epf.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/filters.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/jpeg/jpeg_data.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/luminance.h" +#include "lib/jxl/passes_state.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/sanitizers.h" +#include "lib/jxl/splines.h" +#include "lib/jxl/toc.h" + +namespace jxl { + +namespace { +Status DecodeGlobalDCInfo(BitReader* reader, bool is_jpeg, + PassesDecoderState* state, ThreadPool* pool) { + PROFILER_FUNC; + JXL_RETURN_IF_ERROR(state->shared_storage.quantizer.Decode(reader)); + + JXL_RETURN_IF_ERROR( + DecodeBlockCtxMap(reader, &state->shared_storage.block_ctx_map)); + + JXL_RETURN_IF_ERROR(state->shared_storage.cmap.DecodeDC(reader)); + + // Pre-compute info for decoding a group. + if (is_jpeg) { + state->shared_storage.quantizer.ClearDCMul(); // Don't dequant DC + } + + state->shared_storage.ac_strategy.FillInvalid(); + return true; +} +} // namespace + +Status DecodeFrameHeader(BitReader* JXL_RESTRICT reader, + FrameHeader* JXL_RESTRICT frame_header) { + JXL_ASSERT(frame_header->nonserialized_metadata != nullptr); + JXL_RETURN_IF_ERROR(ReadFrameHeader(reader, frame_header)); + return true; +} + +Status SkipFrame(const CodecMetadata& metadata, BitReader* JXL_RESTRICT reader, + bool is_preview) { + FrameHeader header(&metadata); + header.nonserialized_is_preview = is_preview; + JXL_RETURN_IF_ERROR(DecodeFrameHeader(reader, &header)); + + // Read TOC. + std::vector group_offsets; + std::vector group_sizes; + uint64_t groups_total_size; + const bool has_ac_global = true; + const FrameDimensions frame_dim = header.ToFrameDimensions(); + const size_t toc_entries = + NumTocEntries(frame_dim.num_groups, frame_dim.num_dc_groups, + header.passes.num_passes, has_ac_global); + JXL_RETURN_IF_ERROR(ReadGroupOffsets(toc_entries, reader, &group_offsets, + &group_sizes, &groups_total_size)); + + // Pretend all groups are read. + reader->SkipBits(groups_total_size * kBitsPerByte); + if (reader->TotalBitsConsumed() > reader->TotalBytes() * kBitsPerByte) { + return JXL_FAILURE("Group code extends after stream end"); + } + + return true; +} + +static BitReader* GetReaderForSection( + size_t num_groups, size_t num_passes, size_t group_codes_begin, + const std::vector& group_offsets, + const std::vector& group_sizes, BitReader* JXL_RESTRICT reader, + BitReader* JXL_RESTRICT store, size_t index) { + if (num_groups == 1 && num_passes == 1) return reader; + const size_t group_offset = group_codes_begin + group_offsets[index]; + const size_t next_group_offset = + group_codes_begin + group_offsets[index] + group_sizes[index]; + // The order of these variables must be: + // group_codes_begin <= group_offset <= next_group_offset <= file.size() + JXL_DASSERT(group_codes_begin <= group_offset); + JXL_DASSERT(group_offset <= next_group_offset); + JXL_DASSERT(next_group_offset <= reader->TotalBytes()); + const size_t group_size = next_group_offset - group_offset; + const size_t remaining_size = reader->TotalBytes() - group_offset; + const size_t size = std::min(group_size + 8, remaining_size); + *store = + BitReader(Span(reader->FirstByte() + group_offset, size)); + return store; +} + +Status DecodeFrame(const DecompressParams& dparams, + PassesDecoderState* dec_state, ThreadPool* JXL_RESTRICT pool, + BitReader* JXL_RESTRICT reader, ImageBundle* decoded, + const CodecMetadata& metadata, + const SizeConstraints* constraints, bool is_preview) { + PROFILER_ZONE("DecodeFrame uninstrumented"); + + FrameDecoder frame_decoder(dec_state, metadata, pool); + + frame_decoder.SetFrameSizeLimits(constraints); + + JXL_RETURN_IF_ERROR(frame_decoder.InitFrame( + reader, decoded, is_preview, dparams.allow_partial_files, + dparams.allow_partial_files && dparams.allow_more_progressive_steps)); + + // Handling of progressive decoding. + { + const FrameHeader& frame_header = frame_decoder.GetFrameHeader(); + size_t max_passes = dparams.max_passes; + size_t max_downsampling = std::max( + dparams.max_downsampling >> (frame_header.dc_level * 3), size_t(1)); + // TODO(veluca): deal with downsamplings >= 8. + if (max_downsampling >= 8) { + max_passes = 0; + } else { + for (uint32_t i = 0; i < frame_header.passes.num_downsample; ++i) { + if (max_downsampling >= frame_header.passes.downsample[i] && + max_passes > frame_header.passes.last_pass[i]) { + max_passes = frame_header.passes.last_pass[i] + 1; + } + } + } + // Do not use downsampling for kReferenceOnly frames. + if (frame_header.frame_type == FrameType::kReferenceOnly) { + max_passes = frame_header.passes.num_passes; + } + max_passes = std::min(max_passes, frame_header.passes.num_passes); + frame_decoder.SetMaxPasses(max_passes); + } + frame_decoder.SetRenderSpotcolors(dparams.render_spotcolors); + + size_t processed_bytes = reader->TotalBitsConsumed() / kBitsPerByte; + + Status close_ok = true; + std::vector> section_readers; + { + std::vector> section_closers; + std::vector section_info; + std::vector section_status; + size_t bytes_to_skip = 0; + for (size_t i = 0; i < frame_decoder.NumSections(); i++) { + size_t b = frame_decoder.SectionOffsets()[i]; + size_t e = b + frame_decoder.SectionSizes()[i]; + bytes_to_skip += e - b; + size_t pos = reader->TotalBitsConsumed() / kBitsPerByte; + if (pos + e <= reader->TotalBytes()) { + auto br = make_unique( + Span(reader->FirstByte() + b + pos, e - b)); + section_info.emplace_back(FrameDecoder::SectionInfo{br.get(), i}); + section_closers.emplace_back( + make_unique(br.get(), &close_ok)); + section_readers.emplace_back(std::move(br)); + } else if (!dparams.allow_partial_files) { + return JXL_FAILURE("Premature end of stream."); + } + } + // Skip over the to-be-decoded sections. + reader->SkipBits(kBitsPerByte * bytes_to_skip); + section_status.resize(section_info.size()); + + JXL_RETURN_IF_ERROR(frame_decoder.ProcessSections( + section_info.data(), section_info.size(), section_status.data())); + + for (size_t i = 0; i < section_status.size(); i++) { + auto s = section_status[i]; + if (s == FrameDecoder::kDone) { + processed_bytes += frame_decoder.SectionSizes()[i]; + continue; + } + if (dparams.allow_more_progressive_steps && s == FrameDecoder::kPartial) { + continue; + } + if (dparams.max_downsampling > 1 && s == FrameDecoder::kSkipped) { + continue; + } + return JXL_FAILURE("Invalid section %zu status: %d", section_info[i].id, + s); + } + } + + JXL_RETURN_IF_ERROR(close_ok); + + JXL_RETURN_IF_ERROR(frame_decoder.FinalizeFrame()); + decoded->SetDecodedBytes(processed_bytes); + return true; +} + +Status FrameDecoder::InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded, + bool is_preview, bool allow_partial_frames, + bool allow_partial_dc_global) { + PROFILER_FUNC; + decoded_ = decoded; + JXL_ASSERT(is_finalized_); + + allow_partial_frames_ = allow_partial_frames; + allow_partial_dc_global_ = allow_partial_dc_global; + + // Reset the dequantization matrices to their default values. + dec_state_->shared_storage.matrices = DequantMatrices(); + + frame_header_.nonserialized_is_preview = is_preview; + JXL_RETURN_IF_ERROR(DecodeFrameHeader(br, &frame_header_)); + frame_dim_ = frame_header_.ToFrameDimensions(); + + const size_t num_passes = frame_header_.passes.num_passes; + const size_t xsize = frame_dim_.xsize; + const size_t ysize = frame_dim_.ysize; + const size_t num_groups = frame_dim_.num_groups; + + // Check validity of frame dimensions. + JXL_RETURN_IF_ERROR(VerifyDimensions(constraints_, xsize, ysize)); + + // If the previous frame was not a kRegularFrame, `decoded` may have different + // dimensions; must reset to avoid errors. + decoded->RemoveColor(); + decoded->ClearExtraChannels(); + + // Read TOC. + uint64_t groups_total_size; + const bool has_ac_global = true; + const size_t toc_entries = NumTocEntries(num_groups, frame_dim_.num_dc_groups, + num_passes, has_ac_global); + JXL_RETURN_IF_ERROR(ReadGroupOffsets(toc_entries, br, §ion_offsets_, + §ion_sizes_, &groups_total_size)); + + JXL_DASSERT((br->TotalBitsConsumed() % kBitsPerByte) == 0); + const size_t group_codes_begin = br->TotalBitsConsumed() / kBitsPerByte; + JXL_DASSERT(!section_offsets_.empty()); + + // Overflow check. + if (group_codes_begin + groups_total_size < group_codes_begin) { + return JXL_FAILURE("Invalid group codes"); + } + + if (!frame_header_.chroma_subsampling.Is444() && + !(frame_header_.flags & FrameHeader::kSkipAdaptiveDCSmoothing) && + frame_header_.encoding == FrameEncoding::kVarDCT) { + return JXL_FAILURE( + "Non-444 chroma subsampling is not allowed when adaptive DC " + "smoothing is enabled"); + } + JXL_RETURN_IF_ERROR( + InitializePassesSharedState(frame_header_, &dec_state_->shared_storage)); + JXL_RETURN_IF_ERROR(dec_state_->Init()); + modular_frame_decoder_.Init(frame_dim_); + + if (decoded->IsJPEG()) { + if (frame_header_.encoding == FrameEncoding::kModular) { + return JXL_FAILURE("Cannot output JPEG from Modular"); + } + jpeg::JPEGData* jpeg_data = decoded->jpeg_data.get(); + size_t num_components = jpeg_data->components.size(); + if (num_components != 1 && num_components != 3) { + return JXL_FAILURE("Invalid number of components"); + } + if (frame_header_.nonserialized_metadata->m.xyb_encoded) { + return JXL_FAILURE("Cannot decode to JPEG an XYB image"); + } + auto jpeg_c_map = JpegOrder(ColorTransform::kYCbCr, num_components == 1); + decoded->jpeg_data->width = frame_dim_.xsize; + decoded->jpeg_data->height = frame_dim_.ysize; + for (size_t c = 0; c < num_components; c++) { + auto& component = jpeg_data->components[jpeg_c_map[c]]; + component.width_in_blocks = + frame_dim_.xsize_blocks >> frame_header_.chroma_subsampling.HShift(c); + component.height_in_blocks = + frame_dim_.ysize_blocks >> frame_header_.chroma_subsampling.VShift(c); + component.h_samp_factor = + 1 << frame_header_.chroma_subsampling.RawHShift(c); + component.v_samp_factor = + 1 << frame_header_.chroma_subsampling.RawVShift(c); + component.coeffs.resize(component.width_in_blocks * + component.height_in_blocks * jxl::kDCTBlockSize); + } + } + + // Clear the state. + decoded_dc_global_ = false; + decoded_ac_global_ = false; + is_finalized_ = false; + finalized_dc_ = false; + decoded_dc_groups_.clear(); + decoded_dc_groups_.resize(frame_dim_.num_dc_groups); + decoded_passes_per_ac_group_.clear(); + decoded_passes_per_ac_group_.resize(frame_dim_.num_groups, 0); + processed_section_.clear(); + processed_section_.resize(section_offsets_.size()); + max_passes_ = frame_header_.passes.num_passes; + num_renders_ = 0; + + return true; +} + +Status FrameDecoder::ProcessDCGlobal(BitReader* br) { + PROFILER_FUNC; + PassesSharedState& shared = dec_state_->shared_storage; + if (shared.frame_header.flags & FrameHeader::kPatches) { + bool uses_extra_channels = false; + JXL_RETURN_IF_ERROR(shared.image_features.patches.Decode( + br, frame_dim_.xsize_padded, frame_dim_.ysize_padded, + &uses_extra_channels)); + if (uses_extra_channels && frame_header_.upsampling != 1) { + for (size_t ecups : frame_header_.extra_channel_upsampling) { + if (ecups != frame_header_.upsampling) { + return JXL_FAILURE( + "Cannot use extra channels in patches if color channels are " + "subsampled differently from extra channels"); + } + } + } + } else { + shared.image_features.patches.Clear(); + } + if (shared.frame_header.flags & FrameHeader::kSplines) { + JXL_RETURN_IF_ERROR(shared.image_features.splines.Decode( + br, frame_dim_.xsize * frame_dim_.ysize)); + } + if (shared.frame_header.flags & FrameHeader::kNoise) { + JXL_RETURN_IF_ERROR(DecodeNoise(br, &shared.image_features.noise_params)); + } + + JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.DecodeDC(br)); + if (frame_header_.encoding == FrameEncoding::kVarDCT) { + JXL_RETURN_IF_ERROR( + jxl::DecodeGlobalDCInfo(br, decoded_->IsJPEG(), dec_state_, pool_)); + } + Status dec_status = modular_frame_decoder_.DecodeGlobalInfo( + br, frame_header_, allow_partial_dc_global_); + if (dec_status.IsFatalError()) return dec_status; + if (dec_status) { + decoded_dc_global_ = true; + } + return dec_status; +} + +Status FrameDecoder::ProcessDCGroup(size_t dc_group_id, BitReader* br) { + PROFILER_FUNC; + const size_t gx = dc_group_id % frame_dim_.xsize_dc_groups; + const size_t gy = dc_group_id / frame_dim_.xsize_dc_groups; + const LoopFilter& lf = dec_state_->shared->frame_header.loop_filter; + if (frame_header_.encoding == FrameEncoding::kVarDCT && + !(frame_header_.flags & FrameHeader::kUseDcFrame)) { + JXL_RETURN_IF_ERROR( + modular_frame_decoder_.DecodeVarDCTDC(dc_group_id, br, dec_state_)); + } + const Rect mrect(gx * frame_dim_.dc_group_dim, gy * frame_dim_.dc_group_dim, + frame_dim_.dc_group_dim, frame_dim_.dc_group_dim); + JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup( + mrect, br, 3, 1000, ModularStreamId::ModularDC(dc_group_id), + /*zerofill=*/false)); + if (frame_header_.encoding == FrameEncoding::kVarDCT) { + JXL_RETURN_IF_ERROR( + modular_frame_decoder_.DecodeAcMetadata(dc_group_id, br, dec_state_)); + } else if (lf.epf_iters > 0) { + FillImage(kInvSigmaNum / lf.epf_sigma_for_modular, + &dec_state_->filter_weights.sigma); + } + decoded_dc_groups_[dc_group_id] = true; + return true; +} + +void FrameDecoder::FinalizeDC() { + // Do Adaptive DC smoothing if enabled. This *must* happen between all the + // ProcessDCGroup and ProcessACGroup. + if (frame_header_.encoding == FrameEncoding::kVarDCT && + !(frame_header_.flags & FrameHeader::kSkipAdaptiveDCSmoothing) && + !(frame_header_.flags & FrameHeader::kUseDcFrame)) { + AdaptiveDCSmoothing(dec_state_->shared->quantizer.MulDC(), + &dec_state_->shared_storage.dc_storage, pool_); + } + + finalized_dc_ = true; +} + +void FrameDecoder::AllocateOutput() { + const CodecMetadata& metadata = *frame_header_.nonserialized_metadata; + if (dec_state_->rgb_output == nullptr && !dec_state_->pixel_callback) { + decoded_->SetFromImage(Image3F(frame_dim_.xsize_upsampled_padded, + frame_dim_.ysize_upsampled_padded), + dec_state_->output_encoding_info.color_encoding); + } + dec_state_->extra_channels.clear(); + if (metadata.m.num_extra_channels > 0) { + for (size_t i = 0; i < metadata.m.num_extra_channels; i++) { + uint32_t ecups = frame_header_.extra_channel_upsampling[i]; + dec_state_->extra_channels.emplace_back( + DivCeil(frame_dim_.xsize_upsampled_padded, ecups), + DivCeil(frame_dim_.ysize_upsampled_padded, ecups)); +#if MEMORY_SANITIZER + // Avoid errors due to loading vectors on the outermost padding. + for (size_t y = 0; y < DivCeil(frame_dim_.ysize_upsampled_padded, ecups); + y++) { + for (size_t x = DivCeil(frame_dim_.xsize_upsampled, ecups); + x < DivCeil(frame_dim_.xsize_upsampled_padded, ecups); x++) { + dec_state_->extra_channels.back().Row(y)[x] = + msan::kSanitizerSentinel; + } + } +#endif + } + } + decoded_->origin = dec_state_->shared->frame_header.frame_origin; +} + +Status FrameDecoder::ProcessACGlobal(BitReader* br) { + JXL_CHECK(finalized_dc_); + JXL_CHECK(decoded_->HasColor() || dec_state_->rgb_output != nullptr || + !!dec_state_->pixel_callback); + + // Decode AC group. + if (frame_header_.encoding == FrameEncoding::kVarDCT) { + JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.Decode( + br, &modular_frame_decoder_)); + + size_t num_histo_bits = + CeilLog2Nonzero(dec_state_->shared->frame_dim.num_groups); + dec_state_->shared_storage.num_histograms = + 1 + br->ReadBits(num_histo_bits); + + dec_state_->code.resize(kMaxNumPasses); + dec_state_->context_map.resize(kMaxNumPasses); + // Read coefficient orders and histograms. + size_t max_num_bits_ac = 0; + for (size_t i = 0; + i < dec_state_->shared_storage.frame_header.passes.num_passes; i++) { + uint16_t used_orders = U32Coder::Read(kOrderEnc, br); + JXL_RETURN_IF_ERROR(DecodeCoeffOrders( + used_orders, dec_state_->used_acs, + &dec_state_->shared_storage + .coeff_orders[i * dec_state_->shared_storage.coeff_order_size], + br)); + size_t num_contexts = + dec_state_->shared->num_histograms * + dec_state_->shared_storage.block_ctx_map.NumACContexts(); + JXL_RETURN_IF_ERROR(DecodeHistograms( + br, num_contexts, &dec_state_->code[i], &dec_state_->context_map[i])); + // Add extra values to enable the cheat in hot loop of DecodeACVarBlock. + dec_state_->context_map[i].resize( + num_contexts + kZeroDensityContextLimit - kZeroDensityContextCount); + max_num_bits_ac = + std::max(max_num_bits_ac, dec_state_->code[i].max_num_bits); + } + max_num_bits_ac += CeilLog2Nonzero( + dec_state_->shared_storage.frame_header.passes.num_passes); + // 16-bit buffer for decoding to JPEG are not implemented. + // TODO(veluca): figure out the exact limit - 16 should still work with + // 16-bit buffers, but we are excluding it for safety. + bool use_16_bit = max_num_bits_ac < 16 && !decoded_->IsJPEG(); + bool store = frame_header_.passes.num_passes > 1; + size_t xs = store ? kGroupDim * kGroupDim : 0; + size_t ys = store ? frame_dim_.num_groups : 0; + if (use_16_bit) { + dec_state_->coefficients = make_unique>(xs, ys); + } else { + dec_state_->coefficients = make_unique>(xs, ys); + } + if (store) { + dec_state_->coefficients->ZeroFill(); + } + } + + // Set JPEG decoding data. + if (decoded_->IsJPEG()) { + decoded_->color_transform = frame_header_.color_transform; + decoded_->chroma_subsampling = frame_header_.chroma_subsampling; + const std::vector& qe = + dec_state_->shared_storage.matrices.encodings(); + if (qe.empty() || qe[0].mode != QuantEncoding::Mode::kQuantModeRAW || + std::abs(qe[0].qraw.qtable_den - 1.f / (8 * 255)) > 1e-8f) { + return JXL_FAILURE( + "Quantization table is not a JPEG quantization table."); + } + jpeg::JPEGData* jpeg_data = decoded_->jpeg_data.get(); + size_t num_components = jpeg_data->components.size(); + bool is_gray = (num_components == 1); + auto jpeg_c_map = JpegOrder(frame_header_.color_transform, is_gray); + for (size_t c = 0; c < num_components; c++) { + // TODO(eustas): why 1-st quant table for gray? + size_t quant_c = is_gray ? 1 : c; + size_t qpos = jpeg_data->components[jpeg_c_map[c]].quant_idx; + JXL_CHECK(qpos != jpeg_data->quant.size()); + for (size_t x = 0; x < 8; x++) { + for (size_t y = 0; y < 8; y++) { + jpeg_data->quant[qpos].values[x * 8 + y] = + (*qe[0].qraw.qtable)[quant_c * 64 + y * 8 + x]; + } + } + } + } + // Set memory buffer for pre-color-transform frame, if needed. + if (frame_header_.needs_color_transform() && + frame_header_.save_before_color_transform) { + dec_state_->pre_color_transform_frame = + Image3F(frame_dim_.xsize_upsampled, frame_dim_.ysize_upsampled); + } else { + // clear pre_color_transform_frame to ensure that previously moved-from + // images are not used. + dec_state_->pre_color_transform_frame = Image3F(); + } + decoded_ac_global_ = true; + return true; +} + +Status FrameDecoder::ProcessACGroup(size_t ac_group_id, + BitReader* JXL_RESTRICT* br, + size_t num_passes, size_t thread, + bool force_draw, bool dc_only) { + PROFILER_ZONE("process_group"); + const size_t gx = ac_group_id % frame_dim_.xsize_groups; + const size_t gy = ac_group_id / frame_dim_.xsize_groups; + const size_t x = gx * frame_dim_.group_dim; + const size_t y = gy * frame_dim_.group_dim; + + if (frame_header_.encoding == FrameEncoding::kVarDCT) { + group_dec_caches_[thread].InitOnce(frame_header_.passes.num_passes, + dec_state_->used_acs); + JXL_RETURN_IF_ERROR(DecodeGroup( + br, num_passes, ac_group_id, dec_state_, &group_dec_caches_[thread], + thread, decoded_, decoded_passes_per_ac_group_[ac_group_id], force_draw, + dc_only)); + } + + // don't limit to image dimensions here (is done in DecodeGroup) + const Rect mrect(x, y, frame_dim_.group_dim, frame_dim_.group_dim); + for (size_t i = 0; i < frame_header_.passes.num_passes; i++) { + int minShift, maxShift; + frame_header_.passes.GetDownsamplingBracket(i, minShift, maxShift); + if (i >= decoded_passes_per_ac_group_[ac_group_id] && + i < decoded_passes_per_ac_group_[ac_group_id] + num_passes) { + JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup( + mrect, br[i - decoded_passes_per_ac_group_[ac_group_id]], minShift, + maxShift, ModularStreamId::ModularAC(ac_group_id, i), + /*zerofill=*/false)); + } else if (i >= decoded_passes_per_ac_group_[ac_group_id] + num_passes && + force_draw) { + JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup( + mrect, nullptr, minShift, maxShift, + ModularStreamId::ModularAC(ac_group_id, i), /*zerofill=*/true)); + } + } + decoded_passes_per_ac_group_[ac_group_id] += num_passes; + return true; +} + +Status FrameDecoder::ProcessSections(const SectionInfo* sections, size_t num, + SectionStatus* section_status) { + if (num == 0) return true; // Nothing to process + std::fill(section_status, section_status + num, SectionStatus::kSkipped); + size_t dc_global_sec = num; + size_t ac_global_sec = num; + std::vector dc_group_sec(frame_dim_.num_dc_groups, num); + std::vector> ac_group_sec( + frame_dim_.num_groups, + std::vector(frame_header_.passes.num_passes, num)); + std::vector num_ac_passes(frame_dim_.num_groups); + if (frame_dim_.num_groups == 1 && frame_header_.passes.num_passes == 1) { + JXL_ASSERT(num == 1); + JXL_ASSERT(sections[0].id == 0); + if (processed_section_[0] == false) { + processed_section_[0] = true; + ac_group_sec[0].resize(1); + dc_global_sec = ac_global_sec = dc_group_sec[0] = ac_group_sec[0][0] = 0; + num_ac_passes[0] = 1; + } else { + section_status[0] = SectionStatus::kDuplicate; + } + } else { + size_t ac_global_index = frame_dim_.num_dc_groups + 1; + for (size_t i = 0; i < num; i++) { + JXL_ASSERT(sections[i].id < processed_section_.size()); + if (processed_section_[sections[i].id]) { + section_status[i] = SectionStatus::kDuplicate; + continue; + } + if (sections[i].id == 0) { + dc_global_sec = i; + } else if (sections[i].id < ac_global_index) { + dc_group_sec[sections[i].id - 1] = i; + } else if (sections[i].id == ac_global_index) { + ac_global_sec = i; + } else { + size_t ac_idx = sections[i].id - ac_global_index - 1; + size_t acg = ac_idx % frame_dim_.num_groups; + size_t acp = ac_idx / frame_dim_.num_groups; + if (acp >= frame_header_.passes.num_passes) { + return JXL_FAILURE("Invalid section ID"); + } + if (acp >= max_passes_) { + continue; + } + ac_group_sec[acg][acp] = i; + } + processed_section_[sections[i].id] = true; + } + // Count number of new passes per group. + for (size_t g = 0; g < ac_group_sec.size(); g++) { + size_t j = 0; + for (; j + decoded_passes_per_ac_group_[g] < max_passes_; j++) { + if (ac_group_sec[g][j + decoded_passes_per_ac_group_[g]] == num) { + break; + } + } + num_ac_passes[g] = j; + } + } + if (dc_global_sec != num) { + Status dc_global_status = ProcessDCGlobal(sections[dc_global_sec].br); + if (dc_global_status.IsFatalError()) return dc_global_status; + if (dc_global_status) { + section_status[dc_global_sec] = SectionStatus::kDone; + } else { + section_status[dc_global_sec] = SectionStatus::kPartial; + } + } + + std::atomic has_error{false}; + if (decoded_dc_global_) { + RunOnPool( + pool_, 0, dc_group_sec.size(), ThreadPool::SkipInit(), + [this, &dc_group_sec, &num, §ions, §ion_status, &has_error]( + size_t i, size_t thread) { + if (dc_group_sec[i] != num) { + if (!ProcessDCGroup(i, sections[dc_group_sec[i]].br)) { + has_error = true; + } else { + section_status[dc_group_sec[i]] = SectionStatus::kDone; + } + } + }, + "DecodeDCGroup"); + } + if (has_error) return JXL_FAILURE("Error in DC group"); + + if (*std::min_element(decoded_dc_groups_.begin(), decoded_dc_groups_.end()) == + true && + !finalized_dc_) { + FinalizeDC(); + AllocateOutput(); + } + + if (finalized_dc_) dec_state_->EnsureBordersStorage(); + if (finalized_dc_ && ac_global_sec != num && !decoded_ac_global_) { + dec_state_->InitForAC(pool_); + JXL_RETURN_IF_ERROR(ProcessACGlobal(sections[ac_global_sec].br)); + section_status[ac_global_sec] = SectionStatus::kDone; + } + + if (decoded_ac_global_) { + // The decoded image requires padding for filtering. ProcessACGlobal added + // the padding, however when Flush is used, the image is shrunk to the + // output size. Add the padding back here. This is a cheap operation + // since the image has the original allocated size. The memory and original + // size are already there, but for safety we require the indicated xsize and + // ysize dimensions match the working area, see PlaneRowBoundsCheck. + decoded_->ShrinkTo(frame_dim_.xsize_upsampled_padded, + frame_dim_.ysize_upsampled_padded); + + // Mark all the AC groups that we received as not complete yet. + for (size_t i = 0; i < ac_group_sec.size(); i++) { + if (num_ac_passes[i] == 0) continue; + dec_state_->group_border_assigner.ClearDone(i); + } + + RunOnPool( + pool_, 0, ac_group_sec.size(), + [this](size_t num_threads) { + PrepareStorage(num_threads, decoded_passes_per_ac_group_.size()); + return true; + }, + [this, &ac_group_sec, &num_ac_passes, &num, §ions, §ion_status, + &has_error](size_t g, size_t thread) { + if (num_ac_passes[g] == 0) { // no new AC pass, nothing to do. + return; + } + (void)num; + size_t first_pass = decoded_passes_per_ac_group_[g]; + BitReader* JXL_RESTRICT readers[kMaxNumPasses]; + for (size_t i = 0; i < num_ac_passes[g]; i++) { + JXL_ASSERT(ac_group_sec[g][first_pass + i] != num); + readers[i] = sections[ac_group_sec[g][first_pass + i]].br; + } + if (!ProcessACGroup(g, readers, num_ac_passes[g], + GetStorageLocation(thread, g), + /*force_draw=*/false, /*dc_only=*/false)) { + has_error = true; + } else { + for (size_t i = 0; i < num_ac_passes[g]; i++) { + section_status[ac_group_sec[g][first_pass + i]] = + SectionStatus::kDone; + } + } + }, + "DecodeGroup"); + } + if (has_error) return JXL_FAILURE("Error in AC group"); + + for (size_t i = 0; i < num; i++) { + if (section_status[i] == SectionStatus::kSkipped || + section_status[i] == SectionStatus::kPartial) { + processed_section_[sections[i].id] = false; + } + } + return true; +} + +Status FrameDecoder::Flush() { + bool has_blending = frame_header_.blending_info.mode != BlendMode::kReplace || + frame_header_.custom_size_or_origin; + for (const auto& blending_info_ec : + frame_header_.extra_channel_blending_info) { + if (blending_info_ec.mode != BlendMode::kReplace) has_blending = true; + } + // No early Flush() if blending is enabled. + if (has_blending && !is_finalized_) { + return false; + } + // No early Flush() - nothing to do - if the frame is a kSkipProgressive + // frame. + if (frame_header_.frame_type == FrameType::kSkipProgressive && + !is_finalized_) { + return true; + } + if (decoded_->IsJPEG()) { + // Nothing to do. + return true; + } + uint32_t completely_decoded_ac_pass = *std::min_element( + decoded_passes_per_ac_group_.begin(), decoded_passes_per_ac_group_.end()); + if (completely_decoded_ac_pass < frame_header_.passes.num_passes) { + // We don't have all AC yet: force a draw of all the missing areas. + // Mark all sections as not complete. + for (size_t i = 0; i < decoded_passes_per_ac_group_.size(); i++) { + if (decoded_passes_per_ac_group_[i] == frame_header_.passes.num_passes) + continue; + dec_state_->group_border_assigner.ClearDone(i); + } + std::atomic has_error{false}; + RunOnPool( + pool_, 0, decoded_passes_per_ac_group_.size(), + [this](size_t num_threads) { + PrepareStorage(num_threads, decoded_passes_per_ac_group_.size()); + return true; + }, + [this, &has_error](size_t g, size_t thread) { + if (decoded_passes_per_ac_group_[g] == + frame_header_.passes.num_passes) { + // This group was drawn already, nothing to do. + return; + } + BitReader* JXL_RESTRICT readers[kMaxNumPasses] = {}; + bool ok = ProcessACGroup( + g, readers, /*num_passes=*/0, GetStorageLocation(thread, g), + /*force_draw=*/true, /*dc_only=*/!decoded_ac_global_); + if (!ok) has_error = true; + }, + "ForceDrawGroup"); + if (has_error) { + return JXL_FAILURE("Drawing groups failed"); + } + } + // TODO(veluca): the rest of this function should be removed once we have full + // support for per-group decoding. + + // undo global modular transforms and copy int pixel buffers to float ones + JXL_RETURN_IF_ERROR( + modular_frame_decoder_.FinalizeDecoding(dec_state_, pool_, decoded_)); + + JXL_RETURN_IF_ERROR(FinalizeFrameDecoding(decoded_, dec_state_, pool_, + /*force_fir=*/false, + /*skip_blending=*/false)); + + num_renders_++; + return true; +} + +int FrameDecoder::SavedAs(const FrameHeader& header) { + if (header.frame_type == FrameType::kDCFrame) { + // bits 16, 32, 64, 128 for DC level + return 16 << (header.dc_level - 1); + } else if (header.CanBeReferenced()) { + // bits 1, 2, 4 and 8 for the references + return 1 << header.save_as_reference; + } + + return 0; +} + +int FrameDecoder::References() const { + if (is_finalized_) { + return 0; + } + if ((!decoded_dc_global_ || !decoded_ac_global_ || + *std::min_element(decoded_dc_groups_.begin(), + decoded_dc_groups_.end()) != 1 || + *std::min_element(decoded_passes_per_ac_group_.begin(), + decoded_passes_per_ac_group_.end()) < max_passes_)) { + return 0; + } + + int result = 0; + + // Blending + if (frame_header_.frame_type == FrameType::kRegularFrame || + frame_header_.frame_type == FrameType::kSkipProgressive) { + bool cropped = frame_header_.custom_size_or_origin; + if (cropped || frame_header_.blending_info.mode != BlendMode::kReplace) { + result |= (1 << frame_header_.blending_info.source); + } + const auto& extra = frame_header_.extra_channel_blending_info; + for (size_t i = 0; i < extra.size(); ++i) { + if (cropped || extra[i].mode != BlendMode::kReplace) { + result |= (1 << extra[i].source); + } + } + } + + // Patches + if (frame_header_.flags & FrameHeader::kPatches) { + result |= dec_state_->shared->image_features.patches.GetReferences(); + } + + // DC Level + if (frame_header_.flags & FrameHeader::kUseDcFrame) { + // Reads from the next dc level + int dc_level = frame_header_.dc_level + 1; + // bits 16, 32, 64, 128 for DC level + result |= (16 << (dc_level - 1)); + } + + return result; +} + +Status FrameDecoder::FinalizeFrame() { + if (is_finalized_) { + return JXL_FAILURE("FinalizeFrame called multiple times"); + } + is_finalized_ = true; + if (decoded_->IsJPEG()) { + // Nothing to do. + return true; + } + if (!finalized_dc_) { + // We don't have all of DC: EPF might not behave correctly (and is not + // particularly useful anyway on upsampling results), so we disable it. + dec_state_->shared_storage.frame_header.loop_filter.epf_iters = 0; + } + if ((!decoded_dc_global_ || !decoded_ac_global_ || + *std::min_element(decoded_dc_groups_.begin(), + decoded_dc_groups_.end()) != 1 || + *std::min_element(decoded_passes_per_ac_group_.begin(), + decoded_passes_per_ac_group_.end()) < max_passes_) && + !allow_partial_frames_) { + return JXL_FAILURE( + "FinalizeFrame called before the frame was fully decoded"); + } + + if (!finalized_dc_) { + JXL_ASSERT(allow_partial_frames_); + AllocateOutput(); + dec_state_->InitForAC(nullptr); + } + + JXL_RETURN_IF_ERROR(Flush()); + + if (dec_state_->shared->frame_header.CanBeReferenced()) { + size_t id = dec_state_->shared->frame_header.save_as_reference; + auto& reference_frame = dec_state_->shared_storage.reference_frames[id]; + if (dec_state_->pre_color_transform_frame.xsize() == 0) { + reference_frame.storage = decoded_->Copy(); + } else { + reference_frame.storage = ImageBundle(decoded_->metadata()); + reference_frame.storage.SetFromImage( + std::move(dec_state_->pre_color_transform_frame), + decoded_->c_current()); + if (decoded_->HasExtraChannels()) { + const std::vector* ecs = &dec_state_->pre_color_transform_ec; + if (ecs->empty()) ecs = &decoded_->extra_channels(); + std::vector extra_channels; + for (const auto& ec : *ecs) { + extra_channels.push_back(CopyImage(ec)); + } + reference_frame.storage.SetExtraChannels(std::move(extra_channels)); + } + } + reference_frame.frame = &reference_frame.storage; + reference_frame.ib_is_in_xyb = + dec_state_->shared->frame_header.save_before_color_transform; + if (!dec_state_->shared->frame_header.save_before_color_transform) { + const CodecMetadata* metadata = + dec_state_->shared->frame_header.nonserialized_metadata; + if (reference_frame.frame->xsize() < metadata->xsize() || + reference_frame.frame->ysize() < metadata->ysize()) { + return JXL_FAILURE( + "trying to save a reference frame that is too small: %zux%zu " + "instead of %zux%zu", + reference_frame.frame->xsize(), reference_frame.frame->ysize(), + metadata->xsize(), metadata->ysize()); + } + reference_frame.storage.ShrinkTo(metadata->xsize(), metadata->ysize()); + } + } + if (frame_header_.nonserialized_is_preview) { + // Fix possible larger image size (multiple of kBlockDim) + // TODO(lode): verify if and when that happens. + decoded_->ShrinkTo(frame_dim_.xsize, frame_dim_.ysize); + } else if (!decoded_->IsJPEG()) { + // A kRegularFrame is blended with the other frames, and thus results in a + // coalesced frame of size equal to image dimensions. Other frames are not + // blended, thus their final size is the size that was defined in the + // frame_header. + if (frame_header_.frame_type == kRegularFrame || + frame_header_.frame_type == kSkipProgressive) { + decoded_->ShrinkTo( + dec_state_->shared->frame_header.nonserialized_metadata->xsize(), + dec_state_->shared->frame_header.nonserialized_metadata->ysize()); + } else { + // xsize_upsampled is the actual frame size, after any upsampling has been + // applied. + decoded_->ShrinkTo(frame_dim_.xsize_upsampled, + frame_dim_.ysize_upsampled); + } + } + + if (render_spotcolors_) { + for (size_t i = 0; i < decoded_->extra_channels().size(); i++) { + // Don't use Find() because there may be multiple spot color channels. + const ExtraChannelInfo& eci = decoded_->metadata()->extra_channel_info[i]; + if (eci.type == ExtraChannel::kOptional) { + continue; + } + if (eci.type == ExtraChannel::kUnknown || + (int(ExtraChannel::kReserved0) <= int(eci.type) && + int(eci.type) <= int(ExtraChannel::kReserved7))) { + return JXL_FAILURE( + "Unknown extra channel (bits %u, shift %u, name '%s')\n", + eci.bit_depth.bits_per_sample, eci.dim_shift, eci.name.c_str()); + } + if (eci.type == ExtraChannel::kSpotColor) { + float scale = eci.spot_color[3]; + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < decoded_->ysize(); y++) { + float* JXL_RESTRICT p = decoded_->color()->Plane(c).Row(y); + const float* JXL_RESTRICT s = + decoded_->extra_channels()[i].ConstRow(y); + for (size_t x = 0; x < decoded_->xsize(); x++) { + float mix = scale * s[x]; + p[x] = mix * eci.spot_color[c] + (1.0 - mix) * p[x]; + } + } + } + } + } + } + if (dec_state_->shared->frame_header.dc_level != 0) { + dec_state_->shared_storage + .dc_frames[dec_state_->shared->frame_header.dc_level - 1] = + std::move(*decoded_->color()); + decoded_->RemoveColor(); + } + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_frame.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_frame.h new file mode 100644 index 0000000000..0c86feb8ab --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_frame.h @@ -0,0 +1,281 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_FRAME_H_ +#define LIB_JXL_DEC_FRAME_H_ + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/blending.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/dec_modular.h" +#include "lib/jxl/dec_params.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +// TODO(veluca): remove DecodeFrameHeader once the API migrates to FrameDecoder. + +// `frame_header` must have nonserialized_metadata and +// nonserialized_is_preview set. +Status DecodeFrameHeader(BitReader* JXL_RESTRICT reader, + FrameHeader* JXL_RESTRICT frame_header); + +// Decodes a frame. Groups may be processed in parallel by `pool`. +// See DecodeFile for explanation of c_decoded. +// `io` is only used for reading maximum image size. Also updates +// `dec_state` with the new frame header. +// `metadata` is the metadata that applies to all frames of the codestream +// `decoded->metadata` must already be set and must match metadata.m. +Status DecodeFrame(const DecompressParams& dparams, + PassesDecoderState* dec_state, ThreadPool* JXL_RESTRICT pool, + BitReader* JXL_RESTRICT reader, ImageBundle* decoded, + const CodecMetadata& metadata, + const SizeConstraints* constraints, bool is_preview = false); + +// Leaves reader in the same state as DecodeFrame would. Used to skip preview. +// Also updates `dec_state` with the new frame header. +Status SkipFrame(const CodecMetadata& metadata, BitReader* JXL_RESTRICT reader, + bool is_preview = false); + +// TODO(veluca): implement "forced drawing". +class FrameDecoder { + public: + // All parameters must outlive the FrameDecoder. + FrameDecoder(PassesDecoderState* dec_state, const CodecMetadata& metadata, + ThreadPool* pool) + : dec_state_(dec_state), pool_(pool), frame_header_(&metadata) {} + + // `constraints` must outlive the FrameDecoder if not null, or stay alive + // until the next call to SetFrameSizeLimits. + void SetFrameSizeLimits(const SizeConstraints* constraints) { + constraints_ = constraints; + } + void SetRenderSpotcolors(bool rsc) { render_spotcolors_ = rsc; } + + // Read FrameHeader and table of contents from the given BitReader. + // Also checks frame dimensions for their limits, and sets the output + // image buffer. + // TODO(veluca): remove the `allow_partial_frames` flag - this should be moved + // on callers. + Status InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded, + bool is_preview, bool allow_partial_frames, + bool allow_partial_dc_global); + + struct SectionInfo { + BitReader* JXL_RESTRICT br; + size_t id; + }; + + enum SectionStatus { + // Processed correctly. + kDone = 0, + // Skipped because other required sections were not yet processed. + kSkipped = 1, + // Skipped because the section was already processed. + kDuplicate = 2, + // Only partially decoded: the section will need to be processed again. + kPartial = 3, + }; + + // Processes `num` sections; each SectionInfo contains the index + // of the section and a BitReader that only contains the data of the section. + // `section_status` should point to `num` elements, and will be filled with + // information about whether each section was processed or not. + // A section is a part of the encoded file that is indexed by the TOC. + Status ProcessSections(const SectionInfo* sections, size_t num, + SectionStatus* section_status); + + // Flushes all the data decoded so far to pixels. + Status Flush(); + + // Runs final operations once a frame data is decoded. + // Must be called exactly once per frame, after all calls to ProcessSections. + Status FinalizeFrame(); + + // Returns dependencies of this frame on reference ids as a bit mask: bits 0-3 + // indicate reference frame 0-3 for patches and blending, bits 4-7 indicate DC + // frames this frame depends on. Only returns a valid result after all calls + // to ProcessSections are finished and before FinalizeFrame. + int References() const; + + // Returns reference id of storage location where this frame is stored as a + // bit flag, or 0 if not stored. + // Matches the bit mask used for GetReferences: bits 0-3 indicate it is stored + // for patching or blending, bits 4-7 indicate DC frame. + // Unlike References, can be ran at any time as + // soon as the frame header is known. + static int SavedAs(const FrameHeader& header); + + // Returns offset of this section after the end of the TOC. The end of the TOC + // is the byte position of the bit reader after InitFrame was called. + const std::vector& SectionOffsets() const { + return section_offsets_; + } + const std::vector& SectionSizes() const { return section_sizes_; } + size_t NumSections() const { return section_sizes_.size(); } + + // TODO(veluca): remove once we remove --downsampling flag. + void SetMaxPasses(size_t max_passes) { max_passes_ = max_passes; } + const FrameHeader& GetFrameHeader() const { return frame_header_; } + + // Returns whether a DC image has been decoded, accessible at low resolution + // at passes.shared_storage.dc_storage + bool HasDecodedDC() const { + return frame_header_.encoding == FrameEncoding::kVarDCT && finalized_dc_; + } + + // Sets the buffer to which uint8 sRGB pixels will be decoded. This is not + // supported for all images. If it succeeds, HasRGBBuffer() will return true. + // If it does not succeed, the image is decoded to the ImageBundle passed to + // InitFrame instead. + // If an output callback is set, this function *may not* be called. + // + // @param undo_orientation: if true, indicates the frame decoder should apply + // the exif orientation to bring the image to the intended display + // orientation. Performing this operation is not yet supported, so this + // results in not setting the buffer if the image has a non-identity EXIF + // orientation. When outputting to the ImageBundle, no orientation is undone. + void MaybeSetRGB8OutputBuffer(uint8_t* rgb_output, size_t stride, + bool is_rgba, bool undo_orientation) const { + if (!CanDoLowMemoryPath(undo_orientation)) return; + dec_state_->rgb_output = rgb_output; + dec_state_->rgb_output_is_rgba = is_rgba; + dec_state_->rgb_stride = stride; + JXL_ASSERT(dec_state_->pixel_callback == nullptr); +#if !JXL_HIGH_PRECISION + if (decoded_->metadata()->xyb_encoded && + dec_state_->output_encoding_info.color_encoding.IsSRGB() && + dec_state_->output_encoding_info.all_default_opsin && + HasFastXYBTosRGB8() && frame_header_.needs_color_transform()) { + dec_state_->fast_xyb_srgb8_conversion = true; + } +#endif + } + + // Same as MaybeSetRGB8OutputBuffer, but with a float callback. This is not + // supported for all images. If it succeeds, HasRGBBuffer() will return true. + // If it does not succeed, the image is decoded to the ImageBundle passed to + // InitFrame instead. + // If a RGB8 output buffer is set, this function *may not* be called. + // + // @param undo_orientation: if true, indicates the frame decoder should apply + // the exif orientation to bring the image to the intended display + // orientation. Performing this operation is not yet supported, so this + // results in not setting the buffer if the image has a non-identity EXIF + // orientation. When outputting to the ImageBundle, no orientation is undone. + void MaybeSetFloatCallback( + const std::function& cb, + bool is_rgba, bool undo_orientation) const { + if (!CanDoLowMemoryPath(undo_orientation)) return; + dec_state_->pixel_callback = cb; + dec_state_->rgb_output_is_rgba = is_rgba; + JXL_ASSERT(dec_state_->rgb_output == nullptr); + } + + // Returns true if the rgb output buffer passed by MaybeSetRGB8OutputBuffer + // has been/will be populated by Flush() / FinalizeFrame(), or if a pixel + // callback has been used. + bool HasRGBBuffer() const { + return dec_state_->rgb_output != nullptr || + dec_state_->pixel_callback != nullptr; + } + + private: + Status ProcessDCGlobal(BitReader* br); + Status ProcessDCGroup(size_t dc_group_id, BitReader* br); + void FinalizeDC(); + void AllocateOutput(); + Status ProcessACGlobal(BitReader* br); + Status ProcessACGroup(size_t ac_group_id, BitReader* JXL_RESTRICT* br, + size_t num_passes, size_t thread, bool force_draw, + bool dc_only); + + // Allocates storage for parallel decoding using up to `num_threads` threads + // of up to `num_tasks` tasks. The value of `thread` passed to + // `GetStorageLocation` must be smaller than the `num_threads` value passed + // here. The value of `task` passed to `GetStorageLocation` must be smaller + // than the value of `num_tasks` passed here. + void PrepareStorage(size_t num_threads, size_t num_tasks) { + size_t storage_size = std::min(num_threads, num_tasks); + if (storage_size > group_dec_caches_.size()) { + group_dec_caches_.resize(storage_size); + } + dec_state_->EnsureStorage(storage_size); + use_task_id_ = num_threads > num_tasks; + } + + size_t GetStorageLocation(size_t thread, size_t task) { + if (use_task_id_) return task; + return thread; + } + + // If the image has default exif orientation (or has an orientation but should + // not be undone) and no blending, the current frame cannot be referenced by + // future frames, there are no spot colors to be rendered, and alpha is not + // premultiplied, then low memory options can be used + // (uint8 output buffer or float pixel callback). + // TODO(veluca): reduce this set of restrictions. + bool CanDoLowMemoryPath(bool undo_orientation) const { + if (undo_orientation && + decoded_->metadata()->GetOrientation() != Orientation::kIdentity) { + return false; + } + if (ImageBlender::NeedsBlending(dec_state_)) return false; + if (frame_header_.CanBeReferenced()) return false; + if (render_spotcolors_ && + decoded_->metadata()->Find(ExtraChannel::kSpotColor)) { + return false; + } + if (decoded_->AlphaIsPremultiplied()) return false; + return true; + } + + PassesDecoderState* dec_state_; + ThreadPool* pool_; + std::vector section_offsets_; + std::vector section_sizes_; + size_t max_passes_; + // TODO(veluca): figure out the duplication between these and dec_state_. + FrameHeader frame_header_; + FrameDimensions frame_dim_; + ImageBundle* decoded_; + ModularFrameDecoder modular_frame_decoder_; + bool allow_partial_frames_; + bool allow_partial_dc_global_; + bool render_spotcolors_ = true; + + std::vector processed_section_; + std::vector decoded_passes_per_ac_group_; + std::vector decoded_dc_groups_; + bool decoded_dc_global_; + bool decoded_ac_global_; + bool finalized_dc_ = true; + bool is_finalized_ = true; + size_t num_renders_ = 0; + + std::vector group_dec_caches_; + + // Frame size limits. + const SizeConstraints* constraints_ = nullptr; + + // Whether or not the task id should be used for storage indexing, instead of + // the thread id. + bool use_task_id_ = false; +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_FRAME_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group.cc new file mode 100644 index 0000000000..ce917765d9 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group.cc @@ -0,0 +1,774 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_group.h" + +#include +#include + +#include +#include +#include + +#include "lib/jxl/frame_header.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/dec_group.cc" +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/common.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/dec_reconstruct.h" +#include "lib/jxl/dec_transforms-inl.h" +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/epf.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer-inl.h" +#include "lib/jxl/quantizer.h" + +#ifndef LIB_JXL_DEC_GROUP_CC +#define LIB_JXL_DEC_GROUP_CC +namespace jxl { + +// Interface for reading groups for DecodeGroupImpl. +class GetBlock { + public: + virtual void StartRow(size_t by) = 0; + virtual Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, + size_t size, size_t log2_covered_blocks, + ACPtr block[3], ACType ac_type) = 0; + virtual ~GetBlock() {} +}; + +// Controls whether DecodeGroupImpl renders to pixels or not. +enum DrawMode { + // Render to pixels. + kDraw = 0, + // Don't render to pixels. + kDontDraw = 1, + // Don't do IDCT or dequantization, but just postprocessing. Used for + // progressive DC. + kOnlyImageFeatures = 2, +}; + +} // namespace jxl +#endif // LIB_JXL_DEC_GROUP_CC + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Rebind; +using hwy::HWY_NAMESPACE::ShiftRight; + +using D = HWY_FULL(float); +using DU = HWY_FULL(uint32_t); +using DI = HWY_FULL(int32_t); +using DI16 = Rebind; +constexpr D d; +constexpr DI di; +constexpr DI16 di16; + +// TODO(veluca): consider SIMDfying. +void Transpose8x8InPlace(int32_t* JXL_RESTRICT block) { + for (size_t x = 0; x < 8; x++) { + for (size_t y = x + 1; y < 8; y++) { + std::swap(block[y * 8 + x], block[x * 8 + y]); + } + } +} + +template +void DequantLane(Vec scaled_dequant_x, Vec scaled_dequant_y, + Vec scaled_dequant_b, + const float* JXL_RESTRICT dequant_matrices, size_t dq_ofs, + size_t size, size_t k, Vec x_cc_mul, Vec b_cc_mul, + const float* JXL_RESTRICT biases, ACPtr qblock[3], + float* JXL_RESTRICT block) { + const auto x_mul = Load(d, dequant_matrices + dq_ofs + k) * scaled_dequant_x; + const auto y_mul = + Load(d, dequant_matrices + dq_ofs + size + k) * scaled_dequant_y; + const auto b_mul = + Load(d, dequant_matrices + dq_ofs + 2 * size + k) * scaled_dequant_b; + + Vec quantized_x_int; + Vec quantized_y_int; + Vec quantized_b_int; + if (ac_type == ACType::k16) { + Rebind di16; + quantized_x_int = PromoteTo(di, Load(di16, qblock[0].ptr16 + k)); + quantized_y_int = PromoteTo(di, Load(di16, qblock[1].ptr16 + k)); + quantized_b_int = PromoteTo(di, Load(di16, qblock[2].ptr16 + k)); + } else { + quantized_x_int = Load(di, qblock[0].ptr32 + k); + quantized_y_int = Load(di, qblock[1].ptr32 + k); + quantized_b_int = Load(di, qblock[2].ptr32 + k); + } + + const auto dequant_x_cc = + AdjustQuantBias(di, 0, quantized_x_int, biases) * x_mul; + const auto dequant_y = + AdjustQuantBias(di, 1, quantized_y_int, biases) * y_mul; + const auto dequant_b_cc = + AdjustQuantBias(di, 2, quantized_b_int, biases) * b_mul; + + const auto dequant_x = MulAdd(x_cc_mul, dequant_y, dequant_x_cc); + const auto dequant_b = MulAdd(b_cc_mul, dequant_y, dequant_b_cc); + Store(dequant_x, d, block + k); + Store(dequant_y, d, block + size + k); + Store(dequant_b, d, block + 2 * size + k); +} + +template +void DequantBlock(const AcStrategy& acs, float inv_global_scale, int quant, + float x_dm_multiplier, float b_dm_multiplier, Vec x_cc_mul, + Vec b_cc_mul, size_t kind, size_t size, + const Quantizer& quantizer, + const float* JXL_RESTRICT dequant_matrices, + size_t covered_blocks, const size_t* sbx, + const float* JXL_RESTRICT* JXL_RESTRICT dc_row, + size_t dc_stride, const float* JXL_RESTRICT biases, + ACPtr qblock[3], float* JXL_RESTRICT block) { + PROFILER_FUNC; + + const auto scaled_dequant_s = inv_global_scale / quant; + + const auto scaled_dequant_x = Set(d, scaled_dequant_s * x_dm_multiplier); + const auto scaled_dequant_y = Set(d, scaled_dequant_s); + const auto scaled_dequant_b = Set(d, scaled_dequant_s * b_dm_multiplier); + + const size_t dq_ofs = quantizer.DequantMatrixOffset(kind, 0); + + for (size_t k = 0; k < covered_blocks * kDCTBlockSize; k += Lanes(d)) { + DequantLane(scaled_dequant_x, scaled_dequant_y, scaled_dequant_b, + dequant_matrices, dq_ofs, size, k, x_cc_mul, b_cc_mul, + biases, qblock, block); + } + for (size_t c = 0; c < 3; c++) { + LowestFrequenciesFromDC(acs.Strategy(), dc_row[c] + sbx[c], dc_stride, + block + c * size); + } +} + +Status DecodeGroupImpl(GetBlock* JXL_RESTRICT get_block, + GroupDecCache* JXL_RESTRICT group_dec_cache, + PassesDecoderState* JXL_RESTRICT dec_state, + size_t thread, size_t group_idx, ImageBundle* decoded, + DrawMode draw) { + // TODO(veluca): investigate cache usage in this function. + PROFILER_FUNC; + constexpr size_t kGroupDataXBorder = PassesDecoderState::kGroupDataXBorder; + constexpr size_t kGroupDataYBorder = PassesDecoderState::kGroupDataYBorder; + + const Rect block_rect = dec_state->shared->BlockGroupRect(group_idx); + const AcStrategyImage& ac_strategy = dec_state->shared->ac_strategy; + + const size_t xsize_blocks = block_rect.xsize(); + const size_t ysize_blocks = block_rect.ysize(); + + const size_t dc_stride = dec_state->shared->dc->PixelsPerRow(); + + const float inv_global_scale = dec_state->shared->quantizer.InvGlobalScale(); + const float* JXL_RESTRICT dequant_matrices = + dec_state->shared->quantizer.DequantMatrix(0, 0); + + const YCbCrChromaSubsampling& cs = + dec_state->shared->frame_header.chroma_subsampling; + + const size_t idct_stride = dec_state->EagerFinalizeImageRect() + ? dec_state->group_data[thread].PixelsPerRow() + : dec_state->decoded.PixelsPerRow(); + + HWY_ALIGN int32_t scaled_qtable[64 * 3]; + + ACType ac_type = dec_state->coefficients->Type(); + auto dequant_block = ac_type == ACType::k16 ? DequantBlock + : DequantBlock; + // Whether or not coefficients should be stored for future usage, and/or read + // from past usage. + bool accumulate = !dec_state->coefficients->IsEmpty(); + // Offset of the current block in the group. + size_t offset = 0; + + std::array jpeg_c_map; + bool jpeg_is_gray = false; + std::array dcoff = {}; + + // TODO(veluca): all of this should be done only once per image. + if (decoded->IsJPEG()) { + if (!dec_state->shared->cmap.IsJPEGCompatible()) { + return JXL_FAILURE("The CfL map is not JPEG-compatible"); + } + jpeg_is_gray = (decoded->jpeg_data->components.size() == 1); + jpeg_c_map = JpegOrder(dec_state->shared->frame_header.color_transform, + jpeg_is_gray); + const std::vector& qe = + dec_state->shared->matrices.encodings(); + if (qe.empty() || qe[0].mode != QuantEncoding::Mode::kQuantModeRAW || + std::abs(qe[0].qraw.qtable_den - 1.f / (8 * 255)) > 1e-8f) { + return JXL_FAILURE( + "Quantization table is not a JPEG quantization table."); + } + for (size_t c = 0; c < 3; c++) { + if (dec_state->shared->frame_header.color_transform == + ColorTransform::kNone) { + dcoff[c] = 1024 / (*qe[0].qraw.qtable)[64 * c]; + } + for (size_t i = 0; i < 64; i++) { + // Transpose the matrix, as it will be used on the transposed block. + int n = qe[0].qraw.qtable->at(64 + i); + int d = qe[0].qraw.qtable->at(64 * c + i); + if (n <= 0 || d <= 0 || n >= 65536 || d >= 65536) { + return JXL_FAILURE("Invalid JPEG quantization table"); + } + scaled_qtable[64 * c + (i % 8) * 8 + (i / 8)] = + (1 << kCFLFixedPointPrecision) * n / d; + } + } + } + + size_t hshift[3] = {cs.HShift(0), cs.HShift(1), cs.HShift(2)}; + size_t vshift[3] = {cs.VShift(0), cs.VShift(1), cs.VShift(2)}; + Rect r[3]; + for (size_t i = 0; i < 3; i++) { + r[i] = + Rect(block_rect.x0() >> hshift[i], block_rect.y0() >> vshift[i], + block_rect.xsize() >> hshift[i], block_rect.ysize() >> vshift[i]); + } + + for (size_t by = 0; by < ysize_blocks; ++by) { + if (draw == kOnlyImageFeatures) break; + get_block->StartRow(by); + size_t sby[3] = {by >> vshift[0], by >> vshift[1], by >> vshift[2]}; + + const int32_t* JXL_RESTRICT row_quant = + block_rect.ConstRow(dec_state->shared->raw_quant_field, by); + + const float* JXL_RESTRICT dc_rows[3] = { + r[0].ConstPlaneRow(*dec_state->shared->dc, 0, sby[0]), + r[1].ConstPlaneRow(*dec_state->shared->dc, 1, sby[1]), + r[2].ConstPlaneRow(*dec_state->shared->dc, 2, sby[2]), + }; + + const size_t ty = (block_rect.y0() + by) / kColorTileDimInBlocks; + AcStrategyRow acs_row = ac_strategy.ConstRow(block_rect, by); + + const int8_t* JXL_RESTRICT row_cmap[3] = { + dec_state->shared->cmap.ytox_map.ConstRow(ty), + nullptr, + dec_state->shared->cmap.ytob_map.ConstRow(ty), + }; + + float* JXL_RESTRICT idct_row[3]; + int16_t* JXL_RESTRICT jpeg_row[3]; + for (size_t c = 0; c < 3; c++) { + if (dec_state->EagerFinalizeImageRect()) { + idct_row[c] = dec_state->group_data[thread].PlaneRow( + c, sby[c] * kBlockDim + kGroupDataYBorder) + + kGroupDataXBorder; + } else { + idct_row[c] = + dec_state->decoded.PlaneRow(c, (r[c].y0() + sby[c]) * kBlockDim) + + r[c].x0() * kBlockDim; + } + if (decoded->IsJPEG()) { + auto& component = decoded->jpeg_data->components[jpeg_c_map[c]]; + jpeg_row[c] = + component.coeffs.data() + + (component.width_in_blocks * (r[c].y0() + sby[c]) + r[c].x0()) * + kDCTBlockSize; + } + } + + size_t bx = 0; + for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks); + tx++) { + size_t abs_tx = tx + block_rect.x0() / kColorTileDimInBlocks; + auto x_cc_mul = + Set(d, dec_state->shared->cmap.YtoXRatio(row_cmap[0][abs_tx])); + auto b_cc_mul = + Set(d, dec_state->shared->cmap.YtoBRatio(row_cmap[2][abs_tx])); + // Increment bx by llf_x because those iterations would otherwise + // immediately continue (!IsFirstBlock). Reduces mispredictions. + for (; bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks;) { + size_t sbx[3] = {bx >> hshift[0], bx >> hshift[1], bx >> hshift[2]}; + AcStrategy acs = acs_row[bx]; + const size_t llf_x = acs.covered_blocks_x(); + + // Can only happen in the second or lower rows of a varblock. + if (JXL_UNLIKELY(!acs.IsFirstBlock())) { + bx += llf_x; + continue; + } + PROFILER_ZONE("DecodeGroupImpl inner"); + const size_t log2_covered_blocks = acs.log2_covered_blocks(); + + const size_t covered_blocks = 1 << log2_covered_blocks; + const size_t size = covered_blocks * kDCTBlockSize; + + ACPtr qblock[3]; + if (accumulate) { + for (size_t c = 0; c < 3; c++) { + qblock[c] = dec_state->coefficients->PlaneRow(c, group_idx, offset); + } + } else { + // No point in reading from bitstream without accumulating and not + // drawing. + JXL_ASSERT(draw == kDraw); + if (ac_type == ACType::k16) { + memset(group_dec_cache->dec_group_qblock16, 0, + size * 3 * sizeof(int16_t)); + for (size_t c = 0; c < 3; c++) { + qblock[c].ptr16 = group_dec_cache->dec_group_qblock16 + c * size; + } + } else { + memset(group_dec_cache->dec_group_qblock, 0, + size * 3 * sizeof(int32_t)); + for (size_t c = 0; c < 3; c++) { + qblock[c].ptr32 = group_dec_cache->dec_group_qblock + c * size; + } + } + } + JXL_RETURN_IF_ERROR(get_block->LoadBlock( + bx, by, acs, size, log2_covered_blocks, qblock, ac_type)); + offset += size; + if (draw == kDontDraw) { + bx += llf_x; + continue; + } + + if (JXL_UNLIKELY(decoded->IsJPEG())) { + if (acs.Strategy() != AcStrategy::Type::DCT) { + return JXL_FAILURE( + "Can only decode to JPEG if only DCT-8 is used."); + } + + HWY_ALIGN int32_t transposed_dct_y[64]; + for (size_t c : {1, 0, 2}) { + // Propagate only Y for grayscale. + if (jpeg_is_gray && c != 1) { + continue; + } + if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) { + continue; + } + int16_t* JXL_RESTRICT jpeg_pos = + jpeg_row[c] + sbx[c] * kDCTBlockSize; + // JPEG XL is transposed, JPEG is not. + auto transposed_dct = qblock[c].ptr32; + Transpose8x8InPlace(transposed_dct); + // No CfL - no need to store the y block converted to integers. + if (!cs.Is444() || + (row_cmap[0][abs_tx] == 0 && row_cmap[2][abs_tx] == 0)) { + for (size_t i = 0; i < 64; i += Lanes(d)) { + const auto ini = Load(di, transposed_dct + i); + const auto ini16 = DemoteTo(di16, ini); + StoreU(ini16, di16, jpeg_pos + i); + } + } else if (c == 1) { + // Y channel: save for restoring X/B, but nothing else to do. + for (size_t i = 0; i < 64; i += Lanes(d)) { + const auto ini = Load(di, transposed_dct + i); + Store(ini, di, transposed_dct_y + i); + const auto ini16 = DemoteTo(di16, ini); + StoreU(ini16, di16, jpeg_pos + i); + } + } else { + // transposed_dct_y contains the y channel block, transposed. + const auto scale = Set( + di, dec_state->shared->cmap.RatioJPEG(row_cmap[c][abs_tx])); + const auto round = Set(di, 1 << (kCFLFixedPointPrecision - 1)); + for (int i = 0; i < 64; i += Lanes(d)) { + auto in = Load(di, transposed_dct + i); + auto in_y = Load(di, transposed_dct_y + i); + auto qt = Load(di, scaled_qtable + c * size + i); + auto coeff_scale = + ShiftRight(qt * scale + round); + auto cfl_factor = ShiftRight( + in_y * coeff_scale + round); + StoreU(DemoteTo(di16, in + cfl_factor), di16, jpeg_pos + i); + } + } + jpeg_pos[0] = + Clamp1(dc_rows[c][sbx[c]] - dcoff[c], -2047, 2047); + } + } else { + HWY_ALIGN float* const block = group_dec_cache->dec_group_block; + // Dequantize and add predictions. + dequant_block( + acs, inv_global_scale, row_quant[bx], dec_state->x_dm_multiplier, + dec_state->b_dm_multiplier, x_cc_mul, b_cc_mul, acs.RawStrategy(), + size, dec_state->shared->quantizer, dequant_matrices, + acs.covered_blocks_y() * acs.covered_blocks_x(), sbx, dc_rows, + dc_stride, + dec_state->output_encoding_info.opsin_params.quant_biases, qblock, + block); + + for (size_t c : {1, 0, 2}) { + if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) { + continue; + } + // IDCT + float* JXL_RESTRICT idct_pos = idct_row[c] + sbx[c] * kBlockDim; + TransformToPixels(acs.Strategy(), block + c * size, idct_pos, + idct_stride, group_dec_cache->scratch_space); + } + } + bx += llf_x; + } + } + } + if (draw == kDontDraw) { + return true; + } + // No ApplyImageFeatures in JPEG mode or when we need to delay it. + if (!decoded->IsJPEG() && dec_state->EagerFinalizeImageRect()) { + JXL_RETURN_IF_ERROR(dec_state->FinalizeGroup( + group_idx, thread, &dec_state->group_data[thread], decoded)); + } + return true; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +namespace { +// Decode quantized AC coefficients of DCT blocks. +// LLF components in the output block will not be modified. +template +Status DecodeACVarBlock(size_t ctx_offset, size_t log2_covered_blocks, + int32_t* JXL_RESTRICT row_nzeros, + const int32_t* JXL_RESTRICT row_nzeros_top, + size_t nzeros_stride, size_t c, size_t bx, size_t by, + size_t lbx, AcStrategy acs, + const coeff_order_t* JXL_RESTRICT coeff_order, + BitReader* JXL_RESTRICT br, + ANSSymbolReader* JXL_RESTRICT decoder, + const std::vector& context_map, + const uint8_t* qdc_row, const int32_t* qf_row, + const BlockCtxMap& block_ctx_map, ACPtr block, + size_t shift = 0) { + PROFILER_FUNC; + // Equal to number of LLF coefficients. + const size_t covered_blocks = 1 << log2_covered_blocks; + const size_t size = covered_blocks * kDCTBlockSize; + int32_t predicted_nzeros = + PredictFromTopAndLeft(row_nzeros_top, row_nzeros, bx, 32); + + size_t ord = kStrategyOrder[acs.RawStrategy()]; + const coeff_order_t* JXL_RESTRICT order = + &coeff_order[CoeffOrderOffset(ord, c)]; + + size_t block_ctx = block_ctx_map.Context(qdc_row[lbx], qf_row[bx], ord, c); + const int32_t nzero_ctx = + block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset; + + size_t nzeros = decoder->ReadHybridUint(nzero_ctx, br, context_map); + if (nzeros + covered_blocks > size) { + return JXL_FAILURE("Invalid AC: nzeros too large"); + } + for (size_t y = 0; y < acs.covered_blocks_y(); y++) { + for (size_t x = 0; x < acs.covered_blocks_x(); x++) { + row_nzeros[bx + x + y * nzeros_stride] = + (nzeros + covered_blocks - 1) >> log2_covered_blocks; + } + } + + const size_t histo_offset = + ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx); + + // Skip LLF + { + PROFILER_ZONE("AcDecSkipLLF, reader"); + size_t prev = (nzeros > size / 16 ? 0 : 1); + for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) { + const size_t ctx = + histo_offset + ZeroDensityContext(nzeros, k, covered_blocks, + log2_covered_blocks, prev); + const size_t u_coeff = decoder->ReadHybridUint(ctx, br, context_map); + // Hand-rolled version of UnpackSigned, shifting before the conversion to + // signed integer to avoid undefined behavior of shifting negative + // numbers. + const size_t magnitude = u_coeff >> 1; + const size_t neg_sign = (~u_coeff) & 1; + const intptr_t coeff = + static_cast((magnitude ^ (neg_sign - 1)) << shift); + if (ac_type == ACType::k16) { + block.ptr16[order[k]] += coeff; + } else { + block.ptr32[order[k]] += coeff; + } + prev = static_cast(u_coeff != 0); + nzeros -= prev; + } + if (JXL_UNLIKELY(nzeros != 0)) { + return JXL_FAILURE( + "Invalid AC: nzeros not 0. Block (%zu, %zu), channel %zu", bx, by, c); + } + } + return true; +} + +// Structs used by DecodeGroupImpl to get a quantized block. +// GetBlockFromBitstream uses ANS decoding (and thus keeps track of row +// pointers in row_nzeros), GetBlockFromEncoder simply reads the coefficient +// image provided by the encoder. + +struct GetBlockFromBitstream : public GetBlock { + void StartRow(size_t by) override { + qf_row = rect.ConstRow(*qf, by); + for (size_t c = 0; c < 3; c++) { + size_t sby = by >> vshift[c]; + quant_dc_row = quant_dc->ConstRow(rect.y0() + by) + rect.x0(); + for (size_t i = 0; i < num_passes; i++) { + row_nzeros[i][c] = group_dec_cache->num_nzeroes[i].PlaneRow(c, sby); + row_nzeros_top[i][c] = + sby == 0 + ? nullptr + : group_dec_cache->num_nzeroes[i].ConstPlaneRow(c, sby - 1); + } + } + } + + Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size, + size_t log2_covered_blocks, ACPtr block[3], + ACType ac_type) override { + auto decode_ac_varblock = ac_type == ACType::k16 + ? DecodeACVarBlock + : DecodeACVarBlock; + for (size_t c : {1, 0, 2}) { + size_t sbx = bx >> hshift[c]; + size_t sby = by >> vshift[c]; + if (JXL_UNLIKELY((sbx << hshift[c] != bx) || (sby << vshift[c] != by))) { + continue; + } + + for (size_t pass = 0; JXL_UNLIKELY(pass < num_passes); pass++) { + JXL_RETURN_IF_ERROR(decode_ac_varblock( + ctx_offset[pass], log2_covered_blocks, row_nzeros[pass][c], + row_nzeros_top[pass][c], nzeros_stride, c, sbx, sby, bx, acs, + &coeff_orders[pass * coeff_order_size], readers[pass], + &decoders[pass], context_map[pass], quant_dc_row, qf_row, + *block_ctx_map, block[c], shift_for_pass[pass])); + } + } + return true; + } + + Status Init(BitReader* JXL_RESTRICT* JXL_RESTRICT readers, size_t num_passes, + size_t group_idx, size_t histo_selector_bits, const Rect& rect, + GroupDecCache* JXL_RESTRICT group_dec_cache, + PassesDecoderState* dec_state, size_t first_pass) { + for (size_t i = 0; i < 3; i++) { + hshift[i] = dec_state->shared->frame_header.chroma_subsampling.HShift(i); + vshift[i] = dec_state->shared->frame_header.chroma_subsampling.VShift(i); + } + this->coeff_order_size = dec_state->shared->coeff_order_size; + this->coeff_orders = + dec_state->shared->coeff_orders.data() + first_pass * coeff_order_size; + this->context_map = dec_state->context_map.data() + first_pass; + this->readers = readers; + this->num_passes = num_passes; + this->shift_for_pass = + dec_state->shared->frame_header.passes.shift + first_pass; + this->group_dec_cache = group_dec_cache; + this->rect = rect; + block_ctx_map = &dec_state->shared->block_ctx_map; + qf = &dec_state->shared->raw_quant_field; + quant_dc = &dec_state->shared->quant_dc; + + for (size_t pass = 0; pass < num_passes; pass++) { + // Select which histogram set to use among those of the current pass. + size_t cur_histogram = 0; + if (histo_selector_bits != 0) { + cur_histogram = readers[pass]->ReadBits(histo_selector_bits); + } + if (cur_histogram >= dec_state->shared->num_histograms) { + return JXL_FAILURE("Invalid histogram selector"); + } + ctx_offset[pass] = cur_histogram * block_ctx_map->NumACContexts(); + + decoders[pass] = + ANSSymbolReader(&dec_state->code[pass + first_pass], readers[pass]); + } + nzeros_stride = group_dec_cache->num_nzeroes[0].PixelsPerRow(); + for (size_t i = 0; i < num_passes; i++) { + JXL_ASSERT( + nzeros_stride == + static_cast(group_dec_cache->num_nzeroes[i].PixelsPerRow())); + } + return true; + } + + const uint32_t* shift_for_pass = nullptr; // not owned + const coeff_order_t* JXL_RESTRICT coeff_orders; + size_t coeff_order_size; + const std::vector* JXL_RESTRICT context_map; + ANSSymbolReader decoders[kMaxNumPasses]; + BitReader* JXL_RESTRICT* JXL_RESTRICT readers; + size_t num_passes; + size_t ctx_offset[kMaxNumPasses]; + size_t nzeros_stride; + int32_t* JXL_RESTRICT row_nzeros[kMaxNumPasses][3]; + const int32_t* JXL_RESTRICT row_nzeros_top[kMaxNumPasses][3]; + GroupDecCache* JXL_RESTRICT group_dec_cache; + const BlockCtxMap* block_ctx_map; + const ImageI* qf; + const ImageB* quant_dc; + const int32_t* qf_row; + const uint8_t* quant_dc_row; + Rect rect; + size_t hshift[3], vshift[3]; +}; + +struct GetBlockFromEncoder : public GetBlock { + void StartRow(size_t by) override {} + + Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size, + size_t log2_covered_blocks, ACPtr block[3], + ACType ac_type) override { + JXL_DASSERT(ac_type == ACType::k32); + for (size_t c = 0; c < 3; c++) { + // for each pass + for (size_t i = 0; i < quantized_ac->size(); i++) { + for (size_t k = 0; k < size; k++) { + // TODO(veluca): SIMD. + block[c].ptr32[k] += + rows[i][c][offset + k] * (1 << shift_for_pass[i]); + } + } + } + offset += size; + return true; + } + + GetBlockFromEncoder(const std::vector>& ac, + size_t group_idx, const uint32_t* shift_for_pass) + : quantized_ac(&ac), shift_for_pass(shift_for_pass) { + // TODO(veluca): not supported with chroma subsampling. + for (size_t i = 0; i < quantized_ac->size(); i++) { + JXL_CHECK((*quantized_ac)[i]->Type() == ACType::k32); + for (size_t c = 0; c < 3; c++) { + rows[i][c] = (*quantized_ac)[i]->PlaneRow(c, group_idx, 0).ptr32; + } + } + } + + const std::vector>* JXL_RESTRICT quantized_ac; + size_t offset = 0; + const int32_t* JXL_RESTRICT rows[kMaxNumPasses][3]; + const uint32_t* shift_for_pass = nullptr; // not owned +}; + +HWY_EXPORT(DecodeGroupImpl); + +} // namespace + +Status DecodeGroup(BitReader* JXL_RESTRICT* JXL_RESTRICT readers, + size_t num_passes, size_t group_idx, + PassesDecoderState* JXL_RESTRICT dec_state, + GroupDecCache* JXL_RESTRICT group_dec_cache, size_t thread, + ImageBundle* JXL_RESTRICT decoded, size_t first_pass, + bool force_draw, bool dc_only) { + PROFILER_FUNC; + + DrawMode draw = (num_passes + first_pass == + dec_state->shared->frame_header.passes.num_passes) || + force_draw + ? kDraw + : kDontDraw; + + if (draw == kDraw && num_passes == 0 && first_pass == 0) { + // We reuse filter_input_storage here as it is not currently in use. + const Rect src_rect = dec_state->shared->BlockGroupRect(group_idx); + const Rect copy_rect(kBlockDim, 2, src_rect.xsize(), src_rect.ysize()); + CopyImageToWithPadding(src_rect, *dec_state->shared->dc, 2, copy_rect, + &dec_state->filter_input_storage[thread]); + EnsurePaddingInPlace(&dec_state->filter_input_storage[thread], copy_rect, + src_rect, dec_state->shared->frame_dim.xsize_blocks, + dec_state->shared->frame_dim.ysize_blocks, 2, 2); + Image3F* upsampling_dst = &dec_state->decoded; + Rect dst_rect(src_rect.x0() * 8, src_rect.y0() * 8, src_rect.xsize() * 8, + src_rect.ysize() * 8); + if (dec_state->EagerFinalizeImageRect()) { + upsampling_dst = &dec_state->group_data[thread]; + dst_rect = Rect(PassesDecoderState::kGroupDataXBorder, + PassesDecoderState::kGroupDataYBorder, dst_rect.xsize(), + dst_rect.ysize()); + } + dec_state->upsamplers[2].UpsampleRect( + dec_state->filter_input_storage[thread], copy_rect, upsampling_dst, + dst_rect, + static_cast(src_rect.y0()) - + static_cast(copy_rect.y0()), + dec_state->shared->frame_dim.ysize_blocks, + dec_state->upsampler_storage[thread].get()); + draw = kOnlyImageFeatures; + } + + size_t histo_selector_bits = 0; + if (dc_only) { + JXL_ASSERT(num_passes == 0); + } else { + JXL_ASSERT(dec_state->shared->num_histograms > 0); + histo_selector_bits = CeilLog2Nonzero(dec_state->shared->num_histograms); + } + + GetBlockFromBitstream get_block; + JXL_RETURN_IF_ERROR( + get_block.Init(readers, num_passes, group_idx, histo_selector_bits, + dec_state->shared->BlockGroupRect(group_idx), + group_dec_cache, dec_state, first_pass)); + + JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)( + &get_block, group_dec_cache, dec_state, thread, group_idx, decoded, + draw)); + + for (size_t pass = 0; pass < num_passes; pass++) { + if (!get_block.decoders[pass].CheckANSFinalState()) { + return JXL_FAILURE("ANS checksum failure."); + } + } + return true; +} + +Status DecodeGroupForRoundtrip(const std::vector>& ac, + size_t group_idx, + PassesDecoderState* JXL_RESTRICT dec_state, + GroupDecCache* JXL_RESTRICT group_dec_cache, + size_t thread, ImageBundle* JXL_RESTRICT decoded, + AuxOut* aux_out) { + PROFILER_FUNC; + + GetBlockFromEncoder get_block(ac, group_idx, + dec_state->shared->frame_header.passes.shift); + group_dec_cache->InitOnce( + /*num_passes=*/0, + /*used_acs=*/(1u << AcStrategy::kNumValidStrategies) - 1); + + return HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)(&get_block, group_dec_cache, + dec_state, thread, group_idx, + decoded, kDraw); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group.h new file mode 100644 index 0000000000..a7b868d3a4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group.h @@ -0,0 +1,47 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_GROUP_H_ +#define LIB_JXL_DEC_GROUP_H_ + +#include +#include + +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/dec_params.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/quantizer.h" + +namespace jxl { + +Status DecodeGroup(BitReader* JXL_RESTRICT* JXL_RESTRICT readers, + size_t num_passes, size_t group_idx, + PassesDecoderState* JXL_RESTRICT dec_state, + GroupDecCache* JXL_RESTRICT group_dec_cache, size_t thread, + ImageBundle* JXL_RESTRICT decoded, size_t first_pass, + bool force_draw, bool dc_only); + +Status DecodeGroupForRoundtrip(const std::vector>& ac, + size_t group_idx, + PassesDecoderState* JXL_RESTRICT dec_state, + GroupDecCache* JXL_RESTRICT group_dec_cache, + size_t thread, ImageBundle* JXL_RESTRICT decoded, + AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_DEC_GROUP_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group_border.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group_border.cc new file mode 100644 index 0000000000..2e08578730 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group_border.cc @@ -0,0 +1,183 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_group_border.h" + +#include + +namespace jxl { + +void GroupBorderAssigner::Init(const FrameDimensions& frame_dim) { + frame_dim_ = frame_dim; + size_t num_corners = + (frame_dim_.xsize_groups + 1) * (frame_dim_.ysize_groups + 1); + counters_.reset(new std::atomic[num_corners]); + // Initialize counters. + for (size_t y = 0; y < frame_dim_.ysize_groups + 1; y++) { + for (size_t x = 0; x < frame_dim_.xsize_groups + 1; x++) { + // Counters at image borders don't have anything on the other side, we + // pre-fill their value to have more uniform handling afterwards. + uint8_t init_value = 0; + if (x == 0) { + init_value |= kTopLeft | kBottomLeft; + } + if (x == frame_dim_.xsize_groups) { + init_value |= kTopRight | kBottomRight; + } + if (y == 0) { + init_value |= kTopLeft | kTopRight; + } + if (y == frame_dim_.ysize_groups) { + init_value |= kBottomLeft | kBottomRight; + } + counters_[y * (frame_dim_.xsize_groups + 1) + x] = init_value; + } + } +} + +void GroupBorderAssigner::ClearDone(size_t group_id) { + size_t x = group_id % frame_dim_.xsize_groups; + size_t y = group_id / frame_dim_.xsize_groups; + size_t top_left_idx = y * (frame_dim_.xsize_groups + 1) + x; + size_t top_right_idx = y * (frame_dim_.xsize_groups + 1) + x + 1; + size_t bottom_right_idx = (y + 1) * (frame_dim_.xsize_groups + 1) + x + 1; + size_t bottom_left_idx = (y + 1) * (frame_dim_.xsize_groups + 1) + x; + counters_[top_left_idx].fetch_and(~kBottomRight); + counters_[top_right_idx].fetch_and(~kBottomLeft); + counters_[bottom_left_idx].fetch_and(~kTopRight); + counters_[bottom_right_idx].fetch_and(~kTopLeft); +} + +// Looking at each corner between groups, we can guarantee that the four +// involved groups will agree between each other regarding the order in which +// each of the four groups terminated. Thus, the last of the four groups +// gets the responsibility of handling the corner. For borders, every border +// is assigned to its top corner (for vertical borders) or to its left corner +// (for horizontal borders): the order as seen on those corners will decide who +// handles that border. + +void GroupBorderAssigner::GroupDone(size_t group_id, size_t padding, + Rect* rects_to_finalize, + size_t* num_to_finalize) { + size_t x = group_id % frame_dim_.xsize_groups; + size_t y = group_id / frame_dim_.xsize_groups; + Rect block_rect(x * frame_dim_.group_dim / kBlockDim, + y * frame_dim_.group_dim / kBlockDim, + frame_dim_.group_dim / kBlockDim, + frame_dim_.group_dim / kBlockDim, frame_dim_.xsize_blocks, + frame_dim_.ysize_blocks); + + size_t top_left_idx = y * (frame_dim_.xsize_groups + 1) + x; + size_t top_right_idx = y * (frame_dim_.xsize_groups + 1) + x + 1; + size_t bottom_right_idx = (y + 1) * (frame_dim_.xsize_groups + 1) + x + 1; + size_t bottom_left_idx = (y + 1) * (frame_dim_.xsize_groups + 1) + x; + + auto fetch_status = [this](size_t idx, uint8_t bit) { + // Note that the acq-rel semantics of this fetch are actually needed to + // ensure that the pixel data of the group is already written to memory. + size_t status = counters_[idx].fetch_or(bit); + JXL_DASSERT((bit & status) == 0); + return bit | status; + }; + + size_t top_left_status = fetch_status(top_left_idx, kBottomRight); + size_t top_right_status = fetch_status(top_right_idx, kBottomLeft); + size_t bottom_right_status = fetch_status(bottom_right_idx, kTopLeft); + size_t bottom_left_status = fetch_status(bottom_left_idx, kTopRight); + + size_t padx = PaddingX(padding); + size_t pady = padding; + + size_t x1 = block_rect.x0() + block_rect.xsize(); + size_t y1 = block_rect.y0() + block_rect.ysize(); + + bool is_last_group_x = frame_dim_.xsize_groups == x + 1; + bool is_last_group_y = frame_dim_.ysize_groups == y + 1; + + // Start of border of neighbouring group, end of border of this group, start + // of border of this group (on the other side), end of border of next group. + size_t xpos[4] = { + block_rect.x0() == 0 ? 0 : block_rect.x0() * kBlockDim - padx, + block_rect.x0() == 0 ? 0 : block_rect.x0() * kBlockDim + padx, + is_last_group_x ? frame_dim_.xsize_padded : x1 * kBlockDim - padx, + is_last_group_x ? frame_dim_.xsize_padded : x1 * kBlockDim + padx}; + size_t ypos[4] = { + block_rect.y0() == 0 ? 0 : block_rect.y0() * kBlockDim - pady, + block_rect.y0() == 0 ? 0 : block_rect.y0() * kBlockDim + pady, + is_last_group_y ? frame_dim_.ysize_padded : y1 * kBlockDim - pady, + is_last_group_y ? frame_dim_.ysize_padded : y1 * kBlockDim + pady}; + + *num_to_finalize = 0; + auto append_rect = [&](size_t x0, size_t x1, size_t y0, size_t y1) { + Rect rect(xpos[x0], ypos[y0], xpos[x1] - xpos[x0], ypos[y1] - ypos[y0]); + if (rect.xsize() == 0 || rect.ysize() == 0) return; + JXL_DASSERT(*num_to_finalize < kMaxToFinalize); + rects_to_finalize[(*num_to_finalize)++] = rect; + }; + + // Because of how group borders are assigned, it is impossible that we need to + // process the left and right side of some area but not the center area. Thus, + // we compute the first/last part to process in every horizontal strip and + // merge them together. We first collect a mask of what parts should be + // processed. + // We do this horizontally rather than vertically because horizontal borders + // are larger. + bool available_parts_mask[3][3] = {}; // [x][y] + // Center + available_parts_mask[1][1] = true; + // Corners + if (top_left_status == 0xF) available_parts_mask[0][0] = true; + if (top_right_status == 0xF) available_parts_mask[2][0] = true; + if (bottom_right_status == 0xF) available_parts_mask[2][2] = true; + if (bottom_left_status == 0xF) available_parts_mask[0][2] = true; + // Other borders + if (top_left_status & kTopRight) available_parts_mask[1][0] = true; + if (top_left_status & kBottomLeft) available_parts_mask[0][1] = true; + if (top_right_status & kBottomRight) available_parts_mask[2][1] = true; + if (bottom_left_status & kBottomRight) available_parts_mask[1][2] = true; + + // Collect horizontal ranges. + constexpr size_t kNoSegment = 3; + std::pair horizontal_segments[3] = {{kNoSegment, kNoSegment}, + {kNoSegment, kNoSegment}, + {kNoSegment, kNoSegment}}; + for (size_t y = 0; y < 3; y++) { + for (size_t x = 0; x < 3; x++) { + if (!available_parts_mask[x][y]) continue; + JXL_DASSERT(horizontal_segments[y].second == kNoSegment || + horizontal_segments[y].second == x); + JXL_DASSERT((horizontal_segments[y].first == kNoSegment) == + (horizontal_segments[y].second == kNoSegment)); + if (horizontal_segments[y].first == kNoSegment) { + horizontal_segments[y].first = x; + } + horizontal_segments[y].second = x + 1; + } + } + if (horizontal_segments[0] == horizontal_segments[1] && + horizontal_segments[0] == horizontal_segments[2]) { + append_rect(horizontal_segments[0].first, horizontal_segments[0].second, 0, + 3); + } else if (horizontal_segments[0] == horizontal_segments[1]) { + append_rect(horizontal_segments[0].first, horizontal_segments[0].second, 0, + 2); + append_rect(horizontal_segments[2].first, horizontal_segments[2].second, 2, + 3); + } else if (horizontal_segments[1] == horizontal_segments[2]) { + append_rect(horizontal_segments[0].first, horizontal_segments[0].second, 0, + 1); + append_rect(horizontal_segments[1].first, horizontal_segments[1].second, 1, + 3); + } else { + append_rect(horizontal_segments[0].first, horizontal_segments[0].second, 0, + 1); + append_rect(horizontal_segments[1].first, horizontal_segments[1].second, 1, + 2); + append_rect(horizontal_segments[2].first, horizontal_segments[2].second, 2, + 3); + } +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group_border.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group_border.h new file mode 100644 index 0000000000..67af6afd7d --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_group_border.h @@ -0,0 +1,60 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_GROUP_BORDER_H_ +#define LIB_JXL_DEC_GROUP_BORDER_H_ + +#include + +#include + +#include "lib/jxl/base/arch_macros.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image.h" + +namespace jxl { + +class GroupBorderAssigner { + public: + // Prepare the GroupBorderAssigner to handle a given frame. + void Init(const FrameDimensions& frame_dim); + // Marks a group as done, and returns the (at most 3) rects to run + // FinalizeImageRect on. `block_rect` must be the rect corresponding + // to the given `group_id`, measured in blocks. + void GroupDone(size_t group_id, size_t padding, Rect* rects_to_finalize, + size_t* num_to_finalize); + // Marks a group as not-done, for running re-paints. + void ClearDone(size_t group_id); + + static constexpr size_t kMaxToFinalize = 3; + + // Vectors on ARM NEON are never wider than 4 floats, so rounding to multiples + // of 4 is enough. +#if defined(__ARM_NEON) || defined(__ARM_NEON__) + static constexpr size_t kPaddingXRound = 4; +#else + static constexpr size_t kPaddingXRound = kBlockDim; +#endif + + // Returns the necessary amount of padding for the X axis. + static size_t PaddingX(size_t padding) { + return RoundUpTo(padding, kPaddingXRound); + } + + private: + FrameDimensions frame_dim_; + std::unique_ptr[]> counters_; + + // Constants to identify group positions relative to the corners. + static constexpr uint8_t kTopLeft = 0x01; + static constexpr uint8_t kTopRight = 0x02; + static constexpr uint8_t kBottomRight = 0x04; + static constexpr uint8_t kBottomLeft = 0x08; +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_GROUP_BORDER_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_huffman.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_huffman.cc new file mode 100644 index 0000000000..05b275773a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_huffman.cc @@ -0,0 +1,255 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_huffman.h" + +#include /* for memset */ + +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/huffman_table.h" + +namespace jxl { + +static const int kCodeLengthCodes = 18; +static const uint8_t kCodeLengthCodeOrder[kCodeLengthCodes] = { + 1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15, +}; +static const uint8_t kDefaultCodeLength = 8; +static const uint8_t kCodeLengthRepeatCode = 16; + +int ReadHuffmanCodeLengths(const uint8_t* code_length_code_lengths, + int num_symbols, uint8_t* code_lengths, + BitReader* br) { + int symbol = 0; + uint8_t prev_code_len = kDefaultCodeLength; + int repeat = 0; + uint8_t repeat_code_len = 0; + int space = 32768; + HuffmanCode table[32]; + + uint16_t counts[16] = {0}; + for (int i = 0; i < kCodeLengthCodes; ++i) { + ++counts[code_length_code_lengths[i]]; + } + if (!BuildHuffmanTable(table, 5, code_length_code_lengths, kCodeLengthCodes, + &counts[0])) { + return 0; + } + + while (symbol < num_symbols && space > 0) { + const HuffmanCode* p = table; + uint8_t code_len; + br->Refill(); + p += br->PeekFixedBits<5>(); + br->Consume(p->bits); + code_len = (uint8_t)p->value; + if (code_len < kCodeLengthRepeatCode) { + repeat = 0; + code_lengths[symbol++] = code_len; + if (code_len != 0) { + prev_code_len = code_len; + space -= 32768u >> code_len; + } + } else { + const int extra_bits = code_len - 14; + int old_repeat; + int repeat_delta; + uint8_t new_len = 0; + if (code_len == kCodeLengthRepeatCode) { + new_len = prev_code_len; + } + if (repeat_code_len != new_len) { + repeat = 0; + repeat_code_len = new_len; + } + old_repeat = repeat; + if (repeat > 0) { + repeat -= 2; + repeat <<= extra_bits; + } + repeat += (int)br->ReadBits(extra_bits) + 3; + repeat_delta = repeat - old_repeat; + if (symbol + repeat_delta > num_symbols) { + return 0; + } + memset(&code_lengths[symbol], repeat_code_len, (size_t)repeat_delta); + symbol += repeat_delta; + if (repeat_code_len != 0) { + space -= repeat_delta << (15 - repeat_code_len); + } + } + } + if (space != 0) { + return 0; + } + memset(&code_lengths[symbol], 0, (size_t)(num_symbols - symbol)); + return true; +} + +static JXL_INLINE bool ReadSimpleCode(size_t alphabet_size, BitReader* br, + HuffmanCode* table) { + size_t max_bits = + (alphabet_size > 1u) ? FloorLog2Nonzero(alphabet_size - 1u) + 1 : 0; + + size_t num_symbols = br->ReadFixedBits<2>() + 1; + + uint16_t symbols[4] = {0}; + for (size_t i = 0; i < num_symbols; ++i) { + uint16_t symbol = br->ReadBits(max_bits); + if (symbol >= alphabet_size) { + return false; + } + symbols[i] = symbol; + } + + for (size_t i = 0; i < num_symbols - 1; ++i) { + for (size_t j = i + 1; j < num_symbols; ++j) { + if (symbols[i] == symbols[j]) return false; + } + } + + // 4 symbols have to option to encode. + if (num_symbols == 4) num_symbols += br->ReadFixedBits<1>(); + + const auto swap_symbols = [&symbols](size_t i, size_t j) { + uint16_t t = symbols[j]; + symbols[j] = symbols[i]; + symbols[i] = t; + }; + + size_t table_size = 1; + switch (num_symbols) { + case 1: + table[0] = {0, symbols[0]}; + break; + case 2: + if (symbols[0] > symbols[1]) swap_symbols(0, 1); + table[0] = {1, symbols[0]}; + table[1] = {1, symbols[1]}; + table_size = 2; + break; + case 3: + if (symbols[1] > symbols[2]) swap_symbols(1, 2); + table[0] = {1, symbols[0]}; + table[2] = {1, symbols[0]}; + table[1] = {2, symbols[1]}; + table[3] = {2, symbols[2]}; + table_size = 4; + break; + case 4: { + for (size_t i = 0; i < 3; ++i) { + for (size_t j = i + 1; j < 4; ++j) { + if (symbols[i] > symbols[j]) swap_symbols(i, j); + } + } + table[0] = {2, symbols[0]}; + table[2] = {2, symbols[1]}; + table[1] = {2, symbols[2]}; + table[3] = {2, symbols[3]}; + table_size = 4; + break; + } + case 5: { + if (symbols[2] > symbols[3]) swap_symbols(2, 3); + table[0] = {1, symbols[0]}; + table[1] = {2, symbols[1]}; + table[2] = {1, symbols[0]}; + table[3] = {3, symbols[2]}; + table[4] = {1, symbols[0]}; + table[5] = {2, symbols[1]}; + table[6] = {1, symbols[0]}; + table[7] = {3, symbols[3]}; + table_size = 8; + break; + } + default: { + // Unreachable. + return false; + } + } + + const uint32_t goal_size = 1u << kHuffmanTableBits; + while (table_size != goal_size) { + memcpy(&table[table_size], &table[0], + (size_t)table_size * sizeof(table[0])); + table_size <<= 1; + } + + return true; +} + +bool HuffmanDecodingData::ReadFromBitStream(size_t alphabet_size, + BitReader* br) { + if (alphabet_size > (1 << PREFIX_MAX_BITS)) return false; + + /* simple_code_or_skip is used as follows: + 1 for simple code; + 0 for no skipping, 2 skips 2 code lengths, 3 skips 3 code lengths */ + uint32_t simple_code_or_skip = br->ReadFixedBits<2>(); + if (simple_code_or_skip == 1u) { + table_.resize(1u << kHuffmanTableBits); + return ReadSimpleCode(alphabet_size, br, table_.data()); + } + + std::vector code_lengths(alphabet_size, 0); + uint8_t code_length_code_lengths[kCodeLengthCodes] = {0}; + int space = 32; + int num_codes = 0; + /* Static Huffman code for the code length code lengths */ + static const HuffmanCode huff[16] = { + {2, 0}, {2, 4}, {2, 3}, {3, 2}, {2, 0}, {2, 4}, {2, 3}, {4, 1}, + {2, 0}, {2, 4}, {2, 3}, {3, 2}, {2, 0}, {2, 4}, {2, 3}, {4, 5}, + }; + for (size_t i = simple_code_or_skip; i < kCodeLengthCodes && space > 0; ++i) { + const int code_len_idx = kCodeLengthCodeOrder[i]; + const HuffmanCode* p = huff; + uint8_t v; + br->Refill(); + p += br->PeekFixedBits<4>(); + br->Consume(p->bits); + v = (uint8_t)p->value; + code_length_code_lengths[code_len_idx] = v; + if (v != 0) { + space -= (32u >> v); + ++num_codes; + } + } + bool ok = (num_codes == 1 || space == 0) && + ReadHuffmanCodeLengths(code_length_code_lengths, alphabet_size, + &code_lengths[0], br); + + if (!ok) return false; + uint16_t counts[16] = {0}; + for (size_t i = 0; i < alphabet_size; ++i) { + ++counts[code_lengths[i]]; + } + table_.resize(alphabet_size + 376); + uint32_t table_size = + BuildHuffmanTable(table_.data(), kHuffmanTableBits, &code_lengths[0], + alphabet_size, &counts[0]); + table_.resize(table_size); + return (table_size > 0); +} + +// Decodes the next Huffman coded symbol from the bit-stream. +uint16_t HuffmanDecodingData::ReadSymbol(BitReader* br) const { + size_t n_bits; + const HuffmanCode* table = table_.data(); + table += br->PeekBits(kHuffmanTableBits); + n_bits = table->bits; + if (n_bits > kHuffmanTableBits) { + br->Consume(kHuffmanTableBits); + n_bits -= kHuffmanTableBits; + table += table->value; + table += br->PeekBits(n_bits); + } + br->Consume(table->bits); + return table->value; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_huffman.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_huffman.h new file mode 100644 index 0000000000..162c3e309c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_huffman.h @@ -0,0 +1,32 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_HUFFMAN_H_ +#define LIB_JXL_DEC_HUFFMAN_H_ + +#include +#include + +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/huffman_table.h" + +namespace jxl { + +static constexpr size_t kHuffmanTableBits = 8u; + +struct HuffmanDecodingData { + // Decodes the Huffman code lengths from the bit-stream and fills in the + // pre-allocated table with the corresponding 2-level Huffman decoding table. + // Returns false if the Huffman code lengths can not de decoded. + bool ReadFromBitStream(size_t alphabet_size, BitReader* br); + + uint16_t ReadSymbol(BitReader* br) const; + + std::vector table_; +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_HUFFMAN_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_modular.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_modular.cc new file mode 100644 index 0000000000..64773eb4e4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_modular.cc @@ -0,0 +1,592 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_modular.h" + +#include + +#include + +#include "lib/jxl/frame_header.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/dec_modular.cc" +#include +#include + +#include "lib/jxl/alpha.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/epf.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/modular_image.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Rebind; + +void MultiplySum(const size_t xsize, + const pixel_type* const JXL_RESTRICT row_in, + const pixel_type* const JXL_RESTRICT row_in_Y, + const float factor, float* const JXL_RESTRICT row_out) { + const HWY_FULL(float) df; + const Rebind di; // assumes pixel_type <= float + const auto factor_v = Set(df, factor); + for (size_t x = 0; x < xsize; x += Lanes(di)) { + const auto in = Load(di, row_in + x) + Load(di, row_in_Y + x); + const auto out = ConvertTo(df, in) * factor_v; + Store(out, df, row_out + x); + } +} + +void RgbFromSingle(const size_t xsize, + const pixel_type* const JXL_RESTRICT row_in, + const float factor, Image3F* decoded, size_t /*c*/, + size_t y) { + const HWY_FULL(float) df; + const Rebind di; // assumes pixel_type <= float + + float* const JXL_RESTRICT row_out_r = decoded->PlaneRow(0, y); + float* const JXL_RESTRICT row_out_g = decoded->PlaneRow(1, y); + float* const JXL_RESTRICT row_out_b = decoded->PlaneRow(2, y); + + const auto factor_v = Set(df, factor); + for (size_t x = 0; x < xsize; x += Lanes(di)) { + const auto in = Load(di, row_in + x); + const auto out = ConvertTo(df, in) * factor_v; + Store(out, df, row_out_r + x); + Store(out, df, row_out_g + x); + Store(out, df, row_out_b + x); + } +} + +// Same signature as RgbFromSingle so we can assign to the same pointer. +void SingleFromSingle(const size_t xsize, + const pixel_type* const JXL_RESTRICT row_in, + const float factor, Image3F* decoded, size_t c, + size_t y) { + const HWY_FULL(float) df; + const Rebind di; // assumes pixel_type <= float + + float* const JXL_RESTRICT row_out = decoded->PlaneRow(c, y); + + const auto factor_v = Set(df, factor); + for (size_t x = 0; x < xsize; x += Lanes(di)) { + const auto in = Load(di, row_in + x); + const auto out = ConvertTo(df, in) * factor_v; + Store(out, df, row_out + x); + } +} +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(MultiplySum); // Local function +HWY_EXPORT(RgbFromSingle); // Local function +HWY_EXPORT(SingleFromSingle); // Local function + +// convert custom [bits]-bit float (with [exp_bits] exponent bits) stored as int +// back to binary32 float +void int_to_float(const pixel_type* const JXL_RESTRICT row_in, + float* const JXL_RESTRICT row_out, const size_t xsize, + const int bits, const int exp_bits) { + if (bits == 32) { + JXL_ASSERT(sizeof(pixel_type) == sizeof(float)); + JXL_ASSERT(exp_bits == 8); + memcpy(row_out, row_in, xsize * sizeof(float)); + return; + } + int exp_bias = (1 << (exp_bits - 1)) - 1; + int sign_shift = bits - 1; + int mant_bits = bits - exp_bits - 1; + int mant_shift = 23 - mant_bits; + for (size_t x = 0; x < xsize; ++x) { + uint32_t f; + memcpy(&f, &row_in[x], 4); + int signbit = (f >> sign_shift); + f &= (1 << sign_shift) - 1; + if (f == 0) { + row_out[x] = (signbit ? -0.f : 0.f); + continue; + } + int exp = (f >> mant_bits); + int mantissa = (f & ((1 << mant_bits) - 1)); + mantissa <<= mant_shift; + // Try to normalize only if there is space for maneuver. + if (exp == 0 && exp_bits < 8) { + // subnormal number + while ((mantissa & 0x800000) == 0) { + mantissa <<= 1; + exp--; + } + exp++; + // remove leading 1 because it is implicit now + mantissa &= 0x7fffff; + } + exp -= exp_bias; + // broke up the arbitrary float into its parts, now reassemble into + // binary32 + exp += 127; + JXL_ASSERT(exp >= 0); + f = (signbit ? 0x80000000 : 0); + f |= (exp << 23); + f |= mantissa; + memcpy(&row_out[x], &f, 4); + } +} + +Status ModularFrameDecoder::DecodeGlobalInfo(BitReader* reader, + const FrameHeader& frame_header, + bool allow_truncated_group) { + bool decode_color = frame_header.encoding == FrameEncoding::kModular; + const auto& metadata = frame_header.nonserialized_metadata->m; + bool is_gray = metadata.color_encoding.IsGray(); + size_t nb_chans = 3; + if (is_gray && frame_header.color_transform == ColorTransform::kNone) { + nb_chans = 1; + } + bool has_tree = reader->ReadBits(1); + if (has_tree) { + size_t tree_size_limit = + 1024 + frame_dim.xsize * frame_dim.ysize * nb_chans; + JXL_RETURN_IF_ERROR(DecodeTree(reader, &tree, tree_size_limit)); + JXL_RETURN_IF_ERROR( + DecodeHistograms(reader, (tree.size() + 1) / 2, &code, &context_map)); + } + do_color = decode_color; + if (!do_color) nb_chans = 0; + size_t nb_extra = metadata.extra_channel_info.size(); + + bool fp = metadata.bit_depth.floating_point_sample; + + // bits_per_sample is just metadata for XYB images. + if (metadata.bit_depth.bits_per_sample >= 32 && do_color && + frame_header.color_transform != ColorTransform::kXYB) { + if (metadata.bit_depth.bits_per_sample == 32 && fp == false) { + return JXL_FAILURE("uint32_t not supported in dec_modular"); + } else if (metadata.bit_depth.bits_per_sample > 32) { + return JXL_FAILURE("bits_per_sample > 32 not supported"); + } + } + + Image gi(frame_dim.xsize, frame_dim.ysize, metadata.bit_depth.bits_per_sample, + nb_chans + nb_extra); + + if (frame_header.color_transform == ColorTransform::kYCbCr) { + for (size_t c = 0; c < nb_chans; c++) { + gi.channel[c].hshift = frame_header.chroma_subsampling.HShift(c); + gi.channel[c].vshift = frame_header.chroma_subsampling.VShift(c); + size_t xsize_shifted = + DivCeil(frame_dim.xsize, 1 << gi.channel[c].hshift); + size_t ysize_shifted = + DivCeil(frame_dim.ysize, 1 << gi.channel[c].vshift); + gi.channel[c].shrink(xsize_shifted, ysize_shifted); + } + } + + for (size_t ec = 0, c = nb_chans; ec < nb_extra; ec++, c++) { + size_t ecups = frame_header.extra_channel_upsampling[ec]; + gi.channel[c].shrink(DivCeil(frame_dim.xsize_upsampled, ecups), + DivCeil(frame_dim.ysize_upsampled, ecups)); + gi.channel[c].hshift = gi.channel[c].vshift = + CeilLog2Nonzero(ecups) - CeilLog2Nonzero(frame_header.upsampling); + } + + ModularOptions options; + options.max_chan_size = frame_dim.group_dim; + options.group_dim = frame_dim.group_dim; + Status dec_status = ModularGenericDecompress( + reader, gi, &global_header, ModularStreamId::Global().ID(frame_dim), + &options, + /*undo_transforms=*/-2, &tree, &code, &context_map, + allow_truncated_group); + if (!allow_truncated_group) JXL_RETURN_IF_ERROR(dec_status); + if (dec_status.IsFatalError()) { + return JXL_FAILURE("Failed to decode global modular info"); + } + + // TODO(eustas): are we sure this can be done after partial decode? + have_something = false; + for (size_t c = 0; c < gi.channel.size(); c++) { + Channel& gic = gi.channel[c]; + if (c >= gi.nb_meta_channels && gic.w < frame_dim.group_dim && + gic.h < frame_dim.group_dim) + have_something = true; + } + full_image = std::move(gi); + return dec_status; +} + +Status ModularFrameDecoder::DecodeGroup(const Rect& rect, BitReader* reader, + int minShift, int maxShift, + const ModularStreamId& stream, + bool zerofill) { + JXL_DASSERT(stream.kind == ModularStreamId::kModularDC || + stream.kind == ModularStreamId::kModularAC); + const size_t xsize = rect.xsize(); + const size_t ysize = rect.ysize(); + Image gi(xsize, ysize, full_image.bitdepth, 0); + // start at the first bigger-than-groupsize non-metachannel + size_t c = full_image.nb_meta_channels; + for (; c < full_image.channel.size(); c++) { + Channel& fc = full_image.channel[c]; + if (fc.w > frame_dim.group_dim || fc.h > frame_dim.group_dim) break; + } + size_t beginc = c; + for (; c < full_image.channel.size(); c++) { + Channel& fc = full_image.channel[c]; + int shift = std::min(fc.hshift, fc.vshift); + if (shift > maxShift) continue; + if (shift < minShift) continue; + Rect r(rect.x0() >> fc.hshift, rect.y0() >> fc.vshift, + rect.xsize() >> fc.hshift, rect.ysize() >> fc.vshift, fc.w, fc.h); + if (r.xsize() == 0 || r.ysize() == 0) continue; + Channel gc(r.xsize(), r.ysize()); + gc.hshift = fc.hshift; + gc.vshift = fc.vshift; + gi.channel.emplace_back(std::move(gc)); + } + if (zerofill) { + int gic = 0; + for (c = beginc; c < full_image.channel.size(); c++) { + Channel& fc = full_image.channel[c]; + int shift = std::min(fc.hshift, fc.vshift); + if (shift > maxShift) continue; + if (shift < minShift) continue; + Rect r(rect.x0() >> fc.hshift, rect.y0() >> fc.vshift, + rect.xsize() >> fc.hshift, rect.ysize() >> fc.vshift, fc.w, fc.h); + if (r.xsize() == 0 || r.ysize() == 0) continue; + for (size_t y = 0; y < r.ysize(); ++y) { + pixel_type* const JXL_RESTRICT row_out = r.Row(&fc.plane, y); + memset(row_out, 0, r.xsize() * sizeof(*row_out)); + } + gic++; + } + return true; + } + ModularOptions options; + if (!ModularGenericDecompress( + reader, gi, /*header=*/nullptr, stream.ID(frame_dim), &options, + /*undo_transforms=*/-1, &tree, &code, &context_map)) + return JXL_FAILURE("Failed to decode modular group"); + int gic = 0; + for (c = beginc; c < full_image.channel.size(); c++) { + Channel& fc = full_image.channel[c]; + int shift = std::min(fc.hshift, fc.vshift); + if (shift > maxShift) continue; + if (shift < minShift) continue; + Rect r(rect.x0() >> fc.hshift, rect.y0() >> fc.vshift, + rect.xsize() >> fc.hshift, rect.ysize() >> fc.vshift, fc.w, fc.h); + if (r.xsize() == 0 || r.ysize() == 0) continue; + for (size_t y = 0; y < r.ysize(); ++y) { + pixel_type* const JXL_RESTRICT row_out = r.Row(&fc.plane, y); + const pixel_type* const JXL_RESTRICT row_in = gi.channel[gic].Row(y); + for (size_t x = 0; x < r.xsize(); ++x) { + row_out[x] = row_in[x]; + } + } + gic++; + } + return true; +} +Status ModularFrameDecoder::DecodeVarDCTDC(size_t group_id, BitReader* reader, + PassesDecoderState* dec_state) { + const Rect r = dec_state->shared->DCGroupRect(group_id); + // TODO(eustas): investigate if we could reduce the impact of + // EvalRationalPolynomial; generally speaking, the limit is + // 2**(128/(3*magic)), where 128 comes from IEEE 754 exponent, + // 3 comes from XybToRgb that cubes the values, and "magic" is + // the sum of all other contributions. 2**18 is known to lead + // to NaN on input found by fuzzing (see commit message). + Image image(r.xsize(), r.ysize(), full_image.bitdepth, 3); + size_t stream_id = ModularStreamId::VarDCTDC(group_id).ID(frame_dim); + reader->Refill(); + size_t extra_precision = reader->ReadFixedBits<2>(); + float mul = 1.0f / (1 << extra_precision); + ModularOptions options; + for (size_t c = 0; c < 3; c++) { + Channel& ch = image.channel[c < 2 ? c ^ 1 : c]; + ch.w >>= dec_state->shared->frame_header.chroma_subsampling.HShift(c); + ch.h >>= dec_state->shared->frame_header.chroma_subsampling.VShift(c); + ch.shrink(); + } + if (!ModularGenericDecompress( + reader, image, /*header=*/nullptr, stream_id, &options, + /*undo_transforms=*/-1, &tree, &code, &context_map)) { + return JXL_FAILURE("Failed to decode modular DC group"); + } + DequantDC(r, &dec_state->shared_storage.dc_storage, + &dec_state->shared_storage.quant_dc, image, + dec_state->shared->quantizer.MulDC(), mul, + dec_state->shared->cmap.DCFactors(), + dec_state->shared->frame_header.chroma_subsampling, + dec_state->shared->block_ctx_map); + return true; +} + +Status ModularFrameDecoder::DecodeAcMetadata(size_t group_id, BitReader* reader, + PassesDecoderState* dec_state) { + const Rect r = dec_state->shared->DCGroupRect(group_id); + size_t upper_bound = r.xsize() * r.ysize(); + reader->Refill(); + size_t count = reader->ReadBits(CeilLog2Nonzero(upper_bound)) + 1; + size_t stream_id = ModularStreamId::ACMetadata(group_id).ID(frame_dim); + // YToX, YToB, ACS + QF, EPF + Image image(r.xsize(), r.ysize(), full_image.bitdepth, 4); + static_assert(kColorTileDimInBlocks == 8, "Color tile size changed"); + Rect cr(r.x0() >> 3, r.y0() >> 3, (r.xsize() + 7) >> 3, (r.ysize() + 7) >> 3); + image.channel[0] = Channel(cr.xsize(), cr.ysize(), 3, 3); + image.channel[1] = Channel(cr.xsize(), cr.ysize(), 3, 3); + image.channel[2] = Channel(count, 2, 0, 0); + ModularOptions options; + if (!ModularGenericDecompress( + reader, image, /*header=*/nullptr, stream_id, &options, + /*undo_transforms=*/-1, &tree, &code, &context_map)) { + return JXL_FAILURE("Failed to decode AC metadata"); + } + ConvertPlaneAndClamp(Rect(image.channel[0].plane), image.channel[0].plane, cr, + &dec_state->shared_storage.cmap.ytox_map); + ConvertPlaneAndClamp(Rect(image.channel[1].plane), image.channel[1].plane, cr, + &dec_state->shared_storage.cmap.ytob_map); + size_t num = 0; + bool is444 = dec_state->shared->frame_header.chroma_subsampling.Is444(); + auto& ac_strategy = dec_state->shared_storage.ac_strategy; + size_t xlim = std::min(ac_strategy.xsize(), r.x0() + r.xsize()); + size_t ylim = std::min(ac_strategy.ysize(), r.y0() + r.ysize()); + uint32_t local_used_acs = 0; + for (size_t iy = 0; iy < r.ysize(); iy++) { + size_t y = r.y0() + iy; + int* row_qf = r.Row(&dec_state->shared_storage.raw_quant_field, iy); + uint8_t* row_epf = r.Row(&dec_state->shared_storage.epf_sharpness, iy); + int* row_in_1 = image.channel[2].plane.Row(0); + int* row_in_2 = image.channel[2].plane.Row(1); + int* row_in_3 = image.channel[3].plane.Row(iy); + for (size_t ix = 0; ix < r.xsize(); ix++) { + size_t x = r.x0() + ix; + int sharpness = row_in_3[ix]; + if (sharpness < 0 || sharpness >= LoopFilter::kEpfSharpEntries) { + return JXL_FAILURE("Corrupted sharpness field"); + } + row_epf[ix] = sharpness; + if (ac_strategy.IsValid(x, y)) { + continue; + } + + if (num >= count) return JXL_FAILURE("Corrupted stream"); + + if (!AcStrategy::IsRawStrategyValid(row_in_1[num])) { + return JXL_FAILURE("Invalid AC strategy"); + } + local_used_acs |= 1u << row_in_1[num]; + AcStrategy acs = AcStrategy::FromRawStrategy(row_in_1[num]); + if ((acs.covered_blocks_x() > 1 || acs.covered_blocks_y() > 1) && + !is444) { + return JXL_FAILURE( + "AC strategy not compatible with chroma subsampling"); + } + // Ensure that blocks do not overflow *AC* groups. + size_t next_x_ac_block = (x / kGroupDimInBlocks + 1) * kGroupDimInBlocks; + size_t next_y_ac_block = (y / kGroupDimInBlocks + 1) * kGroupDimInBlocks; + size_t next_x_dct_block = x + acs.covered_blocks_x(); + size_t next_y_dct_block = y + acs.covered_blocks_y(); + if (next_x_dct_block > next_x_ac_block || next_x_dct_block > xlim) { + return JXL_FAILURE("Invalid AC strategy, x overflow"); + } + if (next_y_dct_block > next_y_ac_block || next_y_dct_block > ylim) { + return JXL_FAILURE("Invalid AC strategy, y overflow"); + } + JXL_RETURN_IF_ERROR( + ac_strategy.SetNoBoundsCheck(x, y, AcStrategy::Type(row_in_1[num]))); + row_qf[ix] = + 1 + std::max(0, std::min(Quantizer::kQuantMax - 1, row_in_2[num])); + num++; + } + } + dec_state->used_acs |= local_used_acs; + if (dec_state->shared->frame_header.loop_filter.epf_iters > 0) { + ComputeSigma(r, dec_state); + } + return true; +} + +Status ModularFrameDecoder::FinalizeDecoding(PassesDecoderState* dec_state, + jxl::ThreadPool* pool, + ImageBundle* output) { + Image& gi = full_image; + size_t xsize = gi.w; + size_t ysize = gi.h; + + const auto& frame_header = dec_state->shared->frame_header; + const auto* metadata = frame_header.nonserialized_metadata; + + // Don't use threads if total image size is smaller than a group + if (xsize * ysize < frame_dim.group_dim * frame_dim.group_dim) pool = nullptr; + + // Undo the global transforms + gi.undo_transforms(global_header.wp_header, -1, pool); + if (gi.error) return JXL_FAILURE("Undoing transforms failed"); + + auto& decoded = dec_state->decoded; + + int c = 0; + if (do_color) { + const bool rgb_from_gray = + metadata->m.color_encoding.IsGray() && + frame_header.color_transform == ColorTransform::kNone; + const bool fp = metadata->m.bit_depth.floating_point_sample; + + for (; c < 3; c++) { + float factor = full_image.bitdepth < 32 + ? 1.f / ((1u << full_image.bitdepth) - 1) + : 0; + int c_in = c; + if (frame_header.color_transform == ColorTransform::kXYB) { + factor = dec_state->shared->matrices.DCQuants()[c]; + // XYB is encoded as YX(B-Y) + if (c < 2) c_in = 1 - c; + } else if (rgb_from_gray) { + c_in = 0; + } + // TODO(eustas): could we detect it on earlier stage? + if (gi.channel[c_in].w == 0 || gi.channel[c_in].h == 0) { + return JXL_FAILURE("Empty image"); + } + size_t xsize_shifted = DivCeil(xsize, 1 << gi.channel[c_in].hshift); + size_t ysize_shifted = DivCeil(ysize, 1 << gi.channel[c_in].vshift); + if (ysize_shifted != gi.channel[c_in].h || + xsize_shifted != gi.channel[c_in].w) { + return JXL_FAILURE("Dimension mismatch"); + } + if (frame_header.color_transform == ColorTransform::kXYB && c == 2) { + JXL_ASSERT(!fp); + RunOnPool( + pool, 0, ysize_shifted, jxl::ThreadPool::SkipInit(), + [&](const int task, const int thread) { + const size_t y = task; + const pixel_type* const JXL_RESTRICT row_in = + gi.channel[c_in].Row(y); + const pixel_type* const JXL_RESTRICT row_in_Y = + gi.channel[0].Row(y); + float* const JXL_RESTRICT row_out = decoded.PlaneRow(c, y); + HWY_DYNAMIC_DISPATCH(MultiplySum) + (xsize_shifted, row_in, row_in_Y, factor, row_out); + }, + "ModularIntToFloat"); + } else if (fp) { + int bits = metadata->m.bit_depth.bits_per_sample; + int exp_bits = metadata->m.bit_depth.exponent_bits_per_sample; + RunOnPool( + pool, 0, ysize_shifted, jxl::ThreadPool::SkipInit(), + [&](const int task, const int thread) { + const size_t y = task; + const pixel_type* const JXL_RESTRICT row_in = + gi.channel[c_in].Row(y); + float* const JXL_RESTRICT row_out = decoded.PlaneRow(c, y); + int_to_float(row_in, row_out, xsize_shifted, bits, exp_bits); + }, + "ModularIntToFloat_losslessfloat"); + } else { + RunOnPool( + pool, 0, ysize_shifted, jxl::ThreadPool::SkipInit(), + [&](const int task, const int thread) { + const size_t y = task; + const pixel_type* const JXL_RESTRICT row_in = + gi.channel[c_in].Row(y); + if (rgb_from_gray) { + HWY_DYNAMIC_DISPATCH(RgbFromSingle) + (xsize_shifted, row_in, factor, &decoded, c, y); + } else { + HWY_DYNAMIC_DISPATCH(SingleFromSingle) + (xsize_shifted, row_in, factor, &decoded, c, y); + } + }, + "ModularIntToFloat"); + } + if (rgb_from_gray) { + break; + } + } + if (rgb_from_gray) { + c = 1; + } + } + for (size_t ec = 0; ec < dec_state->extra_channels.size(); ec++, c++) { + const ExtraChannelInfo& eci = output->metadata()->extra_channel_info[ec]; + int bits = eci.bit_depth.bits_per_sample; + int exp_bits = eci.bit_depth.exponent_bits_per_sample; + bool fp = eci.bit_depth.floating_point_sample; + JXL_ASSERT(fp || bits < 32); + const float mul = fp ? 0 : (1.0f / ((1u << bits) - 1)); + size_t ecups = frame_header.extra_channel_upsampling[ec]; + const size_t ec_xsize = DivCeil(frame_dim.xsize_upsampled, ecups); + const size_t ec_ysize = DivCeil(frame_dim.ysize_upsampled, ecups); + for (size_t y = 0; y < ec_ysize; ++y) { + float* const JXL_RESTRICT row_out = dec_state->extra_channels[ec].Row(y); + const pixel_type* const JXL_RESTRICT row_in = gi.channel[c].Row(y); + if (fp) { + int_to_float(row_in, row_out, ec_xsize, bits, exp_bits); + } else { + for (size_t x = 0; x < ec_xsize; ++x) { + row_out[x] = row_in[x] * mul; + } + } + } + } + return true; +} + +static constexpr const float kAlmostZero = 1e-8f; + +Status ModularFrameDecoder::DecodeQuantTable( + size_t required_size_x, size_t required_size_y, BitReader* br, + QuantEncoding* encoding, size_t idx, + ModularFrameDecoder* modular_frame_decoder) { + JXL_RETURN_IF_ERROR(F16Coder::Read(br, &encoding->qraw.qtable_den)); + if (encoding->qraw.qtable_den < kAlmostZero) { + // qtable[] values are already checked for <= 0 so the denominator may not + // be negative. + return JXL_FAILURE("Invalid qtable_den: value too small"); + } + Image image(required_size_x, required_size_y, 8, 3); + ModularOptions options; + if (modular_frame_decoder) { + JXL_RETURN_IF_ERROR(ModularGenericDecompress( + br, image, /*header=*/nullptr, + ModularStreamId::QuantTable(idx).ID(modular_frame_decoder->frame_dim), + &options, /*undo_transforms=*/-1, &modular_frame_decoder->tree, + &modular_frame_decoder->code, &modular_frame_decoder->context_map)); + } else { + JXL_RETURN_IF_ERROR(ModularGenericDecompress(br, image, /*header=*/nullptr, + 0, &options, + /*undo_transforms=*/-1)); + } + if (!encoding->qraw.qtable) { + encoding->qraw.qtable = new std::vector(); + } + encoding->qraw.qtable->resize(required_size_x * required_size_y * 3); + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < required_size_y; y++) { + int* JXL_RESTRICT row = image.channel[c].Row(y); + for (size_t x = 0; x < required_size_x; x++) { + (*encoding->qraw.qtable)[c * required_size_x * required_size_y + + y * required_size_x + x] = row[x]; + if (row[x] <= 0) { + return JXL_FAILURE("Invalid raw quantization table"); + } + } + } + } + return true; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_modular.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_modular.h new file mode 100644 index 0000000000..7ae2418471 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_modular.h @@ -0,0 +1,125 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_MODULAR_H_ +#define LIB_JXL_DEC_MODULAR_H_ + +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/dec_params.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/modular_image.h" + +namespace jxl { + +struct ModularStreamId { + enum Kind { + kGlobalData, + kVarDCTDC, + kModularDC, + kACMetadata, + kQuantTable, + kModularAC + }; + Kind kind; + size_t quant_table_id; + size_t group_id; // DC or AC group id. + size_t pass_id; // Only for kModularAC. + size_t ID(const FrameDimensions& frame_dim) const { + size_t id = 0; + switch (kind) { + case kGlobalData: + id = 0; + break; + case kVarDCTDC: + id = 1 + group_id; + break; + case kModularDC: + id = 1 + frame_dim.num_dc_groups + group_id; + break; + case kACMetadata: + id = 1 + 2 * frame_dim.num_dc_groups + group_id; + break; + case kQuantTable: + id = 1 + 3 * frame_dim.num_dc_groups + quant_table_id; + break; + case kModularAC: + id = 1 + 3 * frame_dim.num_dc_groups + DequantMatrices::kNum + + frame_dim.num_groups * pass_id + group_id; + break; + }; + return id; + } + static ModularStreamId Global() { + return ModularStreamId{kGlobalData, 0, 0, 0}; + } + static ModularStreamId VarDCTDC(size_t group_id) { + return ModularStreamId{kVarDCTDC, 0, group_id, 0}; + } + static ModularStreamId ModularDC(size_t group_id) { + return ModularStreamId{kModularDC, 0, group_id, 0}; + } + static ModularStreamId ACMetadata(size_t group_id) { + return ModularStreamId{kACMetadata, 0, group_id, 0}; + } + static ModularStreamId QuantTable(size_t quant_table_id) { + JXL_ASSERT(quant_table_id < DequantMatrices::kNum); + return ModularStreamId{kQuantTable, quant_table_id, 0, 0}; + } + static ModularStreamId ModularAC(size_t group_id, size_t pass_id) { + return ModularStreamId{kModularAC, 0, group_id, pass_id}; + } + static size_t Num(const FrameDimensions& frame_dim, size_t passes) { + return ModularAC(0, passes).ID(frame_dim); + } +}; + +class ModularFrameDecoder { + public: + void Init(const FrameDimensions& frame_dim) { this->frame_dim = frame_dim; } + Status DecodeGlobalInfo(BitReader* reader, const FrameHeader& frame_header, + bool allow_truncated_group = false); + Status DecodeGroup(const Rect& rect, BitReader* reader, int minShift, + int maxShift, const ModularStreamId& stream, + bool zerofill); + // Decodes a VarDCT DC group (`group_id`) from the given `reader`. + Status DecodeVarDCTDC(size_t group_id, BitReader* reader, + PassesDecoderState* dec_state); + // Decodes a VarDCT AC Metadata group (`group_id`) from the given `reader`. + Status DecodeAcMetadata(size_t group_id, BitReader* reader, + PassesDecoderState* dec_state); + // Decodes a RAW quant table from `br` into the given `encoding`, of size + // `required_size_x x required_size_y`. If `modular_frame_decoder` is passed, + // its global tree is used, otherwise no global tree is used. + static Status DecodeQuantTable(size_t required_size_x, size_t required_size_y, + BitReader* br, QuantEncoding* encoding, + size_t idx, + ModularFrameDecoder* modular_frame_decoder); + Status FinalizeDecoding(PassesDecoderState* dec_state, jxl::ThreadPool* pool, + ImageBundle* output); + bool have_dc() const { return have_something; } + + private: + Image full_image; + FrameDimensions frame_dim; + bool do_color; + bool have_something; + Tree tree; + ANSCode code; + std::vector context_map; + GroupHeader global_header; +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_MODULAR_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_noise.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_noise.cc new file mode 100644 index 0000000000..240b8aff21 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_noise.cc @@ -0,0 +1,295 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_noise.h" + +#include +#include +#include + +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/dec_noise.cc" +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/sanitizers.h" +#include "lib/jxl/xorshift128plus-inl.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::ShiftRight; +using hwy::HWY_NAMESPACE::Vec; + +using D = HWY_CAPPED(float, kBlockDim); +using DI = hwy::HWY_NAMESPACE::Rebind; +using DI8 = hwy::HWY_NAMESPACE::Repartition; + +// Converts one vector's worth of random bits to floats in [1, 2). +// NOTE: as the convolution kernel sums to 0, it doesn't matter if inputs are in +// [0, 1) or in [1, 2). +void BitsToFloat(const uint32_t* JXL_RESTRICT random_bits, + float* JXL_RESTRICT floats) { + const HWY_FULL(float) df; + const HWY_FULL(uint32_t) du; + + const auto bits = Load(du, random_bits); + // 1.0 + 23 random mantissa bits = [1, 2) + const auto rand12 = BitCast(df, ShiftRight<9>(bits) | Set(du, 0x3F800000)); + Store(rand12, df, floats); +} + +void RandomImage(Xorshift128Plus* rng, const Rect& rect, + ImageF* JXL_RESTRICT noise) { + const size_t xsize = rect.xsize(); + const size_t ysize = rect.ysize(); + + // May exceed the vector size, hence we have two loops over x below. + constexpr size_t kFloatsPerBatch = + Xorshift128Plus::N * sizeof(uint64_t) / sizeof(float); + HWY_ALIGN uint64_t batch[Xorshift128Plus::N]; + + const HWY_FULL(float) df; + const size_t N = Lanes(df); + + for (size_t y = 0; y < ysize; ++y) { + float* JXL_RESTRICT row = rect.Row(noise, y); + + size_t x = 0; + // Only entire batches (avoids exceeding the image padding). + for (; x + kFloatsPerBatch <= xsize; x += kFloatsPerBatch) { + rng->Fill(batch); + for (size_t i = 0; i < kFloatsPerBatch; i += Lanes(df)) { + BitsToFloat(reinterpret_cast(batch) + i, row + x + i); + } + } + + // Any remaining pixels, rounded up to vectors (safe due to padding). + rng->Fill(batch); + size_t batch_pos = 0; // < kFloatsPerBatch + for (; x < xsize; x += N) { + BitsToFloat(reinterpret_cast(batch) + batch_pos, + row + x); + batch_pos += N; + } + } +} + +// [0, max_value] +template +static HWY_INLINE V Clamp0ToMax(D d, const V x, const V max_value) { + const auto clamped = Min(x, max_value); + return ZeroIfNegative(clamped); +} + +// x is in [0+delta, 1+delta], delta ~= 0.06 +template +typename StrengthEval::V NoiseStrength(const StrengthEval& eval, + const typename StrengthEval::V x) { + return Clamp0ToMax(D(), eval(x), Set(D(), 1.0f)); +} + +// TODO(veluca): SIMD-fy. +class StrengthEvalLut { + public: + using V = Vec; + + explicit StrengthEvalLut(const NoiseParams& noise_params) +#if HWY_TARGET == HWY_SCALAR + : noise_params_(noise_params) +#endif + { +#if HWY_TARGET != HWY_SCALAR + uint32_t lut[8]; + memcpy(lut, noise_params.lut, sizeof(lut)); + for (size_t i = 0; i < 8; i++) { + low16_lut[2 * i] = (lut[i] >> 0) & 0xFF; + low16_lut[2 * i + 1] = (lut[i] >> 8) & 0xFF; + high16_lut[2 * i] = (lut[i] >> 16) & 0xFF; + high16_lut[2 * i + 1] = (lut[i] >> 24) & 0xFF; + } +#endif + } + + V operator()(const V vx) const { + constexpr size_t kScale = NoiseParams::kNumNoisePoints - 2; + auto scaled_vx = Max(Zero(D()), vx * Set(D(), kScale)); + auto floor_x = Floor(scaled_vx); + auto frac_x = scaled_vx - floor_x; + floor_x = IfThenElse(scaled_vx >= Set(D(), kScale), Set(D(), kScale - 1), + floor_x); + frac_x = IfThenElse(scaled_vx >= Set(D(), kScale), Set(D(), 1), frac_x); + auto floor_x_int = ConvertTo(DI(), floor_x); +#if HWY_TARGET == HWY_SCALAR + auto low = Set(D(), noise_params_.lut[floor_x_int.raw]); + auto hi = Set(D(), noise_params_.lut[floor_x_int.raw + 1]); +#else + // Set each lane's bytes to {0, 0, 2x+1, 2x}. + auto floorx_indices_low = + floor_x_int * Set(DI(), 0x0202) + Set(DI(), 0x0100); + // Set each lane's bytes to {2x+1, 2x, 0, 0}. + auto floorx_indices_hi = + floor_x_int * Set(DI(), 0x02020000) + Set(DI(), 0x01000000); + // load LUT + auto low16 = BitCast(DI(), LoadDup128(DI8(), low16_lut)); + auto lowm = Set(DI(), 0xFFFF); + auto hi16 = BitCast(DI(), LoadDup128(DI8(), high16_lut)); + auto him = Set(DI(), 0xFFFF0000); + // low = noise_params.lut[floor_x] + auto low = + BitCast(D(), (TableLookupBytes(low16, floorx_indices_low) & lowm) | + (TableLookupBytes(hi16, floorx_indices_hi) & him)); + // hi = noise_params.lut[floor_x+1] + floorx_indices_low += Set(DI(), 0x0202); + floorx_indices_hi += Set(DI(), 0x02020000); + auto hi = + BitCast(D(), (TableLookupBytes(low16, floorx_indices_low) & lowm) | + (TableLookupBytes(hi16, floorx_indices_hi) & him)); +#endif + return MulAdd(hi - low, frac_x, low); + } + + private: +#if HWY_TARGET != HWY_SCALAR + // noise_params.lut transformed into two 16-bit lookup tables. + HWY_ALIGN uint8_t high16_lut[16]; + HWY_ALIGN uint8_t low16_lut[16]; +#else + const NoiseParams& noise_params_; +#endif +}; + +template +void AddNoiseToRGB(const D d, const Vec rnd_noise_r, + const Vec rnd_noise_g, const Vec rnd_noise_cor, + const Vec noise_strength_g, const Vec noise_strength_r, + float ytox, float ytob, float* JXL_RESTRICT out_x, + float* JXL_RESTRICT out_y, float* JXL_RESTRICT out_b) { + const auto kRGCorr = Set(d, 0.9921875f); // 127/128 + const auto kRGNCorr = Set(d, 0.0078125f); // 1/128 + + const auto red_noise = kRGNCorr * rnd_noise_r * noise_strength_r + + kRGCorr * rnd_noise_cor * noise_strength_r; + const auto green_noise = kRGNCorr * rnd_noise_g * noise_strength_g + + kRGCorr * rnd_noise_cor * noise_strength_g; + + auto vx = Load(d, out_x); + auto vy = Load(d, out_y); + auto vb = Load(d, out_b); + + vx += red_noise - green_noise + Set(d, ytox) * (red_noise + green_noise); + vy += red_noise + green_noise; + vb += Set(d, ytob) * (red_noise + green_noise); + + Store(vx, d, out_x); + Store(vy, d, out_y); + Store(vb, d, out_b); +} + +void AddNoise(const NoiseParams& noise_params, const Rect& noise_rect, + const Image3F& noise, const Rect& opsin_rect, + const ColorCorrelationMap& cmap, Image3F* opsin) { + if (!noise_params.HasAny()) return; + const StrengthEvalLut noise_model(noise_params); + D d; + const auto half = Set(d, 0.5f); + + const size_t xsize = opsin_rect.xsize(); + const size_t ysize = opsin_rect.ysize(); + + // With the prior subtract-random Laplacian approximation, rnd_* ranges were + // about [-1.5, 1.6]; Laplacian3 about doubles this to [-3.6, 3.6], so the + // normalizer is half of what it was before (0.5). + const auto norm_const = Set(d, 0.22f); + + float ytox = cmap.YtoXRatio(0); + float ytob = cmap.YtoBRatio(0); + + const size_t xsize_v = RoundUpTo(xsize, Lanes(d)); + + for (size_t y = 0; y < ysize; ++y) { + float* JXL_RESTRICT row_x = opsin_rect.PlaneRow(opsin, 0, y); + float* JXL_RESTRICT row_y = opsin_rect.PlaneRow(opsin, 1, y); + float* JXL_RESTRICT row_b = opsin_rect.PlaneRow(opsin, 2, y); + const float* JXL_RESTRICT row_rnd_r = noise_rect.ConstPlaneRow(noise, 0, y); + const float* JXL_RESTRICT row_rnd_g = noise_rect.ConstPlaneRow(noise, 1, y); + const float* JXL_RESTRICT row_rnd_c = noise_rect.ConstPlaneRow(noise, 2, y); + // Needed by the calls to Floor() in StrengthEvalLut. Only arithmetic and + // shuffles are otherwise done on the data, so this is safe. + msan::UnpoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float)); + msan::UnpoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float)); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + const auto vx = Load(d, row_x + x); + const auto vy = Load(d, row_y + x); + const auto in_g = vy - vx; + const auto in_r = vy + vx; + const auto noise_strength_g = NoiseStrength(noise_model, in_g * half); + const auto noise_strength_r = NoiseStrength(noise_model, in_r * half); + const auto addit_rnd_noise_red = Load(d, row_rnd_r + x) * norm_const; + const auto addit_rnd_noise_green = Load(d, row_rnd_g + x) * norm_const; + const auto addit_rnd_noise_correlated = + Load(d, row_rnd_c + x) * norm_const; + AddNoiseToRGB(D(), addit_rnd_noise_red, addit_rnd_noise_green, + addit_rnd_noise_correlated, noise_strength_g, + noise_strength_r, ytox, ytob, row_x + x, row_y + x, + row_b + x); + } + msan::PoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float)); + msan::PoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float)); + msan::PoisonMemory(row_b + xsize, (xsize_v - xsize) * sizeof(float)); + } +} + +void RandomImage3(size_t seed, const Rect& rect, Image3F* JXL_RESTRICT noise) { + HWY_ALIGN Xorshift128Plus rng(seed); + RandomImage(&rng, rect, &noise->Plane(0)); + RandomImage(&rng, rect, &noise->Plane(1)); + RandomImage(&rng, rect, &noise->Plane(2)); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(AddNoise); +void AddNoise(const NoiseParams& noise_params, const Rect& noise_rect, + const Image3F& noise, const Rect& opsin_rect, + const ColorCorrelationMap& cmap, Image3F* opsin) { + return HWY_DYNAMIC_DISPATCH(AddNoise)(noise_params, noise_rect, noise, + opsin_rect, cmap, opsin); +} + +HWY_EXPORT(RandomImage3); +void RandomImage3(size_t seed, const Rect& rect, Image3F* JXL_RESTRICT noise) { + return HWY_DYNAMIC_DISPATCH(RandomImage3)(seed, rect, noise); +} + +void DecodeFloatParam(float precision, float* val, BitReader* br) { + const int absval_quant = br->ReadFixedBits<10>(); + *val = absval_quant / precision; +} + +Status DecodeNoise(BitReader* br, NoiseParams* noise_params) { + for (float& i : noise_params->lut) { + DecodeFloatParam(kNoisePrecision, &i, br); + } + return true; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_noise.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_noise.h new file mode 100644 index 0000000000..f7135e7c5a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_noise.h @@ -0,0 +1,36 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_NOISE_H_ +#define LIB_JXL_DEC_NOISE_H_ + +// Noise synthesis. Currently disabled. + +#include +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/image.h" +#include "lib/jxl/noise.h" + +namespace jxl { + +// Add a noise to Opsin image, loading generated random noise from `noise_rect` +// in `noise`. +void AddNoise(const NoiseParams& noise_params, const Rect& noise_rect, + const Image3F& noise, const Rect& opsin_rect, + const ColorCorrelationMap& cmap, Image3F* opsin); + +void RandomImage3(size_t seed, const Rect& rect, Image3F* JXL_RESTRICT noise); + +// Must only call if FrameHeader.flags.kNoise. +Status DecodeNoise(BitReader* br, NoiseParams* noise_params); + +} // namespace jxl + +#endif // LIB_JXL_DEC_NOISE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_params.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_params.h new file mode 100644 index 0000000000..e3131e6cb9 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_params.h @@ -0,0 +1,62 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_PARAMS_H_ +#define LIB_JXL_DEC_PARAMS_H_ + +// Parameters and flags that govern JXL decompression. + +#include +#include + +#include + +#include "lib/jxl/base/override.h" + +namespace jxl { + +struct DecompressParams { + // If true, checks at the end of decoding that all of the compressed data + // was consumed by the decoder. + bool check_decompressed_size = true; + + // If true, skip dequant and iDCT and decode to JPEG (only if possible) + bool keep_dct = false; + // If true, render spot colors (otherwise only returned as extra channels) + bool render_spotcolors = true; + + // These cannot be kOn because they need encoder support. + Override preview = Override::kDefault; + + // How many passes to decode at most. By default, decode everything. + uint32_t max_passes = std::numeric_limits::max(); + // Alternatively, one can specify the maximum tolerable downscaling factor + // with respect to the full size of the image. By default, nothing less than + // the full size is requested. + size_t max_downsampling = 1; + + // Try to decode as much as possible of a truncated codestream, but only whole + // sections at a time. + bool allow_partial_files = false; + // Allow even more progression. + bool allow_more_progressive_steps = false; + + bool operator==(const DecompressParams other) const { + return check_decompressed_size == other.check_decompressed_size && + keep_dct == other.keep_dct && + render_spotcolors == other.render_spotcolors && + preview == other.preview && max_passes == other.max_passes && + max_downsampling == other.max_downsampling && + allow_partial_files == other.allow_partial_files && + allow_more_progressive_steps == other.allow_more_progressive_steps; + } + bool operator!=(const DecompressParams& other) const { + return !(*this == other); + } +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_PARAMS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_patch_dictionary.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_patch_dictionary.cc new file mode 100644 index 0000000000..d1b84f24db --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_patch_dictionary.cc @@ -0,0 +1,238 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_patch_dictionary.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/blending.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_frame.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/patch_dictionary_internal.h" + +namespace jxl { + +constexpr int kMaxPatches = 1 << 24; + +Status PatchDictionary::Decode(BitReader* br, size_t xsize, size_t ysize, + bool* uses_extra_channels) { + positions_.clear(); + std::vector context_map; + ANSCode code; + JXL_RETURN_IF_ERROR( + DecodeHistograms(br, kNumPatchDictionaryContexts, &code, &context_map)); + ANSSymbolReader decoder(&code, br); + + auto read_num = [&](size_t context) { + size_t r = decoder.ReadHybridUint(context, br, context_map); + return r; + }; + + size_t num_ref_patch = read_num(kNumRefPatchContext); + // TODO(veluca): does this make sense? + if (num_ref_patch > kMaxPatches) { + return JXL_FAILURE("Too many patches in dictionary"); + } + + for (size_t id = 0; id < num_ref_patch; id++) { + PatchReferencePosition ref_pos; + ref_pos.ref = read_num(kReferenceFrameContext); + if (ref_pos.ref >= kMaxNumReferenceFrames || + shared_->reference_frames[ref_pos.ref].frame->xsize() == 0) { + return JXL_FAILURE("Invalid reference frame ID"); + } + if (!shared_->reference_frames[ref_pos.ref].ib_is_in_xyb) { + return JXL_FAILURE( + "Patches cannot use frames saved post color transforms"); + } + const ImageBundle& ib = *shared_->reference_frames[ref_pos.ref].frame; + ref_pos.x0 = read_num(kPatchReferencePositionContext); + ref_pos.y0 = read_num(kPatchReferencePositionContext); + ref_pos.xsize = read_num(kPatchSizeContext) + 1; + ref_pos.ysize = read_num(kPatchSizeContext) + 1; + if (ref_pos.x0 + ref_pos.xsize > ib.xsize()) { + return JXL_FAILURE("Invalid position specified in reference frame"); + } + if (ref_pos.y0 + ref_pos.ysize > ib.ysize()) { + return JXL_FAILURE("Invalid position specified in reference frame"); + } + size_t id_count = read_num(kPatchCountContext) + 1; + if (id_count > kMaxPatches) { + return JXL_FAILURE("Too many patches in dictionary"); + } + positions_.reserve(positions_.size() + id_count); + for (size_t i = 0; i < id_count; i++) { + PatchPosition pos; + pos.ref_pos = ref_pos; + if (i == 0) { + pos.x = read_num(kPatchPositionContext); + pos.y = read_num(kPatchPositionContext); + } else { + pos.x = + positions_.back().x + UnpackSigned(read_num(kPatchOffsetContext)); + pos.y = + positions_.back().y + UnpackSigned(read_num(kPatchOffsetContext)); + } + if (pos.x + ref_pos.xsize > xsize) { + return JXL_FAILURE("Invalid patch x: at %zu + %zu > %zu", pos.x, + ref_pos.xsize, xsize); + } + if (pos.y + ref_pos.ysize > ysize) { + return JXL_FAILURE("Invalid patch y: at %zu + %zu > %zu", pos.y, + ref_pos.ysize, ysize); + } + for (size_t i = 0; i < shared_->metadata->m.extra_channel_info.size() + 1; + i++) { + uint32_t blend_mode = read_num(kPatchBlendModeContext); + if (blend_mode >= uint32_t(PatchBlendMode::kNumBlendModes)) { + return JXL_FAILURE("Invalid patch blend mode: %u", blend_mode); + } + PatchBlending info; + info.mode = static_cast(blend_mode); + if (UsesAlpha(info.mode)) { + *uses_extra_channels = true; + } + if (info.mode != PatchBlendMode::kNone && i > 0) { + *uses_extra_channels = true; + } + if (UsesAlpha(info.mode) && + shared_->metadata->m.extra_channel_info.size() > 1) { + info.alpha_channel = read_num(kPatchAlphaChannelContext); + if (info.alpha_channel >= + shared_->metadata->m.extra_channel_info.size()) { + return JXL_FAILURE( + "Invalid alpha channel for blending: %u out of %u\n", + info.alpha_channel, + (uint32_t)shared_->metadata->m.extra_channel_info.size()); + } + } else { + info.alpha_channel = 0; + } + if (UsesClamp(info.mode)) { + info.clamp = read_num(kPatchClampContext); + } else { + info.clamp = false; + } + pos.blending.push_back(info); + } + positions_.push_back(std::move(pos)); + } + } + + if (!decoder.CheckANSFinalState()) { + return JXL_FAILURE("ANS checksum failure."); + } + if (!HasAny()) { + return JXL_FAILURE("Decoded patch dictionary but got none"); + } + + ComputePatchCache(); + return true; +} + +int PatchDictionary::GetReferences() const { + int result = 0; + for (size_t i = 0; i < positions_.size(); ++i) { + result |= (1 << static_cast(positions_[i].ref_pos.ref)); + } + return result; +} + +void PatchDictionary::ComputePatchCache() { + patch_starts_.clear(); + sorted_patches_.clear(); + if (positions_.empty()) return; + std::vector> sorted_patches_y; + for (size_t i = 0; i < positions_.size(); i++) { + const PatchPosition& pos = positions_[i]; + for (size_t y = pos.y; y < pos.y + pos.ref_pos.ysize; y++) { + sorted_patches_y.emplace_back(y, i); + } + } + // The relative order of patches that affect the same pixels is preserved. + // This is important for patches that have a blend mode different from kAdd. + std::sort(sorted_patches_y.begin(), sorted_patches_y.end()); + patch_starts_.resize(sorted_patches_y.back().first + 2, + sorted_patches_y.size()); + sorted_patches_.resize(sorted_patches_y.size()); + for (size_t i = 0; i < sorted_patches_y.size(); i++) { + sorted_patches_[i] = sorted_patches_y[i].second; + patch_starts_[sorted_patches_y[i].first] = + std::min(patch_starts_[sorted_patches_y[i].first], i); + } + for (size_t i = patch_starts_.size() - 1; i > 0; i--) { + patch_starts_[i - 1] = std::min(patch_starts_[i], patch_starts_[i - 1]); + } +} + +Status PatchDictionary::AddTo(Image3F* opsin, const Rect& opsin_rect, + float* const* extra_channels, + const Rect& image_rect) const { + JXL_CHECK(SameSize(opsin_rect, image_rect)); + size_t num_ec = shared_->metadata->m.num_extra_channels; + std::vector fg_ptrs(3 + num_ec); + std::vector bg_ptrs(3 + num_ec); + for (size_t y = image_rect.y0(); y < image_rect.y0() + image_rect.ysize(); + y++) { + if (y + 1 >= patch_starts_.size()) continue; + for (size_t id = patch_starts_[y]; id < patch_starts_[y + 1]; id++) { + const PatchPosition& pos = positions_[sorted_patches_[id]]; + size_t by = pos.y; + size_t bx = pos.x; + size_t xsize = pos.ref_pos.xsize; + JXL_DASSERT(y >= by); + JXL_DASSERT(y < by + pos.ref_pos.ysize); + size_t iy = y - by; + size_t ref = pos.ref_pos.ref; + if (bx >= image_rect.x0() + image_rect.xsize()) continue; + if (bx + xsize < image_rect.x0()) continue; + size_t x0 = std::max(bx, image_rect.x0()); + size_t x1 = std::min(bx + xsize, image_rect.x0() + image_rect.xsize()); + for (size_t c = 0; c < 3; c++) { + fg_ptrs[c] = + shared_->reference_frames[ref].frame->color()->ConstPlaneRow( + c, pos.ref_pos.y0 + iy) + + pos.ref_pos.x0 + x0 - bx; + bg_ptrs[c] = opsin_rect.PlaneRow(opsin, c, y - image_rect.y0()) + x0 - + image_rect.x0(); + } + for (size_t i = 0; i < num_ec; i++) { + fg_ptrs[3 + i] = + shared_->reference_frames[ref].frame->extra_channels()[i].ConstRow( + pos.ref_pos.y0 + iy) + + pos.ref_pos.x0 + x0 - bx; + bg_ptrs[3 + i] = extra_channels[i] + x0 - image_rect.x0(); + } + JXL_RETURN_IF_ERROR( + PerformBlending(bg_ptrs.data(), fg_ptrs.data(), bg_ptrs.data(), + x1 - x0, pos.blending[0], pos.blending.data() + 1, + shared_->metadata->m.extra_channel_info)); + } + } + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_patch_dictionary.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_patch_dictionary.h new file mode 100644 index 0000000000..8e3c4d0349 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_patch_dictionary.h @@ -0,0 +1,200 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_PATCH_DICTIONARY_H_ +#define LIB_JXL_DEC_PATCH_DICTIONARY_H_ + +// Chooses reference patches, and avoids encoding them once per occurrence. + +#include +#include +#include + +#include +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/image.h" +#include "lib/jxl/opsin_params.h" + +namespace jxl { + +constexpr size_t kMaxPatchSize = 32; + +enum class PatchBlendMode : uint8_t { + // The new values are the old ones. Useful to skip some channels. + kNone = 0, + // The new values (in the crop) replace the old ones: sample = new + kReplace = 1, + // The new values (in the crop) get added to the old ones: sample = old + new + kAdd = 2, + // The new values (in the crop) get multiplied by the old ones: + // sample = old * new + // This blend mode is only supported if BlendColorSpace is kEncoded. The + // range of the new value matters for multiplication purposes, and its + // nominal range of 0..1 is computed the same way as this is done for the + // alpha values in kBlend and kAlphaWeightedAdd. + kMul = 3, + // The new values (in the crop) replace the old ones if alpha>0: + // For first alpha channel: + // alpha = old + new * (1 - old) + // For other channels if !alpha_associated: + // sample = ((1 - new_alpha) * old * old_alpha + new_alpha * new) / alpha + // For other channels if alpha_associated: + // sample = (1 - new_alpha) * old + new + // The alpha formula applies to the alpha used for the division in the other + // channels formula, and applies to the alpha channel itself if its + // blend_channel value matches itself. + // If using kBlendAbove, new is the patch and old is the original image; if + // using kBlendBelow, the meaning is inverted. + kBlendAbove = 4, + kBlendBelow = 5, + // The new values (in the crop) are added to the old ones if alpha>0: + // For first alpha channel: sample = sample = old + new * (1 - old) + // For other channels: sample = old + alpha * new + kAlphaWeightedAddAbove = 6, + kAlphaWeightedAddBelow = 7, + kNumBlendModes, +}; + +inline bool UsesAlpha(PatchBlendMode mode) { + return mode == PatchBlendMode::kBlendAbove || + mode == PatchBlendMode::kBlendBelow || + mode == PatchBlendMode::kAlphaWeightedAddAbove || + mode == PatchBlendMode::kAlphaWeightedAddBelow; +} +inline bool UsesClamp(PatchBlendMode mode) { + return UsesAlpha(mode) || mode == PatchBlendMode::kMul; +} + +struct PatchBlending { + PatchBlendMode mode; + uint32_t alpha_channel; + bool clamp; +}; + +struct QuantizedPatch { + size_t xsize; + size_t ysize; + QuantizedPatch() { + for (size_t i = 0; i < 3; i++) { + pixels[i].resize(kMaxPatchSize * kMaxPatchSize); + fpixels[i].resize(kMaxPatchSize * kMaxPatchSize); + } + } + std::vector pixels[3] = {}; + // Not compared. Used only to retrieve original pixels to construct the + // reference image. + std::vector fpixels[3] = {}; + bool operator==(const QuantizedPatch& other) const { + if (xsize != other.xsize) return false; + if (ysize != other.ysize) return false; + for (size_t c = 0; c < 3; c++) { + if (memcmp(pixels[c].data(), other.pixels[c].data(), + sizeof(int8_t) * xsize * ysize) != 0) + return false; + } + return true; + } + + bool operator<(const QuantizedPatch& other) const { + if (xsize != other.xsize) return xsize < other.xsize; + if (ysize != other.ysize) return ysize < other.ysize; + for (size_t c = 0; c < 3; c++) { + int cmp = memcmp(pixels[c].data(), other.pixels[c].data(), + sizeof(int8_t) * xsize * ysize); + if (cmp > 0) return false; + if (cmp < 0) return true; + } + return false; + } +}; + +// Pair (patch, vector of occurrences). +using PatchInfo = + std::pair>>; + +// Position and size of the patch in the reference frame. +struct PatchReferencePosition { + size_t ref, x0, y0, xsize, ysize; + bool operator<(const PatchReferencePosition& oth) const { + return std::make_tuple(ref, x0, y0, xsize, ysize) < + std::make_tuple(oth.ref, oth.x0, oth.y0, oth.xsize, oth.ysize); + } + bool operator==(const PatchReferencePosition& oth) const { + return !(*this < oth) && !(oth < *this); + } +}; + +struct PatchPosition { + // Position of top-left corner of the patch in the image. + size_t x, y; + // Different blend mode for color and extra channels. + std::vector blending; + PatchReferencePosition ref_pos; + bool operator<(const PatchPosition& oth) const { + return std::make_tuple(ref_pos, x, y) < + std::make_tuple(oth.ref_pos, oth.x, oth.y); + } +}; + +struct PassesSharedState; + +// Encoder-side helper class to encode the PatchesDictionary. +class PatchDictionaryEncoder; + +class PatchDictionary { + public: + PatchDictionary() = default; + + void SetPassesSharedState(const PassesSharedState* shared) { + shared_ = shared; + } + + bool HasAny() const { return !positions_.empty(); } + + Status Decode(BitReader* br, size_t xsize, size_t ysize, + bool* uses_extra_channels); + + void Clear() { + positions_.clear(); + ComputePatchCache(); + } + + // Only adds patches that belong to the `image_rect` area of the decoded + // image, writing them to the `opsin_rect` area of `opsin`. + Status AddTo(Image3F* opsin, const Rect& opsin_rect, + float* const* extra_channels, const Rect& image_rect) const; + + // Returns dependencies of this patch dictionary on reference frame ids as a + // bit mask: bits 0-3 indicate reference frame 0-3. + int GetReferences() const; + + private: + friend class PatchDictionaryEncoder; + + const PassesSharedState* shared_; + std::vector positions_; + + // Patch occurrences sorted by y. + std::vector sorted_patches_; + // Index of the first patch for each y value. + std::vector patch_starts_; + + // Patch IDs in position [patch_starts_[y], patch_start_[y+1]) of + // sorted_patches_ are all the patches that intersect the horizontal line at + // y. + // The relative order of patches that affect the same pixels is the same - + // important when applying patches is noncommutative. + + // Compute patches_by_y_ after updating positions_. + void ComputePatchCache(); +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_PATCH_DICTIONARY_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_reconstruct.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_reconstruct.cc new file mode 100644 index 0000000000..b295728ba4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_reconstruct.cc @@ -0,0 +1,1242 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_reconstruct.h" + +#include +#include + +#include "lib/jxl/filters.h" +#include "lib/jxl/image_ops.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/dec_reconstruct.cc" +#include +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/blending.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_noise.h" +#include "lib/jxl/dec_upsample.h" +#include "lib/jxl/dec_xyb-inl.h" +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/epf.h" +#include "lib/jxl/fast_math-inl.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/passes_state.h" +#include "lib/jxl/sanitizers.h" +#include "lib/jxl/transfer_functions-inl.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +template +void DoUndoXYBInPlace(Image3F* idct, const Rect& rect, Op op, + const OutputEncodingInfo& output_encoding_info) { + // TODO(eustas): should it still be capped? + const HWY_CAPPED(float, GroupBorderAssigner::kPaddingXRound) d; + const size_t xsize = rect.xsize(); + const size_t xsize_v = RoundUpTo(xsize, Lanes(d)); + // The size of `rect` might not be a multiple of Lanes(d), but is guaranteed + // to be a multiple of kBlockDim or at the margin of the image. + for (size_t y = 0; y < rect.ysize(); y++) { + float* JXL_RESTRICT row0 = rect.PlaneRow(idct, 0, y); + float* JXL_RESTRICT row1 = rect.PlaneRow(idct, 1, y); + float* JXL_RESTRICT row2 = rect.PlaneRow(idct, 2, y); + // All calculations are lane-wise, still some might require value-dependent + // behaviour (e.g. NearestInt). Temporary unposion last vector tail. + msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + for (size_t x = 0; x < rect.xsize(); x += Lanes(d)) { + const auto in_opsin_x = Load(d, row0 + x); + const auto in_opsin_y = Load(d, row1 + x); + const auto in_opsin_b = Load(d, row2 + x); + JXL_COMPILER_FENCE; + auto linear_r = Undefined(d); + auto linear_g = Undefined(d); + auto linear_b = Undefined(d); + XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, + output_encoding_info.opsin_params, &linear_r, &linear_g, + &linear_b); + Store(op.Transform(d, linear_r), d, row0 + x); + Store(op.Transform(d, linear_g), d, row1 + x); + Store(op.Transform(d, linear_b), d, row2 + x); + } + msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize)); + } +} + +struct OpLinear { + template + T Transform(D d, const T& linear) { + return linear; + } +}; + +struct OpRgb { + template + T Transform(D d, const T& linear) { +#if JXL_HIGH_PRECISION + return TF_SRGB().EncodedFromDisplay(d, linear); +#else + return FastLinearToSRGB(d, linear); +#endif + } +}; + +struct OpPq { + template + T Transform(D d, const T& linear) { + return TF_PQ().EncodedFromDisplay(d, linear); + } +}; + +struct OpHlg { + template + T Transform(D d, const T& linear) { + return TF_HLG().EncodedFromDisplay(d, linear); + } +}; + +struct Op709 { + template + T Transform(D d, const T& linear) { + return TF_709().EncodedFromDisplay(d, linear); + } +}; + +struct OpGamma { + const float inverse_gamma; + template + T Transform(D d, const T& linear) { + return IfThenZeroElse(linear <= Set(d, 1e-5f), + FastPowf(d, linear, Set(d, inverse_gamma))); + } +}; + +Status UndoXYBInPlace(Image3F* idct, const Rect& rect, + const OutputEncodingInfo& output_encoding_info) { + PROFILER_ZONE("UndoXYB"); + + if (output_encoding_info.color_encoding.tf.IsLinear()) { + DoUndoXYBInPlace(idct, rect, OpLinear(), output_encoding_info); + } else if (output_encoding_info.color_encoding.tf.IsSRGB()) { + DoUndoXYBInPlace(idct, rect, OpRgb(), output_encoding_info); + } else if (output_encoding_info.color_encoding.tf.IsPQ()) { + DoUndoXYBInPlace(idct, rect, OpPq(), output_encoding_info); + } else if (output_encoding_info.color_encoding.tf.IsHLG()) { + DoUndoXYBInPlace(idct, rect, OpHlg(), output_encoding_info); + } else if (output_encoding_info.color_encoding.tf.Is709()) { + DoUndoXYBInPlace(idct, rect, Op709(), output_encoding_info); + } else if (output_encoding_info.color_encoding.tf.IsGamma() || + output_encoding_info.color_encoding.tf.IsDCI()) { + OpGamma op = {output_encoding_info.inverse_gamma}; + DoUndoXYBInPlace(idct, rect, op, output_encoding_info); + } else { + // This is a programming error. + JXL_ABORT("Invalid target encoding"); + } + return true; +} + +template +void StoreRGBA(D d, V r, V g, V b, V a, bool alpha, size_t n, size_t extra, + uint8_t* buf) { +#if HWY_TARGET == HWY_SCALAR + buf[0] = r.raw; + buf[1] = g.raw; + buf[2] = b.raw; + if (alpha) { + buf[3] = a.raw; + } +#elif HWY_TARGET == HWY_NEON + if (alpha) { + uint8x8x4_t data = {r.raw, g.raw, b.raw, a.raw}; + if (extra >= 8) { + vst4_u8(buf, data); + } else { + uint8_t tmp[8 * 4]; + vst4_u8(tmp, data); + memcpy(buf, tmp, n * 4); + } + } else { + uint8x8x3_t data = {r.raw, g.raw, b.raw}; + if (extra >= 8) { + vst3_u8(buf, data); + } else { + uint8_t tmp[8 * 3]; + vst3_u8(tmp, data); + memcpy(buf, tmp, n * 3); + } + } +#else + // TODO(veluca): implement this for x86. + size_t mul = alpha ? 4 : 3; + HWY_ALIGN uint8_t bytes[16]; + Store(r, d, bytes); + for (size_t i = 0; i < n; i++) { + buf[mul * i] = bytes[i]; + } + Store(g, d, bytes); + for (size_t i = 0; i < n; i++) { + buf[mul * i + 1] = bytes[i]; + } + Store(b, d, bytes); + for (size_t i = 0; i < n; i++) { + buf[mul * i + 2] = bytes[i]; + } + if (alpha) { + Store(a, d, bytes); + for (size_t i = 0; i < n; i++) { + buf[4 * i + 3] = bytes[i]; + } + } +#endif +} + +// Outputs floating point image to RGBA 8-bit buffer. Does not support alpha +// channel in the input, but outputs opaque alpha channel for the case where the +// output buffer to write to is in the 4-byte per pixel RGBA format. +void FloatToRGBA8(const Image3F& input, const Rect& input_rect, bool is_rgba, + const ImageF* alpha_in, const Rect& alpha_rect, + const Rect& output_buf_rect, uint8_t* JXL_RESTRICT output_buf, + size_t stride) { + size_t bytes = is_rgba ? 4 : 3; + for (size_t y = 0; y < output_buf_rect.ysize(); y++) { + const float* JXL_RESTRICT row_in_r = input_rect.ConstPlaneRow(input, 0, y); + const float* JXL_RESTRICT row_in_g = input_rect.ConstPlaneRow(input, 1, y); + const float* JXL_RESTRICT row_in_b = input_rect.ConstPlaneRow(input, 2, y); + const float* JXL_RESTRICT row_in_a = + alpha_in ? alpha_rect.ConstRow(*alpha_in, y) : nullptr; + size_t base_ptr = + (y + output_buf_rect.y0()) * stride + bytes * output_buf_rect.x0(); + using D = HWY_CAPPED(float, 4); + const D d; + D::Rebind du; + auto zero = Zero(d); + auto one = Set(d, 1.0f); + auto mul = Set(d, 255.0f); + + // All calculations are lane-wise, still some might require value-dependent + // behaviour (e.g. NearestInt). Temporary unposion last vector tail. + size_t xsize = output_buf_rect.xsize(); + size_t xsize_v = RoundUpTo(xsize, Lanes(d)); + msan::UnpoisonMemory(row_in_r + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row_in_g + xsize, sizeof(float) * (xsize_v - xsize)); + msan::UnpoisonMemory(row_in_b + xsize, sizeof(float) * (xsize_v - xsize)); + if (row_in_a) + msan::UnpoisonMemory(row_in_a + xsize, sizeof(float) * (xsize_v - xsize)); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + auto rf = Clamp(zero, Load(d, row_in_r + x), one) * mul; + auto gf = Clamp(zero, Load(d, row_in_g + x), one) * mul; + auto bf = Clamp(zero, Load(d, row_in_b + x), one) * mul; + auto af = row_in_a ? Clamp(zero, Load(d, row_in_a + x), one) * mul + : Set(d, 255.0f); + auto r8 = U8FromU32(BitCast(du, NearestInt(rf))); + auto g8 = U8FromU32(BitCast(du, NearestInt(gf))); + auto b8 = U8FromU32(BitCast(du, NearestInt(bf))); + auto a8 = U8FromU32(BitCast(du, NearestInt(af))); + size_t n = output_buf_rect.xsize() - x; + if (JXL_LIKELY(n >= Lanes(d))) { + StoreRGBA(D::Rebind(), r8, g8, b8, a8, is_rgba, Lanes(d), n, + output_buf + base_ptr + bytes * x); + } else { + StoreRGBA(D::Rebind(), r8, g8, b8, a8, is_rgba, n, n, + output_buf + base_ptr + bytes * x); + } + } + msan::PoisonMemory(row_in_r + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row_in_g + xsize, sizeof(float) * (xsize_v - xsize)); + msan::PoisonMemory(row_in_b + xsize, sizeof(float) * (xsize_v - xsize)); + if (row_in_a) + msan::PoisonMemory(row_in_a + xsize, sizeof(float) * (xsize_v - xsize)); + } +} + +// Upsample in horizonal (if hs=1) and vertical (if vs=1) the plane_in image +// to the output plane_out image. +// The output region "rect" in plane_out and a border around it of lf.Padding() +// will be generated, as long as those pixels fall inside the image frame. +// Otherwise the border pixels that fall outside the image frame in plane_out +// are undefined. +// "rect" is an area inside the plane_out image which corresponds to the +// "frame_rect" area in the frame. plane_in and plane_out both are expected to +// have a padding of kGroupDataXBorder and kGroupDataYBorder on either side of +// X and Y coordinates. This means that when upsampling vertically the plane_out +// row `kGroupDataXBorder + N` will be generated from the plane_in row +// `kGroupDataXBorder + N / 2` (and a previous or next row). +void DoYCbCrUpsampling(size_t hs, size_t vs, ImageF* plane_in, const Rect& rect, + const Rect& frame_rect, const FrameDimensions& frame_dim, + ImageF* plane_out, const LoopFilter& lf, ImageF* temp) { + JXL_DASSERT(SameSize(rect, frame_rect)); + JXL_DASSERT(hs <= 1 && vs <= 1); + // The pixel in (xoff, yoff) is the origin of the downsampling coordinate + // system. + size_t xoff = PassesDecoderState::kGroupDataXBorder; + size_t yoff = PassesDecoderState::kGroupDataYBorder; + + // This X,Y range is the intersection between the requested "rect" expanded + // with a lf.Padding() all around and the image frame translated to the + // coordinate system used by plane_out. + // All the pixels in the [x0, x1) x [y0, y1) range must be defined in the + // plane_out output at the end. + const size_t y0 = rect.y0() - std::min(lf.Padding(), frame_rect.y0()); + const size_t y1 = rect.y0() + + std::min(frame_rect.y0() + rect.ysize() + lf.Padding(), + frame_dim.ysize_padded) - + frame_rect.y0(); + + const size_t x0 = rect.x0() - std::min(lf.Padding(), frame_rect.x0()); + const size_t x1 = rect.x0() + + std::min(frame_rect.x0() + rect.xsize() + lf.Padding(), + frame_dim.xsize_padded) - + frame_rect.x0(); + + if (hs == 0 && vs == 0) { + Rect r(x0, y0, x1 - x0, y1 - y0); + JXL_CHECK_IMAGE_INITIALIZED(*plane_in, r); + CopyImageTo(r, *plane_in, r, plane_out); + return; + } + // Prepare padding if we are on a border. + // Copy the whole row/column here: it is likely similarly fast and ensures + // that we don't forget some parts of padding. + if (frame_rect.x0() == 0) { + for (size_t y = 0; y < plane_in->ysize(); y++) { + plane_in->Row(y)[rect.x0() - 1] = plane_in->Row(y)[rect.x0()]; + } + } + if (frame_rect.x0() + x1 - rect.x0() >= frame_dim.xsize_padded) { + ssize_t borderx = static_cast(x1 - xoff + hs) / (1 << hs) + xoff; + for (size_t y = 0; y < plane_in->ysize(); y++) { + plane_in->Row(y)[borderx] = plane_in->Row(y)[borderx - 1]; + } + } + if (frame_rect.y0() == 0) { + memcpy(plane_in->Row(rect.y0() - 1), plane_in->Row(rect.y0()), + plane_in->xsize() * sizeof(float)); + } + if (frame_rect.y0() + y1 - rect.y0() >= frame_dim.ysize_padded) { + ssize_t bordery = static_cast(y1 - yoff + vs) / (1 << vs) + yoff; + memcpy(plane_in->Row(bordery), plane_in->Row(bordery - 1), + plane_in->xsize() * sizeof(float)); + } + if (hs == 1) { + // Limited to 4 for Interleave*. + HWY_CAPPED(float, 4) d; + auto threefour = Set(d, 0.75f); + auto onefour = Set(d, 0.25f); + size_t orig_y0 = y0; + size_t orig_y1 = y1; + if (vs != 0) { + orig_y0 = (y0 >> 1) + (yoff >> 1) - 1; + orig_y1 = (y1 >> 1) + (yoff >> 1) + 1; + } + for (size_t y = orig_y0; y < orig_y1; y++) { + const float* in = plane_in->Row(y); + float* out = temp->Row(y); + for (size_t x = x0 / (2 * Lanes(d)) * 2 * Lanes(d); + x < RoundUpTo(x1, 2 * Lanes(d)); x += 2 * Lanes(d)) { + size_t ox = (x >> 1) + (xoff >> 1); + auto current = Load(d, in + ox) * threefour; + auto prev = LoadU(d, in + ox - 1); + auto next = LoadU(d, in + ox + 1); + auto left = MulAdd(onefour, prev, current); + auto right = MulAdd(onefour, next, current); +#if HWY_TARGET == HWY_SCALAR + Store(left, d, out + x); + Store(right, d, out + x + 1); +#else + Store(InterleaveLower(left, right), d, out + x); + Store(InterleaveUpper(left, right), d, out + x + Lanes(d)); +#endif + } + } + } else { + CopyImageTo(*plane_in, temp); + } + if (vs == 1) { + HWY_FULL(float) d; + auto threefour = Set(d, 0.75f); + auto onefour = Set(d, 0.25f); + for (size_t y = y0; y < y1; y++) { + size_t oy1 = (y >> 1) + (yoff >> 1); + if ((y & 1) == 1) oy1++; + size_t oy0 = oy1 - 1; + const float* in0 = temp->Row(oy0); + const float* in1 = temp->Row(oy1); + float* out = plane_out->Row(y); + if ((y & 1) == 1) { + for (size_t x = x0 / Lanes(d) * Lanes(d); x < RoundUpTo(x1, Lanes(d)); + x += Lanes(d)) { + auto i0 = Load(d, in0 + x); + auto i1 = Load(d, in1 + x); + auto o = MulAdd(i0, threefour, i1 * onefour); + Store(o, d, out + x); + } + } else { + for (size_t x = x0 / Lanes(d) * Lanes(d); x < RoundUpTo(x1, Lanes(d)); + x += Lanes(d)) { + auto i0 = Load(d, in0 + x); + auto i1 = Load(d, in1 + x); + auto o = MulAdd(i0, onefour, i1 * threefour); + Store(o, d, out + x); + } + } + } + } else { + CopyImageTo(*temp, plane_out); + } + + // The output must be initialized including the lf.Padding() around the image + // for all the pixels that fall inside the image frame. + JXL_CHECK_IMAGE_INITIALIZED(*plane_out, Rect(x0, y0, x1 - x0, y1 - y0)); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(UndoXYBInPlace); +HWY_EXPORT(FloatToRGBA8); +HWY_EXPORT(DoYCbCrUpsampling); + +void UndoXYB(const Image3F& src, Image3F* dst, + const OutputEncodingInfo& output_info, ThreadPool* pool) { + CopyImageTo(src, dst); + pool->Run(0, src.ysize(), ThreadPool::SkipInit(), [&](int y, int /*thread*/) { + JXL_CHECK(HWY_DYNAMIC_DISPATCH(UndoXYBInPlace)(dst, Rect(*dst).Line(y), + output_info)); + }); +} + +namespace { +Rect ScaleRectForEC(Rect in, const FrameHeader& frame_header, size_t ec) { + auto s = [&](size_t x) { + return DivCeil(x * frame_header.upsampling, + frame_header.extra_channel_upsampling[ec]); + }; + return Rect(s(in.x0()), s(in.y0()), s(in.xsize()), s(in.ysize())); +} + +// Implements EnsurePaddingInPlace, but allows processing data one row at a +// time. +class EnsurePaddingInPlaceRowByRow { + void Init(const Rect& rect, const Rect& image_rect, size_t image_xsize, + size_t image_ysize, size_t xpadding, size_t ypadding, ssize_t* y0, + ssize_t* y1) { + // coordinates relative to rect. + JXL_DASSERT(SameSize(rect, image_rect)); + *y0 = -std::min(image_rect.y0(), ypadding); + *y1 = rect.ysize() + std::min(ypadding, image_ysize - image_rect.ysize() - + image_rect.y0()); + if (image_rect.x0() >= xpadding && + image_rect.x0() + image_rect.xsize() + xpadding <= image_xsize) { + // Nothing to do. + strategy_ = kSkip; + } else if (image_xsize >= 2 * xpadding) { + strategy_ = kFast; + } else { + strategy_ = kSlow; + } + y0_ = rect.y0(); + JXL_DASSERT(rect.x0() >= xpadding); + x0_ = x1_ = rect.x0() - xpadding; + // If close to the left border - do mirroring. + if (image_rect.x0() < xpadding) x1_ = rect.x0() - image_rect.x0(); + x2_ = x3_ = rect.x0() + rect.xsize() + xpadding; + // If close to the right border - do mirroring. + if (image_rect.x0() + image_rect.xsize() + xpadding > image_xsize) { + x2_ = rect.x0() + image_xsize - image_rect.x0(); + } + JXL_DASSERT(image_xsize == (x2_ - x1_) || + (x1_ - x0_ <= x2_ - x1_ && x3_ - x2_ <= x2_ - x1_)); + } + + public: + void Init(Image3F* img, const Rect& rect, const Rect& image_rect, + size_t image_xsize, size_t image_ysize, size_t xpadding, + size_t ypadding, ssize_t* y0, ssize_t* y1) { + Init(rect, image_rect, image_xsize, image_ysize, xpadding, ypadding, y0, + y1); + img3_ = img; + JXL_DASSERT(x3_ <= img->xsize()); + } + void Init(ImageF* img, const Rect& rect, const Rect& image_rect, + size_t image_xsize, size_t image_ysize, size_t xpadding, + size_t ypadding, ssize_t* y0, ssize_t* y1) { + Init(rect, image_rect, image_xsize, image_ysize, xpadding, ypadding, y0, + y1); + img_ = img; + JXL_DASSERT(x3_ <= img->xsize()); + } + // To be called when row `y` of the input is available, for all the values in + // [*y0, *y1). + void Process3(ssize_t y) { + JXL_DASSERT(img3_); + for (size_t c = 0; c < 3; c++) { + img_ = &img3_->Plane(c); + Process(y); + } + } + void Process(ssize_t y) { + JXL_DASSERT(img_); + switch (strategy_) { + case kSkip: + break; + case kFast: { + // Image is wide enough that a single Mirror() step is sufficient. + float* JXL_RESTRICT row = img_->Row(y + y0_); + for (size_t x = x0_; x < x1_; x++) { + row[x] = row[2 * x1_ - x - 1]; + } + for (size_t x = x2_; x < x3_; x++) { + row[x] = row[2 * x2_ - x - 1]; + } + break; + } + case kSlow: { + // Slow case for small images. + float* JXL_RESTRICT row = img_->Row(y + y0_) + x1_; + for (ssize_t x = x0_ - x1_; x < 0; x++) { + *(row + x) = row[Mirror(x, x2_ - x1_)]; + } + for (size_t x = x2_ - x1_; x < x3_ - x1_; x++) { + *(row + x) = row[Mirror(x, x2_ - x1_)]; + } + break; + } + } + } + + private: + // Initialized to silence spurious compiler warnings. + Image3F* img3_ = nullptr; + ImageF* img_ = nullptr; + // Will fill [x0_, x1_) and [x2_, x3_) on every row. + // The [x1_, x2_) range contains valid image pixels. We guarantee that either + // x1_ - x0_ <= x2_ - x1_, (and similarly for x2_, x3_), or that the [x1_, + // x2_) contains a full horizontal line of the original image. + size_t x0_ = 0, x1_ = 0, x2_ = 0, x3_ = 0; + size_t y0_ = 0; + // kSlow: use calls to Mirror(), for the case where the border might be larger + // than the image. + // kFast: directly use the result of Mirror() when it can be computed in a + // single iteration. + // kSkip: do nothing. + enum Strategy { kFast, kSlow, kSkip }; + Strategy strategy_ = kSkip; +}; +} // namespace + +void EnsurePaddingInPlace(Image3F* img, const Rect& rect, + const Rect& image_rect, size_t image_xsize, + size_t image_ysize, size_t xpadding, + size_t ypadding) { + ssize_t y0, y1; + EnsurePaddingInPlaceRowByRow impl; + impl.Init(img, rect, image_rect, image_xsize, image_ysize, xpadding, ypadding, + &y0, &y1); + for (ssize_t y = y0; y < y1; y++) { + impl.Process3(y); + } +} + +Status FinalizeImageRect( + Image3F* input_image, const Rect& input_rect, + const std::vector>& extra_channels, + PassesDecoderState* dec_state, size_t thread, + ImageBundle* JXL_RESTRICT output_image, const Rect& frame_rect) { + const ImageFeatures& image_features = dec_state->shared->image_features; + const FrameHeader& frame_header = dec_state->shared->frame_header; + const ImageMetadata& metadata = frame_header.nonserialized_metadata->m; + const LoopFilter& lf = frame_header.loop_filter; + const FrameDimensions& frame_dim = dec_state->shared->frame_dim; + JXL_DASSERT(frame_rect.xsize() <= kApplyImageFeaturesTileDim); + JXL_DASSERT(frame_rect.ysize() <= kApplyImageFeaturesTileDim); + JXL_DASSERT(input_rect.xsize() == frame_rect.xsize()); + JXL_DASSERT(input_rect.ysize() == frame_rect.ysize()); + JXL_DASSERT(frame_rect.x0() % GroupBorderAssigner::kPaddingXRound == 0); + JXL_DASSERT(frame_rect.xsize() % GroupBorderAssigner::kPaddingXRound == 0 || + frame_rect.xsize() + frame_rect.x0() == frame_dim.xsize || + frame_rect.xsize() + frame_rect.x0() == frame_dim.xsize_padded); + + // +----------------------------- STEP 1 ------------------------------+ + // | Compute the rects on which patches and splines will be applied. | + // | In case we are applying upsampling, we need to apply patches on a | + // | slightly larger image. | + // +-------------------------------------------------------------------+ + + // If we are applying upsampling, we need 2 more pixels around the actual rect + // for border. Thus, we also need to apply patches and splines to those + // pixels. We compute here + // - The portion of image that corresponds to the area we are applying IF. + // (rect_for_if) + // - The rect where that pixel data is stored in upsampling_input_storage. + // (rect_for_if_storage) + // - The rect where the pixel data that we need to upsample is stored. + // (rect_for_upsampling) + // - The source rect for the pixel data in `input_image`. It is assumed that, + // if `frame_rect` is not on an image border, `input_image:input_rect` has + // enough border available. (rect_for_if_input) + + Image3F* output_color = + dec_state->rgb_output == nullptr && dec_state->pixel_callback == nullptr + ? output_image->color() + : nullptr; + + Image3F* storage_for_if = output_color; + Rect rect_for_if = frame_rect; + Rect rect_for_if_storage = frame_rect; + Rect rect_for_upsampling = frame_rect; + Rect rect_for_if_input = input_rect; + // The same as rect_for_if_input but in the frame coordinates. + Rect frame_rect_for_ycbcr_upsampling = frame_rect; + size_t extra_rows_t = 0; + size_t extra_rows_b = 0; + if (frame_header.upsampling != 1) { + size_t ifbx0 = 0; + size_t ifbx1 = 0; + size_t ifby0 = 0; + size_t ifby1 = 0; + if (frame_rect.x0() >= 2) { + JXL_DASSERT(input_rect.x0() >= 2); + ifbx0 = 2; + } + if (frame_rect.y0() >= 2) { + JXL_DASSERT(input_rect.y0() >= 2); + extra_rows_t = ifby0 = 2; + } + for (size_t extra : {1, 2}) { + if (frame_rect.x0() + frame_rect.xsize() + extra <= + dec_state->shared->frame_dim.xsize_padded) { + JXL_DASSERT(input_rect.x0() + input_rect.xsize() + extra <= + input_image->xsize()); + ifbx1 = extra; + } + if (frame_rect.y0() + frame_rect.ysize() + extra <= + dec_state->shared->frame_dim.ysize_padded) { + JXL_DASSERT(input_rect.y0() + input_rect.ysize() + extra <= + input_image->ysize()); + extra_rows_b = ifby1 = extra; + } + } + rect_for_if = Rect(frame_rect.x0() - ifbx0, frame_rect.y0() - ifby0, + frame_rect.xsize() + ifbx0 + ifbx1, + frame_rect.ysize() + ifby0 + ifby1); + // Storage for pixel data does not necessarily start at (0, 0) as we need to + // have the left border of upsampling_rect aligned to a multiple of + // GroupBorderAssigner::kPaddingXRound. + rect_for_if_storage = + Rect(kBlockDim + RoundUpTo(ifbx0, GroupBorderAssigner::kPaddingXRound) - + ifbx0, + kBlockDim, rect_for_if.xsize(), rect_for_if.ysize()); + rect_for_upsampling = + Rect(kBlockDim + RoundUpTo(ifbx0, GroupBorderAssigner::kPaddingXRound), + kBlockDim + ifby0, frame_rect.xsize(), frame_rect.ysize()); + rect_for_if_input = + Rect(input_rect.x0() - ifbx0, input_rect.y0() - ifby0, + rect_for_if_storage.xsize(), rect_for_if_storage.ysize()); + frame_rect_for_ycbcr_upsampling = + Rect(frame_rect.x0() - ifbx0, frame_rect.y0() - ifby0, + rect_for_if_input.xsize(), rect_for_if_input.ysize()); + storage_for_if = &dec_state->upsampling_input_storage[thread]; + } + + // +--------------------------- STEP 1.5 ------------------------------+ + // | Perform YCbCr upsampling if needed. | + // +-------------------------------------------------------------------+ + + Image3F* input = input_image; + if (!frame_header.chroma_subsampling.Is444()) { + for (size_t c = 0; c < 3; c++) { + size_t vs = frame_header.chroma_subsampling.VShift(c); + size_t hs = frame_header.chroma_subsampling.HShift(c); + // The per-thread output is used for the first time here. Poison the temp + // image on this thread to prevent leaking initialized data from a + // previous run in this thread in msan builds. + msan::PoisonImage(dec_state->ycbcr_out_images[thread].Plane(c)); + HWY_DYNAMIC_DISPATCH(DoYCbCrUpsampling) + (hs, vs, &input_image->Plane(c), rect_for_if_input, + frame_rect_for_ycbcr_upsampling, frame_dim, + &dec_state->ycbcr_out_images[thread].Plane(c), lf, + &dec_state->ycbcr_temp_images[thread]); + } + input = &dec_state->ycbcr_out_images[thread]; + } + + // Variables for upsampling and filtering. + Rect upsampled_frame_rect(frame_rect.x0() * frame_header.upsampling, + frame_rect.y0() * frame_header.upsampling, + frame_rect.xsize() * frame_header.upsampling, + frame_rect.ysize() * frame_header.upsampling); + Rect full_frame_rect(0, 0, frame_dim.xsize_upsampled, + frame_dim.ysize_upsampled); + upsampled_frame_rect = upsampled_frame_rect.Crop(full_frame_rect); + EnsurePaddingInPlaceRowByRow ensure_padding_upsampling; + ssize_t ensure_padding_upsampling_y0 = 0; + ssize_t ensure_padding_upsampling_y1 = 0; + + EnsurePaddingInPlaceRowByRow ensure_padding_filter; + FilterPipeline* fp = nullptr; + ssize_t ensure_padding_filter_y0 = 0; + ssize_t ensure_padding_filter_y1 = 0; + if (lf.epf_iters != 0 || lf.gab) { + fp = &dec_state->filter_pipelines[thread]; + } + + // +----------------------------- STEP 2 ------------------------------+ + // | Change rects and buffer to not use `output_image` if direct | + // | output to rgb8 is requested. | + // +-------------------------------------------------------------------+ + Image3F* output_pixel_data_storage = output_color; + Rect upsampled_frame_rect_for_storage = upsampled_frame_rect; + if (dec_state->rgb_output || dec_state->pixel_callback) { + size_t log2_upsampling = CeilLog2Nonzero(frame_header.upsampling); + if (storage_for_if == output_color) { + storage_for_if = + &dec_state->output_pixel_data_storage[log2_upsampling][thread]; + rect_for_if_storage = + Rect(0, 0, rect_for_if_storage.xsize(), rect_for_if_storage.ysize()); + } + output_pixel_data_storage = + &dec_state->output_pixel_data_storage[log2_upsampling][thread]; + upsampled_frame_rect_for_storage = + Rect(0, 0, upsampled_frame_rect.xsize(), upsampled_frame_rect.ysize()); + if (frame_header.upsampling == 1 && fp == nullptr) { + upsampled_frame_rect_for_storage = rect_for_if_storage = + rect_for_if_input; + output_pixel_data_storage = storage_for_if = input; + } + } + // Set up alpha channel. + const size_t ec = + metadata.Find(ExtraChannel::kAlpha) - metadata.extra_channel_info.data(); + const ImageF* alpha = nullptr; + Rect alpha_rect = upsampled_frame_rect; + if (ec < metadata.extra_channel_info.size()) { + JXL_ASSERT(ec < extra_channels.size()); + if (frame_header.extra_channel_upsampling[ec] == 1) { + alpha = extra_channels[ec].first; + alpha_rect = extra_channels[ec].second; + } else { + alpha = &output_image->extra_channels()[ec]; + alpha_rect = upsampled_frame_rect; + } + } + + // +----------------------------- STEP 3 ------------------------------+ + // | Set up upsampling and upsample extra channels. | + // +-------------------------------------------------------------------+ + Upsampler* color_upsampler = nullptr; + if (frame_header.upsampling != 1) { + color_upsampler = + &dec_state->upsamplers[CeilLog2Nonzero(frame_header.upsampling) - 1]; + ensure_padding_upsampling.Init( + storage_for_if, rect_for_upsampling, frame_rect, frame_dim.xsize_padded, + frame_dim.ysize_padded, 2, 2, &ensure_padding_upsampling_y0, + &ensure_padding_upsampling_y1); + } + + std::vector> extra_channels_for_patches; + std::vector ec_padding; + + bool late_ec_upsample = frame_header.upsampling != 1; + for (auto ecups : frame_header.extra_channel_upsampling) { + if (ecups != frame_header.upsampling) { + // If patches are applied, either frame_header.upsampling == 1 or + // late_ec_upsample is true. + late_ec_upsample = false; + } + } + + ssize_t ensure_padding_upsampling_ec_y0 = 0; + ssize_t ensure_padding_upsampling_ec_y1 = 0; + + // TODO(veluca) do not upsample extra channels to a full-image-sized buffer if + // we are not outputting to an ImageBundle. + if (!late_ec_upsample) { + // Upsample extra channels first if not all channels have the same + // upsampling factor. + for (size_t ec = 0; ec < extra_channels.size(); ec++) { + size_t ecups = frame_header.extra_channel_upsampling[ec]; + if (ecups == 1) { + extra_channels_for_patches.push_back(extra_channels[ec]); + continue; + } + ssize_t ensure_padding_y0, ensure_padding_y1; + EnsurePaddingInPlaceRowByRow ensure_padding; + Rect ec_image_rect = ScaleRectForEC(frame_rect, frame_header, ec); + size_t ecxs = DivCeil(frame_dim.xsize_upsampled, + frame_header.extra_channel_upsampling[ec]); + size_t ecys = DivCeil(frame_dim.ysize_upsampled, + frame_header.extra_channel_upsampling[ec]); + ensure_padding.Init(extra_channels[ec].first, extra_channels[ec].second, + ec_image_rect, ecxs, ecys, 2, 2, &ensure_padding_y0, + &ensure_padding_y1); + for (ssize_t y = ensure_padding_y0; y < ensure_padding_y1; y++) { + ensure_padding.Process(y); + } + Upsampler& upsampler = + dec_state->upsamplers[CeilLog2Nonzero( + frame_header.extra_channel_upsampling[ec]) - + 1]; + upsampler.UpsampleRect( + *extra_channels[ec].first, extra_channels[ec].second, + &output_image->extra_channels()[ec], upsampled_frame_rect, + static_cast(ec_image_rect.y0()) - + static_cast(extra_channels[ec].second.y0()), + ecys, dec_state->upsampler_storage[thread].get()); + extra_channels_for_patches.emplace_back( + &output_image->extra_channels()[ec], upsampled_frame_rect); + } + } else { + // Upsample extra channels last if color channels are upsampled and all the + // extra channels have the same upsampling as them. + ec_padding.resize(extra_channels.size()); + for (size_t ec = 0; ec < extra_channels.size(); ec++) { + // Add a border to the extra channel rect for when patches are applied. + // This ensures that the correct row is accessed (y values for patches are + // relative to rect_for_if, not to input_rect). + // As the rect is extended by 0 or 2 pixels, and the patches input has, + // accordingly, the same padding, this is safe. + Rect r(extra_channels[ec].second.x0() + rect_for_upsampling.x0() - + rect_for_if_storage.x0(), + extra_channels[ec].second.y0() + rect_for_upsampling.y0() - + rect_for_if_storage.y0(), + extra_channels[ec].second.xsize() + rect_for_if_storage.xsize() - + rect_for_upsampling.xsize(), + extra_channels[ec].second.ysize() + rect_for_if_storage.ysize() - + rect_for_upsampling.ysize()); + extra_channels_for_patches.emplace_back(extra_channels[ec].first, r); + ec_padding[ec].Init(extra_channels[ec].first, extra_channels[ec].second, + frame_rect, frame_dim.xsize, frame_dim.ysize, 2, 2, + &ensure_padding_upsampling_ec_y0, + &ensure_padding_upsampling_ec_y1); + } + } + + // Initialized to a valid non-null ptr to avoid UB if arithmetic is done with + // the pointer value (which would then not be used). + std::vector ec_ptrs_for_patches(extra_channels.size(), + input->PlaneRow(0, 0)); + + // +----------------------------- STEP 4 ------------------------------+ + // | Set up the filter pipeline. | + // +-------------------------------------------------------------------+ + if (fp) { + ensure_padding_filter.Init( + input, rect_for_if_input, rect_for_if, frame_dim.xsize_padded, + frame_dim.ysize_padded, lf.Padding(), lf.Padding(), + &ensure_padding_filter_y0, &ensure_padding_filter_y1); + + fp = PrepareFilterPipeline(dec_state, rect_for_if, *input, + rect_for_if_input, frame_dim.ysize_padded, + thread, storage_for_if, rect_for_if_storage); + } + + // +----------------------------- STEP 5 ------------------------------+ + // | Run the prepared pipeline of operations. | + // +-------------------------------------------------------------------+ + + // y values are relative to rect_for_if. + // Automatic mirroring in fp->ApplyFiltersRow() implies that we should ensure + // that padding for the first lines of the image is already present before + // calling ApplyFiltersRow() with "virtual" rows. + // Here we rely on the fact that virtual rows at the beginning of the image + // are only present if input_rect.y0() == 0. + ssize_t first_ensure_padding_y = ensure_padding_filter_y0; + if (frame_rect.y0() == 0) { + JXL_DASSERT(ensure_padding_filter_y0 == 0); + first_ensure_padding_y = + std::min(lf.Padding(), ensure_padding_filter_y1); + for (ssize_t y = 0; y < first_ensure_padding_y; y++) { + ensure_padding_filter.Process3(y); + } + } + + for (ssize_t y = -lf.Padding(); + y < static_cast(lf.Padding() + rect_for_if.ysize()); y++) { + if (fp) { + if (y >= first_ensure_padding_y && y < ensure_padding_filter_y1) { + ensure_padding_filter.Process3(y); + } + fp->ApplyFiltersRow(lf, dec_state->filter_weights, y); + } else if (output_pixel_data_storage != input) { + for (size_t c = 0; c < 3; c++) { + memcpy(rect_for_if_storage.PlaneRow(storage_for_if, c, y), + rect_for_if_input.ConstPlaneRow(*input, c, y), + rect_for_if_input.xsize() * sizeof(float)); + } + } + if (y < static_cast(lf.Padding())) continue; + // At this point, row `y - lf.Padding()` of `rect_for_if` has been produced + // by the filters. + ssize_t available_y = y - lf.Padding(); + if (frame_header.upsampling == 1) { + for (size_t i = 0; i < extra_channels.size(); i++) { + ec_ptrs_for_patches[i] = extra_channels_for_patches[i].second.Row( + extra_channels_for_patches[i].first, available_y); + } + } + JXL_RETURN_IF_ERROR(image_features.patches.AddTo( + storage_for_if, rect_for_if_storage.Line(available_y), + ec_ptrs_for_patches.data(), rect_for_if.Line(available_y))); + JXL_RETURN_IF_ERROR(image_features.splines.AddTo( + storage_for_if, rect_for_if_storage.Line(available_y), + rect_for_if.Line(available_y), dec_state->shared->cmap)); + size_t num_ys = 1; + if (frame_header.upsampling != 1) { + // Upsampling `y` values are relative to `rect_for_upsampling`, not to + // `rect_for_if`. + ssize_t shifted_y = available_y - extra_rows_t; + if (shifted_y >= ensure_padding_upsampling_y0 && + shifted_y < ensure_padding_upsampling_y1) { + ensure_padding_upsampling.Process3(shifted_y); + } + if (late_ec_upsample && shifted_y >= ensure_padding_upsampling_ec_y0 && + shifted_y < ensure_padding_upsampling_ec_y1) { + for (size_t ec = 0; ec < extra_channels.size(); ec++) { + ec_padding[ec].Process(shifted_y); + } + } + // Upsampling will access two rows of border, so the first upsampling + // output will be available after shifted_y is at least 2, *unless* image + // height is <= 2. + if (shifted_y < 2 && + shifted_y + 1 != static_cast(frame_rect.ysize())) { + continue; + } + // Value relative to upsampled_frame_rect. + size_t input_y = std::max(shifted_y - 2, 0); + size_t upsampled_available_y = frame_header.upsampling * input_y; + size_t num_input_rows = 1; + // If we are going to mirror the last output rows, then we already have 3 + // input lines ready. This happens iff we did not extend rect_for_if on + // the bottom *and* we are at the last `y` value. + if (extra_rows_b != 2 && + static_cast(y) + 1 == lf.Padding() + rect_for_if.ysize()) { + num_input_rows = 3; + } + num_input_rows = std::min(num_input_rows, frame_dim.ysize_padded); + num_ys = num_input_rows * frame_header.upsampling; + + if (static_cast(upsampled_available_y) >= + upsampled_frame_rect.ysize()) { + continue; + } + + if (upsampled_available_y + num_ys >= upsampled_frame_rect.ysize()) { + num_ys = upsampled_frame_rect.ysize() - upsampled_available_y; + } + + // Upsampler takes care of mirroring, and checks "physical" boundaries. + Rect upsample_input_rect = rect_for_upsampling.Lines(input_y, 1); + color_upsampler->UpsampleRect( + *storage_for_if, upsample_input_rect, output_pixel_data_storage, + upsampled_frame_rect_for_storage.Lines(upsampled_available_y, num_ys), + static_cast(frame_rect.y0()) - + static_cast(rect_for_upsampling.y0()), + frame_dim.ysize_padded, dec_state->upsampler_storage[thread].get()); + if (late_ec_upsample) { + for (size_t ec = 0; ec < extra_channels.size(); ec++) { + // Upsampler takes care of mirroring, and checks "physical" + // boundaries. + Rect upsample_ec_input_rect = + extra_channels[ec].second.Lines(input_y, 1); + color_upsampler->UpsampleRect( + *extra_channels[ec].first, upsample_ec_input_rect, + &output_image->extra_channels()[ec], + upsampled_frame_rect.Lines(upsampled_available_y, num_ys), + static_cast(frame_rect.y0()) - + static_cast(extra_channels[ec].second.y0()), + frame_dim.ysize, dec_state->upsampler_storage[thread].get()); + } + } + available_y = upsampled_available_y; + } + + if (static_cast(available_y) >= upsampled_frame_rect.ysize()) { + continue; + } + + // The image data is now unconditionally in + // `output_image_storage:upsampled_frame_rect_for_storage`. + if (frame_header.flags & FrameHeader::kNoise) { + PROFILER_ZONE("AddNoise"); + AddNoise(image_features.noise_params, + upsampled_frame_rect.Lines(available_y, num_ys), + dec_state->noise, + upsampled_frame_rect_for_storage.Lines(available_y, num_ys), + dec_state->shared_storage.cmap, output_pixel_data_storage); + } + + if (dec_state->pre_color_transform_frame.xsize() != 0) { + for (size_t c = 0; c < 3; c++) { + for (size_t y = available_y; y < available_y + num_ys; y++) { + float* JXL_RESTRICT row_out = upsampled_frame_rect.PlaneRow( + &dec_state->pre_color_transform_frame, c, y); + const float* JXL_RESTRICT row_in = + upsampled_frame_rect_for_storage.ConstPlaneRow( + *output_pixel_data_storage, c, y); + memcpy(row_out, row_in, + upsampled_frame_rect.xsize() * sizeof(*row_in)); + } + } + } + + // We skip the color transform entirely if save_before_color_transform and + // the frame is not supposed to be displayed. + + if (dec_state->fast_xyb_srgb8_conversion) { + FastXYBTosRGB8( + *output_pixel_data_storage, + upsampled_frame_rect_for_storage.Lines(available_y, num_ys), + upsampled_frame_rect.Lines(available_y, num_ys) + .Crop(Rect(0, 0, frame_dim.xsize, frame_dim.ysize)), + alpha, alpha_rect.Lines(available_y, num_ys), + dec_state->rgb_output_is_rgba, dec_state->rgb_output, frame_dim.xsize, + dec_state->rgb_stride); + } else { + if (frame_header.needs_color_transform()) { + if (frame_header.color_transform == ColorTransform::kXYB) { + JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(UndoXYBInPlace)( + output_pixel_data_storage, + upsampled_frame_rect_for_storage.Lines(available_y, num_ys), + dec_state->output_encoding_info)); + } else if (frame_header.color_transform == ColorTransform::kYCbCr) { + YcbcrToRgb( + *output_pixel_data_storage, output_pixel_data_storage, + upsampled_frame_rect_for_storage.Lines(available_y, num_ys)); + } + } + + // TODO(veluca): all blending should happen here. + + if (dec_state->rgb_output != nullptr) { + HWY_DYNAMIC_DISPATCH(FloatToRGBA8) + (*output_pixel_data_storage, + upsampled_frame_rect_for_storage.Lines(available_y, num_ys), + dec_state->rgb_output_is_rgba, alpha, + alpha_rect.Lines(available_y, num_ys), + upsampled_frame_rect.Lines(available_y, num_ys) + .Crop(Rect(0, 0, frame_dim.xsize, frame_dim.ysize)), + dec_state->rgb_output, dec_state->rgb_stride); + } + if (dec_state->pixel_callback != nullptr) { + Rect alpha_line_rect = alpha_rect.Lines(available_y, num_ys); + Rect color_input_line_rect = + upsampled_frame_rect_for_storage.Lines(available_y, num_ys); + Rect image_line_rect = + upsampled_frame_rect.Lines(available_y, num_ys) + .Crop(Rect(0, 0, frame_dim.xsize, frame_dim.ysize)); + const float* line_buffers[4]; + for (size_t iy = 0; iy < image_line_rect.ysize(); iy++) { + for (size_t c = 0; c < 3; c++) { + line_buffers[c] = color_input_line_rect.ConstPlaneRow( + *output_pixel_data_storage, c, iy); + } + if (alpha) { + line_buffers[3] = alpha_line_rect.ConstRow(*alpha, iy); + } else { + line_buffers[3] = dec_state->opaque_alpha.data(); + } + std::vector& interleaved = + dec_state->pixel_callback_rows[thread]; + size_t j = 0; + for (size_t i = 0; i < image_line_rect.xsize(); i++) { + interleaved[j++] = line_buffers[0][i]; + interleaved[j++] = line_buffers[1][i]; + interleaved[j++] = line_buffers[2][i]; + if (dec_state->rgb_output_is_rgba) { + interleaved[j++] = line_buffers[3][i]; + } + } + dec_state->pixel_callback(interleaved.data(), image_line_rect.x0(), + image_line_rect.y0() + iy, + image_line_rect.xsize()); + } + } + } + } + + return true; +} + +Status FinalizeFrameDecoding(ImageBundle* decoded, + PassesDecoderState* dec_state, ThreadPool* pool, + bool force_fir, bool skip_blending) { + const FrameHeader& frame_header = dec_state->shared->frame_header; + const FrameDimensions& frame_dim = dec_state->shared->frame_dim; + + // FinalizeImageRect was not yet run, or we are forcing a run. + if (!dec_state->EagerFinalizeImageRect() || force_fir) { + std::vector rects_to_process; + for (size_t y = 0; y < frame_dim.ysize_padded; y += kGroupDim) { + for (size_t x = 0; x < frame_dim.xsize_padded; x += kGroupDim) { + Rect rect(x, y, kGroupDim, kGroupDim, frame_dim.xsize_padded, + frame_dim.ysize_padded); + if (rect.xsize() == 0 || rect.ysize() == 0) continue; + rects_to_process.push_back(rect); + } + } + const auto allocate_storage = [&](size_t num_threads) { + dec_state->EnsureStorage(num_threads); + return true; + }; + + { + std::vector ecs; + const ImageMetadata& metadata = frame_header.nonserialized_metadata->m; + for (size_t i = 0; i < metadata.num_extra_channels; i++) { + if (frame_header.extra_channel_upsampling[i] == 1) { + ecs.push_back(std::move(dec_state->extra_channels[i])); + } else { + ecs.emplace_back(frame_dim.xsize_upsampled_padded, + frame_dim.ysize_upsampled_padded); + } + } + decoded->SetExtraChannels(std::move(ecs)); + } + + std::atomic apply_features_ok{true}; + auto run_apply_features = [&](size_t rect_id, size_t thread) { + size_t xstart = PassesDecoderState::kGroupDataXBorder; + size_t ystart = PassesDecoderState::kGroupDataYBorder; + for (size_t c = 0; c < 3; c++) { + Rect rh(rects_to_process[rect_id].x0() >> + frame_header.chroma_subsampling.HShift(c), + rects_to_process[rect_id].y0() >> + frame_header.chroma_subsampling.VShift(c), + rects_to_process[rect_id].xsize() >> + frame_header.chroma_subsampling.HShift(c), + rects_to_process[rect_id].ysize() >> + frame_header.chroma_subsampling.VShift(c)); + Rect group_data_rect(xstart, ystart, rh.xsize(), rh.ysize()); + // Poison the image in this thread to prevent leaking initialized data + // from a previous run in this thread in msan builds. + msan::PoisonImage(dec_state->group_data[thread].Plane(c)); + CopyImageToWithPadding( + rh, dec_state->decoded.Plane(c), dec_state->FinalizeRectPadding(), + group_data_rect, &dec_state->group_data[thread].Plane(c)); + } + Rect group_data_rect(xstart, ystart, rects_to_process[rect_id].xsize(), + rects_to_process[rect_id].ysize()); + std::vector> ec_rects; + ec_rects.reserve(decoded->extra_channels().size()); + for (size_t i = 0; i < decoded->extra_channels().size(); i++) { + Rect r = ScaleRectForEC(rects_to_process[rect_id], frame_header, i); + if (frame_header.extra_channel_upsampling[i] != 1) { + Rect ec_input_rect(kBlockDim, 2, r.xsize(), r.ysize()); + auto eti = + &dec_state + ->ec_temp_images[thread * decoded->extra_channels().size() + + i]; + // Poison the temp image on this thread to prevent leaking initialized + // data from a previous run in this thread in msan builds. + msan::PoisonImage(*eti); + CopyImageToWithPadding(r, dec_state->extra_channels[i], + /*padding=*/2, ec_input_rect, eti); + ec_rects.emplace_back(eti, ec_input_rect); + } else { + ec_rects.emplace_back(&decoded->extra_channels()[i], r); + } + } + if (!FinalizeImageRect(&dec_state->group_data[thread], group_data_rect, + ec_rects, dec_state, thread, decoded, + rects_to_process[rect_id])) { + apply_features_ok = false; + } + }; + + RunOnPool(pool, 0, rects_to_process.size(), allocate_storage, + run_apply_features, "ApplyFeatures"); + + if (!apply_features_ok) { + return JXL_FAILURE("FinalizeImageRect failed"); + } + } + + const size_t xsize = frame_dim.xsize_upsampled; + const size_t ysize = frame_dim.ysize_upsampled; + + decoded->ShrinkTo(xsize, ysize); + if (dec_state->pre_color_transform_frame.xsize() != 0) { + dec_state->pre_color_transform_frame.ShrinkTo(xsize, ysize); + } + + if (!skip_blending && ImageBlender::NeedsBlending(dec_state)) { + if (dec_state->pre_color_transform_frame.xsize() != 0) { + // Extra channels are going to be modified. Make a copy. + dec_state->pre_color_transform_ec.clear(); + for (const auto& ec : decoded->extra_channels()) { + dec_state->pre_color_transform_ec.emplace_back(CopyImage(ec)); + } + } + ImageBlender blender; + ImageBundle foreground = std::move(*decoded); + decoded->SetFromImage(Image3F(frame_header.nonserialized_metadata->xsize(), + frame_header.nonserialized_metadata->ysize()), + foreground.c_current()); + std::vector extra_channels_rects; + decoded->extra_channels().reserve(foreground.extra_channels().size()); + extra_channels_rects.reserve(foreground.extra_channels().size()); + for (size_t i = 0; i < foreground.extra_channels().size(); ++i) { + decoded->extra_channels().emplace_back( + frame_header.nonserialized_metadata->xsize(), + frame_header.nonserialized_metadata->ysize()); + extra_channels_rects.emplace_back(decoded->extra_channels().back()); + } + JXL_RETURN_IF_ERROR(blender.PrepareBlending( + dec_state, foreground.origin, foreground.xsize(), foreground.ysize(), + &frame_header.nonserialized_metadata->m.extra_channel_info, + foreground.c_current(), Rect(*decoded->color()), + /*output=*/decoded->color(), Rect(*decoded->color()), + &decoded->extra_channels(), std::move(extra_channels_rects))); + + std::vector rects_to_process; + for (size_t y = 0; y < frame_dim.ysize; y += kGroupDim) { + for (size_t x = 0; x < frame_dim.xsize; x += kGroupDim) { + Rect rect(x, y, kGroupDim, kGroupDim, frame_dim.xsize, frame_dim.ysize); + if (rect.xsize() == 0 || rect.ysize() == 0) continue; + rects_to_process.push_back(rect); + } + } + + std::atomic blending_ok{true}; + JXL_RETURN_IF_ERROR(RunOnPool( + pool, 0, rects_to_process.size(), ThreadPool::SkipInit(), + [&](size_t i, size_t /*thread*/) { + const Rect& rect = rects_to_process[i]; + auto rect_blender = blender.PrepareRect( + rect, *foreground.color(), foreground.extra_channels(), rect); + for (size_t y = 0; y < rect.ysize(); ++y) { + if (!rect_blender.DoBlending(y)) { + blending_ok = false; + return; + } + } + }, + "Blend")); + JXL_RETURN_IF_ERROR(blending_ok.load()); + } + + return true; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_reconstruct.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_reconstruct.h new file mode 100644 index 0000000000..4fa9179b37 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_reconstruct.h @@ -0,0 +1,69 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_RECONSTRUCT_H_ +#define LIB_JXL_DEC_RECONSTRUCT_H_ + +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" + +namespace jxl { + +// Finalizes the decoding of a frame by applying image features if necessary, +// doing color transforms (unless the frame header specifies +// `SaveBeforeColorTransform()`) and applying upsampling. +// +// Writes pixels in the appropriate colorspace to `idct`, shrinking it if +// necessary. +// `skip_blending` is necessary because the encoder butteraugli loop does not +// (yet) handle blending. +// TODO(veluca): remove the "force_fir" parameter, and call EPF directly in +// those use cases where this is needed. +Status FinalizeFrameDecoding(ImageBundle* JXL_RESTRICT decoded, + PassesDecoderState* dec_state, ThreadPool* pool, + bool force_fir, bool skip_blending); + +// Renders the `frame_rect` portion of the final image to `output_image` +// (unless the frame is upsampled - in which case, `frame_rect` is scaled +// accordingly). `input_rect` should have the same shape. `input_rect` always +// refers to the non-padded pixels. `frame_rect.x0()` is guaranteed to be a +// multiple of GroupBorderAssigner::kPaddingRoundX. `frame_rect.xsize()` is +// either a multiple of GroupBorderAssigner::kPaddingRoundX, or is such that +// `frame_rect.x0() + frame_rect.xsize() == frame_dim.xsize`. `input_image` +// may be mutated by adding padding. If `frame_rect` is on an image border, the +// input will be padded. Otherwise, appropriate padding must already be present. +Status FinalizeImageRect( + Image3F* input_image, const Rect& input_rect, + const std::vector>& extra_channels, + PassesDecoderState* dec_state, size_t thread, + ImageBundle* JXL_RESTRICT output_image, const Rect& frame_rect); + +// Fills padding around `img:rect` in the x direction by mirroring. Padding is +// applied so that a full border of xpadding and ypadding is available, except +// if `image_rect` points to an area of the full image that touches the top or +// the bottom. It is expected that padding is already in place for inputs such +// that the corresponding image_rect is not at an image border. +void EnsurePaddingInPlace(Image3F* img, const Rect& rect, + const Rect& image_rect, size_t image_xsize, + size_t image_ysize, size_t xpadding, size_t ypadding); + +// For DC in the API. +void UndoXYB(const Image3F& src, Image3F* dst, + const OutputEncodingInfo& output_info, ThreadPool* pool); + +} // namespace jxl + +#endif // LIB_JXL_DEC_RECONSTRUCT_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_render_pipeline.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_render_pipeline.h new file mode 100644 index 0000000000..9496770a6d --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_render_pipeline.h @@ -0,0 +1,91 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_RENDER_PIPELINE_H_ +#define LIB_JXL_DEC_RENDER_PIPELINE_H_ + +#include + +#include "lib/jxl/filters.h" + +namespace jxl { + +// The first pixel in the input to RenderPipelineStage will be located at +// this position. Pixels before this position may be accessed as padding. +constexpr size_t kRenderPipelineXOffset = 16; + +enum class RenderPipelineChannelMode { + kIgnored = 0, + kInPlace = 1, + kInOut = 2, +}; + +class RenderPipelineStage { + public: + // `input` points to `2*MaxPaddingY() + 1` pointers, each of which points to + // `3+num_non_color_channels` pointer-to-row. So, `input[MaxPaddingY()][0]` is + // the pointer to the center row of the first color channel. + // `MaxPaddingY()` is the maximum value returned by `GetPaddingX()`; + // typically, this is a constant. + // `output` points to `1<>& channel_shifts) { + JXL_ABORT("Not implemented"); + } + + // Adds a stage to the pipeline. The shifts for all the channels that are not + // kIgnored by the stage must be identical at this point. + void AddStage(std::unique_ptr stage) { + JXL_ABORT("Not implemented"); + } + + // Finalizes setup of the pipeline. Shifts for all channels should be 0 at + // this point. + void Finalize() { JXL_ABORT("Not implemented"); } + + // Allocates storage to run with `num` threads. + void PrepareForThreads(size_t num) { JXL_ABORT("Not implemented"); } + + // TBD: run the pipeline for a given input, on a given thread. + // void Run(Image3F* color_data, ImageF* ec_data, const Rect& input_rect, + // size_t thread, size_t xpos, size_t ypos) {} +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_RENDER_PIPELINE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_transforms-inl.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_transforms-inl.h new file mode 100644 index 0000000000..c9aebc6b99 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_transforms-inl.h @@ -0,0 +1,867 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#if defined(LIB_JXL_DEC_TRANSFORMS_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_DEC_TRANSFORMS_INL_H_ +#undef LIB_JXL_DEC_TRANSFORMS_INL_H_ +#else +#define LIB_JXL_DEC_TRANSFORMS_INL_H_ +#endif + +#include + +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/dct-inl.h" +#include "lib/jxl/dct_scales.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +template +struct DoDCT { + template + void operator()(const From& from, float* JXL_RESTRICT to, + float* JXL_RESTRICT scratch_space) { + ComputeScaledDCT()(from, to, scratch_space); + } +}; + +template +struct DoDCT { + template + void operator()(const From& from, float* JXL_RESTRICT to, + float* JXL_RESTRICT scratch_space) { + ComputeTransposedScaledDCT()(from, to, scratch_space); + } +}; + +// Computes the lowest-frequency LF_ROWSxLF_COLS-sized square in output, which +// is a DCT_ROWS*DCT_COLS-sized DCT block, by doing a ROWS*COLS DCT on the +// input block. +template +JXL_INLINE void ReinterpretingDCT(const float* input, const size_t input_stride, + float* output, const size_t output_stride) { + static_assert(LF_ROWS == ROWS, + "ReinterpretingDCT should only be called with LF == N"); + static_assert(LF_COLS == COLS, + "ReinterpretingDCT should only be called with LF == N"); + HWY_ALIGN float block[ROWS * COLS]; + + // ROWS, COLS <= 8, so we can put scratch space on the stack. + HWY_ALIGN float scratch_space[ROWS * COLS]; + DoDCT()(DCTFrom(input, input_stride), block, scratch_space); + if (ROWS < COLS) { + for (size_t y = 0; y < LF_ROWS; y++) { + for (size_t x = 0; x < LF_COLS; x++) { + output[y * output_stride + x] = + block[y * COLS + x] * DCTTotalResampleScale(y) * + DCTTotalResampleScale(x); + } + } + } else { + for (size_t y = 0; y < LF_COLS; y++) { + for (size_t x = 0; x < LF_ROWS; x++) { + output[y * output_stride + x] = + block[y * ROWS + x] * DCTTotalResampleScale(y) * + DCTTotalResampleScale(x); + } + } + } +} + +template +void IDCT2TopBlock(const float* block, size_t stride_out, float* out) { + static_assert(kBlockDim % S == 0, "S should be a divisor of kBlockDim"); + static_assert(S % 2 == 0, "S should be even"); + float temp[kDCTBlockSize]; + constexpr size_t num_2x2 = S / 2; + for (size_t y = 0; y < num_2x2; y++) { + for (size_t x = 0; x < num_2x2; x++) { + float c00 = block[y * kBlockDim + x]; + float c01 = block[y * kBlockDim + num_2x2 + x]; + float c10 = block[(y + num_2x2) * kBlockDim + x]; + float c11 = block[(y + num_2x2) * kBlockDim + num_2x2 + x]; + float r00 = c00 + c01 + c10 + c11; + float r01 = c00 + c01 - c10 - c11; + float r10 = c00 - c01 + c10 - c11; + float r11 = c00 - c01 - c10 + c11; + temp[y * 2 * kBlockDim + x * 2] = r00; + temp[y * 2 * kBlockDim + x * 2 + 1] = r01; + temp[(y * 2 + 1) * kBlockDim + x * 2] = r10; + temp[(y * 2 + 1) * kBlockDim + x * 2 + 1] = r11; + } + } + for (size_t y = 0; y < S; y++) { + for (size_t x = 0; x < S; x++) { + out[y * stride_out + x] = temp[y * kBlockDim + x]; + } + } +} + +void AFVIDCT4x4(const float* JXL_RESTRICT coeffs, float* JXL_RESTRICT pixels) { + HWY_ALIGN static constexpr float k4x4AFVBasis[16][16] = { + { + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + }, + { + 0.876902929799142f, + 0.2206518106944235f, + -0.10140050393753763f, + -0.1014005039375375f, + 0.2206518106944236f, + -0.10140050393753777f, + -0.10140050393753772f, + -0.10140050393753763f, + -0.10140050393753758f, + -0.10140050393753769f, + -0.1014005039375375f, + -0.10140050393753768f, + -0.10140050393753768f, + -0.10140050393753759f, + -0.10140050393753763f, + -0.10140050393753741f, + }, + { + 0.0, + 0.0, + 0.40670075830260755f, + 0.44444816619734445f, + 0.0, + 0.0, + 0.19574399372042936f, + 0.2929100136981264f, + -0.40670075830260716f, + -0.19574399372042872f, + 0.0, + 0.11379074460448091f, + -0.44444816619734384f, + -0.29291001369812636f, + -0.1137907446044814f, + 0.0, + }, + { + 0.0, + 0.0, + -0.21255748058288748f, + 0.3085497062849767f, + 0.0, + 0.4706702258572536f, + -0.1621205195722993f, + 0.0, + -0.21255748058287047f, + -0.16212051957228327f, + -0.47067022585725277f, + -0.1464291867126764f, + 0.3085497062849487f, + 0.0, + -0.14642918671266536f, + 0.4251149611657548f, + }, + { + 0.0, + -0.7071067811865474f, + 0.0, + 0.0, + 0.7071067811865476f, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + }, + { + -0.4105377591765233f, + 0.6235485373547691f, + -0.06435071657946274f, + -0.06435071657946266f, + 0.6235485373547694f, + -0.06435071657946284f, + -0.0643507165794628f, + -0.06435071657946274f, + -0.06435071657946272f, + -0.06435071657946279f, + -0.06435071657946266f, + -0.06435071657946277f, + -0.06435071657946277f, + -0.06435071657946273f, + -0.06435071657946274f, + -0.0643507165794626f, + }, + { + 0.0, + 0.0, + -0.4517556589999482f, + 0.15854503551840063f, + 0.0, + -0.04038515160822202f, + 0.0074182263792423875f, + 0.39351034269210167f, + -0.45175565899994635f, + 0.007418226379244351f, + 0.1107416575309343f, + 0.08298163094882051f, + 0.15854503551839705f, + 0.3935103426921022f, + 0.0829816309488214f, + -0.45175565899994796f, + }, + { + 0.0, + 0.0, + -0.304684750724869f, + 0.5112616136591823f, + 0.0, + 0.0, + -0.290480129728998f, + -0.06578701549142804f, + 0.304684750724884f, + 0.2904801297290076f, + 0.0, + -0.23889773523344604f, + -0.5112616136592012f, + 0.06578701549142545f, + 0.23889773523345467f, + 0.0, + }, + { + 0.0, + 0.0, + 0.3017929516615495f, + 0.25792362796341184f, + 0.0, + 0.16272340142866204f, + 0.09520022653475037f, + 0.0, + 0.3017929516615503f, + 0.09520022653475055f, + -0.16272340142866173f, + -0.35312385449816297f, + 0.25792362796341295f, + 0.0, + -0.3531238544981624f, + -0.6035859033230976f, + }, + { + 0.0, + 0.0, + 0.40824829046386274f, + 0.0, + 0.0, + 0.0, + 0.0, + -0.4082482904638628f, + -0.4082482904638635f, + 0.0, + 0.0, + -0.40824829046386296f, + 0.0, + 0.4082482904638634f, + 0.408248290463863f, + 0.0, + }, + { + 0.0, + 0.0, + 0.1747866975480809f, + 0.0812611176717539f, + 0.0, + 0.0, + -0.3675398009862027f, + -0.307882213957909f, + -0.17478669754808135f, + 0.3675398009862011f, + 0.0, + 0.4826689115059883f, + -0.08126111767175039f, + 0.30788221395790305f, + -0.48266891150598584f, + 0.0, + }, + { + 0.0, + 0.0, + -0.21105601049335784f, + 0.18567180916109802f, + 0.0, + 0.0, + 0.49215859013738733f, + -0.38525013709251915f, + 0.21105601049335806f, + -0.49215859013738905f, + 0.0, + 0.17419412659916217f, + -0.18567180916109904f, + 0.3852501370925211f, + -0.1741941265991621f, + 0.0, + }, + { + 0.0, + 0.0, + -0.14266084808807264f, + -0.3416446842253372f, + 0.0, + 0.7367497537172237f, + 0.24627107722075148f, + -0.08574019035519306f, + -0.14266084808807344f, + 0.24627107722075137f, + 0.14883399227113567f, + -0.04768680350229251f, + -0.3416446842253373f, + -0.08574019035519267f, + -0.047686803502292804f, + -0.14266084808807242f, + }, + { + 0.0, + 0.0, + -0.13813540350758585f, + 0.3302282550303788f, + 0.0, + 0.08755115000587084f, + -0.07946706605909573f, + -0.4613374887461511f, + -0.13813540350758294f, + -0.07946706605910261f, + 0.49724647109535086f, + 0.12538059448563663f, + 0.3302282550303805f, + -0.4613374887461554f, + 0.12538059448564315f, + -0.13813540350758452f, + }, + { + 0.0, + 0.0, + -0.17437602599651067f, + 0.0702790691196284f, + 0.0, + -0.2921026642334881f, + 0.3623817333531167f, + 0.0, + -0.1743760259965108f, + 0.36238173335311646f, + 0.29210266423348785f, + -0.4326608024727445f, + 0.07027906911962818f, + 0.0, + -0.4326608024727457f, + 0.34875205199302267f, + }, + { + 0.0, + 0.0, + 0.11354987314994337f, + -0.07417504595810355f, + 0.0, + 0.19402893032594343f, + -0.435190496523228f, + 0.21918684838857466f, + 0.11354987314994257f, + -0.4351904965232251f, + 0.5550443808910661f, + -0.25468277124066463f, + -0.07417504595810233f, + 0.2191868483885728f, + -0.25468277124066413f, + 0.1135498731499429f, + }, + }; + + const HWY_CAPPED(float, 16) d; + for (size_t i = 0; i < 16; i += Lanes(d)) { + auto pixel = Zero(d); + for (size_t j = 0; j < 16; j++) { + auto cf = Set(d, coeffs[j]); + auto basis = Load(d, k4x4AFVBasis[j] + i); + pixel = MulAdd(cf, basis, pixel); + } + Store(pixel, d, pixels + i); + } +} + +template +void AFVTransformToPixels(const float* JXL_RESTRICT coefficients, + float* JXL_RESTRICT pixels, size_t pixels_stride) { + HWY_ALIGN float scratch_space[4 * 8]; + size_t afv_x = afv_kind & 1; + size_t afv_y = afv_kind / 2; + float dcs[3] = {}; + float block00 = coefficients[0]; + float block01 = coefficients[1]; + float block10 = coefficients[8]; + dcs[0] = (block00 + block10 + block01) * 4.0f; + dcs[1] = (block00 + block10 - block01); + dcs[2] = block00 - block10; + // IAFV: (even, even) positions. + HWY_ALIGN float coeff[4 * 4]; + coeff[0] = dcs[0]; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + if (ix == 0 && iy == 0) continue; + coeff[iy * 4 + ix] = coefficients[iy * 2 * 8 + ix * 2]; + } + } + HWY_ALIGN float block[4 * 8]; + AFVIDCT4x4(coeff, block); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + pixels[(iy + afv_y * 4) * pixels_stride + afv_x * 4 + ix] = + block[(afv_y == 1 ? 3 - iy : iy) * 4 + (afv_x == 1 ? 3 - ix : ix)]; + } + } + // IDCT4x4 in (odd, even) positions. + block[0] = dcs[1]; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + if (ix == 0 && iy == 0) continue; + block[iy * 4 + ix] = coefficients[iy * 2 * 8 + ix * 2 + 1]; + } + } + ComputeTransposedScaledIDCT<4>()( + block, + DCTTo(pixels + afv_y * 4 * pixels_stride + (afv_x == 1 ? 0 : 4), + pixels_stride), + scratch_space); + // IDCT4x8. + block[0] = dcs[2]; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + if (ix == 0 && iy == 0) continue; + block[iy * 8 + ix] = coefficients[(1 + iy * 2) * 8 + ix]; + } + } + ComputeScaledIDCT<4, 8>()( + block, + DCTTo(pixels + (afv_y == 1 ? 0 : 4) * pixels_stride, pixels_stride), + scratch_space); +} + +HWY_MAYBE_UNUSED void TransformToPixels(const AcStrategy::Type strategy, + float* JXL_RESTRICT coefficients, + float* JXL_RESTRICT pixels, + size_t pixels_stride, + float* scratch_space) { + using Type = AcStrategy::Type; + switch (strategy) { + case Type::IDENTITY: { + PROFILER_ZONE("IDCT Identity"); + float dcs[4] = {}; + float block00 = coefficients[0]; + float block01 = coefficients[1]; + float block10 = coefficients[8]; + float block11 = coefficients[9]; + dcs[0] = block00 + block01 + block10 + block11; + dcs[1] = block00 + block01 - block10 - block11; + dcs[2] = block00 - block01 + block10 - block11; + dcs[3] = block00 - block01 - block10 + block11; + for (size_t y = 0; y < 2; y++) { + for (size_t x = 0; x < 2; x++) { + float block_dc = dcs[y * 2 + x]; + float residual_sum = 0; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + if (ix == 0 && iy == 0) continue; + residual_sum += coefficients[(y + iy * 2) * 8 + x + ix * 2]; + } + } + pixels[(4 * y + 1) * pixels_stride + 4 * x + 1] = + block_dc - residual_sum * (1.0f / 16); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + if (ix == 1 && iy == 1) continue; + pixels[(y * 4 + iy) * pixels_stride + x * 4 + ix] = + coefficients[(y + iy * 2) * 8 + x + ix * 2] + + pixels[(4 * y + 1) * pixels_stride + 4 * x + 1]; + } + } + pixels[y * 4 * pixels_stride + x * 4] = + coefficients[(y + 2) * 8 + x + 2] + + pixels[(4 * y + 1) * pixels_stride + 4 * x + 1]; + } + } + break; + } + case Type::DCT8X4: { + PROFILER_ZONE("IDCT 8x4"); + float dcs[2] = {}; + float block0 = coefficients[0]; + float block1 = coefficients[8]; + dcs[0] = block0 + block1; + dcs[1] = block0 - block1; + for (size_t x = 0; x < 2; x++) { + HWY_ALIGN float block[4 * 8]; + block[0] = dcs[x]; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + if (ix == 0 && iy == 0) continue; + block[iy * 8 + ix] = coefficients[(x + iy * 2) * 8 + ix]; + } + } + ComputeScaledIDCT<8, 4>()(block, DCTTo(pixels + x * 4, pixels_stride), + scratch_space); + } + break; + } + case Type::DCT4X8: { + PROFILER_ZONE("IDCT 4x8"); + float dcs[2] = {}; + float block0 = coefficients[0]; + float block1 = coefficients[8]; + dcs[0] = block0 + block1; + dcs[1] = block0 - block1; + for (size_t y = 0; y < 2; y++) { + HWY_ALIGN float block[4 * 8]; + block[0] = dcs[y]; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + if (ix == 0 && iy == 0) continue; + block[iy * 8 + ix] = coefficients[(y + iy * 2) * 8 + ix]; + } + } + ComputeScaledIDCT<4, 8>()( + block, DCTTo(pixels + y * 4 * pixels_stride, pixels_stride), + scratch_space); + } + break; + } + case Type::DCT4X4: { + PROFILER_ZONE("IDCT 4"); + float dcs[4] = {}; + float block00 = coefficients[0]; + float block01 = coefficients[1]; + float block10 = coefficients[8]; + float block11 = coefficients[9]; + dcs[0] = block00 + block01 + block10 + block11; + dcs[1] = block00 + block01 - block10 - block11; + dcs[2] = block00 - block01 + block10 - block11; + dcs[3] = block00 - block01 - block10 + block11; + for (size_t y = 0; y < 2; y++) { + for (size_t x = 0; x < 2; x++) { + HWY_ALIGN float block[4 * 4]; + block[0] = dcs[y * 2 + x]; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + if (ix == 0 && iy == 0) continue; + block[iy * 4 + ix] = coefficients[(y + iy * 2) * 8 + x + ix * 2]; + } + } + ComputeTransposedScaledIDCT<4>()( + block, + DCTTo(pixels + y * 4 * pixels_stride + x * 4, pixels_stride), + scratch_space); + } + } + break; + } + case Type::DCT2X2: { + PROFILER_ZONE("IDCT 2"); + HWY_ALIGN float coeffs[kDCTBlockSize]; + memcpy(coeffs, coefficients, sizeof(float) * kDCTBlockSize); + IDCT2TopBlock<2>(coeffs, kBlockDim, coeffs); + IDCT2TopBlock<4>(coeffs, kBlockDim, coeffs); + IDCT2TopBlock<8>(coeffs, kBlockDim, coeffs); + for (size_t y = 0; y < kBlockDim; y++) { + for (size_t x = 0; x < kBlockDim; x++) { + pixels[y * pixels_stride + x] = coeffs[y * kBlockDim + x]; + } + } + break; + } + case Type::DCT16X16: { + PROFILER_ZONE("IDCT 16"); + ComputeTransposedScaledIDCT<16>()( + coefficients, DCTTo(pixels, pixels_stride), scratch_space); + break; + } + case Type::DCT16X8: { + PROFILER_ZONE("IDCT 16x8"); + ComputeScaledIDCT<16, 8>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT8X16: { + PROFILER_ZONE("IDCT 8x16"); + ComputeScaledIDCT<8, 16>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT32X8: { + PROFILER_ZONE("IDCT 32x8"); + ComputeScaledIDCT<32, 8>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT8X32: { + PROFILER_ZONE("IDCT 8x32"); + ComputeScaledIDCT<8, 32>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT32X16: { + PROFILER_ZONE("IDCT 32x16"); + ComputeScaledIDCT<32, 16>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT16X32: { + PROFILER_ZONE("IDCT 16x32"); + ComputeScaledIDCT<16, 32>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT32X32: { + PROFILER_ZONE("IDCT 32"); + ComputeTransposedScaledIDCT<32>()( + coefficients, DCTTo(pixels, pixels_stride), scratch_space); + break; + } + case Type::DCT: { + PROFILER_ZONE("IDCT 8"); + ComputeTransposedScaledIDCT<8>()( + coefficients, DCTTo(pixels, pixels_stride), scratch_space); + break; + } + case Type::AFV0: { + PROFILER_ZONE("IAFV0"); + AFVTransformToPixels<0>(coefficients, pixels, pixels_stride); + break; + } + case Type::AFV1: { + PROFILER_ZONE("IAFV1"); + AFVTransformToPixels<1>(coefficients, pixels, pixels_stride); + break; + } + case Type::AFV2: { + PROFILER_ZONE("IAFV2"); + AFVTransformToPixels<2>(coefficients, pixels, pixels_stride); + break; + } + case Type::AFV3: { + PROFILER_ZONE("IAFV3"); + AFVTransformToPixels<3>(coefficients, pixels, pixels_stride); + break; + } + case Type::DCT64X32: { + PROFILER_ZONE("IDCT 64x32"); + ComputeScaledIDCT<64, 32>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT32X64: { + PROFILER_ZONE("IDCT 32x64"); + ComputeScaledIDCT<32, 64>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT64X64: { + PROFILER_ZONE("IDCT 64"); + ComputeTransposedScaledIDCT<64>()( + coefficients, DCTTo(pixels, pixels_stride), scratch_space); + break; + } + case Type::DCT128X64: { + PROFILER_ZONE("IDCT 128x64"); + ComputeScaledIDCT<128, 64>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT64X128: { + PROFILER_ZONE("IDCT 64x128"); + ComputeScaledIDCT<64, 128>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT128X128: { + PROFILER_ZONE("IDCT 128"); + ComputeTransposedScaledIDCT<128>()( + coefficients, DCTTo(pixels, pixels_stride), scratch_space); + break; + } + case Type::DCT256X128: { + PROFILER_ZONE("IDCT 256x128"); + ComputeScaledIDCT<256, 128>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT128X256: { + PROFILER_ZONE("IDCT 128x256"); + ComputeScaledIDCT<128, 256>()(coefficients, DCTTo(pixels, pixels_stride), + scratch_space); + break; + } + case Type::DCT256X256: { + PROFILER_ZONE("IDCT 256"); + ComputeTransposedScaledIDCT<256>()( + coefficients, DCTTo(pixels, pixels_stride), scratch_space); + break; + } + case Type::kNumValidStrategies: + JXL_ABORT("Invalid strategy"); + } +} + +HWY_MAYBE_UNUSED void LowestFrequenciesFromDC(const AcStrategy::Type strategy, + const float* dc, size_t dc_stride, + float* llf) { + using Type = AcStrategy::Type; + switch (strategy) { + case Type::DCT16X8: { + ReinterpretingDCT( + dc, dc_stride, llf, 2 * kBlockDim); + break; + } + case Type::DCT8X16: { + ReinterpretingDCT( + dc, dc_stride, llf, 2 * kBlockDim); + break; + } + case Type::DCT16X16: { + ReinterpretingDCT( + dc, dc_stride, llf, 2 * kBlockDim); + break; + } + case Type::DCT32X8: { + ReinterpretingDCT( + dc, dc_stride, llf, 4 * kBlockDim); + break; + } + case Type::DCT8X32: { + ReinterpretingDCT( + dc, dc_stride, llf, 4 * kBlockDim); + break; + } + case Type::DCT32X16: { + ReinterpretingDCT( + dc, dc_stride, llf, 4 * kBlockDim); + break; + } + case Type::DCT16X32: { + ReinterpretingDCT( + dc, dc_stride, llf, 4 * kBlockDim); + break; + } + case Type::DCT32X32: { + ReinterpretingDCT( + dc, dc_stride, llf, 4 * kBlockDim); + break; + } + case Type::DCT64X32: { + ReinterpretingDCT( + dc, dc_stride, llf, 8 * kBlockDim); + break; + } + case Type::DCT32X64: { + ReinterpretingDCT( + dc, dc_stride, llf, 8 * kBlockDim); + break; + } + case Type::DCT64X64: { + ReinterpretingDCT( + dc, dc_stride, llf, 8 * kBlockDim); + break; + } + case Type::DCT128X64: { + ReinterpretingDCT( + dc, dc_stride, llf, 16 * kBlockDim); + break; + } + case Type::DCT64X128: { + ReinterpretingDCT( + dc, dc_stride, llf, 16 * kBlockDim); + break; + } + case Type::DCT128X128: { + ReinterpretingDCT< + /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim, + /*LF_ROWS=*/16, /*LF_COLS=*/16, /*ROWS=*/16, /*COLS=*/16>( + dc, dc_stride, llf, 16 * kBlockDim); + break; + } + case Type::DCT256X128: { + ReinterpretingDCT< + /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim, + /*LF_ROWS=*/32, /*LF_COLS=*/16, /*ROWS=*/32, /*COLS=*/16>( + dc, dc_stride, llf, 32 * kBlockDim); + break; + } + case Type::DCT128X256: { + ReinterpretingDCT< + /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim, + /*LF_ROWS=*/16, /*LF_COLS=*/32, /*ROWS=*/16, /*COLS=*/32>( + dc, dc_stride, llf, 32 * kBlockDim); + break; + } + case Type::DCT256X256: { + ReinterpretingDCT< + /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim, + /*LF_ROWS=*/32, /*LF_COLS=*/32, /*ROWS=*/32, /*COLS=*/32>( + dc, dc_stride, llf, 32 * kBlockDim); + break; + } + case Type::DCT: + case Type::DCT2X2: + case Type::DCT4X4: + case Type::DCT4X8: + case Type::DCT8X4: + case Type::AFV0: + case Type::AFV1: + case Type::AFV2: + case Type::AFV3: + case Type::IDENTITY: + llf[0] = dc[0]; + break; + case Type::kNumValidStrategies: + JXL_ABORT("Invalid strategy"); + }; +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_DEC_TRANSFORMS_INL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_transforms_testonly.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_transforms_testonly.cc new file mode 100644 index 0000000000..9ee80c59dc --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_transforms_testonly.cc @@ -0,0 +1,41 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_transforms_testonly.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/dec_transforms_testonly.cc" +#include +#include + +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/dec_transforms-inl.h" + +namespace jxl { + +#if HWY_ONCE +HWY_EXPORT(TransformToPixels); +void TransformToPixels(AcStrategy::Type strategy, + float* JXL_RESTRICT coefficients, + float* JXL_RESTRICT pixels, size_t pixels_stride, + float* scratch_space) { + return HWY_DYNAMIC_DISPATCH(TransformToPixels)(strategy, coefficients, pixels, + pixels_stride, scratch_space); +} + +HWY_EXPORT(LowestFrequenciesFromDC); +void LowestFrequenciesFromDC(const jxl::AcStrategy::Type strategy, + const float* dc, size_t dc_stride, float* llf) { + return HWY_DYNAMIC_DISPATCH(LowestFrequenciesFromDC)(strategy, dc, dc_stride, + llf); +} + +HWY_EXPORT(AFVIDCT4x4); +void AFVIDCT4x4(const float* JXL_RESTRICT coeffs, float* JXL_RESTRICT pixels) { + return HWY_DYNAMIC_DISPATCH(AFVIDCT4x4)(coeffs, pixels); +} +#endif // HWY_ONCE + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_transforms_testonly.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_transforms_testonly.h new file mode 100644 index 0000000000..97c4ca543d --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_transforms_testonly.h @@ -0,0 +1,32 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_TRANSFORMS_TESTONLY_H_ +#define LIB_JXL_DEC_TRANSFORMS_TESTONLY_H_ + +// Facade for (non-inlined) inverse integral transforms. + +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/compiler_specific.h" + +namespace jxl { + +void TransformToPixels(AcStrategy::Type strategy, + float* JXL_RESTRICT coefficients, + float* JXL_RESTRICT pixels, size_t pixels_stride, + float* JXL_RESTRICT scratch_space); + +// Equivalent of the above for DC image. +void LowestFrequenciesFromDC(const jxl::AcStrategy::Type strategy, + const float* dc, size_t dc_stride, float* llf); + +void AFVIDCT4x4(const float* JXL_RESTRICT coeffs, float* JXL_RESTRICT pixels); + +} // namespace jxl + +#endif // LIB_JXL_DEC_TRANSFORMS_TESTONLY_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc new file mode 100644 index 0000000000..9c7a5e5a92 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.cc @@ -0,0 +1,375 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_upsample.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/dec_upsample.cc" +#include +#include + +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/image_ops.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +void InitKernel(const float* weights, CacheAlignedUniquePtr* kernel_storage, + size_t N, size_t x_repeat) { + const size_t NX = N * x_repeat; + const size_t N2 = N / 2; + HWY_FULL(float) df; + const size_t V = Lanes(df); + const size_t num_kernels = N * NX; + + constexpr const size_t M = 2 * Upsampler::filter_radius() + 1; + const size_t MX = M + x_repeat - 1; + const size_t num_coeffs = M * MX; + + // Pad kernel slices to vector size. + const size_t stride = RoundUpTo(num_kernels, V); + *kernel_storage = AllocateArray(stride * sizeof(float) * num_coeffs); + float* kernels = reinterpret_cast(kernel_storage->get()); + memset(kernels, 0, stride * sizeof(float) * num_coeffs); + + for (size_t offset = 0; offset < num_coeffs; ++offset) { + size_t iy = offset / MX; + size_t ix = offset % MX; + for (size_t kernel = 0; kernel < num_kernels; ++kernel) { + size_t ky = kernel / NX; + size_t kx_ = kernel % NX; + size_t kx = kx_ % N; + size_t shift = kx_ / N; + if ((ix < shift) || (ix - shift >= M)) continue; // 0 weight from memset. + // Only weights for top-left 1 / 4 of kernels are specified; other 3 / 4 + // kernels are produced by vertical and horizontal mirroring. + size_t j = (ky < N2) ? (iy + M * ky) : ((M - 1 - iy) + M * (N - 1 - ky)); + size_t i = (kx < N2) ? (ix - shift + M * kx) + : ((M - 1 - (ix - shift)) + M * (N - 1 - kx)); + // (y, x) = sorted(i, j) + // the matrix built of kernel matrices as blocks is symmetric. + size_t y = std::min(i, j); + size_t x = std::max(i, j); + // Take the weight from "triangle" coordinates. + float weight = weights[M * N2 * y - y * (y - 1) / 2 + x - y]; + kernels[offset * stride + kernel] = weight; + } + } +} + +template +void Upsample(const ImageF& src, const Rect& src_rect, ImageF* dst, + const Rect& dst_rect, const float* kernels, + ssize_t image_y_offset, size_t image_ysize, float* arena) { + constexpr const size_t M = 2 * Upsampler::filter_radius() + 1; + constexpr const size_t M2 = M / 2; + JXL_DASSERT(src_rect.x0() >= M2); + const size_t src_x_limit = src_rect.x0() + src_rect.xsize() + M2; + JXL_DASSERT(src_x_limit <= src.xsize()); + JXL_ASSERT(DivCeil(dst_rect.xsize(), N) <= src_rect.xsize()); + // TODO(eustas): add proper (src|dst) ysize check that accounts for mirroring. + + constexpr const size_t MX = M + x_repeat - 1; + constexpr const size_t num_coeffs = M * MX; + + constexpr const size_t NX = N * x_repeat; + + HWY_FULL(float) df; + const size_t V = Lanes(df); + const size_t num_kernels = N * NX; + const size_t stride = RoundUpTo(num_kernels, V); + + const size_t rsx = DivCeil(dst_rect.xsize(), N); + const size_t dsx = rsx + 2 * M2; + // Round-down to complete vectors. + const size_t dsx_v = V * (dsx / V); + + float* JXL_RESTRICT in = arena; + arena += RoundUpTo(num_coeffs, V); + float* JXL_RESTRICT out = arena; + arena += stride; + float* JXL_RESTRICT raw_min_row = arena; + arena += RoundUpTo(dsx + V, V); + float* JXL_RESTRICT raw_max_row = arena; + arena += RoundUpTo(dsx + V, V); + float* JXL_RESTRICT min_row = arena; + arena += RoundUpTo(rsx * N + V, V); + float* JXL_RESTRICT max_row = arena; + arena += RoundUpTo(rsx * N + V, V); + + memset(raw_min_row + dsx_v, 0, sizeof(float) * (V + dsx - dsx_v)); + memset(raw_max_row + dsx_v, 0, sizeof(float) * (V + dsx - dsx_v)); + memset(min_row + dst_rect.xsize(), 0, sizeof(float) * V); + memset(max_row + dst_rect.xsize(), 0, sizeof(float) * V); + + // For min/max reduction. + const size_t span_tail_len = M % V; + const bool has_span_tail = (span_tail_len != 0); + JXL_ASSERT(has_span_tail || V <= M); + const size_t span_start = has_span_tail ? 0 : V; + const size_t span_tail_start = M - span_tail_len; + const auto span_tail_mask = Iota(df, 0) < Set(df, span_tail_len); + + // sx and sy correspond to offset in source image. + // x and y correspond to top-left pixel offset in upsampled output image. + for (size_t y = 0; y < dst_rect.ysize(); y += N) { + const float* src_rows[M]; + const size_t sy = y / N; + const ssize_t top = static_cast(sy + src_rect.y0() - M2); + for (size_t iy = 0; iy < M; iy++) { + const ssize_t image_y = top + iy + image_y_offset; + src_rows[iy] = src.Row(Mirror(image_y, image_ysize) - image_y_offset); + } + const size_t sx0 = src_rect.x0() - M2; + for (size_t sx = 0; sx < dsx_v; sx += V) { + static_assert(M == 5, "Filter diameter is expected to be 5"); + const auto r0 = LoadU(df, src_rows[0] + sx0 + sx); + const auto r1 = LoadU(df, src_rows[1] + sx0 + sx); + const auto r2 = LoadU(df, src_rows[2] + sx0 + sx); + const auto r3 = LoadU(df, src_rows[3] + sx0 + sx); + const auto r4 = LoadU(df, src_rows[4] + sx0 + sx); + const auto min0 = Min(r0, r1); + const auto max0 = Max(r0, r1); + const auto min1 = Min(r2, r3); + const auto max1 = Max(r2, r3); + const auto min2 = Min(min0, r4); + const auto max2 = Max(max0, r4); + Store(Min(min1, min2), df, raw_min_row + sx); + Store(Max(max1, max2), df, raw_max_row + sx); + } + for (size_t sx = dsx_v; sx < dsx; sx++) { + static_assert(M == 5, "Filter diameter is expected to be 5"); + const auto r0 = src_rows[0][sx0 + sx]; + const auto r1 = src_rows[1][sx0 + sx]; + const auto r2 = src_rows[2][sx0 + sx]; + const auto r3 = src_rows[3][sx0 + sx]; + const auto r4 = src_rows[4][sx0 + sx]; + const auto min0 = std::min(r0, r1); + const auto max0 = std::max(r0, r1); + const auto min1 = std::min(r2, r3); + const auto max1 = std::max(r2, r3); + const auto min2 = std::min(min0, r4); + const auto max2 = std::max(max0, r4); + raw_min_row[sx] = std::min(min1, min2); + raw_max_row[sx] = std::max(max1, max2); + } + + for (size_t sx = 0; sx < rsx; sx++) { + decltype(Zero(df)) min, max; + if (has_span_tail) { + auto dummy = Set(df, raw_min_row[sx]); + min = IfThenElse(span_tail_mask, + LoadU(df, raw_min_row + sx + span_tail_start), dummy); + max = IfThenElse(span_tail_mask, + LoadU(df, raw_max_row + sx + span_tail_start), dummy); + } else { + min = LoadU(df, raw_min_row + sx); + max = LoadU(df, raw_max_row + sx); + } + for (size_t fx = span_start; fx < span_tail_start; fx += V) { + min = Min(LoadU(df, raw_min_row + sx + fx), min); + max = Max(LoadU(df, raw_max_row + sx + fx), max); + } + min = MinOfLanes(min); + max = MaxOfLanes(max); + for (size_t lx = 0; lx < N; lx += V) { + StoreU(min, df, min_row + N * sx + lx); + StoreU(max, df, max_row + N * sx + lx); + } + } + + for (size_t x = 0; x < dst_rect.xsize(); x += NX) { + const size_t sx = x / N; + const size_t xbase = sx + sx0; + // Copy input pixels for "linearization". + for (size_t iy = 0; iy < M; iy++) { + memcpy(in + MX * iy, src_rows[iy] + xbase, MX * sizeof(float)); + } + if (x_repeat > 1) { + // Even if filter coeffs contain 0 at "undefined" values, the result + // might be undefined, because NaN will poison the sum. + if (JXL_UNLIKELY(xbase + MX > src_x_limit)) { + for (size_t iy = 0; iy < M; iy++) { + for (size_t ix = src_x_limit - xbase; ix < MX; ++ix) { + in[MX * iy + ix] = 0.0f; + } + } + } + } + constexpr size_t U = 4; // Unroll factor. + constexpr size_t tail = num_coeffs & ~(U - 1); + constexpr size_t tail_length = num_coeffs - tail; + for (size_t kernel_idx = 0; kernel_idx < num_kernels; kernel_idx += V) { + const float* JXL_RESTRICT kernel_base = kernels + kernel_idx; + decltype(Zero(df)) results[U]; + for (size_t i = 0; i < U; i++) { + results[i] = Set(df, in[i]) * Load(df, kernel_base + i * stride); + } + for (size_t i = U; i < tail; i += U) { + for (size_t j = 0; j < U; ++j) { + results[j] = + MulAdd(Set(df, in[i + j]), + Load(df, kernel_base + (i + j) * stride), results[j]); + } + } + for (size_t i = 0; i < tail_length; ++i) { + results[i] = + MulAdd(Set(df, in[tail + i]), + Load(df, kernel_base + (tail + i) * stride), results[i]); + } + auto result = results[0]; + for (size_t i = 1; i < U; ++i) result += results[i]; + Store(result, df, out + kernel_idx); + } + const size_t oy_max = std::min(dst_rect.ysize(), y + N); + const size_t ox_max = std::min(dst_rect.xsize(), x + NX); + const size_t copy_len = ox_max - x; + const size_t copy_last = RoundUpTo(copy_len, V); + if (JXL_LIKELY(x + copy_last <= dst_rect.xsize())) { + for (size_t dx = 0; dx < copy_len; dx += V) { + auto min = LoadU(df, min_row + x + dx); + auto max = LoadU(df, max_row + x + dx); + float* pixels = out; + for (size_t oy = sy * N; oy < oy_max; ++oy, pixels += NX) { + StoreU(Clamp(LoadU(df, pixels + dx), min, max), df, + dst_rect.Row(dst, oy) + x + dx); + } + } + } else { + for (size_t dx = 0; dx < copy_len; dx++) { + auto min = min_row[x + dx]; + auto max = max_row[x + dx]; + float* pixels = out; + for (size_t oy = sy * N; oy < oy_max; ++oy, pixels += NX) { + dst_rect.Row(dst, oy)[x + dx] = Clamp1(pixels[dx], min, max); + } + } + } + } + } +} + +} // namespace + +void UpsampleRect(size_t upsampling, const float* kernels, const ImageF& src, + const Rect& src_rect, ImageF* dst, const Rect& dst_rect, + ssize_t image_y_offset, size_t image_ysize, float* arena, + size_t x_repeat) { + if (upsampling == 1) return; + if (upsampling == 2) { + if (x_repeat == 1) { + Upsample(src, src_rect, dst, dst_rect, kernels, + image_y_offset, image_ysize, arena); + } else if (x_repeat == 2) { + Upsample(src, src_rect, dst, dst_rect, kernels, + image_y_offset, image_ysize, arena); + } else if (x_repeat == 4) { + Upsample(src, src_rect, dst, dst_rect, kernels, + image_y_offset, image_ysize, arena); + } else { + JXL_ABORT("Not implemented"); + } + } else if (upsampling == 4) { + JXL_ASSERT(x_repeat == 1); + Upsample(src, src_rect, dst, dst_rect, kernels, + image_y_offset, image_ysize, arena); + } else if (upsampling == 8) { + JXL_ASSERT(x_repeat == 1); + Upsample(src, src_rect, dst, dst_rect, kernels, + image_y_offset, image_ysize, arena); + } else { + JXL_ABORT("Not implemented"); + } +} + +size_t NumLanes() { + HWY_FULL(float) df; + return Lanes(df); +} + +void Init(size_t upsampling, CacheAlignedUniquePtr* kernel_storage, + const CustomTransformData& data, size_t x_repeat) { + if ((upsampling & (upsampling - 1)) != 0 || + upsampling > Upsampler::max_upsampling()) { + JXL_ABORT("Invalid upsample"); + } + if ((x_repeat & (x_repeat - 1)) != 0 || + x_repeat > Upsampler::max_x_repeat()) { + JXL_ABORT("Invalid x_repeat"); + } + + // No-op upsampling. + if (upsampling == 1) return; + const float* weights = (upsampling == 2) ? data.upsampling2_weights + : (upsampling == 4) ? data.upsampling4_weights + : data.upsampling8_weights; + InitKernel(weights, kernel_storage, upsampling, x_repeat); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +namespace { +HWY_EXPORT(NumLanes); +HWY_EXPORT(Init); +HWY_EXPORT(UpsampleRect); +} // namespace + +void Upsampler::Init(size_t upsampling, const CustomTransformData& data) { + upsampling_ = upsampling; + size_t V = HWY_DYNAMIC_DISPATCH(NumLanes)(); + x_repeat_ = 1; + if (upsampling_ == 2) { + // 2 * 2 = 4 kernels; repeat cell, if there is more lanes available + if (V >= 8) x_repeat_ = 2; + if (V >= 16) x_repeat_ = 4; + } + HWY_DYNAMIC_DISPATCH(Init)(upsampling, &kernel_storage_, data, x_repeat_); +} + +size_t Upsampler::GetArenaSize(size_t max_dst_xsize) { + size_t V = HWY_DYNAMIC_DISPATCH(NumLanes)(); + constexpr const size_t M2 = Upsampler::filter_radius(); + constexpr const size_t M = 2 * M2 + 1; + constexpr size_t X = max_x_repeat(); + constexpr const size_t MX = M + X - 1; + constexpr const size_t N = max_upsampling(); + // TODO(eustas): raw_(min|max)_row and (min|max)_row could overlap almost + // completely. + return RoundUpTo(N * N * X, V) + RoundUpTo(M * MX, V) + + 2 * RoundUpTo(DivCeil(max_dst_xsize, 8) * 4 + 2 * M2 + V, V) + + 2 * RoundUpTo(max_dst_xsize + V, V); +} + +void Upsampler::UpsampleRect(const ImageF& src, const Rect& src_rect, + ImageF* dst, const Rect& dst_rect, + ssize_t image_y_offset, size_t image_ysize, + float* arena) const { + JXL_CHECK(arena); + HWY_DYNAMIC_DISPATCH(UpsampleRect) + (upsampling_, reinterpret_cast(kernel_storage_.get()), src, src_rect, + dst, dst_rect, image_y_offset, image_ysize, arena, x_repeat_); +} + +void Upsampler::UpsampleRect(const Image3F& src, const Rect& src_rect, + Image3F* dst, const Rect& dst_rect, + ssize_t image_y_offset, size_t image_ysize, + float* arena) const { + PROFILER_FUNC; + for (size_t c = 0; c < 3; c++) { + UpsampleRect(src.Plane(c), src_rect, &dst->Plane(c), dst_rect, + image_y_offset, image_ysize, arena); + } +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.h new file mode 100644 index 0000000000..036acdfcba --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_upsample.h @@ -0,0 +1,57 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_UPSAMPLE_H_ +#define LIB_JXL_DEC_UPSAMPLE_H_ + +#include "lib/jxl/image.h" +#include "lib/jxl/image_metadata.h" + +namespace jxl { + +struct Upsampler { + void Init(size_t upsampling, const CustomTransformData& data); + + // Only 1x, 2x, 4x and 8x upsampling is supported. + static constexpr size_t max_upsampling() { return 8; } + + // To produce N x N upsampled pixels the [-2..2]x[-2..2] neighborhood of + // input pixel is taken and dot-multiplied with N x N corresponding "kernels". + // Thus the "kernel" is a 5 x 5 matrix of weights. + static constexpr size_t filter_radius() { return 2; } + + // Calculate multiple upsampled cells at the same time. + // Kernels are transposed - several kernels are multiplied by input + // at the same time. In case of 2x upsampling there are only 4 kernels. + // If current target supports SIMD vectors longer than 4 floats, to reduce + // the wasted multiplications we increase the effective kernel count. + static constexpr size_t max_x_repeat() { return 4; } + + // Get the size of "arena" required for UpsampleRect; + // "arena" should be an aligned piece of memory with at least `GetArenaSize()` + // float values accessible. + static size_t GetArenaSize(size_t max_dst_xsize); + + // The caller must guarantee that `src:src_rect` has two pixels of padding + // available on each side of the x dimension. `image_ysize` is the total + // height of the frame that the source area belongs to (not the buffer); + // `image_y_offset` is the difference between `src.y0()` and the corresponding + // y value in the full frame. + void UpsampleRect(const Image3F& src, const Rect& src_rect, Image3F* dst, + const Rect& dst_rect, ssize_t image_y_offset, + size_t image_ysize, float* arena) const; + void UpsampleRect(const ImageF& src, const Rect& src_rect, ImageF* dst, + const Rect& dst_rect, ssize_t image_y_offset, + size_t image_ysize, float* arena) const; + + private: + size_t upsampling_ = 1; + size_t x_repeat_ = 1; + CacheAlignedUniquePtr kernel_storage_ = {nullptr}; +}; + +} // namespace jxl + +#endif // LIB_JXL_DEC_UPSAMPLE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb-inl.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb-inl.h new file mode 100644 index 0000000000..df16ce897a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb-inl.h @@ -0,0 +1,351 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// XYB -> linear sRGB helper function. + +#if defined(LIB_JXL_DEC_XYB_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_DEC_XYB_INL_H_ +#undef LIB_JXL_DEC_XYB_INL_H_ +#else +#define LIB_JXL_DEC_XYB_INL_H_ +#endif + +#include + +#include "lib/jxl/dec_xyb.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Broadcast; + +// Inverts the pixel-wise RGB->XYB conversion in OpsinDynamicsImage() (including +// the gamma mixing and simple gamma). Avoids clamping to [0, 1] - out of (sRGB) +// gamut values may be in-gamut after transforming to a wider space. +// "inverse_matrix" points to 9 broadcasted vectors, which are the 3x3 entries +// of the (row-major) opsin absorbance matrix inverse. Pre-multiplying its +// entries by c is equivalent to multiplying linear_* by c afterwards. +template +HWY_INLINE HWY_MAYBE_UNUSED void XybToRgb(D d, const V opsin_x, const V opsin_y, + const V opsin_b, + const OpsinParams& opsin_params, + V* const HWY_RESTRICT linear_r, + V* const HWY_RESTRICT linear_g, + V* const HWY_RESTRICT linear_b) { +#if HWY_TARGET == HWY_SCALAR + const auto neg_bias_r = Set(d, opsin_params.opsin_biases[0]); + const auto neg_bias_g = Set(d, opsin_params.opsin_biases[1]); + const auto neg_bias_b = Set(d, opsin_params.opsin_biases[2]); +#else + const auto neg_bias_rgb = LoadDup128(d, opsin_params.opsin_biases); + const auto neg_bias_r = Broadcast<0>(neg_bias_rgb); + const auto neg_bias_g = Broadcast<1>(neg_bias_rgb); + const auto neg_bias_b = Broadcast<2>(neg_bias_rgb); +#endif + + // Color space: XYB -> RGB + auto gamma_r = opsin_y + opsin_x; + auto gamma_g = opsin_y - opsin_x; + auto gamma_b = opsin_b; + + gamma_r -= Set(d, opsin_params.opsin_biases_cbrt[0]); + gamma_g -= Set(d, opsin_params.opsin_biases_cbrt[1]); + gamma_b -= Set(d, opsin_params.opsin_biases_cbrt[2]); + + // Undo gamma compression: linear = gamma^3 for efficiency. + const auto gamma_r2 = gamma_r * gamma_r; + const auto gamma_g2 = gamma_g * gamma_g; + const auto gamma_b2 = gamma_b * gamma_b; + const auto mixed_r = MulAdd(gamma_r2, gamma_r, neg_bias_r); + const auto mixed_g = MulAdd(gamma_g2, gamma_g, neg_bias_g); + const auto mixed_b = MulAdd(gamma_b2, gamma_b, neg_bias_b); + + const float* HWY_RESTRICT inverse_matrix = opsin_params.inverse_opsin_matrix; + + // Unmix (multiply by 3x3 inverse_matrix) + *linear_r = LoadDup128(d, &inverse_matrix[0 * 4]) * mixed_r; + *linear_g = LoadDup128(d, &inverse_matrix[3 * 4]) * mixed_r; + *linear_b = LoadDup128(d, &inverse_matrix[6 * 4]) * mixed_r; + *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[1 * 4]), mixed_g, *linear_r); + *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[4 * 4]), mixed_g, *linear_g); + *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[7 * 4]), mixed_g, *linear_b); + *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[2 * 4]), mixed_b, *linear_r); + *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[5 * 4]), mixed_b, *linear_g); + *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[8 * 4]), mixed_b, *linear_b); +} + +static inline HWY_MAYBE_UNUSED bool HasFastXYBTosRGB8() { +#if HWY_TARGET == HWY_NEON + return true; +#else + return false; +#endif +} + +static inline HWY_MAYBE_UNUSED void FastXYBTosRGB8( + const Image3F& input, const Rect& input_rect, const Rect& output_buf_rect, + const ImageF* alpha, const Rect& alpha_rect, bool is_rgba, + uint8_t* JXL_RESTRICT output_buf, size_t xsize, size_t output_stride) { + // This function is very NEON-specific. As such, it uses intrinsics directly. +#if HWY_TARGET == HWY_NEON + // WARNING: doing fixed point arithmetic correctly is very complicated. + // Changes to this function should be thoroughly tested. + + // Note that the input is assumed to have 13 bits of mantissa, and the output + // will have 14 bits. + auto srgb_tf = [&](int16x8_t v16) { + int16x8_t clz = vclzq_s16(v16); + // Convert to [0.25, 0.5) range. + int16x8_t v025_05_16 = vqshlq_s16(v16, vqsubq_s16(clz, vdupq_n_s16(2))); + + // third degree polynomial approximation between 0.25 and 0.5 + // of 1.055/2^(7/2.4) * x^(1/2.4) / 32. + // poly ~ ((0.95x-1.75)*x+1.72)*x+0.29 + // We actually compute ~ ((0.47x-0.87)*x+0.86)*(2x)+0.29 as 1.75 and 1.72 + // overflow our fixed point representation. + + int16x8_t twov = vqaddq_s16(v025_05_16, v025_05_16); + + // 0.47 * x + int16x8_t step1 = vqrdmulhq_n_s16(v025_05_16, 15706); + // - 0.87 + int16x8_t step2 = vsubq_s16(step1, vdupq_n_s16(28546)); + // * x + int16x8_t step3 = vqrdmulhq_s16(step2, v025_05_16); + // + 0.86 + int16x8_t step4 = vaddq_s16(step3, vdupq_n_s16(28302)); + // * 2x + int16x8_t step5 = vqrdmulhq_s16(step4, twov); + // + 0.29 + int16x8_t mul16 = vaddq_s16(step5, vdupq_n_s16(9485)); + + int16x8_t exp16 = vsubq_s16(vdupq_n_s16(11), clz); + // Compute 2**(1/2.4*exp16)/32. Values of exp16 that would overflow are + // capped to 1. + // Generated with the following Python script: + // a = [] + // b = [] + // + // for i in range(0, 16): + // v = 2**(5/12.*i) + // v /= 16 + // v *= 256 * 128 + // v = int(v) + // a.append(v // 256) + // b.append(v % 256) + // + // print(", ".join("0x%02x" % x for x in a)) + // + // print(", ".join("0x%02x" % x for x in b)) + + HWY_ALIGN constexpr uint8_t k2to512powersm1div32_high[16] = { + 0x08, 0x0a, 0x0e, 0x13, 0x19, 0x21, 0x2d, 0x3c, + 0x50, 0x6b, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f, + }; + HWY_ALIGN constexpr uint8_t k2to512powersm1div32_low[16] = { + 0x00, 0xad, 0x41, 0x06, 0x65, 0xe7, 0x41, 0x68, + 0xa2, 0xa2, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + }; + // Using the highway implementation here since vqtbl1q is aarch64-only. + using hwy::HWY_NAMESPACE::Vec128; + uint8x16_t pow_low = + TableLookupBytes( + Vec128(vld1q_u8(k2to512powersm1div32_low)), + Vec128(vreinterpretq_u8_s16(exp16))) + .raw; + uint8x16_t pow_high = + TableLookupBytes( + Vec128(vld1q_u8(k2to512powersm1div32_high)), + Vec128(vreinterpretq_u8_s16(exp16))) + .raw; + int16x8_t pow16 = vreinterpretq_s16_u16(vsliq_n_u16( + vreinterpretq_u16_u8(pow_low), vreinterpretq_u16_u8(pow_high), 8)); + + // approximation of v * 12.92, divided by 2 + // Note that our input is using 13 mantissa bits instead of 15. + int16x8_t v16_linear = vrshrq_n_s16(vmulq_n_s16(v16, 826), 5); + // 1.055*pow(v, 1/2.4) - 0.055, divided by 2 + auto v16_pow = vsubq_s16(vqrdmulhq_s16(mul16, pow16), vdupq_n_s16(901)); + // > 0.0031308f (note that v16 has 13 mantissa bits) + return vbslq_s16(vcgeq_s16(v16, vdupq_n_s16(26)), v16_pow, v16_linear); + }; + for (size_t y = 0; y < output_buf_rect.ysize(); y++) { + const float* JXL_RESTRICT row_in_x = input_rect.ConstPlaneRow(input, 0, y); + const float* JXL_RESTRICT row_in_y = input_rect.ConstPlaneRow(input, 1, y); + const float* JXL_RESTRICT row_in_b = input_rect.ConstPlaneRow(input, 2, y); + const float* JXL_RESTRICT row_in_a = + alpha == nullptr ? nullptr : alpha_rect.ConstRow(*alpha, y); + size_t cnt = !is_rgba ? 3 : 4; + size_t base_ptr = + (y + output_buf_rect.y0()) * output_stride + output_buf_rect.x0() * cnt; + for (size_t x = 0; x < output_buf_rect.xsize(); x += 8) { + // Normal ranges for xyb for in-gamut sRGB colors: + // x: -0.015386 0.028100 + // y: 0.000000 0.845308 + // b: 0.000000 0.845308 + + // We actually want x * 8 to have some extra precision. + // TODO(veluca): consider different approaches here, like vld1q_f32_x2. + float32x4_t opsin_x_left = vld1q_f32(row_in_x + x); + int16x4_t opsin_x16_times8_left = + vqmovn_s32(vcvtq_n_s32_f32(opsin_x_left, 18)); + float32x4_t opsin_x_right = + vld1q_f32(row_in_x + x + (x + 4 < output_buf_rect.xsize() ? 4 : 0)); + int16x4_t opsin_x16_times8_right = + vqmovn_s32(vcvtq_n_s32_f32(opsin_x_right, 18)); + int16x8_t opsin_x16_times8 = + vcombine_s16(opsin_x16_times8_left, opsin_x16_times8_right); + + float32x4_t opsin_y_left = vld1q_f32(row_in_y + x); + int16x4_t opsin_y16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_left, 15)); + float32x4_t opsin_y_right = + vld1q_f32(row_in_y + x + (x + 4 < output_buf_rect.xsize() ? 4 : 0)); + int16x4_t opsin_y16_right = + vqmovn_s32(vcvtq_n_s32_f32(opsin_y_right, 15)); + int16x8_t opsin_y16 = vcombine_s16(opsin_y16_left, opsin_y16_right); + + float32x4_t opsin_b_left = vld1q_f32(row_in_b + x); + int16x4_t opsin_b16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_left, 15)); + float32x4_t opsin_b_right = + vld1q_f32(row_in_b + x + (x + 4 < output_buf_rect.xsize() ? 4 : 0)); + int16x4_t opsin_b16_right = + vqmovn_s32(vcvtq_n_s32_f32(opsin_b_right, 15)); + int16x8_t opsin_b16 = vcombine_s16(opsin_b16_left, opsin_b16_right); + + int16x8_t neg_bias16 = vdupq_n_s16(-124); // -0.0037930732552754493 + int16x8_t neg_bias_cbrt16 = vdupq_n_s16(-5110); // -0.155954201 + int16x8_t neg_bias_half16 = vdupq_n_s16(-62); + + // Color space: XYB -> RGB + // Compute ((y+x-bias_cbrt)^3-(y-x-bias_cbrt)^3)/2, + // ((y+x-bias_cbrt)^3+(y-x-bias_cbrt)^3)/2+bias, (b-bias_cbrt)^3+bias. + // Note that ignoring x2 in the formulas below (as x << y) results in + // errors of at least 3 in the final sRGB values. + int16x8_t opsin_yp16 = vqsubq_s16(opsin_y16, neg_bias_cbrt16); + int16x8_t ysq16 = vqrdmulhq_s16(opsin_yp16, opsin_yp16); + int16x8_t twentyfourx16 = vmulq_n_s16(opsin_x16_times8, 3); + int16x8_t twentyfourxy16 = vqrdmulhq_s16(opsin_yp16, twentyfourx16); + int16x8_t threexsq16 = + vrshrq_n_s16(vqrdmulhq_s16(opsin_x16_times8, twentyfourx16), 6); + + // We can ignore x^3 here. Note that this is multiplied by 8. + int16x8_t mixed_rmg16 = vqrdmulhq_s16(twentyfourxy16, opsin_yp16); + + int16x8_t mixed_rpg_sos_half = vhaddq_s16(ysq16, threexsq16); + int16x8_t mixed_rpg16 = vhaddq_s16( + vqrdmulhq_s16(opsin_yp16, mixed_rpg_sos_half), neg_bias_half16); + + int16x8_t gamma_b16 = vqsubq_s16(opsin_b16, neg_bias_cbrt16); + int16x8_t gamma_bsq16 = vqrdmulhq_s16(gamma_b16, gamma_b16); + int16x8_t gamma_bcb16 = vqrdmulhq_s16(gamma_bsq16, gamma_b16); + int16x8_t mixed_b16 = vqaddq_s16(gamma_bcb16, neg_bias16); + // mixed_rpg and mixed_b are in 0-1 range. + // mixed_rmg has a smaller range (-0.035 to 0.035 for valid sRGB). Note + // that at this point it is already multiplied by 8. + + // We multiply all the mixed values by 1/4 (i.e. shift them to 13-bit + // fixed point) to ensure intermediate quantities are in range. Note that + // r-g is not shifted, and was x8 before here; this corresponds to a x32 + // overall multiplicative factor and ensures that all the matrix constants + // are in 0-1 range. + // Similarly, mixed_rpg16 is already multiplied by 1/4 because of the two + // vhadd + using neg_bias_half. + mixed_b16 = vshrq_n_s16(mixed_b16, 2); + + // Unmix (multiply by 3x3 inverse_matrix) + // For increased precision, we use a matrix for converting from + // ((mixed_r - mixed_g)/2, (mixed_r + mixed_g)/2, mixed_b) to rgb. This + // avoids cancellation effects when computing (y+x)^3-(y-x)^3. + // We compute mixed_rpg - mixed_b because the (1+c)*mixed_rpg - c * + // mixed_b pattern is repeated frequently in the code below. This allows + // us to save a multiply per channel, and removes the presence of + // some constants above 1. Moreover, mixed_rmg - mixed_b is in (-1, 1) + // range, so the subtraction is safe. + // All the magic-looking constants here are derived by computing the + // inverse opsin matrix for the transformation modified as described + // above. + + // Precomputation common to multiple color values. + int16x8_t mixed_rpgmb16 = vqsubq_s16(mixed_rpg16, mixed_b16); + int16x8_t mixed_rpgmb_times_016 = vqrdmulhq_n_s16(mixed_rpgmb16, 5394); + int16x8_t mixed_rg16 = vqaddq_s16(mixed_rpgmb_times_016, mixed_rpg16); + + // R + int16x8_t linear_r16 = + vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, 21400)); + + // G + int16x8_t linear_g16 = + vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, -7857)); + + // B + int16x8_t linear_b16 = vqrdmulhq_n_s16(mixed_rpgmb16, -30996); + linear_b16 = vqaddq_s16(linear_b16, mixed_b16); + linear_b16 = vqaddq_s16(linear_b16, vqrdmulhq_n_s16(mixed_rmg16, -6525)); + + // Apply SRGB transfer function. + int16x8_t r = srgb_tf(linear_r16); + int16x8_t g = srgb_tf(linear_g16); + int16x8_t b = srgb_tf(linear_b16); + + uint8x8_t r8 = + vqmovun_s16(vrshrq_n_s16(vsubq_s16(r, vshrq_n_s16(r, 8)), 6)); + uint8x8_t g8 = + vqmovun_s16(vrshrq_n_s16(vsubq_s16(g, vshrq_n_s16(g, 8)), 6)); + uint8x8_t b8 = + vqmovun_s16(vrshrq_n_s16(vsubq_s16(b, vshrq_n_s16(b, 8)), 6)); + + size_t n = output_buf_rect.xsize() - x; + if (is_rgba) { + float32x4_t a_f32_left = + row_in_a ? vld1q_f32(row_in_a + x) : vdupq_n_f32(1.0f); + float32x4_t a_f32_right = + row_in_a ? vld1q_f32(row_in_a + x + + (x + 4 < output_buf_rect.xsize() ? 4 : 0)) + : vdupq_n_f32(1.0f); + int16x4_t a16_left = vqmovn_s32(vcvtq_n_s32_f32(a_f32_left, 8)); + int16x4_t a16_right = vqmovn_s32(vcvtq_n_s32_f32(a_f32_right, 8)); + uint8x8_t a8 = vqmovun_s16(vcombine_s16(a16_left, a16_right)); + uint8_t* buf = output_buf + base_ptr + 4 * x; + uint8x8x4_t data = {r8, g8, b8, a8}; + if (n >= 8) { + vst4_u8(buf, data); + } else { + uint8_t tmp[8 * 4]; + vst4_u8(tmp, data); + memcpy(buf, tmp, n * 4); + } + } else { + uint8_t* buf = output_buf + base_ptr + 3 * x; + uint8x8x3_t data = {r8, g8, b8}; + if (n >= 8) { + vst3_u8(buf, data); + } else { + uint8_t tmp[8 * 3]; + vst3_u8(tmp, data); + memcpy(buf, tmp, n * 3); + } + } + } + } +#else + (void)input; + (void)input_rect; + (void)output_buf_rect; + (void)output_buf; + (void)xsize; + JXL_ABORT("Unreachable"); +#endif +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_DEC_XYB_INL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb.cc new file mode 100644 index 0000000000..26e10037e6 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb.cc @@ -0,0 +1,290 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/dec_xyb.h" + +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/dec_xyb.cc" +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_group_border.h" +#include "lib/jxl/dec_xyb-inl.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/sanitizers.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Broadcast; + +void OpsinToLinearInplace(Image3F* JXL_RESTRICT inout, ThreadPool* pool, + const OpsinParams& opsin_params) { + PROFILER_FUNC; + JXL_CHECK_IMAGE_INITIALIZED(*inout, Rect(*inout)); + + const size_t xsize = inout->xsize(); // not padded + RunOnPool( + pool, 0, inout->ysize(), ThreadPool::SkipInit(), + [&](const int task, const int thread) { + const size_t y = task; + + // Faster than adding via ByteOffset at end of loop. + float* JXL_RESTRICT row0 = inout->PlaneRow(0, y); + float* JXL_RESTRICT row1 = inout->PlaneRow(1, y); + float* JXL_RESTRICT row2 = inout->PlaneRow(2, y); + + const HWY_FULL(float) d; + + for (size_t x = 0; x < xsize; x += Lanes(d)) { + const auto in_opsin_x = Load(d, row0 + x); + const auto in_opsin_y = Load(d, row1 + x); + const auto in_opsin_b = Load(d, row2 + x); + JXL_COMPILER_FENCE; + auto linear_r = Undefined(d); + auto linear_g = Undefined(d); + auto linear_b = Undefined(d); + XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, opsin_params, + &linear_r, &linear_g, &linear_b); + + Store(linear_r, d, row0 + x); + Store(linear_g, d, row1 + x); + Store(linear_b, d, row2 + x); + } + }, + "OpsinToLinear"); +} + +// Same, but not in-place. +void OpsinToLinear(const Image3F& opsin, const Rect& rect, ThreadPool* pool, + Image3F* JXL_RESTRICT linear, + const OpsinParams& opsin_params) { + PROFILER_FUNC; + + JXL_ASSERT(SameSize(rect, *linear)); + JXL_CHECK_IMAGE_INITIALIZED(opsin, rect); + + RunOnPool( + pool, 0, static_cast(rect.ysize()), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const size_t y = static_cast(task); + + // Faster than adding via ByteOffset at end of loop. + const float* JXL_RESTRICT row_opsin_0 = rect.ConstPlaneRow(opsin, 0, y); + const float* JXL_RESTRICT row_opsin_1 = rect.ConstPlaneRow(opsin, 1, y); + const float* JXL_RESTRICT row_opsin_2 = rect.ConstPlaneRow(opsin, 2, y); + float* JXL_RESTRICT row_linear_0 = linear->PlaneRow(0, y); + float* JXL_RESTRICT row_linear_1 = linear->PlaneRow(1, y); + float* JXL_RESTRICT row_linear_2 = linear->PlaneRow(2, y); + + const HWY_FULL(float) d; + + for (size_t x = 0; x < rect.xsize(); x += Lanes(d)) { + const auto in_opsin_x = Load(d, row_opsin_0 + x); + const auto in_opsin_y = Load(d, row_opsin_1 + x); + const auto in_opsin_b = Load(d, row_opsin_2 + x); + JXL_COMPILER_FENCE; + auto linear_r = Undefined(d); + auto linear_g = Undefined(d); + auto linear_b = Undefined(d); + XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, opsin_params, + &linear_r, &linear_g, &linear_b); + + Store(linear_r, d, row_linear_0 + x); + Store(linear_g, d, row_linear_1 + x); + Store(linear_b, d, row_linear_2 + x); + } + }, + "OpsinToLinear(Rect)"); + JXL_CHECK_IMAGE_INITIALIZED(*linear, rect); +} + +// Transform YCbCr to RGB. +// Could be performed in-place (i.e. Y, Cb and Cr could alias R, B and B). +void YcbcrToRgb(const Image3F& ycbcr, Image3F* rgb, const Rect& rect) { + JXL_CHECK_IMAGE_INITIALIZED(ycbcr, rect); + const HWY_CAPPED(float, GroupBorderAssigner::kPaddingXRound) df; + const size_t S = Lanes(df); // Step. + + const size_t xsize = rect.xsize(); + const size_t ysize = rect.ysize(); + if ((xsize == 0) || (ysize == 0)) return; + + // Full-range BT.601 as defined by JFIF Clause 7: + // https://www.itu.int/rec/T-REC-T.871-201105-I/en + const auto c128 = Set(df, 128.0f / 255); + const auto crcr = Set(df, 1.402f); + const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f); + const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f); + const auto cbcb = Set(df, 1.772f); + + for (size_t y = 0; y < ysize; y++) { + const float* y_row = rect.ConstPlaneRow(ycbcr, 1, y); + const float* cb_row = rect.ConstPlaneRow(ycbcr, 0, y); + const float* cr_row = rect.ConstPlaneRow(ycbcr, 2, y); + float* r_row = rect.PlaneRow(rgb, 0, y); + float* g_row = rect.PlaneRow(rgb, 1, y); + float* b_row = rect.PlaneRow(rgb, 2, y); + for (size_t x = 0; x < xsize; x += S) { + const auto y_vec = Load(df, y_row + x) + c128; + const auto cb_vec = Load(df, cb_row + x); + const auto cr_vec = Load(df, cr_row + x); + const auto r_vec = crcr * cr_vec + y_vec; + const auto g_vec = cgcr * cr_vec + cgcb * cb_vec + y_vec; + const auto b_vec = cbcb * cb_vec + y_vec; + Store(r_vec, df, r_row + x); + Store(g_vec, df, g_row + x); + Store(b_vec, df, b_row + x); + } + } + JXL_CHECK_IMAGE_INITIALIZED(*rgb, rect); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(OpsinToLinearInplace); +void OpsinToLinearInplace(Image3F* JXL_RESTRICT inout, ThreadPool* pool, + const OpsinParams& opsin_params) { + return HWY_DYNAMIC_DISPATCH(OpsinToLinearInplace)(inout, pool, opsin_params); +} + +HWY_EXPORT(OpsinToLinear); +void OpsinToLinear(const Image3F& opsin, const Rect& rect, ThreadPool* pool, + Image3F* JXL_RESTRICT linear, + const OpsinParams& opsin_params) { + return HWY_DYNAMIC_DISPATCH(OpsinToLinear)(opsin, rect, pool, linear, + opsin_params); +} + +HWY_EXPORT(YcbcrToRgb); +void YcbcrToRgb(const Image3F& ycbcr, Image3F* rgb, const Rect& rect) { + return HWY_DYNAMIC_DISPATCH(YcbcrToRgb)(ycbcr, rgb, rect); +} + +HWY_EXPORT(HasFastXYBTosRGB8); +bool HasFastXYBTosRGB8() { return HWY_DYNAMIC_DISPATCH(HasFastXYBTosRGB8)(); } + +HWY_EXPORT(FastXYBTosRGB8); +void FastXYBTosRGB8(const Image3F& input, const Rect& input_rect, + const Rect& output_buf_rect, const ImageF* alpha, + const Rect& alpha_rect, bool is_rgba, + uint8_t* JXL_RESTRICT output_buf, size_t xsize, + size_t output_stride) { + return HWY_DYNAMIC_DISPATCH(FastXYBTosRGB8)( + input, input_rect, output_buf_rect, alpha, alpha_rect, is_rgba, + output_buf, xsize, output_stride); +} + +void OpsinParams::Init(float intensity_target) { + InitSIMDInverseMatrix(GetOpsinAbsorbanceInverseMatrix(), inverse_opsin_matrix, + intensity_target); + memcpy(opsin_biases, kNegOpsinAbsorbanceBiasRGB, + sizeof(kNegOpsinAbsorbanceBiasRGB)); + memcpy(quant_biases, kDefaultQuantBias, sizeof(kDefaultQuantBias)); + for (size_t c = 0; c < 4; c++) { + opsin_biases_cbrt[c] = cbrtf(opsin_biases[c]); + } +} + +Status OutputEncodingInfo::Set(const CodecMetadata& metadata, + const ColorEncoding& default_enc) { + const auto& im = metadata.transform_data.opsin_inverse_matrix; + float inverse_matrix[9]; + memcpy(inverse_matrix, im.inverse_matrix, sizeof(inverse_matrix)); + float intensity_target = metadata.m.IntensityTarget(); + if (metadata.m.xyb_encoded) { + const auto& orig_color_encoding = metadata.m.color_encoding; + color_encoding = default_enc; + // Figure out if we can output to this color encoding. + do { + if (!orig_color_encoding.HaveFields()) break; + // TODO(veluca): keep in sync with dec_reconstruct.cc + if (!orig_color_encoding.tf.IsPQ() && !orig_color_encoding.tf.IsSRGB() && + !orig_color_encoding.tf.IsGamma() && + !orig_color_encoding.tf.IsLinear() && + !orig_color_encoding.tf.IsHLG() && !orig_color_encoding.tf.IsDCI() && + !orig_color_encoding.tf.Is709()) { + break; + } + if (orig_color_encoding.tf.IsGamma()) { + inverse_gamma = orig_color_encoding.tf.GetGamma(); + } + if (orig_color_encoding.tf.IsDCI()) { + inverse_gamma = 1.0f / 2.6f; + } + if (orig_color_encoding.IsGray() && + orig_color_encoding.white_point != WhitePoint::kD65) { + // TODO(veluca): figure out what should happen here. + break; + } + + if ((orig_color_encoding.primaries != Primaries::kSRGB || + orig_color_encoding.white_point != WhitePoint::kD65) && + !orig_color_encoding.IsGray()) { + all_default_opsin = false; + float srgb_to_xyzd50[9]; + const auto& srgb = ColorEncoding::SRGB(/*is_gray=*/false); + JXL_CHECK(PrimariesToXYZD50( + srgb.GetPrimaries().r.x, srgb.GetPrimaries().r.y, + srgb.GetPrimaries().g.x, srgb.GetPrimaries().g.y, + srgb.GetPrimaries().b.x, srgb.GetPrimaries().b.y, + srgb.GetWhitePoint().x, srgb.GetWhitePoint().y, srgb_to_xyzd50)); + float xyzd50_to_original[9]; + JXL_RETURN_IF_ERROR(PrimariesToXYZD50( + orig_color_encoding.GetPrimaries().r.x, + orig_color_encoding.GetPrimaries().r.y, + orig_color_encoding.GetPrimaries().g.x, + orig_color_encoding.GetPrimaries().g.y, + orig_color_encoding.GetPrimaries().b.x, + orig_color_encoding.GetPrimaries().b.y, + orig_color_encoding.GetWhitePoint().x, + orig_color_encoding.GetWhitePoint().y, xyzd50_to_original)); + JXL_RETURN_IF_ERROR(Inv3x3Matrix(xyzd50_to_original)); + float srgb_to_original[9]; + MatMul(xyzd50_to_original, srgb_to_xyzd50, 3, 3, 3, srgb_to_original); + MatMul(srgb_to_original, im.inverse_matrix, 3, 3, 3, inverse_matrix); + } + color_encoding = orig_color_encoding; + color_encoding_is_original = true; + if (color_encoding.tf.IsPQ()) { + intensity_target = 10000; + } + } while (false); + } else { + color_encoding = metadata.m.color_encoding; + } + if (std::abs(intensity_target - 255.0) > 0.1f || !im.all_default) { + all_default_opsin = false; + } + InitSIMDInverseMatrix(inverse_matrix, opsin_params.inverse_opsin_matrix, + intensity_target); + std::copy(std::begin(im.opsin_biases), std::end(im.opsin_biases), + opsin_params.opsin_biases); + for (int i = 0; i < 3; ++i) { + opsin_params.opsin_biases_cbrt[i] = cbrtf(opsin_params.opsin_biases[i]); + } + opsin_params.opsin_biases_cbrt[3] = opsin_params.opsin_biases[3] = 1; + std::copy(std::begin(im.quant_biases), std::end(im.quant_biases), + opsin_params.quant_biases); + return true; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb.h new file mode 100644 index 0000000000..affdef11c1 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/dec_xyb.h @@ -0,0 +1,71 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DEC_XYB_H_ +#define LIB_JXL_DEC_XYB_H_ + +// XYB -> linear sRGB. + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_metadata.h" +#include "lib/jxl/opsin_params.h" + +namespace jxl { + +// Parameters for XYB->sRGB conversion. +struct OpsinParams { + float inverse_opsin_matrix[9 * 4]; + float opsin_biases[4]; + float opsin_biases_cbrt[4]; + float quant_biases[4]; + void Init(float intensity_target); +}; + +struct OutputEncodingInfo { + ColorEncoding color_encoding; + // Used for Gamma and DCI transfer functions. + float inverse_gamma; + // Contains an opsin matrix that converts to the primaries of the output + // encoding. + OpsinParams opsin_params; + // default_enc is used for xyb encoded image with ICC profile, in other + // cases it has no effect. Use linear sRGB or grayscale if ICC profile is + // not matched (not parsed or no matching ColorEncoding exists) + Status Set(const CodecMetadata& metadata, const ColorEncoding& default_enc); + bool all_default_opsin = true; + bool color_encoding_is_original = false; +}; + +// Converts `inout` (not padded) from opsin to linear sRGB in-place. Called from +// per-pass postprocessing, hence parallelized. +void OpsinToLinearInplace(Image3F* JXL_RESTRICT inout, ThreadPool* pool, + const OpsinParams& opsin_params); + +// Converts `opsin:rect` (opsin may be padded, rect.x0 must be vector-aligned) +// to linear sRGB. Called from whole-frame encoder, hence parallelized. +void OpsinToLinear(const Image3F& opsin, const Rect& rect, ThreadPool* pool, + Image3F* JXL_RESTRICT linear, + const OpsinParams& opsin_params); + +// Bt.601 to match JPEG/JFIF. Inputs are _signed_ YCbCr values suitable for DCT, +// see F.1.1.3 of T.81 (because our data type is float, there is no need to add +// a bias to make the values unsigned). +void YcbcrToRgb(const Image3F& ycbcr, Image3F* rgb, const Rect& rect); + +bool HasFastXYBTosRGB8(); +void FastXYBTosRGB8(const Image3F& input, const Rect& input_rect, + const Rect& output_buf_rect, const ImageF* alpha, + const Rect& alpha_rect, bool is_rgba, + uint8_t* JXL_RESTRICT output_buf, size_t xsize, + size_t output_stride); + +} // namespace jxl + +#endif // LIB_JXL_DEC_XYB_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/decode.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/decode.cc new file mode 100644 index 0000000000..78c7d8d8e8 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/decode.cc @@ -0,0 +1,2217 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "jxl/decode.h" + +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_external_image.h" +#include "lib/jxl/dec_frame.h" +#include "lib/jxl/dec_modular.h" +#include "lib/jxl/dec_reconstruct.h" +#include "lib/jxl/decode_to_jpeg.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/icc_codec.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/memory_manager_internal.h" +#include "lib/jxl/toc.h" + +#ifndef JPEGXL_MAJOR_VERSION +#define JPEGXL_MAJOR_VERSION 0 +#define JPEGXL_MINOR_VERSION 5 +#define JPEGXL_PATCH_VERSION 0 +#endif + +namespace { + +// If set (by fuzzer) then some operations will fail, if those would require +// allocating large objects. Actual memory usage might be two orders of +// magnitude bigger. +// TODO(eustas): this is a poor-mans replacement for memory-manager approach; +// remove, once memory-manager actually works. +size_t memory_limit_base_ = 0; +size_t cpu_limit_base_ = 0; +size_t used_cpu_base_ = 0; + +bool CheckSizeLimit(size_t xsize, size_t ysize) { + if (!memory_limit_base_) return true; + if (xsize == 0 || ysize == 0) return true; + size_t num_pixels = xsize * ysize; + if (num_pixels / xsize != ysize) return false; // overflow + if (num_pixels > memory_limit_base_) return false; + return true; +} + +// Checks if a + b > size, taking possible integer overflow into account. +bool OutOfBounds(size_t a, size_t b, size_t size) { + size_t pos = a + b; + if (pos > size) return true; + if (pos < a) return true; // overflow happened + return false; +} + +// Checks if a + b + c > size, taking possible integer overflow into account. +bool OutOfBounds(size_t a, size_t b, size_t c, size_t size) { + size_t pos = a + b; + if (pos < b) return true; // overflow happened + pos += c; + if (pos < c) return true; // overflow happened + if (pos > size) return true; + return false; +} + +bool SumOverflows(size_t a, size_t b, size_t c) { + size_t sum = a + b; + if (sum < b) return true; + sum += c; + if (sum < c) return true; + return false; +} + +JXL_INLINE size_t InitialBasicInfoSizeHint() { + // Amount of bytes before the start of the codestream in the container format, + // assuming that the codestream is the first box after the signature and + // filetype boxes. 12 bytes signature box + 20 bytes filetype box + 16 bytes + // codestream box length + name + optional XLBox length. + const size_t container_header_size = 48; + + // Worst-case amount of bytes for basic info of the JPEG XL codestream header, + // that is all information up to and including extra_channel_bits. Up to + // around 2 bytes signature + 8 bytes SizeHeader + 31 bytes ColorEncoding + 4 + // bytes rest of ImageMetadata + 5 bytes part of ImageMetadata2. + // TODO(lode): recompute and update this value when alpha_bits is moved to + // extra channels info. + const size_t max_codestream_basic_info_size = 50; + + return container_header_size + max_codestream_basic_info_size; +} + +// Debug-printing failure macro similar to JXL_FAILURE, but for the status code +// JXL_DEC_ERROR +#ifdef JXL_CRASH_ON_ERROR +#define JXL_API_ERROR(format, ...) \ + (::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \ + ::jxl::Abort(), JXL_DEC_ERROR) +#else // JXL_CRASH_ON_ERROR +#define JXL_API_ERROR(format, ...) \ + (((JXL_DEBUG_ON_ERROR) && \ + ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__)), \ + JXL_DEC_ERROR) +#endif // JXL_CRASH_ON_ERROR + +JxlDecoderStatus ConvertStatus(JxlDecoderStatus status) { return status; } + +JxlDecoderStatus ConvertStatus(jxl::Status status) { + return status ? JXL_DEC_SUCCESS : JXL_DEC_ERROR; +} + +JxlSignature ReadSignature(const uint8_t* buf, size_t len, size_t* pos) { + if (*pos >= len) return JXL_SIG_NOT_ENOUGH_BYTES; + + buf += *pos; + len -= *pos; + + // JPEG XL codestream: 0xff 0x0a + if (len >= 1 && buf[0] == 0xff) { + if (len < 2) { + return JXL_SIG_NOT_ENOUGH_BYTES; + } else if (buf[1] == jxl::kCodestreamMarker) { + *pos += 2; + return JXL_SIG_CODESTREAM; + } else { + return JXL_SIG_INVALID; + } + } + + // JPEG XL container + if (len >= 1 && buf[0] == 0) { + if (len < 12) { + return JXL_SIG_NOT_ENOUGH_BYTES; + } else if (buf[1] == 0 && buf[2] == 0 && buf[3] == 0xC && buf[4] == 'J' && + buf[5] == 'X' && buf[6] == 'L' && buf[7] == ' ' && + buf[8] == 0xD && buf[9] == 0xA && buf[10] == 0x87 && + buf[11] == 0xA) { + *pos += 12; + return JXL_SIG_CONTAINER; + } else { + return JXL_SIG_INVALID; + } + } + + return JXL_SIG_INVALID; +} + +} // namespace + +uint32_t JxlDecoderVersion(void) { + return JPEGXL_MAJOR_VERSION * 1000000 + JPEGXL_MINOR_VERSION * 1000 + + JPEGXL_PATCH_VERSION; +} + +JxlSignature JxlSignatureCheck(const uint8_t* buf, size_t len) { + size_t pos = 0; + return ReadSignature(buf, len, &pos); +} + +namespace { + +size_t BitsPerChannel(JxlDataType data_type) { + switch (data_type) { + case JXL_TYPE_BOOLEAN: + return 1; + case JXL_TYPE_UINT8: + return 8; + case JXL_TYPE_UINT16: + return 16; + case JXL_TYPE_UINT32: + return 32; + case JXL_TYPE_FLOAT: + return 32; + case JXL_TYPE_FLOAT16: + return 16; + // No default, give compiler error if new type not handled. + } + return 0; // Indicate invalid data type. +} + +enum class DecoderStage : uint32_t { + kInited, // Decoder created, no JxlDecoderProcessInput called yet + kStarted, // Running JxlDecoderProcessInput calls + kFinished, // Everything done, nothing left to process + kError, // Error occurred, decoder object no longer usable +}; + +enum class FrameStage : uint32_t { + kHeader, // Must parse frame header. dec->frame_start must be set up + // correctly already. + kTOC, // Must parse TOC + kFull, // Must parse full pixels + kFullOutput, // Must output full pixels +}; + +// Manages the sections for the FrameDecoder based on input bytes received. +struct Sections { + // sections_begin = position in the frame where the sections begin, after + // the frame header and TOC, so sections_begin = sum of frame header size and + // TOC size. + Sections(jxl::FrameDecoder* frame_dec, size_t frame_size, + size_t sections_begin) + : frame_dec_(frame_dec), + frame_size_(frame_size), + sections_begin_(sections_begin) {} + + Sections(const Sections&) = delete; + Sections& operator=(const Sections&) = delete; + Sections(Sections&&) = delete; + Sections& operator=(Sections&&) = delete; + + ~Sections() { + // Avoid memory leaks if the JXL decoder quits early and doesn't end up + // calling CloseInput(). + CloseInput(); + } + + // frame_dec_ must have been Inited already, but not yet done ProcessSections. + JxlDecoderStatus Init() { + section_received.resize(frame_dec_->NumSections(), 0); + + const auto& offsets = frame_dec_->SectionOffsets(); + const auto& sizes = frame_dec_->SectionSizes(); + + // Ensure none of the sums of section offset and size overflow. + for (size_t i = 0; i < frame_dec_->NumSections(); i++) { + if (OutOfBounds(sections_begin_, offsets[i], sizes[i], frame_size_)) { + return JXL_API_ERROR("section out of bounds"); + } + } + + return JXL_DEC_SUCCESS; + } + + // Sets the input data for the frame. The frame pointer must point to the + // beginning of the frame, size is the amount of bytes gotten so far and + // should increase with next calls until the full frame is loaded. + // TODO(lode): allow caller to provide only later chunks of memory when + // earlier sections are fully processed already. + void SetInput(const uint8_t* frame, size_t size) { + const auto& offsets = frame_dec_->SectionOffsets(); + const auto& sizes = frame_dec_->SectionSizes(); + + for (size_t i = 0; i < frame_dec_->NumSections(); i++) { + if (section_received[i]) continue; + if (!OutOfBounds(sections_begin_, offsets[i], sizes[i], size)) { + section_received[i] = 1; + section_info.emplace_back(jxl::FrameDecoder::SectionInfo{nullptr, i}); + section_status.emplace_back(); + } + } + // Reset all the bitreaders, because the address of the frame pointer may + // change, even if it always represents the same frame start. + for (size_t i = 0; i < section_info.size(); i++) { + size_t id = section_info[i].id; + JXL_ASSERT(section_info[i].br == nullptr); + section_info[i].br = new jxl::BitReader(jxl::Span( + frame + sections_begin_ + offsets[id], sizes[id])); + } + } + + JxlDecoderStatus CloseInput() { + bool out_of_bounds = false; + for (size_t i = 0; i < section_info.size(); i++) { + if (!section_info[i].br) continue; + if (!section_info[i].br->AllReadsWithinBounds()) { + // Mark out of bounds section, but keep closing and deleting the next + // ones as well. + out_of_bounds = true; + } + JXL_ASSERT(section_info[i].br->Close()); + delete section_info[i].br; + section_info[i].br = nullptr; + } + if (out_of_bounds) { + // If any bit reader indicates out of bounds, it's an error, not just + // needing more input, since we ensure only bit readers containing + // a complete section are provided to the FrameDecoder. + return JXL_API_ERROR("frame out of bounds"); + } + return JXL_DEC_SUCCESS; + } + + // Not managed by us. + jxl::FrameDecoder* frame_dec_; + + size_t frame_size_; + size_t sections_begin_; + + std::vector section_info; + std::vector section_status; + std::vector section_received; +}; + +/* +Given list of frame references to storage slots, and storage slots in which this +frame is saved, computes which frames are required to decode the frame at the +given index and any frames after it. The frames on which this depends are +returned as a vector of their indices, in no particular order. The given index +must be smaller than saved_as.size(), and references.size() must equal +saved_as.size(). Any frames beyond saved_as and references are considered +unknown future frames and must be treated as if something depends on them. +*/ +std::vector GetFrameDependencies(size_t index, + const std::vector& saved_as, + const std::vector& references) { + JXL_ASSERT(references.size() == saved_as.size()); + JXL_ASSERT(index < references.size()); + + std::vector result; + + constexpr size_t kNumStorage = 8; + + // value which indicates nothing is stored in this storage slot + const size_t invalid = references.size(); + // for each of the 8 storage slots, a vector that translates frame index to + // frame stored in this storage slot at this point, that is, the last + // frame that was stored in this slot before or at this index. + std::array, kNumStorage> storage; + for (size_t s = 0; s < kNumStorage; ++s) { + storage[s].resize(saved_as.size()); + int mask = 1 << s; + size_t id = invalid; + for (size_t i = 0; i < saved_as.size(); ++i) { + if (saved_as[i] & mask) { + id = i; + } + storage[s][i] = id; + } + } + + std::vector seen(index + 1, 0); + std::vector stack; + stack.push_back(index); + seen[index] = 1; + + // For frames after index, assume they can depend on any of the 8 storage + // slots, so push the frame for each stored reference to the stack and result. + // All frames after index are treated as having unknown references and with + // the possibility that there are more frames after the last known. + // TODO(lode): take values of saved_as and references after index, and a + // input flag indicating if they are all frames of the image, to further + // optimize this. + for (size_t s = 0; s < kNumStorage; ++s) { + size_t frame_ref = storage[s][index]; + if (frame_ref == invalid) continue; + if (seen[frame_ref]) continue; + stack.push_back(frame_ref); + seen[frame_ref] = 1; + result.push_back(frame_ref); + } + + while (!stack.empty()) { + size_t frame_index = stack.back(); + stack.pop_back(); + if (frame_index == 0) continue; // first frame cannot have references + for (size_t s = 0; s < kNumStorage; ++s) { + int mask = 1 << s; + if (!(references[frame_index] & mask)) continue; + size_t frame_ref = storage[s][frame_index - 1]; + if (frame_ref == invalid) continue; + if (seen[frame_ref]) continue; + stack.push_back(frame_ref); + seen[frame_ref] = 1; + result.push_back(frame_ref); + } + } + + return result; +} + +} // namespace + +// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) +struct JxlDecoderStruct { + JxlDecoderStruct() = default; + + JxlMemoryManager memory_manager; + std::unique_ptr thread_pool; + + DecoderStage stage; + + // Status of progression, internal. + bool got_signature; + bool first_codestream_seen; + // Indicates we know that we've seen the last codestream, however this is not + // guaranteed to be true for the last box because a jxl file may have multiple + // "jxlp" boxes and it is possible (and permitted) that the last one is not a + // final box that uses size 0 to indicate the end. + bool last_codestream_seen; + bool got_basic_info; + size_t header_except_icc_bits = 0; // To skip everything before ICC. + bool got_all_headers; // Codestream metadata headers. + bool post_headers; // Already decoding pixels. + jxl::ICCReader icc_reader; + + // This means either we actually got the preview image, or determined we + // cannot get it or there is none. + bool got_preview_image; + + // Position of next_in in the original file including box format if present + // (as opposed to position in the codestream) + size_t file_pos; + size_t box_begin; + size_t box_end; + bool skip_box; + // Begin and end of the content of the current codestream box. This could be + // a partial codestream box. + // codestream_begin 0 is used to indicate the begin is not yet known. + // codestream_end 0 is used to indicate uncapped (until end of file, for the + // last box if this box doesn't indicate its actual size). + // Not used if the file is a direct codestream. + size_t codestream_begin; + size_t codestream_end; + + // Settings + bool keep_orientation; + + // Bitfield, for which informative events (JXL_DEC_BASIC_INFO, etc...) the + // decoder returns a status. By default, do not return for any of the events, + // only return when the decoder cannot continue because it needs more input or + // output data. + int events_wanted; + int orig_events_wanted; + + // Fields for reading the basic info from the header. + size_t basic_info_size_hint; + bool have_container; + + // Whether the preview out buffer was set. It is possible for the buffer to + // be nullptr and buffer_set to be true, indicating it was deliberately + // set to nullptr. + bool preview_out_buffer_set; + // Idem for the image buffer. + bool image_out_buffer_set; + + // Owned by the caller, buffers for DC image and full resolution images + void* preview_out_buffer; + void* image_out_buffer; + JxlImageOutCallback image_out_callback; + void* image_out_opaque; + + size_t preview_out_size; + size_t image_out_size; + + // TODO(lode): merge these? + JxlPixelFormat preview_out_format; + JxlPixelFormat image_out_format; + + jxl::CodecMetadata metadata; + std::unique_ptr ib; + // ColorEncoding to use for xyb encoded image with ICC profile. + jxl::ColorEncoding default_enc; + + std::unique_ptr passes_state; + std::unique_ptr frame_dec; + std::unique_ptr sections; + // The FrameDecoder is initialized, and not yet finalized + bool frame_dec_in_progress; + + // headers and TOC for the current frame. When got_toc is true, this is + // always the frame header of the last frame of the current still series, + // that is, the displayed frame. + std::unique_ptr frame_header; + + // Start of the current frame being processed, as offset from the beginning of + // the codestream. + size_t frame_start; + size_t frame_size; + FrameStage frame_stage; + // The currently processed frame is the last of the current composite still, + // and so must be returned as pixels + bool is_last_of_still; + // The currently processed frame is the last of the codestream + bool is_last_total; + // How many frames to skip. + size_t skip_frames; + // Skipping the current frame. May be false if skip_frames was just set to + // a positive value while already processing a current frame, then + // skipping_frame will be enabled only for the next frame. + bool skipping_frame; + + // Amount of internal frames and external frames started. External frames are + // user-visible frames, internal frames includes all external frames and + // also invisible frames such as patches, blending-only and dc_level frames. + size_t internal_frames; + size_t external_frames; + + // For each internal frame, which storage locations it references, and which + // storage locations it is stored in, using the bit mask as defined in + // FrameDecoder::References and FrameDecoder::SaveAs. + std::vector frame_references; + std::vector frame_saved_as; + + // Translates external frame index to internal frame index. The external + // index is the index of user-visible frames. The internal index can be larger + // since non-visible frames (such as frames with patches, ...) are included. + std::vector frame_external_to_internal; + + // Whether the frame with internal index is required to decode the frame + // being skipped to or any frames after that. If no skipping is active, + // this vector is ignored. If the current internal frame index is beyond this + // vector, it must be treated as a required frame. + std::vector frame_required; + + // Codestream input data is stored here, when the decoder takes in and stores + // the user input bytes. If the decoder does not do that (e.g. in one-shot + // case), this field is unused. + // TODO(lode): avoid needing this field once the C++ decoder doesn't need + // all bytes at once, to save memory. Find alternative to std::vector doubling + // strategy to prevent some memory usage. + std::vector codestream; + + jxl::JxlToJpegDecoder jpeg_decoder; + + // Position in the actual codestream, which codestream.begin() points to. + // Non-zero once earlier parts of the codestream vector have been erased. + size_t codestream_pos; + + // Statistics which CodecInOut can keep + uint64_t dec_pixels; + + const uint8_t* next_in; + size_t avail_in; +}; + +// TODO(zond): Make this depend on the data loaded into the decoder. +JxlDecoderStatus JxlDecoderDefaultPixelFormat(const JxlDecoder* dec, + JxlPixelFormat* format) { + if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT; + *format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0}; + return JXL_DEC_SUCCESS; +} + +void JxlDecoderReset(JxlDecoder* dec) { + dec->thread_pool.reset(); + dec->stage = DecoderStage::kInited; + dec->got_signature = false; + dec->first_codestream_seen = false; + dec->last_codestream_seen = false; + dec->got_basic_info = false; + dec->header_except_icc_bits = 0; + dec->got_all_headers = false; + dec->post_headers = false; + dec->icc_reader.Reset(); + dec->got_preview_image = false; + dec->file_pos = 0; + dec->box_begin = 0; + dec->box_end = 0; + dec->skip_box = false; + dec->codestream_pos = 0; + dec->codestream_begin = 0; + dec->codestream_end = 0; + dec->keep_orientation = false; + dec->events_wanted = 0; + dec->orig_events_wanted = 0; + dec->basic_info_size_hint = InitialBasicInfoSizeHint(); + dec->have_container = 0; + dec->preview_out_buffer_set = false; + dec->image_out_buffer_set = false; + dec->preview_out_buffer = nullptr; + dec->image_out_buffer = nullptr; + dec->image_out_callback = nullptr; + dec->image_out_opaque = nullptr; + dec->preview_out_size = 0; + dec->image_out_size = 0; + dec->dec_pixels = 0; + dec->next_in = 0; + dec->avail_in = 0; + + dec->passes_state.reset(nullptr); + dec->frame_dec.reset(nullptr); + dec->sections.reset(nullptr); + dec->frame_dec_in_progress = false; + + dec->ib.reset(); + dec->metadata = jxl::CodecMetadata(); + dec->frame_header.reset(new jxl::FrameHeader(&dec->metadata)); + dec->codestream.clear(); + + dec->frame_stage = FrameStage::kHeader; + dec->frame_start = 0; + dec->frame_size = 0; + dec->is_last_of_still = false; + dec->is_last_total = false; + dec->skip_frames = 0; + dec->skipping_frame = false; + dec->internal_frames = 0; + dec->external_frames = 0; + dec->frame_references.clear(); + dec->frame_saved_as.clear(); + dec->frame_external_to_internal.clear(); + dec->frame_required.clear(); +} + +JxlDecoder* JxlDecoderCreate(const JxlMemoryManager* memory_manager) { + JxlMemoryManager local_memory_manager; + if (!jxl::MemoryManagerInit(&local_memory_manager, memory_manager)) + return nullptr; + + void* alloc = + jxl::MemoryManagerAlloc(&local_memory_manager, sizeof(JxlDecoder)); + if (!alloc) return nullptr; + // Placement new constructor on allocated memory + JxlDecoder* dec = new (alloc) JxlDecoder(); + dec->memory_manager = local_memory_manager; + + JxlDecoderReset(dec); + + return dec; +} + +void JxlDecoderDestroy(JxlDecoder* dec) { + if (dec) { + // Call destructor directly since custom free function is used. + dec->~JxlDecoder(); + jxl::MemoryManagerFree(&dec->memory_manager, dec); + } +} + +void JxlDecoderRewind(JxlDecoder* dec) { + int keep_orientation = dec->keep_orientation; + int events_wanted = dec->orig_events_wanted; + std::vector frame_references; + std::vector frame_saved_as; + std::vector frame_external_to_internal; + std::vector frame_required; + frame_references.swap(dec->frame_references); + frame_saved_as.swap(dec->frame_saved_as); + frame_external_to_internal.swap(dec->frame_external_to_internal); + frame_required.swap(dec->frame_required); + + JxlDecoderReset(dec); + dec->keep_orientation = keep_orientation; + dec->events_wanted = events_wanted; + dec->orig_events_wanted = events_wanted; + frame_references.swap(dec->frame_references); + frame_saved_as.swap(dec->frame_saved_as); + frame_external_to_internal.swap(dec->frame_external_to_internal); + frame_required.swap(dec->frame_required); +} + +void JxlDecoderSkipFrames(JxlDecoder* dec, size_t amount) { + // Increment amount, rather than set it: making the amount smaller is + // impossible because the decoder may already have skipped frames required to + // decode earlier frames, and making the amount larger compared to an existing + // amount is impossible because if JxlDecoderSkipFrames is called in the + // middle of already skipping frames, the user cannot know how many frames + // have already been skipped internally so far so an absolute value cannot + // be defined. + dec->skip_frames += amount; + + dec->frame_required.clear(); + size_t next_frame = dec->external_frames + dec->skip_frames; + + // A frame that has been seen before a rewind + if (next_frame < dec->frame_external_to_internal.size()) { + size_t internal_index = dec->frame_external_to_internal[next_frame]; + if (internal_index < dec->frame_saved_as.size()) { + std::vector deps = GetFrameDependencies( + internal_index, dec->frame_saved_as, dec->frame_references); + + dec->frame_required.resize(internal_index + 1, 0); + for (size_t i = 0; i < deps.size(); i++) { + JXL_ASSERT(deps[i] < dec->frame_required.size()); + dec->frame_required[deps[i]] = 1; + } + } + } +} + +JXL_EXPORT JxlDecoderStatus +JxlDecoderSetParallelRunner(JxlDecoder* dec, JxlParallelRunner parallel_runner, + void* parallel_runner_opaque) { + if (dec->thread_pool) return JXL_API_ERROR("parallel runner already set"); + dec->thread_pool.reset( + new jxl::ThreadPool(parallel_runner, parallel_runner_opaque)); + return JXL_DEC_SUCCESS; +} + +size_t JxlDecoderSizeHintBasicInfo(const JxlDecoder* dec) { + if (dec->got_basic_info) return 0; + return dec->basic_info_size_hint; +} + +JxlDecoderStatus JxlDecoderSubscribeEvents(JxlDecoder* dec, int events_wanted) { + if (dec->stage != DecoderStage::kInited) { + return JXL_DEC_ERROR; // Cannot subscribe to events after having started. + } + if (events_wanted & 63) { + return JXL_DEC_ERROR; // Can only subscribe to informative events. + } + dec->events_wanted = events_wanted; + dec->orig_events_wanted = events_wanted; + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetKeepOrientation(JxlDecoder* dec, + JXL_BOOL keep_orientation) { + if (dec->stage != DecoderStage::kInited) { + return JXL_API_ERROR("Must set keep_orientation option before starting"); + } + dec->keep_orientation = !!keep_orientation; + return JXL_DEC_SUCCESS; +} + +namespace jxl { +namespace { + +template +bool CanRead(Span data, BitReader* reader, T* JXL_RESTRICT t) { + // Use a copy of the bit reader because CanRead advances bits. + BitReader reader2(data); + reader2.SkipBits(reader->TotalBitsConsumed()); + bool result = Bundle::CanRead(&reader2, t); + JXL_ASSERT(reader2.Close()); + return result; +} + +// Returns JXL_DEC_SUCCESS if the full bundle was successfully read, status +// indicating either error or need more input otherwise. +template +JxlDecoderStatus ReadBundle(Span data, BitReader* reader, + T* JXL_RESTRICT t) { + if (!CanRead(data, reader, t)) { + return JXL_DEC_NEED_MORE_INPUT; + } + if (!Bundle::Read(reader, t)) { + return JXL_DEC_ERROR; + } + return JXL_DEC_SUCCESS; +} + +#define JXL_API_RETURN_IF_ERROR(expr) \ + { \ + JxlDecoderStatus status_ = ConvertStatus(expr); \ + if (status_ != JXL_DEC_SUCCESS) return status_; \ + } + +std::unique_ptr> GetBitReader( + Span span) { + BitReader* reader = new BitReader(span); + return std::unique_ptr>( + reader, [](BitReader* reader) { + // We can't allow Close to abort the program if the reader is out of + // bounds, or all return paths in the code, even those that already + // return failure, would have to manually call AllReadsWithinBounds(). + // Invalid JXL codestream should not cause program to quit. + (void)reader->AllReadsWithinBounds(); + (void)reader->Close(); + delete reader; + }); +} + +JxlDecoderStatus JxlDecoderReadBasicInfo(JxlDecoder* dec, const uint8_t* in, + size_t size) { + size_t pos = 0; + + // Check and skip the codestream signature + JxlSignature signature = ReadSignature(in, size, &pos); + if (signature == JXL_SIG_NOT_ENOUGH_BYTES) { + return JXL_DEC_NEED_MORE_INPUT; + } + if (signature == JXL_SIG_CONTAINER) { + // There is a container signature where we expect a codestream, container + // is handled at a higher level already. + return JXL_API_ERROR("invalid: nested container"); + } + if (signature != JXL_SIG_CODESTREAM) { + return JXL_API_ERROR("invalid signature"); + } + + Span span(in + pos, size - pos); + auto reader = GetBitReader(span); + JXL_API_RETURN_IF_ERROR(ReadBundle(span, reader.get(), &dec->metadata.size)); + + dec->metadata.m.nonserialized_only_parse_basic_info = true; + JXL_API_RETURN_IF_ERROR(ReadBundle(span, reader.get(), &dec->metadata.m)); + dec->metadata.m.nonserialized_only_parse_basic_info = false; + dec->got_basic_info = true; + dec->basic_info_size_hint = 0; + + if (!CheckSizeLimit(dec->metadata.size.xsize(), dec->metadata.size.ysize())) { + return JXL_API_ERROR("image is too large"); + } + + return JXL_DEC_SUCCESS; +} + +// Reads all codestream headers (but not frame headers) +JxlDecoderStatus JxlDecoderReadAllHeaders(JxlDecoder* dec, const uint8_t* in, + size_t size) { + size_t pos = 0; + + // Check and skip the codestream signature + JxlSignature signature = ReadSignature(in, size, &pos); + if (signature == JXL_SIG_CONTAINER) { + return JXL_API_ERROR("invalid: nested container"); + } + if (signature != JXL_SIG_CODESTREAM) { + return JXL_API_ERROR("invalid signature"); + } + + Span span(in + pos, size - pos); + auto reader = GetBitReader(span); + + if (dec->header_except_icc_bits != 0) { + // Headers were decoded already. + reader->SkipBits(dec->header_except_icc_bits); + } else { + SizeHeader dummy_size_header; + JXL_API_RETURN_IF_ERROR(ReadBundle(span, reader.get(), &dummy_size_header)); + + // We already decoded the metadata to dec->metadata.m, no reason to + // overwrite it, use a dummy metadata instead. + ImageMetadata dummy_metadata; + JXL_API_RETURN_IF_ERROR(ReadBundle(span, reader.get(), &dummy_metadata)); + + JXL_API_RETURN_IF_ERROR( + ReadBundle(span, reader.get(), &dec->metadata.transform_data)); + } + + dec->header_except_icc_bits = reader->TotalBitsConsumed(); + + if (dec->metadata.m.color_encoding.WantICC()) { + jxl::Status status = dec->icc_reader.Init(reader.get(), memory_limit_base_); + // Always check AllReadsWithinBounds, not all the C++ decoder implementation + // handles reader out of bounds correctly yet (e.g. context map). Not + // checking AllReadsWithinBounds can cause reader->Close() to trigger an + // assert, but we don't want library to quit program for invalid codestream. + if (!reader->AllReadsWithinBounds()) { + return JXL_DEC_NEED_MORE_INPUT; + } + if (!status) { + if (status.code() == StatusCode::kNotEnoughBytes) { + return JXL_DEC_NEED_MORE_INPUT; + } + // Other non-successful status is an error + return JXL_DEC_ERROR; + } + PaddedBytes icc; + status = dec->icc_reader.Process(reader.get(), &icc); + if (!status) { + if (status.code() == StatusCode::kNotEnoughBytes) { + return JXL_DEC_NEED_MORE_INPUT; + } + // Other non-successful status is an error + return JXL_DEC_ERROR; + } + if (!dec->metadata.m.color_encoding.SetICCRaw(std::move(icc))) { + return JXL_DEC_ERROR; + } + } + + dec->got_all_headers = true; + JXL_API_RETURN_IF_ERROR(reader->JumpToByteBoundary()); + + dec->frame_start = pos + reader->TotalBitsConsumed() / jxl::kBitsPerByte; + + if (!dec->passes_state) { + dec->passes_state.reset(new jxl::PassesDecoderState()); + } + + dec->default_enc = + ColorEncoding::LinearSRGB(dec->metadata.m.color_encoding.IsGray()); + + JXL_API_RETURN_IF_ERROR(dec->passes_state->output_encoding_info.Set( + dec->metadata, dec->default_enc)); + + return JXL_DEC_SUCCESS; +} + +static size_t GetStride(const JxlDecoder* dec, const JxlPixelFormat& format, + const jxl::ImageBundle* frame = nullptr) { + size_t xsize = dec->metadata.xsize(); + if (!dec->keep_orientation && dec->metadata.m.orientation > 4) { + xsize = dec->metadata.ysize(); + } + if (frame) { + xsize = dec->keep_orientation ? frame->xsize() : frame->oriented_xsize(); + } + size_t stride = xsize * (BitsPerChannel(format.data_type) * + format.num_channels / jxl::kBitsPerByte); + if (format.align > 1) { + stride = jxl::DivCeil(stride, format.align) * format.align; + } + return stride; +} + +static JxlDecoderStatus ConvertImageInternal(const JxlDecoder* dec, + const jxl::ImageBundle& frame, + const JxlPixelFormat& format, + void* out_image, size_t out_size, + JxlImageOutCallback out_callback, + void* out_opaque) { + // TODO(lode): handle mismatch of RGB/grayscale color profiles and pixel data + // color/grayscale format + const auto& metadata = dec->metadata.m; + + const size_t stride = GetStride(dec, format, &frame); + + bool float_format = format.data_type == JXL_TYPE_FLOAT || + format.data_type == JXL_TYPE_FLOAT16; + + jxl::Orientation undo_orientation = dec->keep_orientation + ? jxl::Orientation::kIdentity + : metadata.GetOrientation(); + JXL_DASSERT(!dec->frame_dec || !dec->frame_dec->HasRGBBuffer()); + jxl::Status status = jxl::ConvertToExternal( + frame, BitsPerChannel(format.data_type), float_format, + format.num_channels, format.endianness, stride, dec->thread_pool.get(), + out_image, out_size, /*out_callback=*/out_callback, + /*out_opaque=*/out_opaque, undo_orientation); + + return status ? JXL_DEC_SUCCESS : JXL_DEC_ERROR; +} + +// Parses the FrameHeader and the total frame_size, given the initial bytes +// of the frame up to and including the TOC. +// TODO(lode): merge this with FrameDecoder +JxlDecoderStatus ParseFrameHeader(jxl::FrameHeader* frame_header, + const uint8_t* in, size_t size, size_t pos, + bool is_preview, size_t* frame_size, + int* saved_as) { + if (pos >= size) { + return JXL_DEC_NEED_MORE_INPUT; + } + Span span(in + pos, size - pos); + auto reader = GetBitReader(span); + + frame_header->nonserialized_is_preview = is_preview; + jxl::Status status = DecodeFrameHeader(reader.get(), frame_header); + jxl::FrameDimensions frame_dim = frame_header->ToFrameDimensions(); + if (!CheckSizeLimit(frame_dim.xsize_upsampled_padded, + frame_dim.ysize_upsampled_padded)) { + return JXL_API_ERROR("frame is too large"); + } + + if (status.code() == StatusCode::kNotEnoughBytes) { + // TODO(lode): prevent asking for way too much input bytes in case of + // invalid header that the decoder thinks is a very long user extension + // instead. Example: fields can currently print something like this: + // "../lib/jxl/fields.cc:416: Skipping 71467322-bit extension(s)" + // Maybe fields.cc should return error in the above case rather than + // print a message. + return JXL_DEC_NEED_MORE_INPUT; + } else if (!status) { + return JXL_API_ERROR("invalid frame header"); + } + + // Read TOC. + uint64_t groups_total_size; + const bool has_ac_global = true; + const size_t toc_entries = + NumTocEntries(frame_dim.num_groups, frame_dim.num_dc_groups, + frame_header->passes.num_passes, has_ac_global); + + std::vector group_offsets; + std::vector group_sizes; + status = ReadGroupOffsets(toc_entries, reader.get(), &group_offsets, + &group_sizes, &groups_total_size); + + // TODO(lode): we're actually relying on AllReadsWithinBounds() here + // instead of on status.code(), change the internal TOC C++ code to + // correctly set the status.code() instead so we can rely on that one. + if (!reader->AllReadsWithinBounds() || + status.code() == StatusCode::kNotEnoughBytes) { + return JXL_DEC_NEED_MORE_INPUT; + } else if (!status) { + return JXL_API_ERROR("invalid toc entries"); + } + + JXL_DASSERT((reader->TotalBitsConsumed() % kBitsPerByte) == 0); + JXL_API_RETURN_IF_ERROR(reader->JumpToByteBoundary()); + size_t header_size = (reader->TotalBitsConsumed() >> 3); + *frame_size = header_size + groups_total_size; + + if (saved_as != nullptr) { + *saved_as = FrameDecoder::SavedAs(*frame_header); + } + + return JXL_DEC_SUCCESS; +} + +// TODO(eustas): no CodecInOut -> no image size reinforcement -> possible OOM. +JxlDecoderStatus JxlDecoderProcessInternal(JxlDecoder* dec, const uint8_t* in, + size_t size) { + // If no parallel runner is set, use the default + // TODO(lode): move this initialization to an appropriate location once the + // runner is used to decode pixels. + if (!dec->thread_pool) { + dec->thread_pool.reset(new jxl::ThreadPool(nullptr, nullptr)); + } + + // No matter what events are wanted, the basic info is always required. + if (!dec->got_basic_info) { + JxlDecoderStatus status = JxlDecoderReadBasicInfo(dec, in, size); + if (status != JXL_DEC_SUCCESS) return status; + } + + if (dec->events_wanted & JXL_DEC_BASIC_INFO) { + dec->events_wanted &= ~JXL_DEC_BASIC_INFO; + return JXL_DEC_BASIC_INFO; + } + + if (!dec->got_all_headers) { + JxlDecoderStatus status = JxlDecoderReadAllHeaders(dec, in, size); + if (status != JXL_DEC_SUCCESS) return status; + } + + if (dec->events_wanted & JXL_DEC_EXTENSIONS) { + dec->events_wanted &= ~JXL_DEC_EXTENSIONS; + if (dec->metadata.m.extensions != 0) { + return JXL_DEC_EXTENSIONS; + } + } + + if (dec->events_wanted & JXL_DEC_COLOR_ENCODING) { + dec->events_wanted &= ~JXL_DEC_COLOR_ENCODING; + return JXL_DEC_COLOR_ENCODING; + } + + dec->post_headers = true; + + // Decode to pixels, only if required for the events the user wants. + if (!dec->got_preview_image) { + // Parse the preview, or at least its TOC to be able to skip the frame, if + // any frame or image decoding is desired. + bool parse_preview = + (dec->events_wanted & + (JXL_DEC_PREVIEW_IMAGE | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + + if (!dec->metadata.m.have_preview) { + // There is no preview, mark this as done and go to next step + dec->got_preview_image = true; + } else if (!parse_preview) { + // No preview parsing needed, mark this step as done + dec->got_preview_image = true; + } else { + // Want to decode the preview, not just skip the frame + bool want_preview = (dec->events_wanted & JXL_DEC_PREVIEW_IMAGE); + size_t frame_size; + size_t pos = dec->frame_start; + dec->frame_header.reset(new FrameHeader(&dec->metadata)); + JxlDecoderStatus status = ParseFrameHeader(dec->frame_header.get(), in, + size, pos, true, &frame_size, + /*saved_as=*/nullptr); + if (status != JXL_DEC_SUCCESS) return status; + if (OutOfBounds(pos, frame_size, size)) { + return JXL_DEC_NEED_MORE_INPUT; + } + + if (want_preview && !dec->preview_out_buffer_set) { + return JXL_DEC_NEED_PREVIEW_OUT_BUFFER; + } + + jxl::Span compressed(in + dec->frame_start, + size - dec->frame_start); + auto reader = GetBitReader(compressed); + jxl::DecompressParams dparams; + dparams.preview = want_preview ? jxl::Override::kOn : jxl::Override::kOff; + jxl::ImageBundle ib(&dec->metadata.m); + PassesDecoderState preview_dec_state; + JXL_API_RETURN_IF_ERROR(preview_dec_state.output_encoding_info.Set( + dec->metadata, + ColorEncoding::LinearSRGB(dec->metadata.m.color_encoding.IsGray()))); + if (!DecodeFrame(dparams, &preview_dec_state, dec->thread_pool.get(), + reader.get(), &ib, dec->metadata, + /*constraints=*/nullptr, + /*is_preview=*/true)) { + return JXL_API_ERROR("decoding preview failed"); + } + + // Set frame_start to the first non-preview frame. + dec->frame_start += DivCeil(reader->TotalBitsConsumed(), kBitsPerByte); + dec->got_preview_image = true; + + if (want_preview) { + if (dec->preview_out_buffer) { + JxlDecoderStatus status = ConvertImageInternal( + dec, ib, dec->preview_out_format, dec->preview_out_buffer, + dec->preview_out_size, /*out_callback=*/nullptr, + /*out_opaque=*/nullptr); + if (status != JXL_DEC_SUCCESS) return status; + } + return JXL_DEC_PREVIEW_IMAGE; + } + } + } + + // Handle frames + for (;;) { + if (!(dec->events_wanted & (JXL_DEC_FULL_IMAGE | JXL_DEC_FRAME))) { + break; + } + if (dec->frame_stage == FrameStage::kHeader && dec->is_last_total) { + break; + } + + if (dec->frame_stage == FrameStage::kHeader) { + size_t pos = dec->frame_start - dec->codestream_pos; + if (pos >= size) { + return JXL_DEC_NEED_MORE_INPUT; + } + dec->frame_header.reset(new FrameHeader(&dec->metadata)); + int saved_as = 0; + JxlDecoderStatus status = + ParseFrameHeader(dec->frame_header.get(), in, size, pos, + /*is_preview=*/false, &dec->frame_size, &saved_as); + if (status != JXL_DEC_SUCCESS) return status; + + // is last in entire codestream + dec->is_last_total = dec->frame_header->is_last; + // is last of current still + dec->is_last_of_still = + dec->is_last_total || dec->frame_header->animation_frame.duration > 0; + + const size_t internal_frame_index = dec->internal_frames; + const size_t external_frame_index = dec->external_frames; + if (dec->is_last_of_still) dec->external_frames++; + dec->internal_frames++; + + dec->frame_stage = FrameStage::kTOC; + + if (dec->skip_frames > 0) { + dec->skipping_frame = true; + if (dec->is_last_of_still) { + dec->skip_frames--; + } + } else { + dec->skipping_frame = false; + } + + if (external_frame_index >= dec->frame_external_to_internal.size()) { + dec->frame_external_to_internal.push_back(internal_frame_index); + JXL_ASSERT(dec->frame_external_to_internal.size() == + external_frame_index + 1); + } + + if (internal_frame_index >= dec->frame_saved_as.size()) { + dec->frame_saved_as.push_back(saved_as); + JXL_ASSERT(dec->frame_saved_as.size() == internal_frame_index + 1); + + // add the value 0xff (which means all references) to new slots: we only + // know the references of the frame at FinalizeFrame, and fill in the + // correct values there. As long as this information is not known, the + // worst case where the frame depends on all storage slots is assumed. + dec->frame_references.push_back(0xff); + JXL_ASSERT(dec->frame_references.size() == internal_frame_index + 1); + } + + if (dec->skipping_frame) { + // Whether this frame could be referenced by any future frame: either + // because it's a frame saved for blending or patches, or because it's + // a DC frame. + bool referenceable = + dec->frame_header->CanBeReferenced() || + dec->frame_header->frame_type == FrameType::kDCFrame; + if (internal_frame_index < dec->frame_required.size() && + !dec->frame_required[internal_frame_index]) { + referenceable = false; + } + if (!referenceable) { + // Skip all decoding for this frame, since the user is skipping this + // frame and no future frames can reference it. + dec->frame_stage = FrameStage::kHeader; + dec->frame_start += dec->frame_size; + continue; + } + } + + if ((dec->events_wanted & JXL_DEC_FRAME) && dec->is_last_of_still) { + // Only return this for the last of a series of stills: patches frames + // etc... before this one do not contain the correct information such + // as animation timing, ... + if (!dec->skipping_frame) { + return JXL_DEC_FRAME; + } + } + } + + if (dec->frame_stage == FrameStage::kTOC) { + size_t pos = dec->frame_start - dec->codestream_pos; + if (pos >= size) { + return JXL_DEC_NEED_MORE_INPUT; + } + Span span(in + pos, size - pos); + auto reader = GetBitReader(span); + + if (!dec->passes_state) { + dec->passes_state.reset(new jxl::PassesDecoderState()); + } + if (!dec->ib) { + dec->ib.reset(new jxl::ImageBundle(&dec->metadata.m)); + } + + dec->frame_dec.reset(new FrameDecoder( + dec->passes_state.get(), dec->metadata, dec->thread_pool.get())); + + // If JPEG reconstruction is wanted and possible, set the jpeg_data of + // the ImageBundle. + if (!dec->jpeg_decoder.SetImageBundleJpegData(dec->ib.get())) + return JXL_DEC_ERROR; + + jxl::Status status = dec->frame_dec->InitFrame( + reader.get(), dec->ib.get(), /*is_preview=*/false, + /*allow_partial_frames=*/false, /*allow_partial_dc_global=*/false); + if (!status) JXL_API_RETURN_IF_ERROR(status); + + size_t sections_begin = + DivCeil(reader->TotalBitsConsumed(), kBitsPerByte); + + dec->sections.reset( + new Sections(dec->frame_dec.get(), dec->frame_size, sections_begin)); + JXL_API_RETURN_IF_ERROR(dec->sections->Init()); + + // If we don't need pixels, we can skip actually decoding the frames + // (kFull / kFullOut). By not updating frame_stage, none of + // these stages will execute, and the loop will continue from the next + // frame. + if (dec->events_wanted & JXL_DEC_FULL_IMAGE) { + dec->frame_dec_in_progress = true; + dec->frame_stage = FrameStage::kFull; + } + } + + bool return_full_image = false; + + if (dec->frame_stage == FrameStage::kFull) { + if (dec->events_wanted & JXL_DEC_FULL_IMAGE) { + if (!dec->image_out_buffer_set && (!dec->jpeg_decoder.IsOutputSet() || + dec->ib->jpeg_data == nullptr) && + dec->is_last_of_still) { + // TODO(lode): remove the dec->is_last_of_still condition if the + // frame decoder needs the image buffer as working space for decoding + // non-visible or blending frames too + if (!dec->skipping_frame) { + return JXL_DEC_NEED_IMAGE_OUT_BUFFER; + } + } + } + + if (dec->image_out_buffer_set && !!dec->image_out_buffer && + dec->image_out_format.data_type == JXL_TYPE_UINT8 && + dec->image_out_format.num_channels >= 3) { + bool is_rgba = dec->image_out_format.num_channels == 4; + dec->frame_dec->MaybeSetRGB8OutputBuffer( + reinterpret_cast(dec->image_out_buffer), + GetStride(dec, dec->image_out_format), is_rgba, + !dec->keep_orientation); + } + + const bool little_endian = + dec->image_out_format.endianness == JXL_LITTLE_ENDIAN || + (dec->image_out_format.endianness == JXL_NATIVE_ENDIAN && + IsLittleEndian()); + bool swap_endianness = little_endian != IsLittleEndian(); + + // TODO(lode): Support more formats than just native endian float32 for + // the low-memory callback path + if (dec->image_out_buffer_set && !!dec->image_out_callback && + dec->image_out_format.data_type == JXL_TYPE_FLOAT && + dec->image_out_format.num_channels >= 3 && !swap_endianness && + dec->frame_dec_in_progress) { + bool is_rgba = dec->image_out_format.num_channels == 4; + dec->frame_dec->MaybeSetFloatCallback( + [dec](const float* pixels, size_t x, size_t y, size_t num_pixels) { + dec->image_out_callback(dec->image_out_opaque, x, y, num_pixels, + pixels); + }, + is_rgba, !dec->keep_orientation); + } + + size_t pos = dec->frame_start - dec->codestream_pos; + if (pos >= size) { + return JXL_DEC_NEED_MORE_INPUT; + } + dec->sections->SetInput(in + pos, size - pos); + + if (cpu_limit_base_ != 0) { + FrameDimensions frame_dim = dec->frame_header->ToFrameDimensions(); + // No overflow, checked in ParseHeader. + size_t num_pixels = frame_dim.xsize * frame_dim.ysize; + if (used_cpu_base_ + num_pixels < used_cpu_base_) { + return JXL_API_ERROR("used too much CPU"); + } + used_cpu_base_ += num_pixels; + if (used_cpu_base_ > cpu_limit_base_) { + return JXL_API_ERROR("used too much CPU"); + } + } + + jxl::Status status = + dec->frame_dec->ProcessSections(dec->sections->section_info.data(), + dec->sections->section_info.size(), + dec->sections->section_status.data()); + JXL_API_RETURN_IF_ERROR(dec->sections->CloseInput()); + if (status.IsFatalError()) { + return JXL_API_ERROR("decoding frame failed"); + } + + // TODO(lode): allow next_in to move forward if sections from the + // beginning of the stream have been processed + + if (status.code() == StatusCode::kNotEnoughBytes || + dec->sections->section_info.size() < dec->frame_dec->NumSections()) { + // Not all sections have been processed yet + return JXL_DEC_NEED_MORE_INPUT; + } + + size_t internal_index = dec->internal_frames - 1; + JXL_ASSERT(dec->frame_references.size() > internal_index); + // Always fill this in, even if it was already written, it could be that + // this frame was skipped before and set to 255, while only now we know + // the true value. + dec->frame_references[internal_index] = dec->frame_dec->References(); + if (!dec->frame_dec->FinalizeFrame()) { + return JXL_API_ERROR("decoding frame failed"); + } + dec->frame_dec_in_progress = false; + dec->frame_stage = FrameStage::kFullOutput; + } + + if (dec->frame_stage == FrameStage::kFullOutput) { + if (dec->is_last_of_still) { + if (dec->events_wanted & JXL_DEC_FULL_IMAGE) { + dec->events_wanted &= ~JXL_DEC_FULL_IMAGE; + return_full_image = true; + } + + // Frame finished, restore the events_wanted with the per-frame events + // from orig_events_wanted, in case there is a next frame. + dec->events_wanted |= + (dec->orig_events_wanted & (JXL_DEC_FULL_IMAGE | JXL_DEC_FRAME)); + + // If no output buffer was set, we merely return the JXL_DEC_FULL_IMAGE + // status without outputting pixels. + if (dec->jpeg_decoder.IsOutputSet() && dec->ib->jpeg_data != nullptr) { + JxlDecoderStatus status = + dec->jpeg_decoder.WriteOutput(*dec->ib->jpeg_data); + if (status != JXL_DEC_SUCCESS) return status; + } else if (return_full_image && dec->image_out_buffer_set) { + if (!dec->frame_dec->HasRGBBuffer()) { + // Copy pixels if desired. + JxlDecoderStatus status = ConvertImageInternal( + dec, *dec->ib, dec->image_out_format, dec->image_out_buffer, + dec->image_out_size, dec->image_out_callback, + dec->image_out_opaque); + if (status != JXL_DEC_SUCCESS) return status; + } + dec->image_out_buffer_set = false; + } + } + } + + // The pixels have been output or are not needed, do not keep them in + // memory here. + dec->ib.reset(); + dec->frame_stage = FrameStage::kHeader; + dec->frame_start += dec->frame_size; + if (return_full_image && !dec->skipping_frame) { + return JXL_DEC_FULL_IMAGE; + } + } + + dec->stage = DecoderStage::kFinished; + // Return success, this means there is nothing more to do. + return JXL_DEC_SUCCESS; +} + +} // namespace +} // namespace jxl + +JxlDecoderStatus JxlDecoderSetInput(JxlDecoder* dec, const uint8_t* data, + size_t size) { + if (dec->next_in) return JXL_DEC_ERROR; + + dec->next_in = data; + dec->avail_in = size; + return JXL_DEC_SUCCESS; +} + +size_t JxlDecoderReleaseInput(JxlDecoder* dec) { + size_t result = dec->avail_in; + dec->next_in = nullptr; + dec->avail_in = 0; + return result; +} + +JxlDecoderStatus JxlDecoderSetJPEGBuffer(JxlDecoder* dec, uint8_t* data, + size_t size) { + return dec->jpeg_decoder.SetOutputBuffer(data, size); +} + +size_t JxlDecoderReleaseJPEGBuffer(JxlDecoder* dec) { + return dec->jpeg_decoder.ReleaseOutputBuffer(); +} + +JxlDecoderStatus JxlDecoderProcessInput(JxlDecoder* dec) { + const uint8_t** next_in = &dec->next_in; + size_t* avail_in = &dec->avail_in; + if (dec->stage == DecoderStage::kInited) { + dec->stage = DecoderStage::kStarted; + } + if (dec->stage == DecoderStage::kError) { + return JXL_API_ERROR( + "Cannot keep using decoder after it encountered an error, use " + "JxlDecoderReset to reset it"); + } + if (dec->stage == DecoderStage::kFinished) { + return JXL_API_ERROR( + "Cannot keep using decoder after it finished, use JxlDecoderReset to " + "reset it"); + } + + if (!dec->got_signature) { + JxlSignature sig = JxlSignatureCheck(*next_in, *avail_in); + if (sig == JXL_SIG_INVALID) return JXL_API_ERROR("invalid signature"); + if (sig == JXL_SIG_NOT_ENOUGH_BYTES) return JXL_DEC_NEED_MORE_INPUT; + + dec->got_signature = true; + + if (sig == JXL_SIG_CONTAINER) { + dec->have_container = 1; + } + } + + // Available codestream bytes, may differ from *avail_in if there is another + // box behind the current position, in the dec->have_container case. + size_t csize = *avail_in; + + if (dec->have_container) { + /* + Process bytes as follows: + *) find the box(es) containing the codestream + *) support codestream split over multiple partial boxes + *) avoid copying bytes to the codestream vector if the decoding will be + one-shot, when the user already provided everything contiguously in + memory + *) copy to codestream vector, and update next_in so user can delete the data + on their side, once we know it's not oneshot. This relieves the user from + continuing to store the data. + *) also copy to codestream if one-shot but the codestream is split across + multiple boxes: this copying can be avoided in the future if the C++ + decoder is updated for streaming, but for now it requires all consecutive + data at once. + */ + + if (dec->skip_box) { + // Amount of remaining bytes in the box that is being skipped. + size_t remaining = dec->box_end - dec->file_pos; + if (*avail_in < remaining) { + // Don't have the full box yet, skip all we have so far + dec->file_pos += *avail_in; + *next_in += *avail_in; + *avail_in -= *avail_in; + return JXL_DEC_NEED_MORE_INPUT; + } else { + // Full box available, skip all its remaining bytes + dec->file_pos += remaining; + *next_in += remaining; + *avail_in -= remaining; + dec->skip_box = false; + } + } + + if (dec->first_codestream_seen && !dec->last_codestream_seen && + dec->codestream_end != 0 && dec->file_pos < dec->codestream_end && + dec->file_pos + *avail_in >= dec->codestream_end && + !dec->codestream.empty()) { + // dec->file_pos in a codestream, not in surrounding box format bytes, but + // the end of the current codestream part is in the current input, and + // boxes that can contain a next part of the codestream could be present. + // Therefore, store the known codestream part, and ensure processing of + // boxes below will trigger. This is only done if + // !dec->codestream.empty(), that is, we're already streaming. + + // Size of the codestream, excluding potential boxes that come after it. + csize = *avail_in; + if (dec->codestream_end && csize > dec->codestream_end - dec->file_pos) { + csize = dec->codestream_end - dec->file_pos; + } + dec->codestream.insert(dec->codestream.end(), *next_in, *next_in + csize); + dec->file_pos += csize; + *next_in += csize; + *avail_in -= csize; + } + + if (dec->jpeg_decoder.IsParsingBox()) { + // We are inside a JPEG reconstruction box. + JxlDecoderStatus recon_result = + dec->jpeg_decoder.Process(next_in, avail_in); + if (recon_result == JXL_DEC_JPEG_RECONSTRUCTION) { + // If successful JPEG reconstruction, return the success if the user + // cares about it, otherwise continue. + if (dec->events_wanted & recon_result) { + dec->events_wanted &= ~recon_result; + return recon_result; + } + } else { + // If anything else, return the result. + return recon_result; + } + } + + if (!dec->last_codestream_seen && + (dec->codestream_begin == 0 || + (dec->codestream_end != 0 && dec->file_pos >= dec->codestream_end))) { + size_t pos = 0; + // after this for loop, either we should be in a part of the data that is + // codestream (not boxes), or have returned that we need more input. + for (;;) { + const uint8_t* in = *next_in; + size_t size = *avail_in; + if (size == pos) { + // If the remaining size is 0, we are exactly after a full box. We + // can't know for sure if this is the last box or not since more bytes + // can follow, but do not return NEED_MORE_INPUT, instead break and + // let the codestream-handling code determine if we need more. + break; + } + if (OutOfBounds(pos, 8, size)) { + dec->basic_info_size_hint = + InitialBasicInfoSizeHint() + pos + 8 - dec->file_pos; + return JXL_DEC_NEED_MORE_INPUT; + } + size_t box_start = pos; + // Box size, including this header itself. + uint64_t box_size = LoadBE32(in + pos); + char type[5] = {0}; + memcpy(type, in + pos + 4, 4); + pos += 8; + if (box_size == 1) { + if (OutOfBounds(pos, 8, size)) return JXL_DEC_NEED_MORE_INPUT; + box_size = LoadBE64(in + pos); + pos += 8; + } + size_t header_size = pos - box_start; + if (box_size > 0 && box_size < header_size) { + return JXL_API_ERROR("invalid box size"); + } + if (SumOverflows(dec->file_pos, pos, box_size)) { + return JXL_API_ERROR("Box size overflow"); + } + size_t contents_size = + (box_size == 0) ? 0 : (box_size - pos + box_start); + + dec->box_begin = box_start; + dec->box_end = dec->file_pos + box_start + box_size; + if (strcmp(type, "jxlc") == 0 || strcmp(type, "jxlp") == 0) { + size_t codestream_size = contents_size; + // Whether this is the last codestream box, either when it is a jxlc + // box, or when it is a jxlp box that has the final bit set. + // The codestream is either contained within a single jxlc box, or + // within one or more jxlp boxes. The final jxlp box is marked as last + // by setting the high bit of its 4-byte box-index value. + bool last_codestream = false; + if (strcmp(type, "jxlp") == 0) { + if (OutOfBounds(pos, 4, size)) return JXL_DEC_NEED_MORE_INPUT; + if (box_size != 0 && contents_size < 4) { + return JXL_API_ERROR("jxlp box too small to contain index"); + } + codestream_size -= 4; + size_t jxlp_index = LoadBE32(in + pos); + pos += 4; + // The high bit of jxlp_index indicates whether this is the last + // jxlp box. + if (jxlp_index & 0x80000000) last_codestream = true; + } else if (strcmp(type, "jxlc") == 0) { + last_codestream = true; + } + if (!last_codestream && box_size == 0) { + return JXL_API_ERROR( + "final box has unbounded size, but is a non-final codestream " + "box"); + } + dec->first_codestream_seen = true; + if (last_codestream) dec->last_codestream_seen = true; + if (dec->codestream_begin != 0 && dec->codestream.empty()) { + // We've already seen a codestream part, so it's a stream spanning + // multiple boxes. + // We have no choice but to copy contents to the codestream + // vector to make it a contiguous stream for the C++ decoder. + // This appends the previous codestream box that we had seen to + // dec->codestream. + if (dec->codestream_begin < dec->file_pos) { + return JXL_API_ERROR("earlier codestream box out of range"); + } + size_t begin = dec->codestream_begin - dec->file_pos; + size_t end = dec->codestream_end - dec->file_pos; + JXL_ASSERT(end <= *avail_in); + dec->codestream.insert(dec->codestream.end(), *next_in + begin, + *next_in + end); + } + dec->codestream_begin = dec->file_pos + pos; + dec->codestream_end = + (box_size == 0) ? 0 : (dec->codestream_begin + codestream_size); + size_t avail_codestream_size = + (box_size == 0) + ? (size - pos) + : std::min(size - pos, box_size - pos + box_start); + // If already appending codestream, append what we have here too + if (!dec->codestream.empty()) { + size_t begin = pos; + size_t end = + std::min(*avail_in, begin + avail_codestream_size); + dec->codestream.insert(dec->codestream.end(), *next_in + begin, + *next_in + end); + pos += (end - begin); + dec->file_pos += pos; + *next_in += pos; + *avail_in -= pos; + pos = 0; + // TODO(lode): check if this should break always instead, and + // process what we have of the codestream so far, to support + // progressive decoding, and get events such as basic info faster. + // The user could have given 1.5 boxes here, and the first one could + // contain useful parts of codestream that can already be processed. + // Similar to several other exact avail_size checks. This may not + // need to be changed here, but instead at the point in this for + // loop where it returns "NEED_MORE_INPUT", it could instead break + // and allow decoding what we have of the codestream so far. + if (*avail_in == 0) break; + } else { + // skip only the header, so next_in points to the start of this new + // codestream part, for the one-shot case where user data is not + // (yet) copied to dec->codestream. + dec->file_pos += pos; + *next_in += pos; + *avail_in -= pos; + pos = 0; + // Update pos to be after the box contents with codestream + if (avail_codestream_size == *avail_in) { + break; // the rest is codestream, this loop is done + } + pos += avail_codestream_size; + } + } else if ((JPEGXL_ENABLE_TRANSCODE_JPEG) && + (dec->orig_events_wanted & JXL_DEC_JPEG_RECONSTRUCTION) && + strcmp(type, "jbrd") == 0) { + // This is a new JPEG reconstruction metadata box. + dec->jpeg_decoder.StartBox(box_size, contents_size); + dec->file_pos += pos; + *next_in += pos; + *avail_in -= pos; + pos = 0; + JxlDecoderStatus recon_result = + dec->jpeg_decoder.Process(next_in, avail_in); + if (recon_result == JXL_DEC_JPEG_RECONSTRUCTION) { + // If successful JPEG reconstruction, return the success if the user + // cares about it, otherwise continue. + if (dec->events_wanted & recon_result) { + dec->events_wanted &= ~recon_result; + return recon_result; + } + } else { + // If anything else, return the result. + return recon_result; + } + } else { + if (box_size == 0) { + // Final box with unknown size, but it's not a codestream box, so + // nothing more to do. + if (!dec->first_codestream_seen) { + return JXL_API_ERROR("didn't find any codestream box"); + } + break; + } + if (OutOfBounds(pos, contents_size, size)) { + dec->skip_box = true; + dec->file_pos += pos; + *next_in += pos; + *avail_in -= pos; + // Indicate how many more bytes needed starting from *next_in. + dec->basic_info_size_hint = InitialBasicInfoSizeHint() + pos + + contents_size - dec->file_pos; + return JXL_DEC_NEED_MORE_INPUT; + } + pos += contents_size; + if (!(dec->codestream.empty() && dec->first_codestream_seen)) { + // Last box no longer needed since we have copied the codestream + // buffer, remove from input so user can release memory. + dec->file_pos += pos; + *next_in += pos; + *avail_in -= pos; + pos = 0; + } + } + } + } + + // Size of the codestream, excluding potential boxes that come after it. + csize = *avail_in; + if (dec->codestream_end && csize > dec->codestream_end - dec->file_pos) { + csize = dec->codestream_end - dec->file_pos; + } + } + + // Whether we are taking the input directly from the user (oneshot case, + // without copying bytes), or appending parts of input to dec->codestream + // (streaming) + bool detected_streaming = !dec->codestream.empty(); + JxlDecoderStatus result; + JXL_DASSERT(csize <= *avail_in); + + if (detected_streaming) { + dec->codestream.insert(dec->codestream.end(), *next_in, *next_in + csize); + dec->file_pos += csize; + *next_in += csize; + *avail_in -= csize; + result = jxl::JxlDecoderProcessInternal(dec, dec->codestream.data(), + dec->codestream.size()); + } else { + // No data copied to codestream buffer yet, the user input may contain the + // full codestream. + result = jxl::JxlDecoderProcessInternal(dec, *next_in, csize); + // Copy the user's input bytes to the codestream once we are able to and + // it is needed. Before we got the basic info, we're still parsing the box + // format instead. If the result is not JXL_DEC_NEED_MORE_INPUT, then + // there is no reason yet to copy since the user may have a full buffer + // allowing one-shot. Once JXL_DEC_NEED_MORE_INPUT occurred at least once, + // start copying over the codestream bytes and allow user to free them + // instead. Next call, detected_streaming will be true. + if (dec->got_basic_info && result == JXL_DEC_NEED_MORE_INPUT) { + dec->codestream.insert(dec->codestream.end(), *next_in, *next_in + csize); + dec->file_pos += csize; + *next_in += csize; + *avail_in -= csize; + } + } + + return result; +} + +JxlDecoderStatus JxlDecoderGetBasicInfo(const JxlDecoder* dec, + JxlBasicInfo* info) { + if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT; + + if (info) { + const jxl::ImageMetadata& meta = dec->metadata.m; + + info->have_container = dec->have_container; + info->xsize = dec->metadata.size.xsize(); + info->ysize = dec->metadata.size.ysize(); + info->uses_original_profile = !meta.xyb_encoded; + + info->bits_per_sample = meta.bit_depth.bits_per_sample; + info->exponent_bits_per_sample = meta.bit_depth.exponent_bits_per_sample; + + info->have_preview = meta.have_preview; + info->have_animation = meta.have_animation; + // TODO(janwas): intrinsic_size + info->orientation = static_cast(meta.orientation); + + if (!dec->keep_orientation) { + if (info->orientation >= JXL_ORIENT_TRANSPOSE) { + std::swap(info->xsize, info->ysize); + } + info->orientation = JXL_ORIENT_IDENTITY; + } + + info->intensity_target = meta.IntensityTarget(); + info->min_nits = meta.tone_mapping.min_nits; + info->relative_to_max_display = meta.tone_mapping.relative_to_max_display; + info->linear_below = meta.tone_mapping.linear_below; + + const jxl::ExtraChannelInfo* alpha = meta.Find(jxl::ExtraChannel::kAlpha); + if (alpha != nullptr) { + info->alpha_bits = alpha->bit_depth.bits_per_sample; + info->alpha_exponent_bits = alpha->bit_depth.exponent_bits_per_sample; + info->alpha_premultiplied = alpha->alpha_associated; + } else { + info->alpha_bits = 0; + info->alpha_exponent_bits = 0; + info->alpha_premultiplied = 0; + } + + info->num_color_channels = + meta.color_encoding.GetColorSpace() == jxl::ColorSpace::kGray ? 1 : 3; + + info->num_extra_channels = meta.num_extra_channels; + + if (info->have_preview) { + info->preview.xsize = dec->metadata.m.preview_size.xsize(); + info->preview.ysize = dec->metadata.m.preview_size.ysize(); + } + + if (info->have_animation) { + info->animation.tps_numerator = dec->metadata.m.animation.tps_numerator; + info->animation.tps_denominator = + dec->metadata.m.animation.tps_denominator; + info->animation.num_loops = dec->metadata.m.animation.num_loops; + info->animation.have_timecodes = dec->metadata.m.animation.have_timecodes; + } + } + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetExtraChannelInfo(const JxlDecoder* dec, + size_t index, + JxlExtraChannelInfo* info) { + if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT; + + const std::vector& channels = + dec->metadata.m.extra_channel_info; + + if (index >= channels.size()) return JXL_DEC_ERROR; // out of bounds + const jxl::ExtraChannelInfo& channel = channels[index]; + + info->type = static_cast(channel.type); + info->bits_per_sample = channel.bit_depth.bits_per_sample; + info->exponent_bits_per_sample = + channel.bit_depth.floating_point_sample + ? channel.bit_depth.exponent_bits_per_sample + : 0; + info->dim_shift = channel.dim_shift; + info->name_length = channel.name.size(); + info->alpha_associated = channel.alpha_associated; + info->spot_color[0] = channel.spot_color[0]; + info->spot_color[1] = channel.spot_color[1]; + info->spot_color[2] = channel.spot_color[2]; + info->spot_color[3] = channel.spot_color[3]; + info->cfa_channel = channel.cfa_channel; + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetExtraChannelName(const JxlDecoder* dec, + size_t index, char* name, + size_t size) { + if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT; + + const std::vector& channels = + dec->metadata.m.extra_channel_info; + + if (index >= channels.size()) return JXL_DEC_ERROR; // out of bounds + const jxl::ExtraChannelInfo& channel = channels[index]; + + // Also need null-termination character + if (channel.name.size() + 1 > size) return JXL_DEC_ERROR; + + memcpy(name, channel.name.c_str(), channel.name.size() + 1); + + return JXL_DEC_SUCCESS; +} + +namespace { + +// Gets the jxl::ColorEncoding for the desired target, and checks errors. +// Returns the object regardless of whether the actual color space is in ICC, +// but ensures that if the color encoding is not the encoding from the +// codestream header metadata, it cannot require ICC profile. +JxlDecoderStatus GetColorEncodingForTarget( + const JxlDecoder* dec, const JxlPixelFormat* format, + JxlColorProfileTarget target, const jxl::ColorEncoding** encoding) { + if (!dec->got_all_headers) return JXL_DEC_NEED_MORE_INPUT; + *encoding = nullptr; + if (target == JXL_COLOR_PROFILE_TARGET_DATA && dec->metadata.m.xyb_encoded) { + *encoding = &dec->passes_state->output_encoding_info.color_encoding; + } else { + *encoding = &dec->metadata.m.color_encoding; + } + return JXL_DEC_SUCCESS; +} +} // namespace + +JxlDecoderStatus JxlDecoderGetColorAsEncodedProfile( + const JxlDecoder* dec, const JxlPixelFormat* format, + JxlColorProfileTarget target, JxlColorEncoding* color_encoding) { + const jxl::ColorEncoding* jxl_color_encoding = nullptr; + JxlDecoderStatus status = + GetColorEncodingForTarget(dec, format, target, &jxl_color_encoding); + if (status) return status; + + if (jxl_color_encoding->WantICC()) + return JXL_DEC_ERROR; // Indicate no encoded profile available. + + if (color_encoding) { + ConvertInternalToExternalColorEncoding(*jxl_color_encoding, color_encoding); + } + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetICCProfileSize(const JxlDecoder* dec, + const JxlPixelFormat* format, + JxlColorProfileTarget target, + size_t* size) { + const jxl::ColorEncoding* jxl_color_encoding = nullptr; + JxlDecoderStatus status = + GetColorEncodingForTarget(dec, format, target, &jxl_color_encoding); + if (status != JXL_DEC_SUCCESS) return status; + + if (jxl_color_encoding->WantICC()) { + jxl::ColorSpace color_space = + dec->metadata.m.color_encoding.GetColorSpace(); + if (color_space == jxl::ColorSpace::kUnknown || + color_space == jxl::ColorSpace::kXYB) { + // This indicates there's no ICC profile available + // TODO(lode): for the XYB case, do we want to craft an ICC profile that + // represents XYB as an RGB profile? It may be possible, but not with + // only 1D transfer functions. + return JXL_DEC_ERROR; + } + } + + if (size) { + *size = jxl_color_encoding->ICC().size(); + } + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetColorAsICCProfile(const JxlDecoder* dec, + const JxlPixelFormat* format, + JxlColorProfileTarget target, + uint8_t* icc_profile, + size_t size) { + size_t wanted_size; + // This also checks the NEED_MORE_INPUT and the unknown/xyb cases + JxlDecoderStatus status = + JxlDecoderGetICCProfileSize(dec, format, target, &wanted_size); + if (status != JXL_DEC_SUCCESS) return status; + if (size < wanted_size) return JXL_API_ERROR("ICC profile output too small"); + + const jxl::ColorEncoding* jxl_color_encoding = nullptr; + status = GetColorEncodingForTarget(dec, format, target, &jxl_color_encoding); + if (status != JXL_DEC_SUCCESS) return status; + + memcpy(icc_profile, jxl_color_encoding->ICC().data(), + jxl_color_encoding->ICC().size()); + + return JXL_DEC_SUCCESS; +} + +namespace { + +// Returns the amount of bits needed for getting memory buffer size, and does +// all error checking required for size checking and format validity. +JxlDecoderStatus PrepareSizeCheck(const JxlDecoder* dec, + const JxlPixelFormat* format, size_t* bits) { + if (!dec->got_basic_info) { + // Don't know image dimensions yet, cannot check for valid size. + return JXL_DEC_NEED_MORE_INPUT; + } + if (format->num_channels > 4) { + return JXL_API_ERROR("More than 4 channels not supported"); + } + if (format->num_channels < 3 && !dec->metadata.m.color_encoding.IsGray()) { + return JXL_API_ERROR("Grayscale output not possible for color image"); + } + if (format->data_type == JXL_TYPE_BOOLEAN) { + return JXL_API_ERROR("Boolean data type not yet supported"); + } + if (format->data_type == JXL_TYPE_UINT32) { + return JXL_API_ERROR("uint32 data type not yet supported"); + } + + *bits = BitsPerChannel(format->data_type); + + if (*bits == 0) { + return JXL_API_ERROR("Invalid data type"); + } + + return JXL_DEC_SUCCESS; +} +} // namespace + +JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec) { + if (!dec->image_out_buffer) return JXL_DEC_ERROR; + if (!dec->sections || dec->sections->section_info.empty()) { + return JXL_DEC_ERROR; + } + if (!dec->frame_dec || !dec->frame_dec_in_progress) { + return JXL_DEC_ERROR; + } + if (!dec->frame_dec->HasDecodedDC()) { + // FrameDecoder::Fush currently requires DC to have been decoded already + // to work correctly. + return JXL_DEC_ERROR; + } + if (dec->frame_header->encoding != jxl::FrameEncoding::kVarDCT) { + // Flushing does not yet work correctly if the frame uses modular encoding. + return JXL_DEC_ERROR; + } + if (dec->metadata.m.num_extra_channels > 0) { + // Flushing does not yet work correctly if there are extra channels, which + // use modular + return JXL_DEC_ERROR; + } + + if (!dec->frame_dec->Flush()) { + return JXL_DEC_ERROR; + } + + if (dec->frame_dec->HasRGBBuffer()) { + return JXL_DEC_SUCCESS; + } + + // Temporarily shrink `dec->ib` to the actual size of the full image to call + // ConvertImageInternal. + size_t xsize = dec->ib->xsize(); + size_t ysize = dec->ib->ysize(); + dec->ib->ShrinkTo(dec->metadata.size.xsize(), dec->metadata.size.ysize()); + JxlDecoderStatus status = jxl::ConvertImageInternal( + dec, *dec->ib, dec->image_out_format, dec->image_out_buffer, + dec->image_out_size, + /*out_callback=*/nullptr, /*out_opaque=*/nullptr); + dec->ib->ShrinkTo(xsize, ysize); + if (status != JXL_DEC_SUCCESS) return status; + return JXL_DEC_SUCCESS; +} + +JXL_EXPORT JxlDecoderStatus JxlDecoderPreviewOutBufferSize( + const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size) { + size_t bits; + JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits); + if (status != JXL_DEC_SUCCESS) return status; + + size_t xsize = dec->metadata.oriented_preview_xsize(dec->keep_orientation); + size_t ysize = dec->metadata.oriented_preview_ysize(dec->keep_orientation); + + size_t row_size = + jxl::DivCeil(xsize * format->num_channels * bits, jxl::kBitsPerByte); + if (format->align > 1) { + row_size = jxl::DivCeil(row_size, format->align) * format->align; + } + *size = row_size * ysize; + return JXL_DEC_SUCCESS; +} + +JXL_EXPORT JxlDecoderStatus JxlDecoderSetPreviewOutBuffer( + JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size) { + if (!dec->got_basic_info || !dec->metadata.m.have_preview || + !(dec->orig_events_wanted & JXL_DEC_PREVIEW_IMAGE)) { + return JXL_API_ERROR("No preview out buffer needed at this time"); + } + + size_t min_size; + // This also checks whether the format is valid and supported and basic info + // is available. + JxlDecoderStatus status = + JxlDecoderPreviewOutBufferSize(dec, format, &min_size); + if (status != JXL_DEC_SUCCESS) return status; + + if (size < min_size) return JXL_DEC_ERROR; + + dec->preview_out_buffer_set = true; + dec->preview_out_buffer = buffer; + dec->preview_out_size = size; + dec->preview_out_format = *format; + + return JXL_DEC_SUCCESS; +} + +JXL_EXPORT JxlDecoderStatus JxlDecoderDCOutBufferSize( + const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size) { + size_t bits; + JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits); + if (status != JXL_DEC_SUCCESS) return status; + + size_t xsize = jxl::DivCeil( + dec->metadata.oriented_xsize(dec->keep_orientation), jxl::kBlockDim); + size_t ysize = jxl::DivCeil( + dec->metadata.oriented_ysize(dec->keep_orientation), jxl::kBlockDim); + + size_t row_size = + jxl::DivCeil(xsize * format->num_channels * bits, jxl::kBitsPerByte); + if (format->align > 1) { + row_size = jxl::DivCeil(row_size, format->align) * format->align; + } + *size = row_size * ysize; + return JXL_DEC_SUCCESS; +} + +JXL_EXPORT JxlDecoderStatus JxlDecoderSetDCOutBuffer( + JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size) { + // No buffer set: this feature is deprecated + return JXL_DEC_SUCCESS; +} + +JXL_EXPORT JxlDecoderStatus JxlDecoderImageOutBufferSize( + const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size) { + size_t bits; + JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits); + if (status != JXL_DEC_SUCCESS) return status; + + size_t row_size = + jxl::DivCeil(dec->metadata.oriented_xsize(dec->keep_orientation) * + format->num_channels * bits, + jxl::kBitsPerByte); + if (format->align > 1) { + row_size = jxl::DivCeil(row_size, format->align) * format->align; + } + *size = row_size * dec->metadata.oriented_ysize(dec->keep_orientation); + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetImageOutBuffer(JxlDecoder* dec, + const JxlPixelFormat* format, + void* buffer, size_t size) { + if (!dec->got_basic_info || !(dec->orig_events_wanted & JXL_DEC_FULL_IMAGE)) { + return JXL_API_ERROR("No image out buffer needed at this time"); + } + if (dec->image_out_buffer_set && !!dec->image_out_callback) { + return JXL_API_ERROR( + "Cannot change from image out callback to image out buffer"); + } + size_t min_size; + // This also checks whether the format is valid and supported and basic info + // is available. + JxlDecoderStatus status = + JxlDecoderImageOutBufferSize(dec, format, &min_size); + if (status != JXL_DEC_SUCCESS) return status; + + if (size < min_size) return JXL_DEC_ERROR; + + dec->image_out_buffer_set = true; + dec->image_out_buffer = buffer; + dec->image_out_size = size; + dec->image_out_format = *format; + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetImageOutCallback(JxlDecoder* dec, + const JxlPixelFormat* format, + JxlImageOutCallback callback, + void* opaque) { + if (dec->image_out_buffer_set && !!dec->image_out_buffer) { + return JXL_API_ERROR( + "Cannot change from image out buffer to image out callback"); + } + + // Perform error checking for invalid format. + size_t bits_dummy; + JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits_dummy); + if (status != JXL_DEC_SUCCESS) return status; + + dec->image_out_buffer_set = true; + dec->image_out_callback = callback; + dec->image_out_opaque = opaque; + dec->image_out_format = *format; + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetFrameHeader(const JxlDecoder* dec, + JxlFrameHeader* header) { + if (!dec->frame_header || dec->frame_stage == FrameStage::kHeader) { + return JXL_API_ERROR("no frame header available"); + } + const auto& metadata = dec->metadata.m; + if (metadata.have_animation) { + header->duration = dec->frame_header->animation_frame.duration; + if (metadata.animation.have_timecodes) { + header->timecode = dec->frame_header->animation_frame.timecode; + } + } + header->name_length = dec->frame_header->name.size(); + header->is_last = dec->frame_header->is_last; + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetFrameName(const JxlDecoder* dec, char* name, + size_t size) { + if (!dec->frame_header || dec->frame_stage == FrameStage::kHeader) { + return JXL_API_ERROR("no frame header available"); + } + if (size < dec->frame_header->name.size() + 1) { + return JXL_API_ERROR("too small frame name output buffer"); + } + memcpy(name, dec->frame_header->name.c_str(), + dec->frame_header->name.size() + 1); + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetPreferredColorProfile( + JxlDecoder* dec, const JxlColorEncoding* color_encoding) { + if (!dec->got_all_headers) { + return JXL_API_ERROR("color info not yet available"); + } + if (dec->post_headers) { + return JXL_API_ERROR("too late to set the color encoding"); + } + if (dec->metadata.m.color_encoding.IsGray() != + (color_encoding->color_space == JXL_COLOR_SPACE_GRAY)) { + return JXL_API_ERROR("grayscale mismatch"); + } + if (color_encoding->color_space == JXL_COLOR_SPACE_UNKNOWN || + color_encoding->color_space == JXL_COLOR_SPACE_XYB) { + return JXL_API_ERROR("only RGB or grayscale output supported"); + } + + JXL_API_RETURN_IF_ERROR(ConvertExternalToInternalColorEncoding( + *color_encoding, &dec->default_enc)); + JXL_API_RETURN_IF_ERROR(dec->passes_state->output_encoding_info.Set( + dec->metadata, dec->default_enc)); + return JXL_DEC_SUCCESS; +} + +// This function is "package-private". It is only used by fuzzer to avoid +// running cases that are too memory / CPU hungry. Limitations are applied +// at mid-level API. In the future high-level API would also include the +// means of limiting / throttling memory / CPU usage. +void SetDecoderMemoryLimitBase_(size_t memory_limit_base) { + memory_limit_base_ = memory_limit_base; + // Allow 5 x max_image_size processing units; every frame is accounted + // as W x H CPU processing units, so there could be numerous small frames + // or few larger ones. + cpu_limit_base_ = 5 * memory_limit_base; +} diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/decode_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/decode_test.cc new file mode 100644 index 0000000000..f1acc4a1e0 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/decode_test.cc @@ -0,0 +1,2926 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "jxl/decode.h" + +#include +#include + +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "jxl/decode_cxx.h" +#include "jxl/resizable_parallel_runner_cxx.h" +#include "jxl/thread_parallel_runner_cxx.h" +#include "lib/extras/codec.h" +#include "lib/extras/codec_jpg.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/file_io.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_external_image.h" +#include "lib/jxl/dec_file.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_external_image.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_gamma_correct.h" +#include "lib/jxl/enc_icc_codec.h" +#include "lib/jxl/encode_internal.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/icc_codec.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/jpeg/enc_jpeg_data.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testdata.h" +#include "tools/box/box.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace { +void AppendU32BE(uint32_t u32, jxl::PaddedBytes* bytes) { + bytes->push_back(u32 >> 24); + bytes->push_back(u32 >> 16); + bytes->push_back(u32 >> 8); + bytes->push_back(u32 >> 0); +} + +bool Near(double expected, double value, double max_dist) { + double dist = expected > value ? expected - value : value - expected; + return dist <= max_dist; +} + +// Loads a Big-Endian float +float LoadBEFloat(const uint8_t* p) { + uint32_t u = LoadBE32(p); + float result; + memcpy(&result, &u, 4); + return result; +} + +// Loads a Little-Endian float +float LoadLEFloat(const uint8_t* p) { + uint32_t u = LoadLE32(p); + float result; + memcpy(&result, &u, 4); + return result; +} + +// Based on highway scalar implementation, for testing +float LoadFloat16(uint16_t bits16) { + const uint32_t sign = bits16 >> 15; + const uint32_t biased_exp = (bits16 >> 10) & 0x1F; + const uint32_t mantissa = bits16 & 0x3FF; + + // Subnormal or zero + if (biased_exp == 0) { + const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024)); + return sign ? -subnormal : subnormal; + } + + // Normalized: convert the representation directly (faster than ldexp/tables). + const uint32_t biased_exp32 = biased_exp + (127 - 15); + const uint32_t mantissa32 = mantissa << (23 - 10); + const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32; + + float result; + memcpy(&result, &bits32, 4); + return result; +} + +float LoadLEFloat16(const uint8_t* p) { + uint16_t bits16 = LoadLE16(p); + return LoadFloat16(bits16); +} + +float LoadBEFloat16(const uint8_t* p) { + uint16_t bits16 = LoadBE16(p); + return LoadFloat16(bits16); +} + +size_t GetPrecision(JxlDataType data_type) { + switch (data_type) { + case JXL_TYPE_BOOLEAN: + return 1; + case JXL_TYPE_UINT8: + return 8; + case JXL_TYPE_UINT16: + return 16; + case JXL_TYPE_UINT32: + return 32; + case JXL_TYPE_FLOAT: + // Floating point mantissa precision + return 24; + case JXL_TYPE_FLOAT16: + return 11; + } + JXL_ASSERT(false); // unknown type +} + +size_t GetDataBits(JxlDataType data_type) { + switch (data_type) { + case JXL_TYPE_BOOLEAN: + return 1; + case JXL_TYPE_UINT8: + return 8; + case JXL_TYPE_UINT16: + return 16; + case JXL_TYPE_UINT32: + return 32; + case JXL_TYPE_FLOAT: + return 32; + case JXL_TYPE_FLOAT16: + return 16; + } + JXL_ASSERT(false); // unknown type +} + +// What type of codestream format in the boxes to use for testing +enum CodeStreamBoxFormat { + // Do not use box format at all, only pure codestream + kCSBF_None, + // Have a single codestream box, with its actual size given in the box + kCSBF_Single, + // Have a single codestream box, with box size 0 (final box running to end) + kCSBF_Single_Zero_Terminated, + // Single codestream box, with another unknown box behind it + kCSBF_Single_other, + // Have multiple partial codestream boxes + kCSBF_Multi, + // Have multiple partial codestream boxes, with final box size 0 (running + // to end) + kCSBF_Multi_Zero_Terminated, + // Have multiple partial codestream boxes, terminated by non-codestream box + kCSBF_Multi_Other_Terminated, + // Have multiple partial codestream boxes, terminated by non-codestream box + // that has its size set to 0 (running to end) + kCSBF_Multi_Other_Zero_Terminated, + // Have multiple partial codestream boxes, and the first one has a content + // of zero length + kCSBF_Multi_First_Empty, + // Not a value but used for counting amount of enum entries + kCSBF_NUM_ENTRIES, +}; + +// Returns an ICC profile output by the JPEG XL decoder for RGB_D65_SRG_Rel_Lin, +// but with, on purpose, rXYZ, bXYZ and gXYZ (the RGB primaries) switched to a +// different order to ensure the profile does not match any known profile, so +// the encoder cannot encode it in a compact struct instead. +jxl::PaddedBytes GetIccTestProfile() { + const uint8_t* profile = reinterpret_cast( + "\0\0\3\200lcms\0040\0\0mntrRGB XYZ " + "\a\344\0\a\0\27\0\21\0$" + "\0\37acspAPPL\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0\366" + "\326\0\1\0\0\0\0\323-lcms\372c\207\36\227\200{" + "\2\232s\255\327\340\0\n\26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" + "\0\0\0\0\0\0\0\0\rdesc\0\0\1 " + "\0\0\0Bcprt\0\0\1d\0\0\1\0wtpt\0\0\2d\0\0\0\24chad\0\0\2x\0\0\0," + "bXYZ\0\0\2\244\0\0\0\24gXYZ\0\0\2\270\0\0\0\24rXYZ\0\0\2\314\0\0\0\24rTR" + "C\0\0\2\340\0\0\0 gTRC\0\0\2\340\0\0\0 bTRC\0\0\2\340\0\0\0 " + "chrm\0\0\3\0\0\0\0$dmnd\0\0\3$\0\0\0(" + "dmdd\0\0\3L\0\0\0002mluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0&" + "\0\0\0\34\0R\0G\0B\0_\0D\0006\0005\0_\0S\0R\0G\0_\0R\0e\0l\0_" + "\0L\0i\0n\0\0mluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0\344\0\0\0\34\0C\0o\0" + "p\0y\0r\0i\0g\0h\0t\0 \0002\0000\0001\08\0 \0G\0o\0o\0g\0l\0e\0 " + "\0L\0L\0C\0,\0 \0C\0C\0-\0B\0Y\0-\0S\0A\0 \0003\0.\0000\0 " + "\0U\0n\0p\0o\0r\0t\0e\0d\0 " + "\0l\0i\0c\0e\0n\0s\0e\0(\0h\0t\0t\0p\0s\0:\0/\0/" + "\0c\0r\0e\0a\0t\0i\0v\0e\0c\0o\0m\0m\0o\0n\0s\0.\0o\0r\0g\0/" + "\0l\0i\0c\0e\0n\0s\0e\0s\0/\0b\0y\0-\0s\0a\0/\0003\0.\0000\0/" + "\0l\0e\0g\0a\0l\0c\0o\0d\0e\0)XYZ " + "\0\0\0\0\0\0\366\326\0\1\0\0\0\0\323-" + "sf32\0\0\0\0\0\1\fB\0\0\5\336\377\377\363%" + "\0\0\a\223\0\0\375\220\377\377\373\241\377\377\375\242\0\0\3\334\0\0\300" + "nXYZ \0\0\0\0\0\0o\240\0\08\365\0\0\3\220XYZ " + "\0\0\0\0\0\0$\237\0\0\17\204\0\0\266\304XYZ " + "\0\0\0\0\0\0b\227\0\0\267\207\0\0\30\331para\0\0\0\0\0\3\0\0\0\1\0\0\0\1" + "\0\0\0\0\0\0\0\1\0\0\0\0\0\0chrm\0\0\0\0\0\3\0\0\0\0\243\327\0\0T|" + "\0\0L\315\0\0\231\232\0\0&" + "g\0\0\17\\mluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0\f\0\0\0\34\0G\0o\0o\0g" + "\0l\0emluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0\26\0\0\0\34\0I\0m\0a\0g\0e" + "\0 \0c\0o\0d\0e\0c\0\0"); + size_t profile_size = 896; + jxl::PaddedBytes icc_profile; + icc_profile.assign(profile, profile + profile_size); + return icc_profile; +} + +} // namespace + +namespace jxl { +namespace { + +// Input pixels always given as 16-bit RGBA, 8 bytes per pixel. +// include_alpha determines if the encoded image should contain the alpha +// channel. +// add_icc_profile: if false, encodes the image as sRGB using the JXL fields, +// for grayscale or RGB images. If true, encodes the image using the ICC profile +// returned by GetIccTestProfile, without the JXL fields, this requires the +// image is RGB, not grayscale. +// Providing jpeg_codestream will populate the jpeg_codestream with compressed +// JPEG bytes, and make it possible to reconstruct those exact JPEG bytes using +// the return value _if_ add_container indicates a box format. +PaddedBytes CreateTestJXLCodestream( + Span pixels, size_t xsize, size_t ysize, size_t num_channels, + const CompressParams& cparams, CodeStreamBoxFormat add_container, + JxlOrientation orientation, bool add_preview, bool add_icc_profile = false, + PaddedBytes* jpeg_codestream = nullptr) { + // Compress the pixels with JPEG XL. + bool grayscale = (num_channels <= 2); + bool include_alpha = !(num_channels & 1) && jpeg_codestream == nullptr; + size_t bitdepth = jpeg_codestream == nullptr ? 16 : 8; + CodecInOut io; + io.SetSize(xsize, ysize); + ColorEncoding color_encoding = + jxl::ColorEncoding::SRGB(/*is_gray=*/grayscale); + if (add_icc_profile) { + // the hardcoded ICC profile we attach requires RGB. + EXPECT_EQ(false, grayscale); + EXPECT_TRUE(color_encoding.SetICC(GetIccTestProfile())); + } + ThreadPool pool(nullptr, nullptr); + io.metadata.m.SetUintSamples(bitdepth); + if (include_alpha) { + io.metadata.m.SetAlphaBits(bitdepth); + } + // Make the grayscale-ness of the io metadata color_encoding and the packed + // image match. + io.metadata.m.color_encoding = color_encoding; + EXPECT_TRUE(ConvertFromExternal( + pixels, xsize, ysize, color_encoding, /*has_alpha=*/include_alpha, + /*alpha_is_premultiplied=*/false, bitdepth, JXL_BIG_ENDIAN, + /*flipped_y=*/false, &pool, &io.Main())); + jxl::PaddedBytes jpeg_data; + if (jpeg_codestream != nullptr) { +#if JPEGXL_ENABLE_JPEG + jxl::PaddedBytes jpeg_bytes; + EXPECT_TRUE(EncodeImageJPG(&io, jxl::JpegEncoder::kLibJpeg, /*quality=*/70, + jxl::YCbCrChromaSubsampling(), &pool, + &jpeg_bytes, jxl::DecodeTarget::kPixels)); + jpeg_codestream->append(jpeg_bytes.data(), + jpeg_bytes.data() + jpeg_bytes.size()); + EXPECT_TRUE(jxl::jpeg::DecodeImageJPG( + jxl::Span(jpeg_bytes.data(), jpeg_bytes.size()), &io)); + EXPECT_TRUE(EncodeJPEGData(*io.Main().jpeg_data, &jpeg_data)); + io.metadata.m.xyb_encoded = false; +#else // JPEGXL_ENABLE_JPEG + JXL_ABORT( + "unable to create reconstructible JPEG without JPEG support enabled"); +#endif // JPEGXL_ENABLE_JPEG + } + if (add_preview) { + io.preview_frame = io.Main().Copy(); + io.preview_frame.ShrinkTo(xsize / 7, ysize / 7); + io.metadata.m.have_preview = true; + EXPECT_TRUE(io.metadata.m.preview_size.Set(io.preview_frame.xsize(), + io.preview_frame.ysize())); + } + io.metadata.m.orientation = orientation; + AuxOut aux_out; + PaddedBytes compressed; + PassesEncoderState enc_state; + EXPECT_TRUE( + EncodeFile(cparams, &io, &enc_state, &compressed, &aux_out, &pool)); + if (add_container != kCSBF_None) { + // Header with signature box and ftyp box. + const uint8_t header[] = {0, 0, 0, 0xc, 0x4a, 0x58, 0x4c, 0x20, + 0xd, 0xa, 0x87, 0xa, 0, 0, 0, 0x14, + 0x66, 0x74, 0x79, 0x70, 0x6a, 0x78, 0x6c, 0x20, + 0, 0, 0, 0, 0x6a, 0x78, 0x6c, 0x20}; + // Unknown box, could be a box added by user, decoder must be able to skip + // over it. Type is set to 'unkn', size to 24, contents to 16 0's. + const uint8_t unknown[] = {0, 0, 0, 0x18, 0x75, 0x6e, 0x6b, 0x6e, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + // same as the unknown box, but with size set to 0, this can only be a final + // box + const uint8_t unknown_end[] = {0, 0, 0, 0, 0x75, 0x6e, 0x6b, 0x6e, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + + bool is_multi = add_container == kCSBF_Multi || + add_container == kCSBF_Multi_Zero_Terminated || + add_container == kCSBF_Multi_Other_Terminated || + add_container == kCSBF_Multi_Other_Zero_Terminated || + add_container == kCSBF_Multi_First_Empty; + + if (is_multi) { + size_t third = compressed.size() / 3; + std::vector compressed0(compressed.data(), + compressed.data() + third); + std::vector compressed1(compressed.data() + third, + compressed.data() + 2 * third); + std::vector compressed2(compressed.data() + 2 * third, + compressed.data() + compressed.size()); + + PaddedBytes c; + c.append(header, header + sizeof(header)); + if (jpeg_codestream != nullptr) { + jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_data.size(), false, + &c); + c.append(jpeg_data.data(), jpeg_data.data() + jpeg_data.size()); + } + uint32_t jxlp_index = 0; + if (add_container == kCSBF_Multi_First_Empty) { + // Dummy (empty) codestream part + AppendU32BE(12, &c); + c.push_back('j'); + c.push_back('x'); + c.push_back('l'); + c.push_back('p'); + AppendU32BE(jxlp_index++, &c); + } + // First codestream part + AppendU32BE(compressed0.size() + 12, &c); + c.push_back('j'); + c.push_back('x'); + c.push_back('l'); + c.push_back('p'); + AppendU32BE(jxlp_index++, &c); + c.append(compressed0.data(), compressed0.data() + compressed0.size()); + // A few non-codestream boxes in between + c.append(unknown, unknown + sizeof(unknown)); + c.append(unknown, unknown + sizeof(unknown)); + // Dummy (empty) codestream part + AppendU32BE(12, &c); + c.push_back('j'); + c.push_back('x'); + c.push_back('l'); + c.push_back('p'); + AppendU32BE(jxlp_index++, &c); + // Second codestream part + AppendU32BE(compressed1.size() + 12, &c); + c.push_back('j'); + c.push_back('x'); + c.push_back('l'); + c.push_back('p'); + AppendU32BE(jxlp_index++, &c); + c.append(compressed1.data(), compressed1.data() + compressed1.size()); + // Third codestream part + AppendU32BE(add_container == kCSBF_Multi ? (compressed2.size() + 12) : 0, + &c); + c.push_back('j'); + c.push_back('x'); + c.push_back('l'); + c.push_back('p'); + AppendU32BE(jxlp_index++ | 0x80000000, &c); + c.append(compressed2.data(), compressed2.data() + compressed2.size()); + if (add_container == kCSBF_Multi_Other_Terminated) { + c.append(unknown, unknown + sizeof(unknown)); + } + if (add_container == kCSBF_Multi_Other_Zero_Terminated) { + c.append(unknown_end, unknown_end + sizeof(unknown_end)); + } + compressed.swap(c); + } else { + PaddedBytes c; + c.append(header, header + sizeof(header)); + if (jpeg_codestream != nullptr) { + jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_data.size(), false, + &c); + c.append(jpeg_data.data(), jpeg_data.data() + jpeg_data.size()); + } + AppendU32BE(add_container == kCSBF_Single_Zero_Terminated + ? 0 + : (compressed.size() + 8), + &c); + c.push_back('j'); + c.push_back('x'); + c.push_back('l'); + c.push_back('c'); + c.append(compressed.data(), compressed.data() + compressed.size()); + if (add_container == kCSBF_Single_other) { + c.append(unknown, unknown + sizeof(unknown)); + } + compressed.swap(c); + } + } + + return compressed; +} + +// Decodes one-shot with the API for non-streaming decoding tests. +std::vector DecodeWithAPI(JxlDecoder* dec, + Span compressed, + const JxlPixelFormat& format, + bool use_callback, bool set_buffer_early, + bool use_resizable_runner) { + JxlThreadParallelRunnerPtr runner_fixed; + JxlResizableParallelRunnerPtr runner_resizable; + JxlParallelRunner runner_fn; + void* runner; + + if (use_resizable_runner) { + runner_resizable = JxlResizableParallelRunnerMake(nullptr); + runner = runner_resizable.get(); + runner_fn = JxlResizableParallelRunner; + } else { + runner_fixed = JxlThreadParallelRunnerMake( + nullptr, JxlThreadParallelRunnerDefaultNumWorkerThreads()); + runner = runner_fixed.get(); + runner_fn = JxlThreadParallelRunner; + } + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetParallelRunner(dec, runner_fn, runner)); + + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | (set_buffer_early ? JXL_DEC_FRAME : 0) | + JXL_DEC_PREVIEW_IMAGE | JXL_DEC_FULL_IMAGE)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput(dec, compressed.data(), compressed.size())); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + if (use_resizable_runner) { + JxlResizableParallelRunnerSetThreads( + runner, + JxlResizableParallelRunnerSuggestThreads(info.xsize, info.ysize)); + } + + std::vector pixels(buffer_size); + size_t bytes_per_pixel = + format.num_channels * GetDataBits(format.data_type) / jxl::kBitsPerByte; + size_t stride = bytes_per_pixel * info.xsize; + if (format.align > 1) { + stride = jxl::DivCeil(stride, format.align) * format.align; + } + auto callback = [&](size_t x, size_t y, size_t num_pixels, + const void* pixels_row) { + memcpy(pixels.data() + stride * y + bytes_per_pixel * x, pixels_row, + num_pixels * bytes_per_pixel); + }; + + JxlDecoderStatus status = JxlDecoderProcessInput(dec); + + std::vector preview; + if (status == JXL_DEC_NEED_PREVIEW_OUT_BUFFER) { + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size)); + preview.resize(buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetPreviewOutBuffer(dec, &format, preview.data(), + preview.size())); + EXPECT_EQ(JXL_DEC_PREVIEW_IMAGE, JxlDecoderProcessInput(dec)); + + status = JxlDecoderProcessInput(dec); + } + + if (set_buffer_early) { + EXPECT_EQ(JXL_DEC_FRAME, status); + } else { + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, status); + } + + if (use_callback) { + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutCallback( + dec, &format, + [](void* opaque, size_t x, size_t y, size_t xsize, + const void* pixels_row) { + auto cb = static_cast(opaque); + (*cb)(x, y, xsize, pixels_row); + }, + /*opaque=*/&callback)); + } else { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels.data(), pixels.size())); + } + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + + // After the full image is gotten, JxlDecoderProcessInput should return + // success to indicate all is done. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + + return pixels; +} + +// Decodes one-shot with the API for non-streaming decoding tests. +std::vector DecodeWithAPI(Span compressed, + const JxlPixelFormat& format, + bool use_callback, bool set_buffer_early, + bool use_resizable_runner) { + JxlDecoder* dec = JxlDecoderCreate(NULL); + std::vector pixels = + DecodeWithAPI(dec, compressed, format, use_callback, set_buffer_early, + use_resizable_runner); + JxlDecoderDestroy(dec); + return pixels; +} + +} // namespace +} // namespace jxl + +namespace { + +// Procedure to convert pixels to double precision, not efficient, but +// well-controlled for testing. It uses double, to be able to represent all +// precisions needed for the maximum data types the API supports: uint32_t +// integers, and, single precision float. The values are in range 0-1 for SDR. +std::vector ConvertToRGBA32(const uint8_t* pixels, size_t xsize, + size_t ysize, + const JxlPixelFormat& format) { + std::vector result(xsize * ysize * 4); + size_t num_channels = format.num_channels; + bool gray = num_channels == 1 || num_channels == 2; + bool alpha = num_channels == 2 || num_channels == 4; + + size_t stride = + xsize * jxl::DivCeil(GetDataBits(format.data_type) * num_channels, + jxl::kBitsPerByte); + if (format.align > 1) stride = jxl::RoundUpTo(stride, format.align); + + if (format.data_type == JXL_TYPE_BOOLEAN) { + for (size_t y = 0; y < ysize; ++y) { + jxl::BitReader br(jxl::Span(pixels + stride * y, stride)); + for (size_t x = 0; x < xsize; ++x) { + size_t j = (y * xsize + x) * 4; + double r = br.ReadBits(1); + double g = gray ? r : br.ReadBits(1); + double b = gray ? r : br.ReadBits(1); + double a = alpha ? br.ReadBits(1) : 1; + result[j + 0] = r; + result[j + 1] = g; + result[j + 2] = b; + result[j + 3] = a; + } + JXL_CHECK(br.Close()); + } + } else if (format.data_type == JXL_TYPE_UINT8) { + double mul = 1.0 / 255.0; // Multiplier to bring to 0-1.0 range + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + size_t j = (y * xsize + x) * 4; + size_t i = y * stride + x * num_channels; + double r = pixels[i]; + double g = gray ? r : pixels[i + 1]; + double b = gray ? r : pixels[i + 2]; + double a = alpha ? pixels[i + num_channels - 1] : 255; + result[j + 0] = r * mul; + result[j + 1] = g * mul; + result[j + 2] = b * mul; + result[j + 3] = a * mul; + } + } + } else if (format.data_type == JXL_TYPE_UINT16) { + double mul = 1.0 / 65535.0; // Multiplier to bring to 0-1.0 range + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + size_t j = (y * xsize + x) * 4; + size_t i = y * stride + x * num_channels * 2; + double r, g, b, a; + if (format.endianness == JXL_BIG_ENDIAN) { + r = (pixels[i + 0] << 8) + pixels[i + 1]; + g = gray ? r : (pixels[i + 2] << 8) + pixels[i + 3]; + b = gray ? r : (pixels[i + 4] << 8) + pixels[i + 5]; + a = alpha ? (pixels[i + num_channels * 2 - 2] << 8) + + pixels[i + num_channels * 2 - 1] + : 65535; + } else { + r = (pixels[i + 1] << 8) + pixels[i + 0]; + g = gray ? r : (pixels[i + 3] << 8) + pixels[i + 2]; + b = gray ? r : (pixels[i + 5] << 8) + pixels[i + 4]; + a = alpha ? (pixels[i + num_channels * 2 - 1] << 8) + + pixels[i + num_channels * 2 - 2] + : 65535; + } + result[j + 0] = r * mul; + result[j + 1] = g * mul; + result[j + 2] = b * mul; + result[j + 3] = a * mul; + } + } + } else if (format.data_type == JXL_TYPE_UINT32) { + double mul = 1.0 / 4294967295.0; // Multiplier to bring to 0-1.0 range + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + size_t j = (y * xsize + x) * 4; + size_t i = y * stride + x * num_channels * 4; + double r, g, b, a; + if (format.endianness == JXL_BIG_ENDIAN) { + r = LoadBE32(pixels + i); + g = gray ? r : LoadBE32(pixels + i + 4); + b = gray ? r : LoadBE32(pixels + i + 8); + a = alpha ? LoadBE32(pixels + i + num_channels * 2 - 4) : 4294967295; + + } else { + r = LoadLE32(pixels + i); + g = gray ? r : LoadLE32(pixels + i + 4); + b = gray ? r : LoadLE32(pixels + i + 8); + a = alpha ? LoadLE32(pixels + i + num_channels * 2 - 4) : 4294967295; + } + result[j + 0] = r * mul; + result[j + 1] = g * mul; + result[j + 2] = b * mul; + result[j + 3] = a * mul; + } + } + } else if (format.data_type == JXL_TYPE_FLOAT) { + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + size_t j = (y * xsize + x) * 4; + size_t i = y * stride + x * num_channels * 4; + double r, g, b, a; + if (format.endianness == JXL_BIG_ENDIAN) { + r = LoadBEFloat(pixels + i); + g = gray ? r : LoadBEFloat(pixels + i + 4); + b = gray ? r : LoadBEFloat(pixels + i + 8); + a = alpha ? LoadBEFloat(pixels + i + num_channels * 4 - 4) : 1.0; + } else { + r = LoadLEFloat(pixels + i); + g = gray ? r : LoadLEFloat(pixels + i + 4); + b = gray ? r : LoadLEFloat(pixels + i + 8); + a = alpha ? LoadLEFloat(pixels + i + num_channels * 4 - 4) : 1.0; + } + result[j + 0] = r; + result[j + 1] = g; + result[j + 2] = b; + result[j + 3] = a; + } + } + } else if (format.data_type == JXL_TYPE_FLOAT16) { + for (size_t y = 0; y < ysize; ++y) { + for (size_t x = 0; x < xsize; ++x) { + size_t j = (y * xsize + x) * 4; + size_t i = y * stride + x * num_channels * 2; + double r, g, b, a; + if (format.endianness == JXL_BIG_ENDIAN) { + r = LoadBEFloat16(pixels + i); + g = gray ? r : LoadBEFloat16(pixels + i + 2); + b = gray ? r : LoadBEFloat16(pixels + i + 4); + a = alpha ? LoadBEFloat16(pixels + i + num_channels * 2 - 2) : 1.0; + } else { + r = LoadLEFloat16(pixels + i); + g = gray ? r : LoadLEFloat16(pixels + i + 2); + b = gray ? r : LoadLEFloat16(pixels + i + 4); + a = alpha ? LoadLEFloat16(pixels + i + num_channels * 2 - 2) : 1.0; + } + result[j + 0] = r; + result[j + 1] = g; + result[j + 2] = b; + result[j + 3] = a; + } + } + } else { + JXL_ASSERT(false); // Unsupported type + } + return result; +} + +// Returns amount of pixels which differ between the two pictures. Image b is +// the image after roundtrip after roundtrip, image a before roundtrip. There +// are more strict requirements for the alpha channel and grayscale values of +// the output image. +size_t ComparePixels(const uint8_t* a, const uint8_t* b, size_t xsize, + size_t ysize, const JxlPixelFormat& format_a, + const JxlPixelFormat& format_b) { + // Convert both images to equal full precision for comparison. + std::vector a_full = ConvertToRGBA32(a, xsize, ysize, format_a); + std::vector b_full = ConvertToRGBA32(b, xsize, ysize, format_b); + bool gray_a = format_a.num_channels < 3; + bool gray_b = format_b.num_channels < 3; + bool alpha_a = !(format_a.num_channels & 1); + bool alpha_b = !(format_b.num_channels & 1); + size_t bits_a = GetPrecision(format_a.data_type); + size_t bits_b = GetPrecision(format_b.data_type); + size_t bits = std::min(bits_a, bits_b); + // How much distance is allowed in case of pixels with lower bit depths, given + // that the double precision float images use range 0-1.0. + // E.g. in case of 1-bit this is 0.5 since 0.499 must map to 0 and 0.501 must + // map to 1. + double precision = 0.5 / ((1ull << bits) - 1ull); + if (format_a.data_type == JXL_TYPE_FLOAT16 || + format_b.data_type == JXL_TYPE_FLOAT16) { + // Lower the precision for float16, because it currently looks like the + // scalar and wasm implementations of hwy have 1 less bit of precision + // than the x86 implementations. + // TODO(lode): Set the required precision back to 11 bits when possible. + precision = 0.5 / ((1ull << (bits - 1)) - 1ull); + } + size_t numdiff = 0; + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xsize; x++) { + size_t i = (y * xsize + x) * 4; + bool ok = true; + if (gray_a || gray_b) { + if (!Near(a_full[i + 0], b_full[i + 0], precision)) ok = false; + // If the input was grayscale and the output not, then the output must + // have all channels equal. + if (gray_a && b_full[i + 0] != b_full[i + 1] && + b_full[i + 2] != b_full[i + 2]) { + ok = false; + } + } else { + if (!Near(a_full[i + 0], b_full[i + 0], precision) || + !Near(a_full[i + 1], b_full[i + 1], precision) || + !Near(a_full[i + 2], b_full[i + 2], precision)) { + ok = false; + } + } + if (alpha_a && alpha_b) { + if (!Near(a_full[i + 3], b_full[i + 3], precision)) ok = false; + } else { + // If the input had no alpha channel, the output should be opaque + // after roundtrip. + if (alpha_b && !Near(1.0, b_full[i + 3], precision)) ok = false; + } + if (!ok) numdiff++; + } + } + return numdiff; +} + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +TEST(DecodeTest, JxlSignatureCheckTest) { + std::vector>> tests = { + // No JPEGXL header starts with 'a'. + {JXL_SIG_INVALID, {'a'}}, + {JXL_SIG_INVALID, {'a', 'b', 'c', 'd', 'e', 'f'}}, + + // Empty file is not enough bytes. + {JXL_SIG_NOT_ENOUGH_BYTES, {}}, + + // JPEGXL headers. + {JXL_SIG_NOT_ENOUGH_BYTES, {0xff}}, // Part of a signature. + {JXL_SIG_INVALID, {0xff, 0xD8}}, // JPEG-1 + {JXL_SIG_CODESTREAM, {0xff, 0x0a}}, + + // JPEGXL container file. + {JXL_SIG_CONTAINER, + {0, 0, 0, 0xc, 'J', 'X', 'L', ' ', 0xD, 0xA, 0x87, 0xA}}, + // Ending with invalid byte. + {JXL_SIG_INVALID, {0, 0, 0, 0xc, 'J', 'X', 'L', ' ', 0xD, 0xA, 0x87, 0}}, + // Part of signature. + {JXL_SIG_NOT_ENOUGH_BYTES, + {0, 0, 0, 0xc, 'J', 'X', 'L', ' ', 0xD, 0xA, 0x87}}, + {JXL_SIG_NOT_ENOUGH_BYTES, {0}}, + }; + for (const auto& test : tests) { + EXPECT_EQ(test.first, + JxlSignatureCheck(test.second.data(), test.second.size())) + << "Where test data is " << ::testing::PrintToString(test.second); + } +} + +TEST(DecodeTest, DefaultAllocTest) { + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_NE(nullptr, dec); + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, CustomAllocTest) { + struct CalledCounters { + int allocs = 0; + int frees = 0; + } counters; + + JxlMemoryManager mm; + mm.opaque = &counters; + mm.alloc = [](void* opaque, size_t size) { + reinterpret_cast(opaque)->allocs++; + return malloc(size); + }; + mm.free = [](void* opaque, void* address) { + reinterpret_cast(opaque)->frees++; + free(address); + }; + + JxlDecoder* dec = JxlDecoderCreate(&mm); + EXPECT_NE(nullptr, dec); + EXPECT_LE(1, counters.allocs); + EXPECT_EQ(0, counters.frees); + JxlDecoderDestroy(dec); + EXPECT_LE(1, counters.frees); +} + +// TODO(lode): add multi-threaded test when multithreaded pixel decoding from +// API is implemented. +TEST(DecodeTest, DefaultParallelRunnerTest) { + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_NE(nullptr, dec); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetParallelRunner(dec, nullptr, nullptr)); + JxlDecoderDestroy(dec); +} + +// Creates the header of a JPEG XL file with various custom parameters for +// testing. +// xsize, ysize: image dimensions to store in the SizeHeader, max 512. +// bits_per_sample, orientation: a selection of header parameters to test with. +// orientation: image orientation to set in the metadata +// alpha_bits: if non-0, alpha extra channel bits to set in the metadata. Also +// gives the alpha channel the name "alpha_test" +// have_container: add box container format around the codestream. +// metadata_default: if true, ImageMetadata is set to default and +// bits_per_sample, orientation and alpha_bits are ignored. +// insert_box: insert an extra box before the codestream box, making the header +// farther away from the front than is ideal. Only used if have_container. +std::vector GetTestHeader(size_t xsize, size_t ysize, + size_t bits_per_sample, size_t orientation, + size_t alpha_bits, bool xyb_encoded, + bool have_container, bool metadata_default, + bool insert_extra_box, + const jxl::PaddedBytes& icc_profile) { + jxl::BitWriter writer; + jxl::BitWriter::Allotment allotment(&writer, 65536); // Large enough + + if (have_container) { + const std::vector signature_box = {0, 0, 0, 0xc, 'J', 'X', + 'L', ' ', 0xd, 0xa, 0x87, 0xa}; + const std::vector filetype_box = { + 0, 0, 0, 0x14, 'f', 't', 'y', 'p', 'j', 'x', + 'l', ' ', 0, 0, 0, 0, 'j', 'x', 'l', ' '}; + const std::vector extra_box_header = {0, 0, 0, 0xff, + 't', 'e', 's', 't'}; + // Beginning of codestream box, with an arbitrary size certainly large + // enough to contain the header + const std::vector codestream_box_header = {0, 0, 0, 0xff, + 'j', 'x', 'l', 'c'}; + + for (size_t i = 0; i < signature_box.size(); i++) { + writer.Write(8, signature_box[i]); + } + for (size_t i = 0; i < filetype_box.size(); i++) { + writer.Write(8, filetype_box[i]); + } + if (insert_extra_box) { + for (size_t i = 0; i < extra_box_header.size(); i++) { + writer.Write(8, extra_box_header[i]); + } + for (size_t i = 0; i < 255 - 8; i++) { + writer.Write(8, 0); + } + } + for (size_t i = 0; i < codestream_box_header.size(); i++) { + writer.Write(8, codestream_box_header[i]); + } + } + + // JXL signature + writer.Write(8, 0xff); + writer.Write(8, 0x0a); + + // SizeHeader + jxl::CodecMetadata metadata; + EXPECT_TRUE(metadata.size.Set(xsize, ysize)); + EXPECT_TRUE(WriteSizeHeader(metadata.size, &writer, 0, nullptr)); + + if (!metadata_default) { + metadata.m.SetUintSamples(bits_per_sample); + metadata.m.orientation = orientation; + metadata.m.SetAlphaBits(alpha_bits); + metadata.m.xyb_encoded = xyb_encoded; + if (alpha_bits != 0) { + metadata.m.extra_channel_info[0].name = "alpha_test"; + } + } + + if (!icc_profile.empty()) { + jxl::PaddedBytes copy = icc_profile; + EXPECT_TRUE(metadata.m.color_encoding.SetICC(std::move(copy))); + } + + EXPECT_TRUE(jxl::Bundle::Write(metadata.m, &writer, 0, nullptr)); + metadata.transform_data.nonserialized_xyb_encoded = metadata.m.xyb_encoded; + EXPECT_TRUE(jxl::Bundle::Write(metadata.transform_data, &writer, 0, nullptr)); + + if (!icc_profile.empty()) { + EXPECT_TRUE(metadata.m.color_encoding.WantICC()); + EXPECT_TRUE(jxl::WriteICC(icc_profile, &writer, 0, nullptr)); + } + + writer.ZeroPadToByte(); + ReclaimAndCharge(&writer, &allotment, 0, nullptr); + return std::vector( + writer.GetSpan().data(), + writer.GetSpan().data() + writer.GetSpan().size()); +} + +TEST(DecodeTest, BasicInfoTest) { + size_t xsize[2] = {50, 33}; + size_t ysize[2] = {50, 77}; + size_t bits_per_sample[2] = {8, 23}; + size_t orientation[2] = {3, 5}; + size_t alpha_bits[2] = {0, 8}; + size_t have_container[2] = {0, 1}; + bool xyb_encoded = false; + + std::vector> test_samples; + // Test with direct codestream + test_samples.push_back(GetTestHeader( + xsize[0], ysize[0], bits_per_sample[0], orientation[0], alpha_bits[0], + xyb_encoded, have_container[0], /*metadata_default=*/false, + /*insert_extra_box=*/false, {})); + // Test with container and different parameters + test_samples.push_back(GetTestHeader( + xsize[1], ysize[1], bits_per_sample[1], orientation[1], alpha_bits[1], + xyb_encoded, have_container[1], /*metadata_default=*/false, + /*insert_extra_box=*/false, {})); + + for (size_t i = 0; i < test_samples.size(); ++i) { + const std::vector& data = test_samples[i]; + // Test decoding too small header first, until we reach the final byte. + for (size_t size = 0; size <= data.size(); ++size) { + // Test with a new decoder for each tested byte size. + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO)); + const uint8_t* next_in = data.data(); + size_t avail_in = size; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + JxlDecoderStatus status = JxlDecoderProcessInput(dec); + + JxlBasicInfo info; + bool have_basic_info = !JxlDecoderGetBasicInfo(dec, &info); + + if (size == data.size()) { + EXPECT_EQ(JXL_DEC_BASIC_INFO, status); + + // All header bytes given so the decoder must have the basic info. + EXPECT_EQ(true, have_basic_info); + EXPECT_EQ(have_container[i], info.have_container); + EXPECT_EQ(alpha_bits[i], info.alpha_bits); + // Orientations 5..8 swap the dimensions + if (orientation[i] >= 5) { + EXPECT_EQ(xsize[i], info.ysize); + EXPECT_EQ(ysize[i], info.xsize); + } else { + EXPECT_EQ(xsize[i], info.xsize); + EXPECT_EQ(ysize[i], info.ysize); + } + // The API should set the orientation to identity by default since it + // already applies the transformation internally by default. + EXPECT_EQ(1, info.orientation); + + EXPECT_EQ(3, info.num_color_channels); + + if (alpha_bits[i] != 0) { + // Expect an extra channel + EXPECT_EQ(1, info.num_extra_channels); + JxlExtraChannelInfo extra; + EXPECT_EQ(0, JxlDecoderGetExtraChannelInfo(dec, 0, &extra)); + EXPECT_EQ(alpha_bits[i], extra.bits_per_sample); + EXPECT_EQ(JXL_CHANNEL_ALPHA, extra.type); + EXPECT_EQ(0, extra.alpha_associated); + // Verify the name "alpha_test" given to the alpha channel + EXPECT_EQ(10, extra.name_length); + char name[11]; + EXPECT_EQ(0, + JxlDecoderGetExtraChannelName(dec, 0, name, sizeof(name))); + EXPECT_EQ(std::string("alpha_test"), std::string(name)); + } else { + EXPECT_EQ(0, info.num_extra_channels); + } + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + } else { + // If we did not give the full header, the basic info should not be + // available. Allow a few bytes of slack due to some bits for default + // opsinmatrix/extension bits. + if (size + 2 < data.size()) { + EXPECT_EQ(false, have_basic_info); + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, status); + } + } + + // Test that decoder doesn't allow setting a setting required at beginning + // unless it's reset + EXPECT_EQ(JXL_DEC_ERROR, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO)); + JxlDecoderReset(dec); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO)); + + JxlDecoderDestroy(dec); + } + } +} + +TEST(DecodeTest, BufferSizeTest) { + size_t xsize = 33; + size_t ysize = 77; + size_t bits_per_sample = 8; + size_t orientation = 1; + size_t alpha_bits = 8; + bool have_container = false; + bool xyb_encoded = false; + + std::vector header = + GetTestHeader(xsize, ysize, bits_per_sample, orientation, alpha_bits, + xyb_encoded, have_container, /*metadata_default=*/false, + /*insert_extra_box=*/false, {}); + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO)); + const uint8_t* next_in = header.data(); + size_t avail_in = header.size(); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + JxlDecoderStatus status = JxlDecoderProcessInput(dec); + EXPECT_EQ(JXL_DEC_BASIC_INFO, status); + + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(xsize, info.xsize); + EXPECT_EQ(ysize, info.ysize); + + JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}; + size_t image_out_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &image_out_size)); + EXPECT_EQ(xsize * ysize * 4, image_out_size); + + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, BasicInfoSizeHintTest) { + // Test on a file where the size hint is too small initially due to inserting + // a box before the codestream (something that is normally not recommended) + size_t xsize = 50; + size_t ysize = 50; + size_t bits_per_sample = 16; + size_t orientation = 1; + size_t alpha_bits = 0; + bool xyb_encoded = false; + std::vector data = GetTestHeader( + xsize, ysize, bits_per_sample, orientation, alpha_bits, xyb_encoded, + /*have_container=*/true, /*metadata_default=*/false, + /*insert_extra_box=*/true, {}); + + JxlDecoderStatus status; + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO)); + + size_t hint0 = JxlDecoderSizeHintBasicInfo(dec); + // Test that the test works as intended: we construct a file on purpose to + // be larger than the first hint by having that extra box. + EXPECT_LT(hint0, data.size()); + const uint8_t* next_in = data.data(); + // Do as if we have only as many bytes as indicated by the hint available + size_t avail_in = std::min(hint0, data.size()); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + status = JxlDecoderProcessInput(dec); + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, status); + // Basic info cannot be available yet due to the extra inserted box. + EXPECT_EQ(false, !JxlDecoderGetBasicInfo(dec, nullptr)); + + size_t num_read = avail_in - JxlDecoderReleaseInput(dec); + EXPECT_LT(num_read, data.size()); + + size_t hint1 = JxlDecoderSizeHintBasicInfo(dec); + // The hint must be larger than the previous hint (taking already processed + // bytes into account, the hint is a hint for the next avail_in) since the + // decoder now knows there is a box in between. + EXPECT_GT(hint1 + num_read, hint0); + avail_in = std::min(hint1, data.size() - num_read); + next_in += num_read; + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + status = JxlDecoderProcessInput(dec); + EXPECT_EQ(JXL_DEC_BASIC_INFO, status); + JxlBasicInfo info; + // We should have the basic info now, since we only added one box in-between, + // and the decoder should have known its size, its implementation can return + // a correct hint. + EXPECT_EQ(true, !JxlDecoderGetBasicInfo(dec, &info)); + + // Also test if the basic info is correct. + EXPECT_EQ(1, info.have_container); + EXPECT_EQ(xsize, info.xsize); + EXPECT_EQ(ysize, info.ysize); + EXPECT_EQ(orientation, info.orientation); + EXPECT_EQ(bits_per_sample, info.bits_per_sample); + + JxlDecoderDestroy(dec); +} + +std::vector GetIccTestHeader(const jxl::PaddedBytes& icc_profile, + bool xyb_encoded) { + size_t xsize = 50; + size_t ysize = 50; + size_t bits_per_sample = 16; + size_t orientation = 1; + size_t alpha_bits = 0; + return GetTestHeader(xsize, ysize, bits_per_sample, orientation, alpha_bits, + xyb_encoded, + /*have_container=*/false, /*metadata_default=*/false, + /*insert_extra_box=*/false, icc_profile); +} + +// Tests the case where pixels and metadata ICC profile are the same +TEST(DecodeTest, IccProfileTestOriginal) { + jxl::PaddedBytes icc_profile = GetIccTestProfile(); + bool xyb_encoded = false; + std::vector data = GetIccTestHeader(icc_profile, xyb_encoded); + JxlPixelFormat format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0}; + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), data.size())); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + + // Expect the opposite of xyb_encoded for uses_original_profile + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(JXL_TRUE, info.uses_original_profile); + + EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec)); + + // the encoded color profile expected to be not available, since the image + // has an ICC profile instead + EXPECT_EQ(JXL_DEC_ERROR, + JxlDecoderGetColorAsEncodedProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr)); + + size_t dec_profile_size; + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &dec_profile_size)); + + // Check that can get return status with NULL size + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr)); + + // The profiles must be equal. This requires they have equal size, and if + // they do, we can get the profile and compare the contents. + EXPECT_EQ(icc_profile.size(), dec_profile_size); + if (icc_profile.size() == dec_profile_size) { + jxl::PaddedBytes icc_profile2(icc_profile.size()); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsICCProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, + icc_profile2.data(), icc_profile2.size())); + EXPECT_EQ(icc_profile, icc_profile2); + } + + // the data is not xyb_encoded, so same result expected for the pixel data + // color profile + EXPECT_EQ(JXL_DEC_ERROR, + JxlDecoderGetColorAsEncodedProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, nullptr)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetICCProfileSize( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, + &dec_profile_size)); + EXPECT_EQ(icc_profile.size(), dec_profile_size); + + JxlDecoderDestroy(dec); +} + +// Tests the case where pixels and metadata ICC profile are different +TEST(DecodeTest, IccProfileTestXybEncoded) { + jxl::PaddedBytes icc_profile = GetIccTestProfile(); + bool xyb_encoded = true; + std::vector data = GetIccTestHeader(icc_profile, xyb_encoded); + JxlPixelFormat format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0}; + JxlPixelFormat format_int = {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}; + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), data.size())); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + + // Expect the opposite of xyb_encoded for uses_original_profile + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(JXL_FALSE, info.uses_original_profile); + + EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec)); + + // the encoded color profile expected to be not available, since the image + // has an ICC profile instead + EXPECT_EQ(JXL_DEC_ERROR, + JxlDecoderGetColorAsEncodedProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr)); + + // Check that can get return status with NULL size + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr)); + + size_t dec_profile_size; + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &dec_profile_size)); + + // The profiles must be equal. This requires they have equal size, and if + // they do, we can get the profile and compare the contents. + EXPECT_EQ(icc_profile.size(), dec_profile_size); + if (icc_profile.size() == dec_profile_size) { + jxl::PaddedBytes icc_profile2(icc_profile.size()); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsICCProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, + icc_profile2.data(), icc_profile2.size())); + EXPECT_EQ(icc_profile, icc_profile2); + } + + // Data is xyb_encoded, so the data profile is a different profile, encoded + // as structured profile. + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsEncodedProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, nullptr)); + JxlColorEncoding pixel_encoding; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsEncodedProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding)); + EXPECT_EQ(JXL_PRIMARIES_SRGB, pixel_encoding.primaries); + // The API returns LINEAR by default when the colorspace cannot be represented + // by enum values. + EXPECT_EQ(JXL_TRANSFER_FUNCTION_LINEAR, pixel_encoding.transfer_function); + + // Test the same but with integer format. + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderGetColorAsEncodedProfile( + dec, &format_int, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding)); + EXPECT_EQ(JXL_PRIMARIES_SRGB, pixel_encoding.primaries); + EXPECT_EQ(JXL_TRANSFER_FUNCTION_LINEAR, pixel_encoding.transfer_function); + + // Test after setting the preferred color profile to non-linear sRGB: + // for XYB images with ICC profile, this setting is expected to take effect. + jxl::ColorEncoding temp_jxl_srgb = jxl::ColorEncoding::SRGB(false); + JxlColorEncoding pixel_encoding_srgb; + ConvertInternalToExternalColorEncoding(temp_jxl_srgb, &pixel_encoding_srgb); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetPreferredColorProfile(dec, &pixel_encoding_srgb)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsEncodedProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding)); + EXPECT_EQ(JXL_TRANSFER_FUNCTION_SRGB, pixel_encoding.transfer_function); + + // The decoder can also output this as a generated ICC profile anyway, and + // we're certain that it will differ from the above defined profile since + // the sRGB data should not have swapped R/G/B primaries. + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetICCProfileSize( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, + &dec_profile_size)); + // We don't need to dictate exactly what size the generated ICC profile + // must be (since there are many ways to represent the same color space), + // but it should not be zero. + EXPECT_NE(0, dec_profile_size); + if (0 != dec_profile_size) { + jxl::PaddedBytes icc_profile2(dec_profile_size); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile( + dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, + icc_profile2.data(), icc_profile2.size())); + // expected not equal + EXPECT_NE(icc_profile, icc_profile2); + } + + JxlDecoderDestroy(dec); +} + +// Test decoding ICC from partial files byte for byte. +// This test must pass also if JXL_CRASH_ON_ERROR is enabled, that is, the +// decoding of the ANS histogram and stream of the encoded ICC profile must also +// handle the case of not enough input bytes with StatusCode::kNotEnoughBytes +// rather than fatal error status codes. +TEST(DecodeTest, ICCPartialTest) { + jxl::PaddedBytes icc_profile = GetIccTestProfile(); + std::vector data = GetIccTestHeader(icc_profile, false); + JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}; + + const uint8_t* next_in = data.data(); + size_t avail_in = 0; + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING)); + + bool seen_basic_info = false; + bool seen_color_encoding = false; + size_t total_size = 0; + + for (;;) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + JxlDecoderStatus status = JxlDecoderProcessInput(dec); + size_t remaining = JxlDecoderReleaseInput(dec); + EXPECT_LE(remaining, avail_in); + next_in += avail_in - remaining; + avail_in = remaining; + if (status == JXL_DEC_NEED_MORE_INPUT) { + if (total_size >= data.size()) { + // End of partial codestream with codestrema headers and ICC profile + // reached, it should not require more input since full image is not + // requested + FAIL(); + break; + } + size_t increment = 1; + if (total_size + increment > data.size()) { + increment = data.size() - total_size; + } + total_size += increment; + avail_in += increment; + } else if (status == JXL_DEC_BASIC_INFO) { + EXPECT_FALSE(seen_basic_info); + seen_basic_info = true; + } else if (status == JXL_DEC_COLOR_ENCODING) { + EXPECT_TRUE(seen_basic_info); + EXPECT_FALSE(seen_color_encoding); + seen_color_encoding = true; + + // Sanity check that the ICC profile was decoded correctly + size_t dec_profile_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize(dec, &format, + JXL_COLOR_PROFILE_TARGET_ORIGINAL, + &dec_profile_size)); + EXPECT_EQ(icc_profile.size(), dec_profile_size); + + } else if (status == JXL_DEC_SUCCESS) { + EXPECT_TRUE(seen_color_encoding); + break; + } else { + // We do not expect any other events or errors + FAIL(); + break; + } + } + + EXPECT_TRUE(seen_basic_info); + EXPECT_TRUE(seen_color_encoding); + + JxlDecoderDestroy(dec); +} + +struct PixelTestConfig { + // Input image definition. + bool grayscale; + bool include_alpha; + size_t xsize; + size_t ysize; + bool add_preview; + // Output format. + JxlEndianness endianness; + JxlDataType data_type; + uint32_t output_channels; + // Container options. + CodeStreamBoxFormat add_container; + // Decoding mode. + bool use_callback; + bool set_buffer_early; + bool use_resizable_runner; + // Exif orientation, 1-8 + JxlOrientation orientation; + bool keep_orientation; +}; + +class DecodeTestParam : public ::testing::TestWithParam {}; + +TEST_P(DecodeTestParam, PixelTest) { + PixelTestConfig config = GetParam(); + JxlDecoder* dec = JxlDecoderCreate(NULL); + + if (config.keep_orientation) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetKeepOrientation(dec, JXL_TRUE)); + } + + size_t num_pixels = config.xsize * config.ysize; + uint32_t orig_channels = + (config.grayscale ? 1 : 3) + (config.include_alpha ? 1 : 0); + std::vector pixels = + jxl::test::GetSomeTestImage(config.xsize, config.ysize, orig_channels, 0); + JxlPixelFormat format_orig = {orig_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, + 0}; + jxl::CompressParams cparams; + // Lossless to verify pixels exactly after roundtrip. + cparams.SetLossless(); + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), config.xsize, + config.ysize, orig_channels, cparams, config.add_container, + config.orientation, config.add_preview); + + JxlPixelFormat format = {config.output_channels, config.data_type, + config.endianness, 0}; + + bool swap_xy = !config.keep_orientation && (config.orientation > 4); + size_t xsize = swap_xy ? config.ysize : config.xsize; + size_t ysize = swap_xy ? config.xsize : config.ysize; + + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(compressed.data(), compressed.size()), + format, config.use_callback, config.set_buffer_early, + config.use_resizable_runner); + JxlDecoderReset(dec); + EXPECT_EQ(num_pixels * config.output_channels * + GetDataBits(config.data_type) / jxl::kBitsPerByte, + pixels2.size()); + + // If an orientation transformation is expected, to compare the pixels, also + // apply this transformation to the original pixels. ConvertToExternal is + // used to achieve this, with a temporary conversion to CodecInOut and back. + if (config.orientation > 1 && !config.keep_orientation) { + jxl::Span bytes(pixels.data(), pixels.size()); + jxl::ColorEncoding color_encoding = + jxl::ColorEncoding::SRGB(config.grayscale); + + jxl::CodecInOut io; + if (config.include_alpha) io.metadata.m.SetAlphaBits(16); + io.SetSize(config.xsize, config.ysize); + + EXPECT_TRUE(ConvertFromExternal( + bytes, config.xsize, config.ysize, color_encoding, config.include_alpha, + /*alpha_is_premultiplied=*/false, 16, JXL_BIG_ENDIAN, + /*flipped_y=*/false, nullptr, &io.Main())); + + for (size_t i = 0; i < pixels.size(); i++) pixels[i] = 0; + EXPECT_TRUE(ConvertToExternal( + io.Main(), 16, + /*float_out=*/false, orig_channels, JXL_BIG_ENDIAN, + xsize * 2 * orig_channels, nullptr, pixels.data(), pixels.size(), + nullptr, nullptr, static_cast(config.orientation))); + } + + EXPECT_EQ(0, ComparePixels(pixels.data(), pixels2.data(), xsize, ysize, + format_orig, format)); + + JxlDecoderDestroy(dec); +} + +std::vector GeneratePixelTests() { + std::vector all_tests; + struct ChannelInfo { + bool grayscale; + bool include_alpha; + size_t output_channels; + }; + ChannelInfo ch_info[] = { + {false, true, 4}, // RGBA -> RGBA + {true, false, 1}, // G -> G + {true, true, 1}, // GA -> G + {true, true, 2}, // GA -> GA + {false, false, 3}, // RGB -> RGB + {false, true, 3}, // RGBA -> RGB + {false, false, 4}, // RGB -> RGBA + }; + + struct OutputFormat { + JxlEndianness endianness; + JxlDataType data_type; + }; + OutputFormat out_formats[] = { + {JXL_NATIVE_ENDIAN, JXL_TYPE_UINT8}, + {JXL_LITTLE_ENDIAN, JXL_TYPE_UINT16}, + {JXL_BIG_ENDIAN, JXL_TYPE_UINT16}, + {JXL_NATIVE_ENDIAN, JXL_TYPE_FLOAT16}, + {JXL_LITTLE_ENDIAN, JXL_TYPE_FLOAT}, + {JXL_BIG_ENDIAN, JXL_TYPE_FLOAT}, + }; + + auto make_test = [&](ChannelInfo ch, size_t xsize, size_t ysize, bool preview, + CodeStreamBoxFormat box, JxlOrientation orientation, + bool keep_orientation, OutputFormat format, + bool use_callback, bool set_buffer_early, + bool resizable_runner) { + PixelTestConfig c; + c.grayscale = ch.grayscale; + c.include_alpha = ch.include_alpha; + c.add_preview = preview; + c.xsize = xsize; + c.ysize = ysize; + c.add_container = (CodeStreamBoxFormat)box; + c.output_channels = ch.output_channels; + c.data_type = format.data_type; + c.endianness = format.endianness; + c.use_callback = use_callback; + c.set_buffer_early = set_buffer_early; + c.use_resizable_runner = resizable_runner; + c.orientation = orientation; + c.keep_orientation = keep_orientation; + all_tests.push_back(c); + }; + + // Test output formats and methods. + for (ChannelInfo ch : ch_info) { + for (int use_callback = 0; use_callback <= 1; use_callback++) { + for (OutputFormat fmt : out_formats) { + make_test(ch, 301, 33, /*add_preview=*/false, + CodeStreamBoxFormat::kCSBF_None, JXL_ORIENT_IDENTITY, + /*keep_orientation=*/false, fmt, use_callback, + /*set_buffer_early=*/false, /*resizable_runner=*/false); + } + } + } + // Test codestream formats. + for (size_t box = 1; box < kCSBF_NUM_ENTRIES; ++box) { + make_test(ch_info[0], 77, 33, /*add_preview=*/false, + (CodeStreamBoxFormat)box, JXL_ORIENT_IDENTITY, + /*keep_orientation=*/false, out_formats[0], + /*use_callback=*/false, + /*set_buffer_early=*/false, /*resizable_runner=*/false); + } + // Test previews. + for (int add_preview = 0; add_preview <= 1; add_preview++) { + make_test(ch_info[0], 77, 33, add_preview, CodeStreamBoxFormat::kCSBF_None, + JXL_ORIENT_IDENTITY, /*keep_orientation=*/false, out_formats[0], + /*use_callback=*/false, /*set_buffer_early=*/false, + /*resizable_runner=*/false); + } + // Test setting buffers early. + make_test(ch_info[0], 300, 33, /*add_preview=*/false, + CodeStreamBoxFormat::kCSBF_None, JXL_ORIENT_IDENTITY, + /*keep_orientation=*/false, out_formats[0], + /*use_callback=*/false, /*set_buffer_early=*/true, + /*resizable_runner=*/false); + + // Test using the resizable runner + for (size_t i = 0; i < 4; i++) { + make_test(ch_info[0], 300 << i, 33 << i, /*add_preview=*/false, + CodeStreamBoxFormat::kCSBF_None, JXL_ORIENT_IDENTITY, + /*keep_orientation=*/false, out_formats[0], + /*use_callback=*/false, /*set_buffer_early=*/false, + /*resizable_runner=*/true); + } + + // Test orientations. + for (int orientation = 1; orientation <= 8; ++orientation) { + make_test(ch_info[0], 280, 12, /*add_preview=*/false, + CodeStreamBoxFormat::kCSBF_None, + static_cast(orientation), + /*keep_orientation=*/false, out_formats[0], + /*use_callback=*/false, /*set_buffer_early=*/true, + /*resizable_runner=*/false); + make_test(ch_info[0], 280, 12, /*add_preview=*/false, + CodeStreamBoxFormat::kCSBF_None, + static_cast(orientation), + /*keep_orientation=*/true, out_formats[0], + /*use_callback=*/false, /*set_buffer_early=*/true, + /*resizable_runner=*/false); + } + + return all_tests; +} + +std::ostream& operator<<(std::ostream& os, const PixelTestConfig& c) { + os << c.xsize << "x" << c.ysize; + const char* colors[] = {"", "G", "GA", "RGB", "RGBA"}; + os << colors[(c.grayscale ? 1 : 3) + (c.include_alpha ? 1 : 0)]; + os << "to"; + os << colors[c.output_channels]; + switch (c.data_type) { + case JXL_TYPE_UINT8: + os << "u8"; + break; + case JXL_TYPE_UINT16: + os << "u16"; + break; + case JXL_TYPE_FLOAT: + os << "f32"; + break; + case JXL_TYPE_FLOAT16: + os << "f16"; + break; + case JXL_TYPE_UINT32: + os << "u32"; + break; + case JXL_TYPE_BOOLEAN: + os << "b"; + break; + }; + if (GetDataBits(c.data_type) > jxl::kBitsPerByte) { + if (c.endianness == JXL_NATIVE_ENDIAN) { + // add nothing + } else if (c.endianness == JXL_BIG_ENDIAN) { + os << "BE"; + } else if (c.endianness == JXL_LITTLE_ENDIAN) { + os << "LE"; + } + } + if (c.add_container != CodeStreamBoxFormat::kCSBF_None) { + os << "Box"; + os << (size_t)c.add_container; + } + if (c.add_preview) os << "Preview"; + if (c.use_callback) os << "Callback"; + if (c.set_buffer_early) os << "EarlyBuffer"; + if (c.use_resizable_runner) os << "ResizableRunner"; + if (c.orientation != 1) os << "O" << c.orientation; + if (c.keep_orientation) os << "Keep"; + return os; +} + +std::string PixelTestDescription( + const testing::TestParamInfo& info) { + std::stringstream name; + name << info.param; + return name.str(); +} + +JXL_GTEST_INSTANTIATE_TEST_SUITE_P(DecodeTest, DecodeTestParam, + testing::ValuesIn(GeneratePixelTests()), + PixelTestDescription); + +TEST(DecodeTest, PixelTestWithICCProfileLossless) { + JxlDecoder* dec = JxlDecoderCreate(NULL); + + size_t xsize = 123, ysize = 77; + size_t num_pixels = xsize * ysize; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + jxl::CompressParams cparams; + // Lossless to verify pixels exactly after roundtrip. + cparams.SetLossless(); + // For variation: some have container and no preview, others have preview + // and no container. + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 4, + cparams, kCSBF_None, JXL_ORIENT_IDENTITY, false, true); + + for (uint32_t channels = 3; channels <= 4; ++channels) { + { + JxlPixelFormat format = {channels, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}; + + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(compressed.data(), compressed.size()), + format, /*use_callback=*/false, /*set_buffer_early=*/false, + /*use_resizable_runner=*/false); + JxlDecoderReset(dec); + EXPECT_EQ(num_pixels * channels, pixels2.size()); + EXPECT_EQ(0, ComparePixels(pixels.data(), pixels2.data(), xsize, ysize, + format_orig, format)); + } + { + JxlPixelFormat format = {channels, JXL_TYPE_UINT16, JXL_LITTLE_ENDIAN, 0}; + + // Test with the container for one of the pixel formats. + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(compressed.data(), compressed.size()), + format, /*use_callback=*/true, /*set_buffer_early=*/true, + /*use_resizable_runner=*/false); + JxlDecoderReset(dec); + EXPECT_EQ(num_pixels * channels * 2, pixels2.size()); + EXPECT_EQ(0, ComparePixels(pixels.data(), pixels2.data(), xsize, ysize, + format_orig, format)); + } + + { + JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0}; + + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(compressed.data(), compressed.size()), + format, /*use_callback=*/false, /*set_buffer_early=*/false, + /*use_resizable_runner=*/false); + JxlDecoderReset(dec); + EXPECT_EQ(num_pixels * channels * 4, pixels2.size()); + EXPECT_EQ(0, ComparePixels(pixels.data(), pixels2.data(), xsize, ysize, + format_orig, format)); + } + } + + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, PixelTestWithICCProfileLossy) { + JxlDecoder* dec = JxlDecoderCreate(NULL); + + size_t xsize = 123, ysize = 77; + size_t num_pixels = xsize * ysize; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + jxl::CompressParams cparams; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 3, + cparams, kCSBF_None, JXL_ORIENT_IDENTITY, /*add_preview=*/false, + /*add_icc_profile=*/true); + uint32_t channels = 3; + + JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0}; + + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(compressed.data(), compressed.size()), + format, /*use_callback=*/false, /*set_buffer_early=*/true, + /*use_resizable_runner=*/false); + JxlDecoderReset(dec); + EXPECT_EQ(num_pixels * channels * 4, pixels2.size()); + + // The input pixels use the profile matching GetIccTestProfile, since we set + // add_icc_profile for CreateTestJXLCodestream to true. + jxl::ColorEncoding color_encoding0; + EXPECT_TRUE(color_encoding0.SetICC(GetIccTestProfile())); + jxl::Span span0(pixels.data(), pixels.size()); + jxl::CodecInOut io0; + io0.SetSize(xsize, ysize); + EXPECT_TRUE(ConvertFromExternal( + span0, xsize, ysize, color_encoding0, + /*has_alpha=*/false, false, 16, format_orig.endianness, + /*flipped_y=*/false, /*pool=*/nullptr, &io0.Main())); + + // The output pixels are expected to be in the same colorspace as the input + // profile, as the profile can be represented by enum values. + jxl::ColorEncoding color_encoding1 = color_encoding0; + jxl::Span span1(pixels2.data(), pixels2.size()); + jxl::CodecInOut io1; + io1.SetSize(xsize, ysize); + EXPECT_TRUE( + ConvertFromExternal(span1, xsize, ysize, color_encoding1, + /*has_alpha=*/false, false, 32, format.endianness, + /*flipped_y=*/false, /*pool=*/nullptr, &io1.Main())); + + jxl::ButteraugliParams ba; + EXPECT_LE(ButteraugliDistance(io0, io1, ba, /*distmap=*/nullptr, nullptr), + 2.4f); + + JxlDecoderDestroy(dec); +} + +// Tests the case of lossy sRGB image without alpha channel, decoded to RGB8 +// and to RGBA8 +TEST(DecodeTest, PixelTestOpaqueSrgbLossy) { + for (unsigned channels = 3; channels <= 4; channels++) { + JxlDecoder* dec = JxlDecoderCreate(NULL); + + size_t xsize = 123, ysize = 77; + size_t num_pixels = xsize * ysize; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + jxl::CompressParams cparams; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 3, + cparams, kCSBF_None, JXL_ORIENT_IDENTITY, /*add_preview=*/false, + /*add_icc_profile=*/false); + + JxlPixelFormat format = {channels, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}; + + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(compressed.data(), compressed.size()), + format, /*use_callback=*/true, /*set_buffer_early=*/false, + /*use_resizable_runner=*/false); + JxlDecoderReset(dec); + EXPECT_EQ(num_pixels * channels, pixels2.size()); + + // The input pixels use the profile matching GetIccTestProfile, since we set + // add_icc_profile for CreateTestJXLCodestream to true. + jxl::ColorEncoding color_encoding0 = jxl::ColorEncoding::SRGB(false); + jxl::Span span0(pixels.data(), pixels.size()); + jxl::CodecInOut io0; + io0.SetSize(xsize, ysize); + EXPECT_TRUE(ConvertFromExternal( + span0, xsize, ysize, color_encoding0, + /*has_alpha=*/false, false, 16, format_orig.endianness, + /*flipped_y=*/false, /*pool=*/nullptr, &io0.Main())); + + jxl::ColorEncoding color_encoding1 = jxl::ColorEncoding::SRGB(false); + jxl::Span span1(pixels2.data(), pixels2.size()); + jxl::CodecInOut io1; + if (channels == 4) { + io1.metadata.m.SetAlphaBits(8); + io1.SetSize(xsize, ysize); + EXPECT_TRUE(ConvertFromExternal( + span1, xsize, ysize, color_encoding1, + /*has_alpha=*/true, false, 8, format.endianness, + /*flipped_y=*/false, /*pool=*/nullptr, &io1.Main())); + io1.metadata.m.SetAlphaBits(0); + io1.Main().ClearExtraChannels(); + } else { + EXPECT_TRUE(ConvertFromExternal( + span1, xsize, ysize, color_encoding1, + /*has_alpha=*/false, false, 8, format.endianness, + /*flipped_y=*/false, /*pool=*/nullptr, &io1.Main())); + } + + jxl::ButteraugliParams ba; + EXPECT_LE(ButteraugliDistance(io0, io1, ba, /*distmap=*/nullptr, nullptr), + 2.4f); + + JxlDecoderDestroy(dec); + } +} + +// Opaque image with noise enabled, decoded to RGB8 and RGBA8. +TEST(DecodeTest, PixelTestOpaqueSrgbLossyNoise) { + for (unsigned channels = 3; channels <= 4; channels++) { + JxlDecoder* dec = JxlDecoderCreate(NULL); + + size_t xsize = 512, ysize = 300; + size_t num_pixels = xsize * ysize; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + jxl::CompressParams cparams; + cparams.noise = jxl::Override::kOn; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 3, + cparams, kCSBF_None, JXL_ORIENT_IDENTITY, /*add_preview=*/false, + /*add_icc_profile=*/false); + + JxlPixelFormat format = {channels, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}; + + std::vector pixels2 = jxl::DecodeWithAPI( + dec, jxl::Span(compressed.data(), compressed.size()), + format, /*use_callback=*/false, /*set_buffer_early=*/true, + /*use_resizable_runner=*/false); + JxlDecoderReset(dec); + EXPECT_EQ(num_pixels * channels, pixels2.size()); + + // The input pixels use the profile matching GetIccTestProfile, since we set + // add_icc_profile for CreateTestJXLCodestream to true. + jxl::ColorEncoding color_encoding0 = jxl::ColorEncoding::SRGB(false); + jxl::Span span0(pixels.data(), pixels.size()); + jxl::CodecInOut io0; + io0.SetSize(xsize, ysize); + EXPECT_TRUE(ConvertFromExternal( + span0, xsize, ysize, color_encoding0, + /*has_alpha=*/false, false, 16, format_orig.endianness, + /*flipped_y=*/false, /*pool=*/nullptr, &io0.Main())); + + jxl::ColorEncoding color_encoding1 = jxl::ColorEncoding::SRGB(false); + jxl::Span span1(pixels2.data(), pixels2.size()); + jxl::CodecInOut io1; + if (channels == 4) { + io1.metadata.m.SetAlphaBits(8); + io1.SetSize(xsize, ysize); + EXPECT_TRUE(ConvertFromExternal( + span1, xsize, ysize, color_encoding1, + /*has_alpha=*/true, false, 8, format.endianness, + /*flipped_y=*/false, /*pool=*/nullptr, &io1.Main())); + io1.metadata.m.SetAlphaBits(0); + io1.Main().ClearExtraChannels(); + } else { + EXPECT_TRUE(ConvertFromExternal( + span1, xsize, ysize, color_encoding1, + /*has_alpha=*/false, false, 8, format.endianness, + /*flipped_y=*/false, /*pool=*/nullptr, &io1.Main())); + } + + jxl::ButteraugliParams ba; + EXPECT_LE(ButteraugliDistance(io0, io1, ba, /*distmap=*/nullptr, nullptr), + 2.6f); + + JxlDecoderDestroy(dec); + } +} + +void TestPartialStream(bool reconstructible_jpeg) { + size_t xsize = 123, ysize = 77; + uint32_t channels = 4; + if (reconstructible_jpeg) { + channels = 3; + } + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, channels, 0); + JxlPixelFormat format_orig = {channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + jxl::CompressParams cparams; + if (reconstructible_jpeg) { + cparams.color_transform = jxl::ColorTransform::kNone; + } else { + cparams + .SetLossless(); // Lossless to verify pixels exactly after roundtrip. + } + + std::vector pixels2; + pixels2.resize(pixels.size()); + + jxl::PaddedBytes jpeg_output(64); + size_t used_jpeg_output = 0; + + std::vector codestreams(kCSBF_NUM_ENTRIES); + std::vector jpeg_codestreams(kCSBF_NUM_ENTRIES); + for (size_t i = 0; i < kCSBF_NUM_ENTRIES; ++i) { + CodeStreamBoxFormat add_container = (CodeStreamBoxFormat)i; + + codestreams[i] = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + channels, cparams, add_container, JXL_ORIENT_IDENTITY, + /*add_preview=*/true, + /*add_icc_profile=*/false, + reconstructible_jpeg ? &jpeg_codestreams[i] : nullptr); + } + + // Test multiple step sizes, to test different combinations of the streaming + // box parsing. + std::vector increments = {1, 3, 17, 23, 120, 700, 1050}; + + for (size_t index = 0; index < increments.size(); index++) { + for (size_t i = 0; i < kCSBF_NUM_ENTRIES; ++i) { + if (reconstructible_jpeg && + (CodeStreamBoxFormat)i == CodeStreamBoxFormat::kCSBF_None) { + continue; + } + const jxl::PaddedBytes& data = codestreams[i]; + const uint8_t* next_in = data.data(); + size_t avail_in = 0; + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE | + JXL_DEC_JPEG_RECONSTRUCTION)); + + bool seen_basic_info = false; + bool seen_full_image = false; + bool seen_jpeg_recon = false; + + size_t total_size = 0; + + for (;;) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + JxlDecoderStatus status = JxlDecoderProcessInput(dec); + size_t remaining = JxlDecoderReleaseInput(dec); + EXPECT_LE(remaining, avail_in); + next_in += avail_in - remaining; + avail_in = remaining; + if (status == JXL_DEC_NEED_MORE_INPUT) { + if (total_size >= data.size()) { + // End of test data reached, it should have successfully decoded the + // image now. + FAIL(); + break; + } + + size_t increment = increments[index]; + // End of the file reached, should be the final test. + if (total_size + increment > data.size()) { + increment = data.size() - total_size; + } + total_size += increment; + avail_in += increment; + } else if (status == JXL_DEC_BASIC_INFO) { + // This event should happen exactly once + EXPECT_FALSE(seen_basic_info); + if (seen_basic_info) break; + seen_basic_info = true; + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(info.xsize, xsize); + EXPECT_EQ(info.ysize, ysize); + } else if (status == JXL_DEC_JPEG_RECONSTRUCTION) { + EXPECT_FALSE(seen_basic_info); + EXPECT_FALSE(seen_full_image); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetJPEGBuffer(dec, jpeg_output.data(), + jpeg_output.size())); + seen_jpeg_recon = true; + } else if (status == JXL_DEC_JPEG_NEED_MORE_OUTPUT) { + EXPECT_TRUE(seen_jpeg_recon); + used_jpeg_output = + jpeg_output.size() - JxlDecoderReleaseJPEGBuffer(dec); + jpeg_output.resize(jpeg_output.size() * 2); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetJPEGBuffer( + dec, jpeg_output.data() + used_jpeg_output, + jpeg_output.size() - used_jpeg_output)); + } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) { + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer( + dec, &format_orig, pixels2.data(), pixels2.size())); + } else if (status == JXL_DEC_FULL_IMAGE) { + // This event should happen exactly once + EXPECT_FALSE(seen_full_image); + if (seen_full_image) break; + // This event should happen after basic info + EXPECT_TRUE(seen_basic_info); + seen_full_image = true; + if (reconstructible_jpeg) { + used_jpeg_output = + jpeg_output.size() - JxlDecoderReleaseJPEGBuffer(dec); + EXPECT_EQ(used_jpeg_output, jpeg_codestreams[i].size()); + EXPECT_EQ(0, memcmp(jpeg_output.data(), jpeg_codestreams[i].data(), + used_jpeg_output)); + } else { + EXPECT_EQ(pixels, pixels2); + } + } else if (status == JXL_DEC_SUCCESS) { + EXPECT_TRUE(seen_full_image); + break; + } else { + // We do not expect any other events or errors + FAIL(); + break; + } + } + + // Ensure the decoder emitted the basic info and full image events + EXPECT_TRUE(seen_basic_info); + EXPECT_TRUE(seen_full_image); + + JxlDecoderDestroy(dec); + } + } +} + +// Tests the return status when trying to decode pixels on incomplete file: it +// should return JXL_DEC_NEED_MORE_INPUT, not error. +TEST(DecodeTest, PixelPartialTest) { TestPartialStream(false); } + +#if JPEGXL_ENABLE_JPEG +// Tests the return status when trying to decode JPEG bytes on incomplete file. +TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGPartialTest)) { + TestPartialStream(true); +} +#endif // JPEGXL_ENABLE_JPEG + +// The DC event still exists, but is no longer implemented, it is deprecated. +TEST(DecodeTest, DCNotGettableTest) { + // 1x1 pixel JXL image + std::string compressed( + "\377\n\0\20\260\23\0H\200(" + "\0\334\0U\17\0\0\250P\31e\334\340\345\\\317\227\37:," + "\246m\\gh\253m\vK\22E\306\261I\252C&pH\22\353 " + "\363\6\22\bp\0\200\237\34\231W2d\255$\1", + 68); + + JxlDecoder* dec = JxlDecoderCreate(NULL); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_DC_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetInput( + dec, reinterpret_cast(compressed.data()), + compressed.size())); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + + // Since the image is only 1x1 pixel, there is only 1 group, the decoder is + // unable to get DC size from this, and will not return the DC at all. Since + // no full image is requested either, it is expected to return success. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, PreviewTest) { + size_t xsize = 77, ysize = 120; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + + jxl::CompressParams cparams; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 3, + cparams, kCSBF_Multi, JXL_ORIENT_IDENTITY, /*add_preview=*/true); + + JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}; + + JxlDecoder* dec = JxlDecoderCreate(NULL); + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_PREVIEW_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size)); + + // GetSomeTestImage is hardcoded to use a top-left cropped preview with + // floor of 1/7th of the size + size_t xsize_preview = (xsize / 7); + size_t ysize_preview = (ysize / 7); + EXPECT_EQ(xsize_preview, info.preview.xsize); + EXPECT_EQ(ysize_preview, info.preview.ysize); + EXPECT_EQ(xsize_preview * ysize_preview * 3, buffer_size); + + EXPECT_EQ(JXL_DEC_NEED_PREVIEW_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + std::vector preview(xsize_preview * ysize_preview * 3); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetPreviewOutBuffer( + dec, &format, preview.data(), preview.size())); + + EXPECT_EQ(JXL_DEC_PREVIEW_IMAGE, JxlDecoderProcessInput(dec)); + + jxl::Image3F preview0(xsize_preview, ysize_preview); + jxl::Image3F preview1(xsize_preview, ysize_preview); + + // For preview0, the original: top-left crop the preview image the way + // GetSomeTestImage does. + for (size_t y = 0; y < ysize_preview; y++) { + for (size_t x = 0; x < xsize_preview; x++) { + preview0.PlaneRow(0, y)[x] = + (1.f / 255) * (pixels[(y * xsize + x) * 6 + 0]); + preview0.PlaneRow(1, y)[x] = + (1.f / 255) * (pixels[(y * xsize + x) * 6 + 2]); + preview0.PlaneRow(2, y)[x] = + (1.f / 255) * (pixels[(y * xsize + x) * 6 + 4]); + preview1.PlaneRow(0, y)[x] = + (1.f / 255) * (preview[(y * xsize_preview + x) * 3 + 0]); + preview1.PlaneRow(1, y)[x] = + (1.f / 255) * (preview[(y * xsize_preview + x) * 3 + 1]); + preview1.PlaneRow(2, y)[x] = + (1.f / 255) * (preview[(y * xsize_preview + x) * 3 + 2]); + } + } + + jxl::CodecInOut io0; + io0.SetFromImage(std::move(preview0), jxl::ColorEncoding::SRGB(false)); + jxl::CodecInOut io1; + io1.SetFromImage(std::move(preview1), jxl::ColorEncoding::SRGB(false)); + + jxl::ButteraugliParams ba; + // TODO(lode): this ButteraugliDistance silently returns 0 (dangerous for + // tests) if xsize or ysize is < 8, no matter how different the images, a tiny + // size that could happen for a preview. ButteraugliDiffmap does support + // smaller than 8x8, but jxl's ButteraugliDistance does not. Perhaps move + // butteraugli's <8x8 handling from ButteraugliDiffmap to + // ButteraugliComparator::Diffmap in butteraugli.cc. + EXPECT_LE(ButteraugliDistance(io0, io1, ba, + /*distmap=*/nullptr, nullptr), + 1.4f); + + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, AlignTest) { + size_t xsize = 123, ysize = 77; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + jxl::CompressParams cparams; + cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip. + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, 4, + cparams, kCSBF_None, JXL_ORIENT_IDENTITY, false); + + size_t align = 17; + JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, align}; + // On purpose not using jxl::RoundUpTo to test it independently. + size_t expected_line_bytes = (1 * 3 * xsize + align - 1) / align * align; + + for (int use_callback = 0; use_callback <= 1; ++use_callback) { + std::vector pixels2 = jxl::DecodeWithAPI( + jxl::Span(compressed.data(), compressed.size()), format, + use_callback, /*set_buffer_early=*/false, + /*use_resizable_runner=*/false); + EXPECT_EQ(expected_line_bytes * ysize, pixels2.size()); + EXPECT_EQ(0, ComparePixels(pixels.data(), pixels2.data(), xsize, ysize, + format_orig, format)); + } +} + +TEST(DecodeTest, AnimationTest) { + size_t xsize = 123, ysize = 77; + static const size_t num_frames = 2; + std::vector frames[2]; + frames[0] = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + frames[1] = jxl::test::GetSomeTestImage(xsize, ysize, 3, 1); + JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + jxl::CodecInOut io; + io.SetSize(xsize, ysize); + io.metadata.m.SetUintSamples(16); + io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false); + io.metadata.m.have_animation = true; + io.frames.clear(); + io.frames.reserve(num_frames); + io.SetSize(xsize, ysize); + + std::vector frame_durations(num_frames); + for (size_t i = 0; i < num_frames; ++i) { + frame_durations[i] = 5 + i; + } + + for (size_t i = 0; i < num_frames; ++i) { + jxl::ImageBundle bundle(&io.metadata.m); + + EXPECT_TRUE(ConvertFromExternal( + jxl::Span(frames[i].data(), frames[i].size()), xsize, + ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*has_alpha=*/false, + /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, + JXL_BIG_ENDIAN, /*flipped_y=*/false, /*pool=*/nullptr, &bundle)); + bundle.duration = frame_durations[i]; + io.frames.push_back(std::move(bundle)); + } + + jxl::CompressParams cparams; + cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip. + jxl::AuxOut aux_out; + jxl::PaddedBytes compressed; + jxl::PassesEncoderState enc_state; + EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed, &aux_out, + nullptr)); + + // Decode and test the animation frames + + JxlDecoder* dec = JxlDecoderCreate(NULL); + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + void* runner = JxlThreadParallelRunnerCreate( + NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads()); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + + for (size_t i = 0; i < num_frames; ++i) { + std::vector pixels(buffer_size); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header)); + EXPECT_EQ(frame_durations[i], frame_header.duration); + EXPECT_EQ(0, frame_header.name_length); + // For now, test with empty name, there's currently no easy way to encode + // a jxl file with a frame name because ImageBundle doesn't have a + // jxl::FrameHeader to set the name in. We can test the null termination + // character though. + char name; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameName(dec, &name, 1)); + EXPECT_EQ(0, name); + + EXPECT_EQ(i + 1 == num_frames, frame_header.is_last); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels.data(), pixels.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_EQ(0, ComparePixels(frames[i].data(), pixels.data(), xsize, ysize, + format, format)); + } + + // After all frames were decoded, JxlDecoderProcessInput should return + // success to indicate all is done. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + + JxlThreadParallelRunnerDestroy(runner); + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, AnimationTestStreaming) { + size_t xsize = 123, ysize = 77; + static const size_t num_frames = 2; + std::vector frames[2]; + frames[0] = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0); + frames[1] = jxl::test::GetSomeTestImage(xsize, ysize, 3, 1); + JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + jxl::CodecInOut io; + io.SetSize(xsize, ysize); + io.metadata.m.SetUintSamples(16); + io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false); + io.metadata.m.have_animation = true; + io.frames.clear(); + io.frames.reserve(num_frames); + io.SetSize(xsize, ysize); + + std::vector frame_durations(num_frames); + for (size_t i = 0; i < num_frames; ++i) { + frame_durations[i] = 5 + i; + } + + for (size_t i = 0; i < num_frames; ++i) { + jxl::ImageBundle bundle(&io.metadata.m); + + EXPECT_TRUE(ConvertFromExternal( + jxl::Span(frames[i].data(), frames[i].size()), xsize, + ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*has_alpha=*/false, + /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, + JXL_BIG_ENDIAN, /*flipped_y=*/false, /*pool=*/nullptr, &bundle)); + bundle.duration = frame_durations[i]; + io.frames.push_back(std::move(bundle)); + } + + jxl::CompressParams cparams; + cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip. + jxl::AuxOut aux_out; + jxl::PaddedBytes compressed; + jxl::PassesEncoderState enc_state; + EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed, &aux_out, + nullptr)); + + // Decode and test the animation frames + + const size_t step_size = 16; + + JxlDecoder* dec = JxlDecoderCreate(NULL); + const uint8_t* next_in = compressed.data(); + size_t avail_in = 0; + size_t frame_headers_seen = 0; + size_t frames_seen = 0; + bool seen_basic_info = false; + + void* runner = JxlThreadParallelRunnerCreate( + NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads()); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + + std::vector frames2[2]; + for (size_t i = 0; i < num_frames; ++i) { + frames2[i].resize(frames[i].size()); + } + + size_t total_in = 0; + size_t loop_count = 0; + + for (;;) { + if (loop_count++ > compressed.size()) { + fprintf(stderr, "Too many loops\n"); + FAIL(); + break; + } + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + auto status = JxlDecoderProcessInput(dec); + size_t remaining = JxlDecoderReleaseInput(dec); + EXPECT_LE(remaining, avail_in); + next_in += avail_in - remaining; + avail_in = remaining; + + if (status == JXL_DEC_SUCCESS) { + break; + } else if (status == JXL_DEC_ERROR) { + FAIL(); + } else if (status == JXL_DEC_NEED_MORE_INPUT) { + if (total_in >= compressed.size()) { + fprintf(stderr, "Already gave all input data\n"); + FAIL(); + break; + } + size_t amount = step_size; + if (total_in + amount > compressed.size()) { + amount = compressed.size() - total_in; + } + avail_in += amount; + total_in += amount; + } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) { + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, frames2[frames_seen].data(), + frames2[frames_seen].size())); + } else if (status == JXL_DEC_BASIC_INFO) { + EXPECT_EQ(false, seen_basic_info); + seen_basic_info = true; + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(xsize, info.xsize); + EXPECT_EQ(ysize, info.ysize); + } else if (status == JXL_DEC_FRAME) { + EXPECT_EQ(true, seen_basic_info); + frame_headers_seen++; + } else if (status == JXL_DEC_FULL_IMAGE) { + frames_seen++; + EXPECT_EQ(frame_headers_seen, frames_seen); + } else { + fprintf(stderr, "Unexpected status: %d\n", (int)status); + FAIL(); + } + } + + EXPECT_EQ(true, seen_basic_info); + EXPECT_EQ(num_frames, frames_seen); + EXPECT_EQ(num_frames, frame_headers_seen); + for (size_t i = 0; i < num_frames; ++i) { + EXPECT_EQ(frames[i], frames2[i]); + } + + JxlThreadParallelRunnerDestroy(runner); + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, SkipFrameTest) { + size_t xsize = 90, ysize = 120; + constexpr size_t num_frames = 16; + std::vector frames[num_frames]; + for (size_t i = 0; i < num_frames; i++) { + frames[i] = jxl::test::GetSomeTestImage(xsize, ysize, 3, i); + } + JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + jxl::CodecInOut io; + io.SetSize(xsize, ysize); + io.metadata.m.SetUintSamples(16); + io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false); + io.metadata.m.have_animation = true; + io.frames.clear(); + io.frames.reserve(num_frames); + io.SetSize(xsize, ysize); + + std::vector frame_durations(num_frames); + for (size_t i = 0; i < num_frames; ++i) { + frame_durations[i] = 5 + i; + } + + for (size_t i = 0; i < num_frames; ++i) { + jxl::ImageBundle bundle(&io.metadata.m); + if (i & 1) { + // Mark some frames as referenceable, others not. + bundle.use_for_next_frame = true; + } + + EXPECT_TRUE(ConvertFromExternal( + jxl::Span(frames[i].data(), frames[i].size()), xsize, + ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*has_alpha=*/false, + /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, + JXL_BIG_ENDIAN, /*flipped_y=*/false, /*pool=*/nullptr, &bundle)); + bundle.duration = frame_durations[i]; + io.frames.push_back(std::move(bundle)); + } + + jxl::CompressParams cparams; + cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip. + jxl::AuxOut aux_out; + jxl::PaddedBytes compressed; + jxl::PassesEncoderState enc_state; + EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed, &aux_out, + nullptr)); + + // Decode and test the animation frames + + JxlDecoder* dec = JxlDecoderCreate(NULL); + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + void* runner = JxlThreadParallelRunnerCreate( + NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads()); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + + for (size_t i = 0; i < num_frames; ++i) { + if (i == 3) { + JxlDecoderSkipFrames(dec, 5); + i += 5; + } + std::vector pixels(buffer_size); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header)); + EXPECT_EQ(frame_durations[i], frame_header.duration); + + EXPECT_EQ(i + 1 == num_frames, frame_header.is_last); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels.data(), pixels.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_EQ(0, ComparePixels(frames[i].data(), pixels.data(), xsize, ysize, + format, format)); + } + + // After all frames were decoded, JxlDecoderProcessInput should return + // success to indicate all is done. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + + // Test rewinding the decoder and skipping different frames + + JxlDecoderRewind(dec); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + + for (size_t i = 0; i < num_frames; ++i) { + int test_skipping = (i == 9) ? 3 : 0; + std::vector pixels(buffer_size); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + // Since this is after JXL_DEC_FRAME but before JXL_DEC_FULL_IMAGE, this + // should only skip the next frame, not the currently processed one. + if (test_skipping) JxlDecoderSkipFrames(dec, test_skipping); + + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header)); + EXPECT_EQ(frame_durations[i], frame_header.duration); + + EXPECT_EQ(i + 1 == num_frames, frame_header.is_last); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels.data(), pixels.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_EQ(0, ComparePixels(frames[i].data(), pixels.data(), xsize, ysize, + format, format)); + + if (test_skipping) i += test_skipping; + } + + JxlThreadParallelRunnerDestroy(runner); + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, SkipFrameWithBlendingTest) { + size_t xsize = 90, ysize = 120; + constexpr size_t num_frames = 16; + std::vector frames[num_frames]; + JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + jxl::CodecInOut io; + io.SetSize(xsize, ysize); + io.metadata.m.SetUintSamples(16); + io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false); + io.metadata.m.have_animation = true; + io.frames.clear(); + io.frames.reserve(num_frames); + io.SetSize(xsize, ysize); + + std::vector frame_durations(num_frames); + + for (size_t i = 0; i < num_frames; ++i) { + if (i < 5) { + std::vector frame_internal = + jxl::test::GetSomeTestImage(xsize, ysize, 3, i * 2 + 1); + // An internal frame with 0 duration, and use_for_next_frame, this is a + // frame that is not rendered and not output by the API, but on which the + // rendered frames depend + jxl::ImageBundle bundle_internal(&io.metadata.m); + EXPECT_TRUE(ConvertFromExternal( + jxl::Span(frame_internal.data(), + frame_internal.size()), + xsize, ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), + /*has_alpha=*/false, + /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, + JXL_BIG_ENDIAN, /*flipped_y=*/false, /*pool=*/nullptr, + &bundle_internal)); + bundle_internal.duration = 0; + bundle_internal.use_for_next_frame = true; + io.frames.push_back(std::move(bundle_internal)); + } + + std::vector frame = + jxl::test::GetSomeTestImage(xsize, ysize, 3, i * 2); + // Actual rendered frame + frame_durations[i] = 5 + i; + jxl::ImageBundle bundle(&io.metadata.m); + EXPECT_TRUE(ConvertFromExternal( + jxl::Span(frame.data(), frame.size()), xsize, ysize, + jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*has_alpha=*/false, + /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, + JXL_BIG_ENDIAN, /*flipped_y=*/false, /*pool=*/nullptr, &bundle)); + bundle.duration = frame_durations[i]; + // Create some variation in which frames depend on which. + if (i != 3 && i != 9 && i != 10) { + bundle.use_for_next_frame = true; + } + if (i != 12) { + bundle.blend = true; + // Choose a blend mode that depends on the pixels of the saved frame and + // doesn't use alpha + bundle.blendmode = jxl::BlendMode::kMul; + } + io.frames.push_back(std::move(bundle)); + } + + jxl::CompressParams cparams; + cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip. + jxl::AuxOut aux_out; + jxl::PaddedBytes compressed; + jxl::PassesEncoderState enc_state; + EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed, &aux_out, + nullptr)); + + // Independently decode all frames without any skipping, to create the + // expected blended frames, for the actual tests below to compare with. + { + JxlDecoder* dec = JxlDecoderCreate(NULL); + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + void* runner = JxlThreadParallelRunnerCreate( + NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads()); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner( + dec, JxlThreadParallelRunner, runner)); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + for (size_t i = 0; i < num_frames; ++i) { + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + frames[i].resize(xsize * ysize * 6); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec, &format, frames[i].data(), + frames[i].size())); + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + } + + // After all frames were decoded, JxlDecoderProcessInput should return + // success to indicate all is done. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + JxlThreadParallelRunnerDestroy(runner); + JxlDecoderDestroy(dec); + } + + JxlDecoder* dec = JxlDecoderCreate(NULL); + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + void* runner = JxlThreadParallelRunnerCreate( + NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads()); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + + for (size_t i = 0; i < num_frames; ++i) { + std::vector pixels(buffer_size); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header)); + EXPECT_EQ(frame_durations[i], frame_header.duration); + + EXPECT_EQ(i + 1 == num_frames, frame_header.is_last); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels.data(), pixels.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_EQ(0, ComparePixels(frames[i].data(), pixels.data(), xsize, ysize, + format, format)); + + // Test rewinding mid-way, not decoding all frames. + if (i == 8) { + break; + } + } + + JxlDecoderRewind(dec); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + + for (size_t i = 0; i < num_frames; ++i) { + if (i == 3) { + JxlDecoderSkipFrames(dec, 5); + i += 5; + } + std::vector pixels(buffer_size); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header)); + EXPECT_EQ(frame_durations[i], frame_header.duration); + + EXPECT_EQ(i + 1 == num_frames, frame_header.is_last); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels.data(), pixels.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_EQ(0, ComparePixels(frames[i].data(), pixels.data(), xsize, ysize, + format, format)); + } + + // After all frames were decoded, JxlDecoderProcessInput should return + // success to indicate all is done. + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec)); + + // Test rewinding the decoder and skipping different frames + + JxlDecoderRewind(dec); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in)); + + for (size_t i = 0; i < num_frames; ++i) { + int test_skipping = (i == 9) ? 3 : 0; + std::vector pixels(buffer_size); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + // Since this is after JXL_DEC_FRAME but before JXL_DEC_FULL_IMAGE, this + // should only skip the next frame, not the currently processed one. + if (test_skipping) JxlDecoderSkipFrames(dec, test_skipping); + + JxlFrameHeader frame_header; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header)); + EXPECT_EQ(frame_durations[i], frame_header.duration); + + EXPECT_EQ(i + 1 == num_frames, frame_header.is_last); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels.data(), pixels.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + EXPECT_EQ(0, ComparePixels(frames[i].data(), pixels.data(), xsize, ysize, + format, format)); + + if (test_skipping) i += test_skipping; + } + + JxlThreadParallelRunnerDestroy(runner); + JxlDecoderDestroy(dec); +} + +TEST(DecodeTest, FlushTest) { + // Size large enough for multiple groups, required to have progressive + // stages + size_t xsize = 333, ysize = 300; + uint32_t num_channels = 3; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0); + jxl::CompressParams cparams; + jxl::PaddedBytes data = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + num_channels, cparams, kCSBF_None, JXL_ORIENT_IDENTITY, true); + JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + + std::vector pixels2; + pixels2.resize(pixels.size()); + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + + // Ensure that the first part contains at least the full DC of the image, + // otherwise flush does not work. The DC takes up more than 50% of the + // image generated here. + size_t first_part = data.size() * 3 / 4; + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), first_part)); + + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(info.xsize, xsize); + EXPECT_EQ(info.ysize, ysize); + + EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec)); + + // Output buffer not yet set + EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec)); + + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + EXPECT_EQ(pixels2.size(), buffer_size); + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &format, pixels2.data(), pixels2.size())); + + // Must process input further until we get JXL_DEC_NEED_MORE_INPUT, even if + // data was already input before, since the processing of the frame only + // happens at the JxlDecoderProcessInput call after JXL_DEC_FRAME. + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec)); + + // Note: actual pixel data not tested here, it should look similar to the + // input image, but with less fine detail. Instead the expected events are + // tested here. + + EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec)); + + size_t consumed = first_part - JxlDecoderReleaseInput(dec); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data() + consumed, + data.size() - consumed)); + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + + JxlDecoderDestroy(dec); +} + +void VerifyJPEGReconstruction(const jxl::PaddedBytes& container, + const jxl::PaddedBytes& jpeg_bytes) { + JxlDecoderPtr dec = JxlDecoderMake(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec.get(), JXL_DEC_JPEG_RECONSTRUCTION | JXL_DEC_FULL_IMAGE)); + JxlDecoderSetInput(dec.get(), container.data(), container.size()); + EXPECT_EQ(JXL_DEC_JPEG_RECONSTRUCTION, JxlDecoderProcessInput(dec.get())); + std::vector reconstructed_buffer(128); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetJPEGBuffer(dec.get(), reconstructed_buffer.data(), + reconstructed_buffer.size())); + size_t used = 0; + JxlDecoderStatus process_result = JXL_DEC_JPEG_NEED_MORE_OUTPUT; + while (process_result == JXL_DEC_JPEG_NEED_MORE_OUTPUT) { + used = reconstructed_buffer.size() - JxlDecoderReleaseJPEGBuffer(dec.get()); + reconstructed_buffer.resize(reconstructed_buffer.size() * 2); + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderSetJPEGBuffer(dec.get(), reconstructed_buffer.data() + used, + reconstructed_buffer.size() - used)); + process_result = JxlDecoderProcessInput(dec.get()); + } + ASSERT_EQ(JXL_DEC_FULL_IMAGE, process_result); + used = reconstructed_buffer.size() - JxlDecoderReleaseJPEGBuffer(dec.get()); + ASSERT_EQ(used, jpeg_bytes.size()); + EXPECT_EQ(0, memcmp(reconstructed_buffer.data(), jpeg_bytes.data(), used)); +} + +#if JPEGXL_ENABLE_JPEG +TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructTestCodestream)) { + size_t xsize = 123; + size_t ysize = 77; + size_t channels = 3; + std::vector pixels = + jxl::test::GetSomeTestImage(xsize, ysize, channels, /*seed=*/0); + jxl::CompressParams cparams; + cparams.color_transform = jxl::ColorTransform::kNone; + jxl::PaddedBytes jpeg_codestream; + jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream( + jxl::Span(pixels.data(), pixels.size()), xsize, ysize, + channels, cparams, kCSBF_Single, JXL_ORIENT_IDENTITY, + /*add_preview=*/true, + /*add_icc_profile=*/false, &jpeg_codestream); + VerifyJPEGReconstruction(compressed, jpeg_codestream); +} +#endif // JPEGXL_ENABLE_JPEG + +TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructionTest)) { + const std::string jpeg_path = + "imagecompression.info/flower_foveon.png.im_q85_420.jpg"; + const jxl::PaddedBytes orig = jxl::ReadTestData(jpeg_path); + jxl::CodecInOut orig_io; + ASSERT_TRUE( + jxl::jpeg::DecodeImageJPG(jxl::Span(orig), &orig_io)); + orig_io.metadata.m.xyb_encoded = false; + jxl::BitWriter writer; + ASSERT_TRUE(WriteHeaders(&orig_io.metadata, &writer, nullptr)); + writer.ZeroPadToByte(); + jxl::PassesEncoderState enc_state; + jxl::CompressParams cparams; + cparams.color_transform = jxl::ColorTransform::kNone; + ASSERT_TRUE(jxl::EncodeFrame(cparams, jxl::FrameInfo{}, &orig_io.metadata, + orig_io.Main(), &enc_state, + /*pool=*/nullptr, &writer, + /*aux_out=*/nullptr)); + + jxl::PaddedBytes jpeg_data; + ASSERT_TRUE(EncodeJPEGData(*orig_io.Main().jpeg_data.get(), &jpeg_data)); + jxl::PaddedBytes container; + container.append(jxl::kContainerHeader, + jxl::kContainerHeader + sizeof(jxl::kContainerHeader)); + jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_data.size(), false, + &container); + container.append(jpeg_data.data(), jpeg_data.data() + jpeg_data.size()); + jxl::AppendBoxHeader(jxl::MakeBoxType("jxlc"), 0, true, &container); + jxl::PaddedBytes codestream = std::move(writer).TakeBytes(); + container.append(codestream.data(), codestream.data() + codestream.size()); + VerifyJPEGReconstruction(container, orig); +} diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/decode_to_jpeg.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/decode_to_jpeg.cc new file mode 100644 index 0000000000..4bab82abb3 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/decode_to_jpeg.cc @@ -0,0 +1,77 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/decode_to_jpeg.h" + +namespace jxl { + +#if JPEGXL_ENABLE_TRANSCODE_JPEG + +JxlDecoderStatus JxlToJpegDecoder::Process(const uint8_t** next_in, + size_t* avail_in) { + if (!inside_box_) { + JXL_ABORT( + "processing of JPEG reconstruction data outside JPEG reconstruction " + "box"); + } + Span to_decode; + if (box_until_eof_) { + // Until EOF means consume all data. + to_decode = Span(*next_in, *avail_in); + *next_in += *avail_in; + *avail_in = 0; + } else { + // Defined size means consume min(available, needed). + size_t avail_recon_in = + std::min(*avail_in, box_size_ - buffer_.size()); + to_decode = Span(*next_in, avail_recon_in); + *next_in += avail_recon_in; + *avail_in -= avail_recon_in; + } + bool old_data_exists = !buffer_.empty(); + if (old_data_exists) { + // Append incoming data to buffer if we already had data in the buffer. + buffer_.insert(buffer_.end(), to_decode.data(), + to_decode.data() + to_decode.size()); + to_decode = Span(buffer_.data(), buffer_.size()); + } + if (!box_until_eof_ && to_decode.size() > box_size_) { + JXL_ABORT("JPEG reconstruction data to decode larger than expected"); + } + if (box_until_eof_ || to_decode.size() == box_size_) { + // If undefined size, or the right size, try to decode. + jpeg_data_ = make_unique(); + const auto status = jpeg::DecodeJPEGData(to_decode, jpeg_data_.get()); + if (status.IsFatalError()) return JXL_DEC_ERROR; + if (status) { + // Successful decoding, emit event after updating state to track that we + // are no longer parsing JPEG reconstruction data. + inside_box_ = false; + return JXL_DEC_JPEG_RECONSTRUCTION; + } + if (box_until_eof_) { + // Unsuccessful decoding and undefined size, assume incomplete data. Copy + // the data if we haven't already. + if (!old_data_exists) { + buffer_.insert(buffer_.end(), to_decode.data(), + to_decode.data() + to_decode.size()); + } + } else { + // Unsuccessful decoding of correct amount of data, assume error. + return JXL_DEC_ERROR; + } + } else { + // Not enough data, copy the data if we haven't already. + if (!old_data_exists) { + buffer_.insert(buffer_.end(), to_decode.data(), + to_decode.data() + to_decode.size()); + } + } + return JXL_DEC_NEED_MORE_INPUT; +} + +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/decode_to_jpeg.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/decode_to_jpeg.h new file mode 100644 index 0000000000..86f0a66da4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/decode_to_jpeg.h @@ -0,0 +1,173 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_DECODE_TO_JPEG_H_ +#define LIB_JXL_DECODE_TO_JPEG_H_ + +// JPEG XL to JPEG bytes decoder logic. The JxlToJpegDecoder class keeps track +// of the decoder state needed to parse the JPEG reconstruction box and provide +// the reconstructed JPEG to the output buffer. + +#include +#include + +#include +#include + +#include "jxl/decode.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" // JPEGXL_ENABLE_TRANSCODE_JPEG +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/jpeg/dec_jpeg_data.h" +#if JPEGXL_ENABLE_TRANSCODE_JPEG +#include "lib/jxl/jpeg/dec_jpeg_data_writer.h" +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + +namespace jxl { + +#if JPEGXL_ENABLE_TRANSCODE_JPEG + +class JxlToJpegDecoder { + public: + // Returns whether an output buffer is set. + bool IsOutputSet() const { return next_out_ != nullptr; } + + // Returns whether the decoder is parsing a boxa JPEG box was parsed. + bool IsParsingBox() const { return inside_box_; } + + const jpeg::JPEGData* JpegData() const { return jpeg_data_.get(); } + + // Return the parsed jpeg::JPEGData object and removes it from the + // JxlToJpegDecoder. + jpeg::JPEGData* ReleaseJpegData() { return jpeg_data_.release(); } + + // Sets the output buffer used when producing JPEG output. + JxlDecoderStatus SetOutputBuffer(uint8_t* data, size_t size) { + if (next_out_) return JXL_DEC_ERROR; + next_out_ = data; + avail_size_ = size; + return JXL_DEC_SUCCESS; + } + + // Releases the buffer set with SetOutputBuffer(). + size_t ReleaseOutputBuffer() { + size_t result = avail_size_; + next_out_ = nullptr; + avail_size_ = 0; + return result; + } + + void StartBox(uint64_t box_size, size_t contents_size) { + // A new box implies that we clear the buffer. + buffer_.clear(); + inside_box_ = true; + if (box_size == 0) { + box_until_eof_ = true; + } else { + box_size_ = contents_size; + } + } + + // Consumes data from next_in/avail_in to reconstruct JPEG data. + // Uses box_size_, inside_box_ and box_until_eof_ to calculate how much to + // consume. Potentially stores unparsed data in buffer_. + // Potentially populates jpeg_data_. Potentially updates inside_box_. + JxlDecoderStatus Process(const uint8_t** next_in, size_t* avail_in); + + // Sets the JpegData of the ImageBundle passed if there is anything to set. + // Releases the JpegData from this decoder if set. + Status SetImageBundleJpegData(ImageBundle* ib) { + if (IsOutputSet() && jpeg_data_ != nullptr) { + if (!jpeg::SetJPEGDataFromICC(ib->metadata()->color_encoding.ICC(), + jpeg_data_.get())) { + return false; + } + ib->jpeg_data.reset(jpeg_data_.release()); + } + return true; + } + + JxlDecoderStatus WriteOutput(const jpeg::JPEGData& jpeg_data) { + // Copy JPEG bytestream if desired. + uint8_t* tmp_next_out = next_out_; + size_t tmp_avail_size = avail_size_; + auto write = [&tmp_next_out, &tmp_avail_size](const uint8_t* buf, + size_t len) { + size_t to_write = std::min(tmp_avail_size, len); + memcpy(tmp_next_out, buf, to_write); + tmp_next_out += to_write; + tmp_avail_size -= to_write; + return to_write; + }; + Status write_result = jpeg::WriteJpeg(jpeg_data, write); + if (!write_result) { + if (tmp_avail_size == 0) { + return JXL_DEC_JPEG_NEED_MORE_OUTPUT; + } + return JXL_DEC_ERROR; + } + next_out_ = tmp_next_out; + avail_size_ = tmp_avail_size; + return JXL_DEC_SUCCESS; + } + + private: + // Content of the most recently parsed JPEG reconstruction box if any. + std::vector buffer_; + + // Decoded content of the most recently parsed JPEG reconstruction box is + // stored here. + std::unique_ptr jpeg_data_; + + // True if the decoder is currently reading bytes inside a JPEG reconstruction + // box. + bool inside_box_ = false; + + // True if the JPEG reconstruction box had undefined size (all remaining + // bytes). + bool box_until_eof_ = false; + // Size of most recently parsed JPEG reconstruction box contents. + size_t box_size_ = 0; + + // Next bytes to write JPEG reconstruction to. + uint8_t* next_out_ = nullptr; + // Available bytes to write JPEG reconstruction to. + size_t avail_size_ = 0; +}; + +#else + +// Fake class that disables support for decoding JPEG XL to JPEG. +class JxlToJpegDecoder { + public: + bool IsOutputSet() const { return false; } + bool IsParsingBox() const { return false; } + + const jpeg::JPEGData* JpegData() const { return nullptr; } + jpeg::JPEGData* ReleaseJpegData() { return nullptr; } + + JxlDecoderStatus SetOutputBuffer(uint8_t* /* data */, size_t /* size */) { + return JXL_DEC_ERROR; + } + size_t ReleaseOutputBuffer() { return 0; } + + void StartBox(uint64_t /* box_size */, size_t /* contents_size */) {} + + JxlDecoderStatus Process(const uint8_t** next_in, size_t* avail_in) { + return JXL_DEC_ERROR; + } + + Status SetImageBundleJpegData(ImageBundle* /* ib */) { return true; } + + JxlDecoderStatus WriteOutput(const jpeg::JPEGData& /* jpeg_data */) { + return JXL_DEC_SUCCESS; + } +}; + +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + +} // namespace jxl + +#endif // LIB_JXL_DECODE_TO_JPEG_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/descriptive_statistics_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/descriptive_statistics_test.cc new file mode 100644 index 0000000000..7891c728e2 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/descriptive_statistics_test.cc @@ -0,0 +1,152 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/descriptive_statistics.h" + +#include +#include + +#include +#include + +#include "gtest/gtest.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/noise_distributions.h" + +namespace jxl { +namespace { + +// Assigns x to one of two streams so we can later test Assimilate. +template +void NotifyEither(float x, Random* rng, Stats* JXL_RESTRICT stats1, + Stats* JXL_RESTRICT stats2) { + if ((*rng)() & 128) { + stats1->Notify(x); + } else { + stats2->Notify(x); + } +} + +TEST(StatsTest, TestGaussian) { + Stats stats; + Stats stats1, stats2; + const float mean = 5.0f; + const float stddev = 4.0f; + NoiseGaussian noise(stddev); + std::mt19937 rng(129); + for (size_t i = 0; i < 1000 * 1000; ++i) { + const float x = noise(mean, &rng); + stats.Notify(x); + NotifyEither(x, &rng, &stats1, &stats2); + } + EXPECT_NEAR(mean, stats.Mean(), 0.01); + EXPECT_NEAR(stddev, stats.StandardDeviation(), 0.02); + EXPECT_NEAR(0.0, stats.Skewness(), 0.02); + EXPECT_NEAR(0.0, stats.Kurtosis() - 3, 0.02); + printf("%s\n", stats.ToString().c_str()); + + // Same results after merging both accumulators. + stats1.Assimilate(stats2); + EXPECT_NEAR(mean, stats1.Mean(), 0.01); + EXPECT_NEAR(stddev, stats1.StandardDeviation(), 0.02); + EXPECT_NEAR(0.0, stats1.Skewness(), 0.02); + EXPECT_NEAR(0.0, stats1.Kurtosis() - 3, 0.02); +} + +TEST(StatsTest, TestUniform) { + Stats stats; + Stats stats1, stats2; + NoiseUniform noise(0, 256); + std::mt19937 rng(129), rng_split(65537); + for (size_t i = 0; i < 1000 * 1000; ++i) { + const float x = noise(0.0f, &rng); + stats.Notify(x); + NotifyEither(x, &rng_split, &stats1, &stats2); + } + EXPECT_NEAR(128.0, stats.Mean(), 0.05); + EXPECT_NEAR(0.0, stats.Min(), 0.01); + EXPECT_NEAR(256.0, stats.Max(), 0.01); + EXPECT_NEAR(70, stats.StandardDeviation(), 10); + // No outliers. + EXPECT_NEAR(-1.2, stats.Kurtosis() - 3, 0.1); + printf("%s\n", stats.ToString().c_str()); + + // Same results after merging both accumulators. + stats1.Assimilate(stats2); + EXPECT_NEAR(128.0, stats1.Mean(), 0.05); + EXPECT_NEAR(0.0, stats1.Min(), 0.01); + EXPECT_NEAR(256.0, stats1.Max(), 0.01); + EXPECT_NEAR(70, stats1.StandardDeviation(), 10); +} + +TEST(StatsTest, CompareCentralMomentsAgainstTwoPass) { + // Vary seed so the thresholds are not specific to one distribution. + for (int rep = 0; rep < 200; ++rep) { + // Uniform avoids outliers. + NoiseUniform noise(0, 256); + std::mt19937 rng(129 + 13 * rep), rng_split(65537); + + // Small count so bias (population vs sample) is visible. + const size_t kSamples = 20; + + // First pass: compute mean + std::vector samples; + samples.reserve(kSamples); + double sum = 0.0; + for (size_t i = 0; i < kSamples; ++i) { + const float x = noise(0.0f, &rng); + samples.push_back(x); + sum += x; + } + const double mean = sum / kSamples; + + // Second pass: compute stats and moments + Stats stats; + Stats stats1, stats2; + double sum2 = 0.0; + double sum3 = 0.0; + double sum4 = 0.0; + for (const double x : samples) { + const double d = x - mean; + sum2 += d * d; + sum3 += d * d * d; + sum4 += d * d * d * d; + + stats.Notify(x); + NotifyEither(x, &rng_split, &stats1, &stats2); + } + const double mu1 = mean; + const double mu2 = sum2 / kSamples; + const double mu3 = sum3 / kSamples; + const double mu4 = sum4 / kSamples; + + // Raw central moments (note: Mu1 is zero by definition) + EXPECT_NEAR(mu1, stats.Mu1(), 1E-13); + EXPECT_NEAR(mu2, stats.Mu2(), 1E-11); + EXPECT_NEAR(mu3, stats.Mu3(), 1E-9); + EXPECT_NEAR(mu4, stats.Mu4(), 1E-6); + + // Same results after merging both accumulators. + stats1.Assimilate(stats2); + EXPECT_NEAR(mu1, stats1.Mu1(), 1E-13); + EXPECT_NEAR(mu2, stats1.Mu2(), 1E-11); + EXPECT_NEAR(mu3, stats1.Mu3(), 1E-9); + EXPECT_NEAR(mu4, stats1.Mu4(), 1E-6); + + const double sample_variance = mu2; + // Scaling factor for sampling bias + const double r = (kSamples - 1.0) / kSamples; + const double skewness = mu3 * pow(r / mu2, 1.5); + const double kurtosis = mu4 * pow(r / mu2, 2.0); + + EXPECT_NEAR(sample_variance, stats.SampleVariance(), + sample_variance * 1E-12); + EXPECT_NEAR(skewness, stats.Skewness(), std::abs(skewness * 1E-11)); + EXPECT_NEAR(kurtosis, stats.Kurtosis(), kurtosis * 1E-12); + } +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/docs/color_management.md b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/docs/color_management.md new file mode 100644 index 0000000000..56f4a2856c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/docs/color_management.md @@ -0,0 +1,68 @@ +# Color Management + +[TOC] + + + +## Why + +The vast majority of web images are still sRGB. However, wide-gamut material is +increasingly being produced (photography, cinema, 4K). Screens covering most of +the Adobe RGB gamut are readily available and some also cover most of DCI P3 +(iPhone, Pixel2) or even BT.2020. + +Currently, after a camera records a very saturated red pixel, most raw +processors would clip it to the rather small sRGB gamut before saving as JPEG. +In keeping with our high-quality goal, we prevent such loss by allowing wider +input color spaces. + +## Which color space + +Even wide gamuts could be expressed relative to the sRGB primaries, but the +resulting coordinates may be outside the valid 0..1 range. Surprisingly, such +'unbounded' coordinates can be passed through color transforms provided the +transfer functions are expressed as parametric functions (not lookup tables). +However, most image file formats (including PNG and PNM) lack min/max metadata +and thus do not support unbounded coordinates. + +Instead, we need a larger working gamut to ensure most pixel coordinates are +within bounds and thus not clipped. However, larger gamuts result in lower +precision/resolution when using <= 16 bit encodings (as opposed to 32-bit float +in PFM). BT.2100 or P3 DCI appear to be good compromises. + +## CMS library + +Transforms with unbounded pixels are desirable because they reduce round-trip +error in tests. This requires parametric curves, which are only supported for +the common sRGB case in ICC v4 profiles. ArgyllCMS does not support v4. The +other popular open-source CMS is LittleCMS. It is also used by color-managed +editors (Krita/darktable), which increases the chances of interoperability. +However, LCMS has race conditions and overflow issues that prevent fuzzing. We +will later switch to the newer skcms. Note that this library does not intend to +support multiProcessElements, so HDR transfer functions cannot be represented +accurately. Thus in the long term, we will probably migrate away from ICC +profiles entirely. + +## Which viewer + +On Linux, Krita and darktable support loading our PNG output images and their +ICC profile. + +## How to compress/decompress + +### Embedded ICC profile + +- Create an 8-bit or 16-bit PNG with an iCCP chunk, e.g. using darktable. +- Pass it to `cjxl`, then `djxl` with no special arguments. The decoded output + will have the same bit depth (can override with `--output_bit_depth`) and + color space. + +### Images without metadata (e.g. HDR) + +- Create a PGM/PPM/PFM file in a known color space. +- Invoke `cjxl` with `-x color_space=RGB_D65_202_Rel_Lin` (linear 2020). For + details/possible values, see color_encoding.cc `Description`. +- Invoke `djxl` as above with no special arguments. diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ac_strategy.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ac_strategy.cc new file mode 100644 index 0000000000..507e022cdc --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ac_strategy.cc @@ -0,0 +1,1099 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_ac_strategy.h" + +#include +#include + +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_ac_strategy.cc" +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_transforms-inl.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/fast_math-inl.h" + +// Some of the floating point constants in this file and in other +// files in the libjxl project have been obtained using the +// tools/optimizer/simplex_fork.py tool. It is a variation of +// Nelder-Mead optimization, and we generally try to minimize +// BPP * pnorm aggregate as reported by the benchmark_xl tool, +// but occasionally the values are optimized by using additional +// constraints such as maintaining a certain density, or ratio of +// popularity of integral transforms. Jyrki visually reviews all +// such changes and often makes manual changes to maintain good +// visual quality to changes where butteraugli was not sufficiently +// sensitive to some kind of degradation. Unfortunately image quality +// is still more of an art than science. + +// This must come before the begin/end_target, but HWY_ONCE is only true +// after that, so use an "include guard". +#ifndef LIB_JXL_ENC_AC_STRATEGY_ +#define LIB_JXL_ENC_AC_STRATEGY_ +// Parameters of the heuristic are marked with a OPTIMIZE comment. +namespace jxl { + +// Debugging utilities. + +// Returns a linear sRGB color (as bytes) for each AC strategy. +const uint8_t* TypeColor(const uint8_t& raw_strategy) { + JXL_ASSERT(AcStrategy::IsRawStrategyValid(raw_strategy)); + static_assert(AcStrategy::kNumValidStrategies == 27, "Change colors"); + static constexpr uint8_t kColors[][3] = { + {0xFF, 0xFF, 0x00}, // DCT8 + {0xFF, 0x80, 0x80}, // HORNUSS + {0xFF, 0x80, 0x80}, // DCT2x2 + {0xFF, 0x80, 0x80}, // DCT4x4 + {0x80, 0xFF, 0x00}, // DCT16x16 + {0x00, 0xC0, 0x00}, // DCT32x32 + {0xC0, 0xFF, 0x00}, // DCT16x8 + {0xC0, 0xFF, 0x00}, // DCT8x16 + {0x00, 0xFF, 0x00}, // DCT32x8 + {0x00, 0xFF, 0x00}, // DCT8x32 + {0x00, 0xFF, 0x00}, // DCT32x16 + {0x00, 0xFF, 0x00}, // DCT16x32 + {0xFF, 0x80, 0x00}, // DCT4x8 + {0xFF, 0x80, 0x00}, // DCT8x4 + {0xFF, 0xFF, 0x80}, // AFV0 + {0xFF, 0xFF, 0x80}, // AFV1 + {0xFF, 0xFF, 0x80}, // AFV2 + {0xFF, 0xFF, 0x80}, // AFV3 + {0x00, 0xC0, 0xFF}, // DCT64x64 + {0x00, 0xFF, 0xFF}, // DCT64x32 + {0x00, 0xFF, 0xFF}, // DCT32x64 + {0x00, 0x40, 0xFF}, // DCT128x128 + {0x00, 0x80, 0xFF}, // DCT128x64 + {0x00, 0x80, 0xFF}, // DCT64x128 + {0x00, 0x00, 0xC0}, // DCT256x256 + {0x00, 0x00, 0xFF}, // DCT256x128 + {0x00, 0x00, 0xFF}, // DCT128x256 + }; + return kColors[raw_strategy]; +} + +const uint8_t* TypeMask(const uint8_t& raw_strategy) { + JXL_ASSERT(AcStrategy::IsRawStrategyValid(raw_strategy)); + static_assert(AcStrategy::kNumValidStrategies == 27, "Add masks"); + // implicitly, first row and column is made dark + static constexpr uint8_t kMask[][64] = { + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // DCT8 + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 1, 0, 0, 1, 0, 0, // + 0, 0, 1, 0, 0, 1, 0, 0, // + 0, 0, 1, 1, 1, 1, 0, 0, // + 0, 0, 1, 0, 0, 1, 0, 0, // + 0, 0, 1, 0, 0, 1, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // HORNUSS + { + 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 0, 1, 0, 1, 0, 1, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 0, 1, 0, 1, 0, 1, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 0, 1, 0, 1, 0, 1, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 0, 1, 0, 1, 0, 1, 0, // + }, // 2x2 + { + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + }, // 4x4 + {}, // DCT16x16 (unused) + {}, // DCT32x32 (unused) + {}, // DCT16x8 (unused) + {}, // DCT8x16 (unused) + {}, // DCT32x8 (unused) + {}, // DCT8x32 (unused) + {}, // DCT32x16 (unused) + {}, // DCT16x32 (unused) + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // DCT4x8 + { + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + 0, 0, 0, 0, 1, 0, 0, 0, // + }, // DCT8x4 + { + 1, 1, 1, 1, 1, 0, 0, 0, // + 1, 1, 1, 1, 0, 0, 0, 0, // + 1, 1, 1, 0, 0, 0, 0, 0, // + 1, 1, 0, 0, 0, 0, 0, 0, // + 1, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // AFV0 + { + 0, 0, 0, 0, 1, 1, 1, 1, // + 0, 0, 0, 0, 0, 1, 1, 1, // + 0, 0, 0, 0, 0, 0, 1, 1, // + 0, 0, 0, 0, 0, 0, 0, 1, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + }, // AFV1 + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 1, 0, 0, 0, 0, 0, 0, 0, // + 1, 1, 0, 0, 0, 0, 0, 0, // + 1, 1, 1, 0, 0, 0, 0, 0, // + 1, 1, 1, 1, 0, 0, 0, 0, // + }, // AFV2 + { + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 1, // + 0, 0, 0, 0, 0, 0, 1, 1, // + 0, 0, 0, 0, 0, 1, 1, 1, // + }, // AFV3 + }; + return kMask[raw_strategy]; +} + +void DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize, + size_t ysize, const char* tag, AuxOut* aux_out) { + Image3F color_acs(xsize, ysize); + for (size_t y = 0; y < ysize; y++) { + float* JXL_RESTRICT rows[3] = { + color_acs.PlaneRow(0, y), + color_acs.PlaneRow(1, y), + color_acs.PlaneRow(2, y), + }; + const AcStrategyRow acs_row = ac_strategy.ConstRow(y / kBlockDim); + for (size_t x = 0; x < xsize; x++) { + AcStrategy acs = acs_row[x / kBlockDim]; + const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy()); + for (size_t c = 0; c < 3; c++) { + rows[c][x] = color[c] / 255.f; + } + } + } + size_t stride = color_acs.PixelsPerRow(); + for (size_t c = 0; c < 3; c++) { + for (size_t by = 0; by < DivCeil(ysize, kBlockDim); by++) { + float* JXL_RESTRICT row = color_acs.PlaneRow(c, by * kBlockDim); + const AcStrategyRow acs_row = ac_strategy.ConstRow(by); + for (size_t bx = 0; bx < DivCeil(xsize, kBlockDim); bx++) { + AcStrategy acs = acs_row[bx]; + if (!acs.IsFirstBlock()) continue; + const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy()); + const uint8_t* JXL_RESTRICT mask = TypeMask(acs.RawStrategy()); + if (acs.covered_blocks_x() == 1 && acs.covered_blocks_y() == 1) { + for (size_t iy = 0; iy < kBlockDim && by * kBlockDim + iy < ysize; + iy++) { + for (size_t ix = 0; ix < kBlockDim && bx * kBlockDim + ix < xsize; + ix++) { + if (mask[iy * kBlockDim + ix]) { + row[iy * stride + bx * kBlockDim + ix] = color[c] / 800.f; + } + } + } + } + // draw block edges + for (size_t ix = 0; ix < kBlockDim * acs.covered_blocks_x() && + bx * kBlockDim + ix < xsize; + ix++) { + row[0 * stride + bx * kBlockDim + ix] = color[c] / 350.f; + } + for (size_t iy = 0; iy < kBlockDim * acs.covered_blocks_y() && + by * kBlockDim + iy < ysize; + iy++) { + row[iy * stride + bx * kBlockDim + 0] = color[c] / 350.f; + } + } + } + } + aux_out->DumpImage(tag, color_acs); +} + +} // namespace jxl +#endif // LIB_JXL_ENC_AC_STRATEGY_ + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +bool MultiBlockTransformCrossesHorizontalBoundary( + const AcStrategyImage& ac_strategy, size_t start_x, size_t y, + size_t end_x) { + if (start_x >= ac_strategy.xsize() || y >= ac_strategy.ysize()) { + return false; + } + if (y % 8 == 0) { + // Nothing crosses 64x64 boundaries, and the memory on the other side + // of the 64x64 block may still uninitialized. + return false; + } + end_x = std::min(end_x, ac_strategy.xsize()); + // The first multiblock might be before the start_x, let's adjust it + // to point to the first IsFirstBlock() == true block we find by backward + // tracing. + AcStrategyRow row = ac_strategy.ConstRow(y); + const size_t start_x_limit = start_x & ~7; + while (start_x != start_x_limit && !row[start_x].IsFirstBlock()) { + --start_x; + } + for (size_t x = start_x; x < end_x;) { + if (row[x].IsFirstBlock()) { + x += row[x].covered_blocks_x(); + } else { + return true; + } + } + return false; +} + +bool MultiBlockTransformCrossesVerticalBoundary( + const AcStrategyImage& ac_strategy, size_t x, size_t start_y, + size_t end_y) { + if (x >= ac_strategy.xsize() || start_y >= ac_strategy.ysize()) { + return false; + } + if (x % 8 == 0) { + // Nothing crosses 64x64 boundaries, and the memory on the other side + // of the 64x64 block may still uninitialized. + return false; + } + end_y = std::min(end_y, ac_strategy.ysize()); + // The first multiblock might be before the start_y, let's adjust it + // to point to the first IsFirstBlock() == true block we find by backward + // tracing. + const size_t start_y_limit = start_y & ~7; + while (start_y != start_y_limit && + !ac_strategy.ConstRow(start_y)[x].IsFirstBlock()) { + --start_y; + } + + for (size_t y = start_y; y < end_y;) { + AcStrategyRow row = ac_strategy.ConstRow(y); + if (row[x].IsFirstBlock()) { + y += row[x].covered_blocks_y(); + } else { + return true; + } + } + return false; +} + +float EstimateEntropy(const AcStrategy& acs, size_t x, size_t y, + const ACSConfig& config, + const float* JXL_RESTRICT cmap_factors, float* block, + float* scratch_space, uint32_t* quantized) { + const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize; + + // Apply transform. + for (size_t c = 0; c < 3; c++) { + float* JXL_RESTRICT block_c = block + size * c; + TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y), + config.src_stride, block_c, scratch_space); + } + + HWY_FULL(float) df; + + const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y(); + float quant_norm8 = 0; + float masking = 0; + if (num_blocks == 1) { + // When it is only one 8x8, we don't need aggregation of values. + quant_norm8 = config.Quant(x / 8, y / 8); + masking = 2.0f * config.Masking(x / 8, y / 8); + } else if (num_blocks == 2) { + // Taking max instead of 8th norm seems to work + // better for smallest blocks up to 16x8. Jyrki couldn't get + // improvements in trying the same for 16x16 blocks. + if (acs.covered_blocks_y() == 2) { + quant_norm8 = + std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1)); + masking = 2.0f * std::max(config.Masking(x / 8, y / 8), + config.Masking(x / 8, y / 8 + 1)); + } else { + quant_norm8 = + std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8)); + masking = 2.0f * std::max(config.Masking(x / 8, y / 8), + config.Masking(x / 8 + 1, y / 8)); + } + } else { + float masking_norm2 = 0; + float masking_max = 0; + // Load QF value, calculate empirical heuristic on masking field + // for weighting the information loss. Information loss manifests + // itself as ringing, and masking could hide it. + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + float qval = config.Quant(x / 8 + ix, y / 8 + iy); + qval *= qval; + qval *= qval; + quant_norm8 += qval * qval; + float maskval = config.Masking(x / 8 + ix, y / 8 + iy); + masking_max = std::max(masking_max, maskval); + masking_norm2 += maskval * maskval; + } + } + quant_norm8 /= num_blocks; + quant_norm8 = FastPowf(quant_norm8, 1.0f / 8.0f); + masking_norm2 = sqrt(masking_norm2 / num_blocks); + // This is a highly empirical formula. + masking = (masking_norm2 + masking_max); + } + const auto q = Set(df, quant_norm8); + + // Compute entropy. + float entropy = config.base_entropy; + auto info_loss = Zero(df); + auto info_loss2 = Zero(df); + + for (size_t c = 0; c < 3; c++) { + const float* inv_matrix = config.dequant->InvMatrix(acs.RawStrategy(), c); + const auto cmap_factor = Set(df, cmap_factors[c]); + + auto entropy_v = Zero(df); + auto nzeros_v = Zero(df); + auto cost1 = Set(df, config.cost1); + auto cost2 = Set(df, config.cost2); + auto cost_delta = Set(df, config.cost_delta); + for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) { + const auto in = Load(df, block + c * size + i); + const auto in_y = Load(df, block + size + i) * cmap_factor; + const auto im = Load(df, inv_matrix + i); + const auto val = (in - in_y) * im * q; + const auto rval = Round(val); + const auto diff = AbsDiff(val, rval); + info_loss += diff; + info_loss2 += diff * diff; + const auto q = Abs(rval); + const auto q_is_zero = q == Zero(df); + entropy_v += IfThenElseZero(q >= Set(df, 1.5f), cost2); + // We used to have q * C here, but that cost model seems to + // be punishing large values more than necessary. Sqrt tries + // to avoid large values less aggressively. Having high accuracy + // around zero is most important at low qualities, and there + // we have directly specified costs for 0, 1, and 2. + entropy_v += Sqrt(q) * cost_delta; + nzeros_v += IfThenZeroElse(q_is_zero, Set(df, 1.0f)); + } + entropy_v += nzeros_v * cost1; + + entropy += GetLane(SumOfLanes(entropy_v)); + size_t num_nzeros = GetLane(SumOfLanes(nzeros_v)); + // Add #bit of num_nonzeros, as an estimate of the cost for encoding the + // number of non-zeros of the block. + size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1; + // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a + // bias. + entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits); + } + float ret = + entropy + + masking * + ((config.info_loss_multiplier * GetLane(SumOfLanes(info_loss))) + + (config.info_loss_multiplier2 * + sqrt(num_blocks * GetLane(SumOfLanes(info_loss2))))); + return ret; +} + +uint8_t FindBest8x8Transform(size_t x, size_t y, int encoding_speed_tier, + const ACSConfig& config, + const float* JXL_RESTRICT cmap_factors, + AcStrategyImage* JXL_RESTRICT ac_strategy, + float* block, float* scratch_space, + uint32_t* quantized, float* entropy_out) { + struct TransformTry8x8 { + AcStrategy::Type type; + int encoding_speed_tier_max_limit; + float entropy_add; + float entropy_mul; + }; + static const TransformTry8x8 kTransforms8x8[] = { + { + AcStrategy::Type::DCT, + 9, + 3.0f, + 0.745f, + }, + { + AcStrategy::Type::DCT4X4, + 5, + 4.0f, + 1.0179946967008329f, + }, + { + AcStrategy::Type::DCT2X2, + 4, + 4.0f, + 0.76721119707580943f, + }, + { + AcStrategy::Type::DCT4X8, + 5, + 0.0f, + 0.700754622182473063f, + }, + { + AcStrategy::Type::DCT8X4, + 5, + 0.0f, + 0.700754622182473063f, + }, + { + AcStrategy::Type::IDENTITY, + 5, + 8.0f, + 0.81217614513585534f, + }, + { + AcStrategy::Type::AFV0, + 4, + 3.0f, + 0.70086131125719425f, + }, + { + AcStrategy::Type::AFV1, + 4, + 3.0f, + 0.70086131125719425f, + }, + { + AcStrategy::Type::AFV2, + 4, + 3.0f, + 0.70086131125719425f, + }, + { + AcStrategy::Type::AFV3, + 4, + 3.0f, + 0.70086131125719425f, + }, + }; + double best = 1e30; + uint8_t best_tx = kTransforms8x8[0].type; + for (auto tx : kTransforms8x8) { + if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) { + continue; + } + AcStrategy acs = AcStrategy::FromRawStrategy(tx.type); + float entropy = EstimateEntropy(acs, x, y, config, cmap_factors, block, + scratch_space, quantized); + entropy = tx.entropy_add + tx.entropy_mul * entropy; + if (entropy < best) { + best_tx = tx.type; + best = entropy; + } + } + *entropy_out = best; + return best_tx; +} + +// bx, by addresses the 64x64 block at 8x8 subresolution +// cx, cy addresses the left, upper 8x8 block position of the candidate +// transform. +void TryMergeAcs(AcStrategy::Type acs_raw, size_t bx, size_t by, size_t cx, + size_t cy, const ACSConfig& config, + const float* JXL_RESTRICT cmap_factors, + AcStrategyImage* JXL_RESTRICT ac_strategy, + const float entropy_mul, const uint8_t candidate_priority, + uint8_t* priority, float* JXL_RESTRICT entropy_estimate, + float* block, float* scratch_space, uint32_t* quantized) { + AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); + float entropy_current = 0; + for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) { + if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) { + // Transform would reuse already allocated blocks and + // lead to invalid overlaps, for example DCT64X32 vs. + // DCT32X64. + return; + } + entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)]; + } + } + float entropy_candidate = + entropy_mul * EstimateEntropy(acs, (bx + cx) * 8, (by + cy) * 8, config, + cmap_factors, block, scratch_space, + quantized); + if (entropy_candidate >= entropy_current) return; + // Accept the candidate. + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + entropy_estimate[(cy + iy) * 8 + cx + ix] = 0; + priority[(cy + iy) * 8 + cx + ix] = candidate_priority; + } + } + ac_strategy->Set(bx + cx, by + cy, acs_raw); + entropy_estimate[cy * 8 + cx] = entropy_candidate; +} + +static void SetEntropyForTransform(size_t cx, size_t cy, + const AcStrategy::Type acs_raw, + float entropy, + float* JXL_RESTRICT entropy_estimate) { + const AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw); + for (size_t dy = 0; dy < acs.covered_blocks_y(); ++dy) { + for (size_t dx = 0; dx < acs.covered_blocks_x(); ++dx) { + entropy_estimate[(cy + dy) * 8 + cx + dx] = 0.0; + } + } + entropy_estimate[cy * 8 + cx] = entropy; +} + +AcStrategy::Type AcsSquare(size_t blocks) { + if (blocks == 2) { + return AcStrategy::Type::DCT16X16; + } else if (blocks == 4) { + return AcStrategy::Type::DCT32X32; + } else { + return AcStrategy::Type::DCT64X64; + } +} + +AcStrategy::Type AcsVerticalSplit(size_t blocks) { + if (blocks == 2) { + return AcStrategy::Type::DCT16X8; + } else if (blocks == 4) { + return AcStrategy::Type::DCT32X16; + } else { + return AcStrategy::Type::DCT64X32; + } +} + +AcStrategy::Type AcsHorizontalSplit(size_t blocks) { + if (blocks == 2) { + return AcStrategy::Type::DCT8X16; + } else if (blocks == 4) { + return AcStrategy::Type::DCT16X32; + } else { + return AcStrategy::Type::DCT32X64; + } +} + +// The following function tries to merge smaller transforms into +// squares and the rectangles originating from a single middle division +// (horizontal or vertical) fairly. +// +// This is now generalized to concern about squares +// of blocks X blocks size, where a block is 8x8 pixels. +void FindBestFirstLevelDivisionForSquare( + size_t blocks, bool allow_square_transform, size_t bx, size_t by, size_t cx, + size_t cy, const ACSConfig& config, const float* JXL_RESTRICT cmap_factors, + AcStrategyImage* JXL_RESTRICT ac_strategy, const float entropy_mul_JXK, + const float entropy_mul_JXJ, float* JXL_RESTRICT entropy_estimate, + float* block, float* scratch_space, uint32_t* quantized) { + // We denote J for the larger dimension here, and K for the smaller. + // For example, for 32x32 block splitting, J would be 32, K 16. + const size_t blocks_half = blocks / 2; + const AcStrategy::Type acs_rawJXK = AcsVerticalSplit(blocks); + const AcStrategy::Type acs_rawKXJ = AcsHorizontalSplit(blocks); + const AcStrategy::Type acs_rawJXJ = AcsSquare(blocks); + const AcStrategy acsJXK = AcStrategy::FromRawStrategy(acs_rawJXK); + const AcStrategy acsKXJ = AcStrategy::FromRawStrategy(acs_rawKXJ); + const AcStrategy acsJXJ = AcStrategy::FromRawStrategy(acs_rawJXJ); + AcStrategyRow row0 = ac_strategy->ConstRow(by + cy + 0); + AcStrategyRow row1 = ac_strategy->ConstRow(by + cy + blocks_half); + // Let's check if we can consider a JXJ block here at all. + // This is not necessary in the basic use of hierarchically merging + // blocks in the simplest possible way, but is needed when we try other + // 'floating' options of merging, possibly after a simple hierarchical + // merge has been explored. + if (MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx, + by + cy, bx + cx + blocks) || + MultiBlockTransformCrossesHorizontalBoundary( + *ac_strategy, bx + cx, by + cy + blocks, bx + cx + blocks) || + MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx, by + cy, + by + cy + blocks) || + MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx + blocks, + by + cy, by + cy + blocks)) { + return; // not suitable for JxJ analysis, some transforms leak out. + } + // For floating transforms there may be + // already blocks selected that make either or both JXK and + // KXJ not feasible for this location. + const bool allow_JXK = !MultiBlockTransformCrossesVerticalBoundary( + *ac_strategy, bx + cx + blocks_half, by + cy, by + cy + blocks); + const bool allow_KXJ = !MultiBlockTransformCrossesHorizontalBoundary( + *ac_strategy, bx + cx, by + cy + blocks_half, bx + cx + blocks); + // Current entropies aggregated on NxN resolution. + float entropy[2][2] = {}; + for (size_t dy = 0; dy < blocks; ++dy) { + for (size_t dx = 0; dx < blocks; ++dx) { + entropy[dy / blocks_half][dx / blocks_half] += + entropy_estimate[(cy + dy) * 8 + (cx + dx)]; + } + } + float entropy_JXK_left = std::numeric_limits::max(); + float entropy_JXK_right = std::numeric_limits::max(); + float entropy_KXJ_top = std::numeric_limits::max(); + float entropy_KXJ_bottom = std::numeric_limits::max(); + float entropy_JXJ = std::numeric_limits::max(); + if (allow_JXK) { + if (row0[bx + cx + 0].RawStrategy() != acs_rawJXK) { + entropy_JXK_left = + entropy_mul_JXK * + EstimateEntropy(acsJXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, + cmap_factors, block, scratch_space, quantized); + } + if (row0[bx + cx + blocks_half].RawStrategy() != acs_rawJXK) { + entropy_JXK_right = + entropy_mul_JXK * EstimateEntropy(acsJXK, (bx + cx + blocks_half) * 8, + (by + cy + 0) * 8, config, + cmap_factors, block, scratch_space, + quantized); + } + } + if (allow_KXJ) { + if (row0[bx + cx].RawStrategy() != acs_rawKXJ) { + entropy_KXJ_top = + entropy_mul_JXK * + EstimateEntropy(acsKXJ, (bx + cx + 0) * 8, (by + cy + 0) * 8, config, + cmap_factors, block, scratch_space, quantized); + } + if (row1[bx + cx].RawStrategy() != acs_rawKXJ) { + entropy_KXJ_bottom = + entropy_mul_JXK * EstimateEntropy(acsKXJ, (bx + cx + 0) * 8, + (by + cy + blocks_half) * 8, config, + cmap_factors, block, scratch_space, + quantized); + } + } + if (allow_square_transform) { + // We control the exploration of the square transform separately so that + // we can turn it off at high decoding speeds for 32x32, but still allow + // exploring 16x32 and 32x16. + entropy_JXJ = entropy_mul_JXJ * EstimateEntropy(acsJXJ, (bx + cx + 0) * 8, + (by + cy + 0) * 8, config, + cmap_factors, block, + scratch_space, quantized); + } + + // Test if this block should have JXK or KXJ transforms, + // because it can have only one or the other. + float costJxN = std::min(entropy_JXK_left, entropy[0][0] + entropy[1][0]) + + std::min(entropy_JXK_right, entropy[0][1] + entropy[1][1]); + float costNxJ = std::min(entropy_KXJ_top, entropy[0][0] + entropy[0][1]) + + std::min(entropy_KXJ_bottom, entropy[1][0] + entropy[1][1]); + if (entropy_JXJ < costJxN && entropy_JXJ < costNxJ) { + ac_strategy->Set(bx + cx, by + cy, acs_rawJXJ); + SetEntropyForTransform(cx, cy, acs_rawJXJ, entropy_JXJ, entropy_estimate); + } else if (costJxN < costNxJ) { + if (entropy_JXK_left < entropy[0][0] + entropy[1][0]) { + ac_strategy->Set(bx + cx, by + cy, acs_rawJXK); + SetEntropyForTransform(cx, cy, acs_rawJXK, entropy_JXK_left, + entropy_estimate); + } + if (entropy_JXK_right < entropy[0][1] + entropy[1][1]) { + ac_strategy->Set(bx + cx + blocks_half, by + cy, acs_rawJXK); + SetEntropyForTransform(cx + blocks_half, cy, acs_rawJXK, + entropy_JXK_right, entropy_estimate); + } + } else { + if (entropy_KXJ_top < entropy[0][0] + entropy[0][1]) { + ac_strategy->Set(bx + cx, by + cy, acs_rawKXJ); + SetEntropyForTransform(cx, cy, acs_rawKXJ, entropy_KXJ_top, + entropy_estimate); + } + if (entropy_KXJ_bottom < entropy[1][0] + entropy[1][1]) { + ac_strategy->Set(bx + cx, by + cy + blocks_half, acs_rawKXJ); + SetEntropyForTransform(cx, cy + blocks_half, acs_rawKXJ, + entropy_KXJ_bottom, entropy_estimate); + } + } +} + +void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state, + const ACSConfig& config, const Rect& rect) { + // Main philosophy here: + // 1. First find best 8x8 transform for each area. + // 2. Merging them into larger transforms where possibly, but + // starting from the smallest transforms (16x8 and 8x16). + // Additional complication: 16x8 and 8x16 are considered + // simultanouesly and fairly against each other. + // We are looking at 64x64 squares since the YtoX and YtoB + // maps happen to be at that resolution, and having + // integral transforms cross these boundaries leads to + // additional complications. + const CompressParams& cparams = enc_state->cparams; + const float butteraugli_target = cparams.butteraugli_distance; + AcStrategyImage* ac_strategy = &enc_state->shared.ac_strategy; + // TODO(veluca): reuse allocations + auto mem = hwy::AllocateAligned(5 * AcStrategy::kMaxCoeffArea); + auto qmem = hwy::AllocateAligned(AcStrategy::kMaxCoeffArea); + uint32_t* JXL_RESTRICT quantized = qmem.get(); + float* JXL_RESTRICT block = mem.get(); + float* JXL_RESTRICT scratch_space = mem.get() + 3 * AcStrategy::kMaxCoeffArea; + size_t bx = rect.x0(); + size_t by = rect.y0(); + JXL_ASSERT(rect.xsize() <= 8); + JXL_ASSERT(rect.ysize() <= 8); + size_t tx = bx / kColorTileDimInBlocks; + size_t ty = by / kColorTileDimInBlocks; + const float cmap_factors[3] = { + enc_state->shared.cmap.YtoXRatio( + enc_state->shared.cmap.ytox_map.ConstRow(ty)[tx]), + 0.0f, + enc_state->shared.cmap.YtoBRatio( + enc_state->shared.cmap.ytob_map.ConstRow(ty)[tx]), + }; + if (cparams.speed_tier > SpeedTier::kHare) return; + // First compute the best 8x8 transform for each square. Later, we do not + // experiment with different combinations, but only use the best of the 8x8s + // when DCT8X8 is specified in the tree search. + // 8x8 transforms have 10 variants, but every larger transform is just a DCT. + float entropy_estimate[64] = {}; + // Favor all 8x8 transforms (against 16x8 and larger transforms)) at + // low butteraugli_target distances. + static const float k8x8mul1 = -0.55; + static const float k8x8mul2 = 1.0735757687292623f; + static const float k8x8base = 1.4; + const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base); + for (size_t iy = 0; iy < rect.ysize(); iy++) { + for (size_t ix = 0; ix < rect.xsize(); ix++) { + float entropy = 0.0; + const uint8_t best_of_8x8s = FindBest8x8Transform( + 8 * (bx + ix), 8 * (by + iy), static_cast(cparams.speed_tier), + config, cmap_factors, ac_strategy, block, scratch_space, quantized, + &entropy); + ac_strategy->Set(bx + ix, by + iy, + static_cast(best_of_8x8s)); + entropy_estimate[iy * 8 + ix] = entropy * mul8x8; + } + } + // Merge when a larger transform is better than the previously + // searched best combination of 8x8 transforms. + struct MergeTry { + AcStrategy::Type type; + uint8_t priority; + uint8_t decoding_speed_tier_max_limit; + uint8_t encoding_speed_tier_max_limit; + float entropy_mul; + }; + static const float k8X16mul1 = -0.55; + static const float k8X16mul2 = 0.9019587899705066; + static const float k8X16base = 1.6; + const float entropy_mul16X8 = + k8X16mul2 + k8X16mul1 / (butteraugli_target + k8X16base); + // const float entropy_mul16X8 = mul8X16 * 0.91195782912371126f; + + static const float k16X16mul1 = -0.35; + static const float k16X16mul2 = 0.82098067020252011; + static const float k16X16base = 2.0; + const float entropy_mul16X16 = + k16X16mul2 + k16X16mul1 / (butteraugli_target + k16X16base); + // const float entropy_mul16X16 = mul16X16 * 0.83183417727960129f; + + static const float k32X16mul1 = -0.1; + static const float k32X16mul2 = 0.86098067020252011; + static const float k32X16base = 2.5; + const float entropy_mul16X32 = + k32X16mul2 + k32X16mul1 / (butteraugli_target + k32X16base); + + const float entropy_mul32X32 = 0.9188333021616017f; + const float entropy_mul64X64 = 1.50f; + // TODO(jyrki): Consider this feedback in further changes: + // Also effectively when the multipliers for smaller blocks are + // below 1, this raises the bar for the bigger blocks even higher + // in that sense these constants are not independent (e.g. changing + // the constant for DCT16x32 by -5% (making it more likely) also + // means that DCT32x32 becomes harder to do when starting from + // two DCT16x32s). It might be better to make them more independent, + // e.g. by not applying the multiplier when storing the new entropy + // estimates in TryMergeToACSCandidate(). + const MergeTry kTransformsForMerge[9] = { + {AcStrategy::Type::DCT16X8, 2, 4, 5, entropy_mul16X8}, + {AcStrategy::Type::DCT8X16, 2, 4, 5, entropy_mul16X8}, + // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its + // subdivisions. {AcStrategy::Type::DCT16X16, 3, entropy_mul16X16}, + {AcStrategy::Type::DCT16X32, 4, 4, 4, entropy_mul16X32}, + {AcStrategy::Type::DCT32X16, 4, 4, 4, entropy_mul16X32}, + // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its + // subdivisions. {AcStrategy::Type::DCT32X32, 5, 1, 5, + // 0.9822994906548809f}, + // TODO(jyrki): re-enable 64x32 and 64x64 if/when possible. + {AcStrategy::Type::DCT64X32, 6, 1, 3, 1.27f}, + {AcStrategy::Type::DCT32X64, 6, 1, 3, 1.27f}, + // {AcStrategy::Type::DCT64X64, 8, 1, 3, 2.0846542128012948f}, + }; + /* + These sizes not yet included in merge heuristic: + set(AcStrategy::Type::DCT32X8, 0.0f, 2.261390410971102f); + set(AcStrategy::Type::DCT8X32, 0.0f, 2.261390410971102f); + set(AcStrategy::Type::DCT128X128, 0.0f, 1.0f); + set(AcStrategy::Type::DCT128X64, 0.0f, 0.73f); + set(AcStrategy::Type::DCT64X128, 0.0f, 0.73f); + set(AcStrategy::Type::DCT256X256, 0.0f, 1.0f); + set(AcStrategy::Type::DCT256X128, 0.0f, 0.73f); + set(AcStrategy::Type::DCT128X256, 0.0f, 0.73f); + */ + + // Priority is a tricky kludge to avoid collisions so that transforms + // don't overlap. + uint8_t priority[64] = {}; + for (auto tx : kTransformsForMerge) { + if (tx.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) { + continue; + } + AcStrategy acs = AcStrategy::FromRawStrategy(tx.type); + for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize(); + cy += acs.covered_blocks_y()) { + for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize(); + cx += acs.covered_blocks_x()) { + if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) { + if (cparams.decoding_speed_tier < 4 && + tx.type == AcStrategy::Type::DCT32X64) { + // We handle both DCT8X16 and DCT16X8 at the same time. + if ((cy | cx) % 8 == 0) { + FindBestFirstLevelDivisionForSquare( + 8, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, + tx.entropy_mul, entropy_mul64X64, entropy_estimate, block, + scratch_space, quantized); + } + continue; + } else if (tx.type == AcStrategy::Type::DCT32X16) { + // We handled both DCT8X16 and DCT16X8 at the same time, + // and that is above. The last column and last row, + // when the last column or last row is odd numbered, + // are still handled by TryMergeAcs. + continue; + } + } + if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) || + (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) { + // already covered by FindBest32X32 + continue; + } + + if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) { + if (tx.type == AcStrategy::Type::DCT16X32) { + // We handle both DCT8X16 and DCT16X8 at the same time. + bool enable_32x32 = cparams.decoding_speed_tier < 4; + if ((cy | cx) % 4 == 0) { + FindBestFirstLevelDivisionForSquare( + 4, enable_32x32, bx, by, cx, cy, config, cmap_factors, + ac_strategy, tx.entropy_mul, entropy_mul32X32, + entropy_estimate, block, scratch_space, quantized); + } + continue; + } else if (tx.type == AcStrategy::Type::DCT32X16) { + // We handled both DCT8X16 and DCT16X8 at the same time, + // and that is above. The last column and last row, + // when the last column or last row is odd numbered, + // are still handled by TryMergeAcs. + continue; + } + } + if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) || + (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) { + // already covered by FindBest32X32 + continue; + } + if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) { + if (tx.type == AcStrategy::Type::DCT8X16) { + // We handle both DCT8X16 and DCT16X8 at the same time. + if ((cy | cx) % 2 == 0) { + FindBestFirstLevelDivisionForSquare( + 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, + tx.entropy_mul, entropy_mul16X16, entropy_estimate, block, + scratch_space, quantized); + } + continue; + } else if (tx.type == AcStrategy::Type::DCT16X8) { + // We handled both DCT8X16 and DCT16X8 at the same time, + // and that is above. The last column and last row, + // when the last column or last row is odd numbered, + // are still handled by TryMergeAcs. + continue; + } + } + if ((tx.type == AcStrategy::Type::DCT8X16 && cy % 2 == 1) || + (tx.type == AcStrategy::Type::DCT16X8 && cx % 2 == 1)) { + // already covered by FindBestFirstLevelDivisionForSquare + continue; + } + // All other merge sizes are handled here. + // Some of the DCT16X8s and DCT8X16s will still leak through here + // when there is an odd number of 8x8 blocks, then the last row + // and column will get their DCT16X8s and DCT8X16s through the + // normal integral transform merging process. + TryMergeAcs(tx.type, bx, by, cx, cy, config, cmap_factors, ac_strategy, + tx.entropy_mul, tx.priority, &priority[0], entropy_estimate, + block, scratch_space, quantized); + } + } + } + // Here we still try to do some non-aligned matching, find a few more + // 16X8, 8X16 and 16X16s between the non-2-aligned blocks. + if (cparams.speed_tier >= SpeedTier::kHare) { + return; + } + for (int ii = 0; ii < 3; ++ii) { + for (size_t cy = 1 - (ii == 1); cy + 1 < rect.ysize(); cy += 2) { + for (size_t cx = 1 - (ii == 2); cx + 1 < rect.xsize(); cx += 2) { + FindBestFirstLevelDivisionForSquare( + 2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy, + entropy_mul16X8, entropy_mul16X16, entropy_estimate, block, + scratch_space, quantized); + } + } + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(ProcessRectACS); + +void AcStrategyHeuristics::Init(const Image3F& src, + PassesEncoderState* enc_state) { + this->enc_state = enc_state; + config.dequant = &enc_state->shared.matrices; + const CompressParams& cparams = enc_state->cparams; + const float butteraugli_target = cparams.butteraugli_distance; + + // Image row pointers and strides. + config.quant_field_row = enc_state->initial_quant_field.Row(0); + config.quant_field_stride = enc_state->initial_quant_field.PixelsPerRow(); + auto& mask = enc_state->initial_quant_masking; + if (mask.xsize() > 0 && mask.ysize() > 0) { + config.masking_field_row = mask.Row(0); + config.masking_field_stride = mask.PixelsPerRow(); + } + + config.src_rows[0] = src.ConstPlaneRow(0, 0); + config.src_rows[1] = src.ConstPlaneRow(1, 0); + config.src_rows[2] = src.ConstPlaneRow(2, 0); + config.src_stride = src.PixelsPerRow(); + + // Entropy estimate is composed of two factors: + // - estimate of the number of bits that will be used by the block + // - information loss due to quantization + // The following constant controls the relative weights of these components. + config.info_loss_multiplier = 138.0f; + config.info_loss_multiplier2 = 50.46839691767866; + // TODO(jyrki): explore base_entropy setting more. + // A small value (0?) works better at high distance, while a larger value + // may be more effective at low distance/high bpp. + config.base_entropy = 0.0; + config.zeros_mul = 7.565053364251793f; + // Lots of +1 and -1 coefficients at high quality, it is + // beneficial to favor them. At low qualities zeros matter more + // and +1 / -1 coefficients are already quite harmful. + float slope = std::min(1.0f, butteraugli_target * (1.0f / 3)); + config.cost1 = 1 + slope * 8.8703248061477744f; + config.cost2 = 4.4628149885273363f; + config.cost_delta = 5.3359184934516337f; + JXL_ASSERT(enc_state->shared.ac_strategy.xsize() == + enc_state->shared.frame_dim.xsize_blocks); + JXL_ASSERT(enc_state->shared.ac_strategy.ysize() == + enc_state->shared.frame_dim.ysize_blocks); +} + +void AcStrategyHeuristics::ProcessRect(const Rect& rect) { + PROFILER_FUNC; + const CompressParams& cparams = enc_state->cparams; + // In Falcon mode, use DCT8 everywhere and uniform quantization. + if (cparams.speed_tier >= SpeedTier::kCheetah) { + enc_state->shared.ac_strategy.FillDCT8(rect); + return; + } + HWY_DYNAMIC_DISPATCH(ProcessRectACS) + (enc_state, config, rect); +} + +void AcStrategyHeuristics::Finalize(AuxOut* aux_out) { + const auto& ac_strategy = enc_state->shared.ac_strategy; + // Accounting and debug output. + if (aux_out != nullptr) { + aux_out->num_dct2_blocks = + 32 * (ac_strategy.CountBlocks(AcStrategy::Type::DCT32X64) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT64X32)); + aux_out->num_dct4_blocks = + 64 * ac_strategy.CountBlocks(AcStrategy::Type::DCT64X64); + aux_out->num_dct4x8_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT4X8) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT8X4); + aux_out->num_afv_blocks = ac_strategy.CountBlocks(AcStrategy::Type::AFV0) + + ac_strategy.CountBlocks(AcStrategy::Type::AFV1) + + ac_strategy.CountBlocks(AcStrategy::Type::AFV2) + + ac_strategy.CountBlocks(AcStrategy::Type::AFV3); + aux_out->num_dct8_blocks = ac_strategy.CountBlocks(AcStrategy::Type::DCT); + aux_out->num_dct8x16_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT8X16) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT16X8); + aux_out->num_dct8x32_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT8X32) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT32X8); + aux_out->num_dct16_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT16X16); + aux_out->num_dct16x32_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT16X32) + + ac_strategy.CountBlocks(AcStrategy::Type::DCT32X16); + aux_out->num_dct32_blocks = + ac_strategy.CountBlocks(AcStrategy::Type::DCT32X32); + } + + if (WantDebugOutput(aux_out)) { + DumpAcStrategy(ac_strategy, enc_state->shared.frame_dim.xsize, + enc_state->shared.frame_dim.ysize, "ac_strategy", aux_out); + } +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ac_strategy.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ac_strategy.h new file mode 100644 index 0000000000..6cf82d524c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ac_strategy.h @@ -0,0 +1,79 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_AC_STRATEGY_H_ +#define LIB_JXL_ENC_AC_STRATEGY_H_ + +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image.h" +#include "lib/jxl/quant_weights.h" + +// `FindBestAcStrategy` uses heuristics to choose which AC strategy should be +// used in each block, as well as the initial quantization field. + +namespace jxl { + +// AC strategy selection: utility struct. + +struct ACSConfig { + const DequantMatrices* JXL_RESTRICT dequant; + float info_loss_multiplier; + float info_loss_multiplier2; + float* JXL_RESTRICT quant_field_row; + size_t quant_field_stride; + float* JXL_RESTRICT masking_field_row; + size_t masking_field_stride; + const float* JXL_RESTRICT src_rows[3]; + size_t src_stride; + // Cost for 1 (-1), 2 (-2) explicitly, cost for others computed with cost1 + + // cost2 + sqrt(q) * cost_delta. + float cost1; + float cost2; + float cost_delta; + float base_entropy; + float zeros_mul; + const float& Pixel(size_t c, size_t x, size_t y) const { + return src_rows[c][y * src_stride + x]; + } + float Masking(size_t bx, size_t by) const { + JXL_DASSERT(masking_field_row[by * masking_field_stride + bx] > 0); + return masking_field_row[by * masking_field_stride + bx]; + } + float Quant(size_t bx, size_t by) const { + JXL_DASSERT(quant_field_row[by * quant_field_stride + bx] > 0); + return quant_field_row[by * quant_field_stride + bx]; + } + void SetQuant(size_t bx, size_t by, float value) const { + JXL_DASSERT(value > 0); + quant_field_row[by * quant_field_stride + bx] = value; + } +}; + +struct AcStrategyHeuristics { + void Init(const Image3F& src, PassesEncoderState* enc_state); + void ProcessRect(const Rect& rect); + void Finalize(AuxOut* aux_out); + ACSConfig config; + PassesEncoderState* enc_state; +}; + +// Debug. +void DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize, + size_t ysize, const char* tag, AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_ENC_AC_STRATEGY_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_adaptive_quantization.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_adaptive_quantization.cc new file mode 100644 index 0000000000..10f99b9c99 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_adaptive_quantization.cc @@ -0,0 +1,1054 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_adaptive_quantization.h" + +#include +#include +#include + +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_adaptive_quantization.cc" +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/butteraugli/butteraugli.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/dec_group.h" +#include "lib/jxl/dec_reconstruct.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_group.h" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_transforms-inl.h" +#include "lib/jxl/epf.h" +#include "lib/jxl/fast_math-inl.h" +#include "lib/jxl/gauss_blur.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/quant_weights.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Rebind; + +// The following functions modulate an exponent (out_val) and return the updated +// value. Their descriptor is limited to 8 lanes for 8x8 blocks. + +// Hack for mask estimation. Eventually replace this code with butteraugli's +// masking. +float ComputeMaskForAcStrategyUse(const float out_val) { + const float kMul = 1.0f; + const float kOffset = 0.4f; + return kMul / (out_val + kOffset); +} + +template +V ComputeMask(const D d, const V out_val) { + const auto kBase = Set(d, -0.74174993f); + const auto kMul4 = Set(d, 3.2353257320940401f); + const auto kMul2 = Set(d, 12.906028311180409f); + const auto kOffset2 = Set(d, 305.04035728311436f); + const auto kMul3 = Set(d, 5.0220313103171232f); + const auto kOffset3 = Set(d, 2.1925739705298404f); + const auto kOffset4 = Set(d, 0.25f) * kOffset3; + const auto kMul0 = Set(d, 0.74760422233706747f); + const auto k1 = Set(d, 1.0f); + + // Avoid division by zero. + const auto v1 = Max(out_val * kMul0, Set(d, 1e-3f)); + const auto v2 = k1 / (v1 + kOffset2); + const auto v3 = k1 / MulAdd(v1, v1, kOffset3); + const auto v4 = k1 / MulAdd(v1, v1, kOffset4); + // TODO(jyrki): + // A log or two here could make sense. In butteraugli we have effectively + // log(log(x + C)) for this kind of use, as a single log is used in + // saturating visual masking and here the modulation values are exponential, + // another log would counter that. + return kBase + MulAdd(kMul4, v4, MulAdd(kMul2, v2, kMul3 * v3)); +} + +// For converting full vectors to a subset. Assumes `vfull` lanes are identical. +template +Vec CapTo(const D d, VFull vfull) { + using T = typename D::T; + const HWY_FULL(T) dfull; + HWY_ALIGN T lanes[MaxLanes(dfull)]; + Store(vfull, dfull, lanes); + return Load(d, lanes); +} + +// mul and mul2 represent a scaling difference between jxl and butteraugli. +static const float kSGmul = 226.0480446705883f; +static const float kSGmul2 = 1.0f / 73.377132366608819f; +static const float kLog2 = 0.693147181f; +// Includes correction factor for std::log -> log2. +static const float kSGRetMul = kSGmul2 * 18.6580932135f * kLog2; +static const float kSGVOffset = 7.14672470003f; + +template +V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) { + // The opsin space in jxl is the cubic root of photons, i.e., v * v * v + // is related to the number of photons. + // + // SimpleGamma(v * v * v) is the psychovisual space in butteraugli. + // This ratio allows quantization to move from jxl's opsin space to + // butteraugli's log-gamma space. + v = ZeroIfNegative(v); + const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul); + const auto kVOffset = Set(d, kSGVOffset * kLog2); + const auto kDenMul = Set(d, kLog2 * kSGmul); + + const auto v2 = v * v; + + const auto num = kNumMul * v2; + const auto den = MulAdd(kDenMul * v, v2, kVOffset); + return invert ? num / den : den / num; +} + +template +static float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) { + using DScalar = HWY_CAPPED(float, 1); + auto vscalar = Load(DScalar(), &v); + return GetLane( + RatioOfDerivativesOfCubicRootToSimpleGamma(DScalar(), vscalar)); +} + +// TODO(veluca): this function computes an approximation of the derivative of +// SimpleGamma with (f(x+eps)-f(x))/eps. Consider two-sided approximation or +// exact derivatives. For reference, SimpleGamma was: +/* +template +V SimpleGamma(const D d, V v) { + // A simple HDR compatible gamma function. + const auto mul = Set(d, kSGmul); + const auto kRetMul = Set(d, kSGRetMul); + const auto kRetAdd = Set(d, kSGmul2 * -20.2789020414f); + const auto kVOffset = Set(d, kSGVOffset); + + v *= mul; + + // This should happen rarely, but may lead to a NaN, which is rather + // undesirable. Since negative photons don't exist we solve the NaNs by + // clamping here. + // TODO(veluca): with FastLog2f, this no longer leads to NaNs. + v = ZeroIfNegative(v); + return kRetMul * FastLog2f(d, v + kVOffset) + kRetAdd; +} +*/ + +template +V GammaModulation(const D d, const size_t x, const size_t y, + const ImageF& xyb_x, const ImageF& xyb_y, const V out_val) { + const float kBias = 0.16f; + JXL_DASSERT(kBias > kOpsinAbsorbanceBias[0]); + JXL_DASSERT(kBias > kOpsinAbsorbanceBias[1]); + JXL_DASSERT(kBias > kOpsinAbsorbanceBias[2]); + auto overall_ratio = Zero(d); + auto bias = Set(d, kBias); + auto half = Set(d, 0.5f); + for (size_t dy = 0; dy < 8; ++dy) { + const float* const JXL_RESTRICT row_in_x = xyb_x.Row(y + dy); + const float* const JXL_RESTRICT row_in_y = xyb_y.Row(y + dy); + for (size_t dx = 0; dx < 8; dx += Lanes(d)) { + const auto iny = Load(d, row_in_y + x + dx) + bias; + const auto inx = Load(d, row_in_x + x + dx); + const auto r = iny - inx; + const auto g = iny + inx; + const auto ratio_r = + RatioOfDerivativesOfCubicRootToSimpleGamma(d, r); + const auto ratio_g = + RatioOfDerivativesOfCubicRootToSimpleGamma(d, g); + const auto avg_ratio = half * (ratio_r + ratio_g); + + overall_ratio += avg_ratio; + } + } + overall_ratio = SumOfLanes(overall_ratio); + overall_ratio *= Set(d, 1.0f / 64); + // ideally -1.0, but likely optimal correction adds some entropy, so slightly + // less than that. + // ln(2) constant folded in because we want std::log but have FastLog2f. + const auto kGam = Set(d, -0.15526878023684174f * 0.693147180559945f); + return MulAdd(kGam, FastLog2f(d, overall_ratio), out_val); +} + +// Change precision in 8x8 blocks that have high frequency content. +template +V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb, + const V out_val) { + // Zero out the invalid differences for the rightmost value per row. + const Rebind du; + HWY_ALIGN constexpr uint32_t kMaskRight[kBlockDim] = {~0u, ~0u, ~0u, ~0u, + ~0u, ~0u, ~0u, 0}; + + auto sum = Zero(d); // sum of absolute differences with right and below + + for (size_t dy = 0; dy < 8; ++dy) { + const float* JXL_RESTRICT row_in = xyb.Row(y + dy) + x; + const float* JXL_RESTRICT row_in_next = + dy == 7 ? row_in : xyb.Row(y + dy + 1) + x; + + // In SCALAR, there is no guarantee of having extra row padding. + // Hence, we need to ensure we don't access pixels outside the row itself. + // In SIMD modes, however, rows are padded, so it's safe to access one + // garbage value after the row. The vector then gets masked with kMaskRight + // to remove the influence of that value. +#if HWY_TARGET != HWY_SCALAR + for (size_t dx = 0; dx < 8; dx += Lanes(d)) { +#else + for (size_t dx = 0; dx < 7; dx += Lanes(d)) { +#endif + const auto p = Load(d, row_in + dx); + const auto pr = LoadU(d, row_in + dx + 1); + const auto mask = BitCast(d, Load(du, kMaskRight + dx)); + sum += And(mask, AbsDiff(p, pr)); + + const auto pd = Load(d, row_in_next + dx); + sum += AbsDiff(p, pd); + } + } + + sum = SumOfLanes(sum); + return MulAdd(sum, Set(d, -2.0052193233688884f / 112), out_val); +} + +void PerBlockModulations(const float butteraugli_target, const ImageF& xyb_x, + const ImageF& xyb_y, const float scale, + const Rect& rect, ImageF* out) { + JXL_ASSERT(SameSize(xyb_x, xyb_y)); + JXL_ASSERT(DivCeil(xyb_x.xsize(), kBlockDim) == out->xsize()); + JXL_ASSERT(DivCeil(xyb_x.ysize(), kBlockDim) == out->ysize()); + + float base_level = 0.5f * scale; + float kDampenRampStart = 7.0f; + float kDampenRampEnd = 14.0f; + float dampen = 1.0f; + if (butteraugli_target >= kDampenRampStart) { + dampen = 1.0f - ((butteraugli_target - kDampenRampStart) / + (kDampenRampEnd - kDampenRampStart)); + if (dampen < 0) { + dampen = 0; + } + } + const float mul = scale * dampen; + const float add = (1.0f - dampen) * base_level; + for (size_t iy = rect.y0(); iy < rect.y0() + rect.ysize(); iy++) { + const size_t y = iy * 8; + float* const JXL_RESTRICT row_out = out->Row(iy); + const HWY_CAPPED(float, kBlockDim) df; + for (size_t ix = rect.x0(); ix < rect.x0() + rect.xsize(); ix++) { + size_t x = ix * 8; + auto out_val = Set(df, row_out[ix]); + out_val = ComputeMask(df, out_val); + out_val = HfModulation(df, x, y, xyb_y, out_val); + out_val = GammaModulation(df, x, y, xyb_x, xyb_y, out_val); + // We want multiplicative quantization field, so everything + // until this point has been modulating the exponent. + row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add; + } + } +} + +template +V MaskingSqrt(const D d, V v) { + static const float kLogOffset = 26.481471032459346f; + static const float kMul = 211.50759899638012f; + const auto mul_v = Set(d, kMul * 1e8); + const auto offset_v = Set(d, kLogOffset); + return Set(d, 0.25f) * Sqrt(MulAdd(v, Sqrt(mul_v), offset_v)); +} + +float MaskingSqrt(const float v) { + using DScalar = HWY_CAPPED(float, 1); + auto vscalar = Load(DScalar(), &v); + return GetLane(MaskingSqrt(DScalar(), vscalar)); +} + +void StoreMin4(const float v, float& min0, float& min1, float& min2, + float& min3) { + if (v < min3) { + if (v < min0) { + min3 = min2; + min2 = min1; + min1 = min0; + min0 = v; + } else if (v < min1) { + min3 = min2; + min2 = min1; + min1 = v; + } else if (v < min2) { + min3 = min2; + min2 = v; + } else { + min3 = v; + } + } +} + +// Look for smooth areas near the area of degradation. +// If the areas are generally smooth, don't do masking. +// Output is downsampled 2x. +void FuzzyErosion(const Rect& from_rect, const ImageF& from, + const Rect& to_rect, ImageF* to) { + const size_t xsize = from.xsize(); + const size_t ysize = from.ysize(); + constexpr int kStep = 1; + static_assert(kStep == 1, "Step must be 1"); + JXL_ASSERT(to_rect.xsize() * 2 == from_rect.xsize()); + JXL_ASSERT(to_rect.ysize() * 2 == from_rect.ysize()); + for (size_t fy = 0; fy < from_rect.ysize(); ++fy) { + size_t y = fy + from_rect.y0(); + size_t ym1 = y >= kStep ? y - kStep : y; + size_t yp1 = y + kStep < ysize ? y + kStep : y; + const float* rowt = from.Row(ym1); + const float* row = from.Row(y); + const float* rowb = from.Row(yp1); + float* row_out = to_rect.Row(to, fy / 2); + for (size_t fx = 0; fx < from_rect.xsize(); ++fx) { + size_t x = fx + from_rect.x0(); + size_t xm1 = x >= kStep ? x - kStep : x; + size_t xp1 = x + kStep < xsize ? x + kStep : x; + float min0 = row[x]; + float min1 = row[xm1]; + float min2 = row[xp1]; + float min3 = rowt[xm1]; + // Sort the first four values. + if (min0 > min1) std::swap(min0, min1); + if (min0 > min2) std::swap(min0, min2); + if (min0 > min3) std::swap(min0, min3); + if (min1 > min2) std::swap(min1, min2); + if (min1 > min3) std::swap(min1, min3); + if (min2 > min3) std::swap(min2, min3); + // The remaining five values of a 3x3 neighbourhood. + StoreMin4(rowt[x], min0, min1, min2, min3); + StoreMin4(rowt[xp1], min0, min1, min2, min3); + StoreMin4(rowb[xm1], min0, min1, min2, min3); + StoreMin4(rowb[x], min0, min1, min2, min3); + StoreMin4(rowb[xp1], min0, min1, min2, min3); + static const float kMulC = 0.05f; + static const float kMul0 = 0.05f; + static const float kMul1 = 0.05f; + static const float kMul2 = 0.05f; + static const float kMul3 = 0.05f; + float v = kMulC * row[x] + kMul0 * min0 + kMul1 * min1 + kMul2 * min2 + + kMul3 * min3; + if (fx % 2 == 0 && fy % 2 == 0) { + row_out[fx / 2] = v; + } else { + row_out[fx / 2] += v; + } + } + } +} + +struct AdaptiveQuantizationImpl { + void Init(const Image3F& xyb) { + JXL_DASSERT(xyb.xsize() % kBlockDim == 0); + JXL_DASSERT(xyb.ysize() % kBlockDim == 0); + const size_t xsize = xyb.xsize(); + const size_t ysize = xyb.ysize(); + aq_map = ImageF(xsize / kBlockDim, ysize / kBlockDim); + } + void PrepareBuffers(size_t num_threads) { + diff_buffer = ImageF(kEncTileDim + 8, num_threads); + for (size_t i = pre_erosion.size(); i < num_threads; i++) { + pre_erosion.emplace_back(kEncTileDimInBlocks * 2 + 2, + kEncTileDimInBlocks * 2 + 2); + } + } + + void ComputeTile(float butteraugli_target, float scale, const Image3F& xyb, + const Rect& rect, const int thread, ImageF* mask) { + PROFILER_ZONE("aq DiffPrecompute"); + const size_t xsize = xyb.xsize(); + const size_t ysize = xyb.ysize(); + + // The XYB gamma is 3.0 to be able to decode faster with two muls. + // Butteraugli's gamma is matching the gamma of human eye, around 2.6. + // We approximate the gamma difference by adding one cubic root into + // the adaptive quantization. This gives us a total gamma of 2.6666 + // for quantization uses. + const float match_gamma_offset = 0.019; + + const HWY_FULL(float) df; + const float kXMul = 23.426802998210313f; + const auto kXMulv = Set(df, kXMul); + + size_t y_start = rect.y0() * 8; + size_t y_end = y_start + rect.ysize() * 8; + + size_t x0 = rect.x0() * 8; + size_t x1 = x0 + rect.xsize() * 8; + if (x0 != 0) x0 -= 4; + if (x1 != xyb.xsize()) x1 += 4; + if (y_start != 0) y_start -= 4; + if (y_end != xyb.ysize()) y_end += 4; + pre_erosion[thread].ShrinkTo((x1 - x0) / 4, (y_end - y_start) / 4); + + // Computes image (padded to multiple of 8x8) of local pixel differences. + // Subsample both directions by 4. + for (size_t y = y_start; y < y_end; ++y) { + size_t y2 = y + 1 < ysize ? y + 1 : y; + size_t y1 = y > 0 ? y - 1 : y; + + const float* row_in = xyb.PlaneRow(1, y); + const float* row_in1 = xyb.PlaneRow(1, y1); + const float* row_in2 = xyb.PlaneRow(1, y2); + const float* row_x_in = xyb.PlaneRow(0, y); + const float* row_x_in1 = xyb.PlaneRow(0, y1); + const float* row_x_in2 = xyb.PlaneRow(0, y2); + float* JXL_RESTRICT row_out = diff_buffer.Row(thread); + + auto scalar_pixel = [&](size_t x) { + const size_t x2 = x + 1 < xsize ? x + 1 : x; + const size_t x1 = x > 0 ? x - 1 : x; + const float base = + 0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]); + const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma( + row_in[x] + match_gamma_offset); + float diff = gammac * (row_in[x] - base); + diff *= diff; + const float base_x = + 0.25f * (row_x_in2[x] + row_x_in1[x] + row_x_in[x1] + row_x_in[x2]); + float diff_x = gammac * (row_x_in[x] - base_x); + diff_x *= diff_x; + diff += kXMul * diff_x; + diff = MaskingSqrt(diff); + if ((y % 4) != 0) { + row_out[x - x0] += diff; + } else { + row_out[x - x0] = diff; + } + }; + + size_t x = x0; + // First pixel of the row. + if (x0 == 0) { + scalar_pixel(x0); + ++x; + } + // SIMD + const auto match_gamma_offset_v = Set(df, match_gamma_offset); + const auto quarter = Set(df, 0.25f); + for (; x + 1 + Lanes(df) < x1; x += Lanes(df)) { + const auto in = LoadU(df, row_in + x); + const auto in_r = LoadU(df, row_in + x + 1); + const auto in_l = LoadU(df, row_in + x - 1); + const auto in_t = LoadU(df, row_in2 + x); + const auto in_b = LoadU(df, row_in1 + x); + auto base = quarter * (in_r + in_l + in_t + in_b); + auto gammacv = + RatioOfDerivativesOfCubicRootToSimpleGamma( + df, in + match_gamma_offset_v); + auto diff = gammacv * (in - base); + diff *= diff; + + const auto in_x = LoadU(df, row_x_in + x); + const auto in_x_r = LoadU(df, row_x_in + x + 1); + const auto in_x_l = LoadU(df, row_x_in + x - 1); + const auto in_x_t = LoadU(df, row_x_in2 + x); + const auto in_x_b = LoadU(df, row_x_in1 + x); + auto base_x = quarter * (in_x_r + in_x_l + in_x_t + in_x_b); + auto diff_x = gammacv * (in_x - base_x); + diff_x *= diff_x; + diff += kXMulv * diff_x; + diff = MaskingSqrt(df, diff); + if ((y & 3) != 0) { + diff += LoadU(df, row_out + x - x0); + } + StoreU(diff, df, row_out + x - x0); + } + // Scalar + for (; x < x1; ++x) { + scalar_pixel(x); + } + if (y % 4 == 3) { + float* row_dout = pre_erosion[thread].Row((y - y_start) / 4); + for (size_t x = 0; x < (x1 - x0) / 4; x++) { + row_dout[x] = (row_out[x * 4] + row_out[x * 4 + 1] + + row_out[x * 4 + 2] + row_out[x * 4 + 3]) * + 0.25f; + } + } + } + Rect from_rect(x0 % 8 == 0 ? 0 : 1, y_start % 8 == 0 ? 0 : 1, + rect.xsize() * 2, rect.ysize() * 2); + FuzzyErosion(from_rect, pre_erosion[thread], rect, &aq_map); + for (size_t y = 0; y < rect.ysize(); ++y) { + const float* aq_map_row = rect.ConstRow(aq_map, y); + float* mask_row = rect.Row(mask, y); + for (size_t x = 0; x < rect.xsize(); ++x) { + mask_row[x] = ComputeMaskForAcStrategyUse(aq_map_row[x]); + } + } + PerBlockModulations(butteraugli_target, xyb.Plane(0), xyb.Plane(1), scale, + rect, &aq_map); + } + std::vector pre_erosion; + ImageF aq_map; + ImageF diff_buffer; +}; + +ImageF AdaptiveQuantizationMap(const float butteraugli_target, + const Image3F& xyb, + const FrameDimensions& frame_dim, float scale, + ThreadPool* pool, ImageF* mask) { + PROFILER_ZONE("aq AdaptiveQuantMap"); + + AdaptiveQuantizationImpl impl; + impl.Init(xyb); + *mask = ImageF(frame_dim.xsize_blocks, frame_dim.ysize_blocks); + RunOnPool( + pool, 0, + DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks) * + DivCeil(frame_dim.ysize_blocks, kEncTileDimInBlocks), + [&](size_t num_threads) { + impl.PrepareBuffers(num_threads); + return true; + }, + [&](const int tid, int thread) { + size_t n_enc_tiles = + DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks); + size_t tx = tid % n_enc_tiles; + size_t ty = tid / n_enc_tiles; + size_t by0 = ty * kEncTileDimInBlocks; + size_t by1 = + std::min((ty + 1) * kEncTileDimInBlocks, frame_dim.ysize_blocks); + size_t bx0 = tx * kEncTileDimInBlocks; + size_t bx1 = + std::min((tx + 1) * kEncTileDimInBlocks, frame_dim.xsize_blocks); + Rect r(bx0, by0, bx1 - bx0, by1 - by0); + impl.ComputeTile(butteraugli_target, scale, xyb, r, thread, mask); + }, + "AQ DiffPrecompute"); + + return std::move(impl).aq_map; +} + +} // namespace + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(AdaptiveQuantizationMap); + +namespace { +bool FLAGS_log_search_state = false; +// If true, prints the quantization maps at each iteration. +bool FLAGS_dump_quant_state = false; + +void DumpHeatmap(const AuxOut* aux_out, const std::string& label, + const ImageF& image, float good_threshold, + float bad_threshold) { + Image3F heatmap = CreateHeatMapImage(image, good_threshold, bad_threshold); + char filename[200]; + snprintf(filename, sizeof(filename), "%s%05d", label.c_str(), + aux_out->num_butteraugli_iters); + aux_out->DumpImage(filename, heatmap); +} + +void DumpHeatmaps(const AuxOut* aux_out, float ba_target, + const ImageF& quant_field, const ImageF& tile_heatmap, + const ImageF& bt_diffmap) { + if (!WantDebugOutput(aux_out)) return; + ImageF inv_qmap(quant_field.xsize(), quant_field.ysize()); + for (size_t y = 0; y < quant_field.ysize(); ++y) { + const float* JXL_RESTRICT row_q = quant_field.ConstRow(y); + float* JXL_RESTRICT row_inv_q = inv_qmap.Row(y); + for (size_t x = 0; x < quant_field.xsize(); ++x) { + row_inv_q[x] = 1.0f / row_q[x]; // never zero + } + } + DumpHeatmap(aux_out, "quant_heatmap", inv_qmap, 4.0f * ba_target, + 6.0f * ba_target); + DumpHeatmap(aux_out, "tile_heatmap", tile_heatmap, ba_target, + 1.5f * ba_target); + // matches heat maps produced by the command line tool. + DumpHeatmap(aux_out, "bt_diffmap", bt_diffmap, ButteraugliFuzzyInverse(1.5), + ButteraugliFuzzyInverse(0.5)); +} + +ImageF TileDistMap(const ImageF& distmap, int tile_size, int margin, + const AcStrategyImage& ac_strategy) { + PROFILER_FUNC; + const int tile_xsize = (distmap.xsize() + tile_size - 1) / tile_size; + const int tile_ysize = (distmap.ysize() + tile_size - 1) / tile_size; + ImageF tile_distmap(tile_xsize, tile_ysize); + size_t distmap_stride = tile_distmap.PixelsPerRow(); + for (int tile_y = 0; tile_y < tile_ysize; ++tile_y) { + AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(tile_y); + float* JXL_RESTRICT dist_row = tile_distmap.Row(tile_y); + for (int tile_x = 0; tile_x < tile_xsize; ++tile_x) { + AcStrategy acs = ac_strategy_row[tile_x]; + if (!acs.IsFirstBlock()) continue; + int this_tile_xsize = acs.covered_blocks_x() * tile_size; + int this_tile_ysize = acs.covered_blocks_y() * tile_size; + int y_begin = std::max(0, tile_size * tile_y - margin); + int y_end = std::min(distmap.ysize(), + tile_size * tile_y + this_tile_ysize + margin); + int x_begin = std::max(0, tile_size * tile_x - margin); + int x_end = std::min(distmap.xsize(), + tile_size * tile_x + this_tile_xsize + margin); + float dist_norm = 0.0; + double pixels = 0; + for (int y = y_begin; y < y_end; ++y) { + float ymul = 1.0; + constexpr float kBorderMul = 0.98f; + constexpr float kCornerMul = 0.7f; + if (margin != 0 && (y == y_begin || y == y_end - 1)) { + ymul = kBorderMul; + } + const float* const JXL_RESTRICT row = distmap.Row(y); + for (int x = x_begin; x < x_end; ++x) { + float xmul = ymul; + if (margin != 0 && (x == x_begin || x == x_end - 1)) { + if (xmul == 1.0) { + xmul = kBorderMul; + } else { + xmul = kCornerMul; + } + } + float v = row[x]; + v *= v; + v *= v; + v *= v; + v *= v; + dist_norm += xmul * v; + pixels += xmul; + } + } + if (pixels == 0) pixels = 1; + // 16th norm is less than the max norm, we reduce the difference + // with this normalization factor. + constexpr float kTileNorm = 1.2f; + const float tile_dist = + kTileNorm * std::pow(dist_norm / pixels, 1.0f / 16.0f); + dist_row[tile_x] = tile_dist; + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + dist_row[tile_x + distmap_stride * iy + ix] = tile_dist; + } + } + } + } + return tile_distmap; +} + +constexpr float kDcQuantPow = 0.57f; +static const float kDcQuant = 1.12f; +static const float kAcQuant = 0.787f; + +void FindBestQuantization(const ImageBundle& linear, const Image3F& opsin, + PassesEncoderState* enc_state, ThreadPool* pool, + AuxOut* aux_out) { + const CompressParams& cparams = enc_state->cparams; + Quantizer& quantizer = enc_state->shared.quantizer; + ImageI& raw_quant_field = enc_state->shared.raw_quant_field; + ImageF& quant_field = enc_state->initial_quant_field; + + const float butteraugli_target = cparams.butteraugli_distance; + ButteraugliParams params = cparams.ba_params; + params.intensity_target = linear.metadata()->IntensityTarget(); + // Hack the default intensity target value to be 80.0, the intensity + // target of sRGB images and a more reasonable viewing default than + // JPEG XL file format's default. + if (fabs(params.intensity_target - 255.0f) < 1e-3) { + params.intensity_target = 80.0f; + } + JxlButteraugliComparator comparator(params); + JXL_CHECK(comparator.SetReferenceImage(linear)); + bool lower_is_better = + (comparator.GoodQualityScore() < comparator.BadQualityScore()); + const float initial_quant_dc = InitialQuantDC(butteraugli_target); + AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field), + &quant_field); + ImageF tile_distmap; + ImageF initial_quant_field = CopyImage(quant_field); + + float initial_qf_min, initial_qf_max; + ImageMinMax(initial_quant_field, &initial_qf_min, &initial_qf_max); + float initial_qf_ratio = initial_qf_max / initial_qf_min; + float qf_max_deviation_low = std::sqrt(250 / initial_qf_ratio); + float asymmetry = 2; + if (qf_max_deviation_low < asymmetry) asymmetry = qf_max_deviation_low; + float qf_lower = initial_qf_min / (asymmetry * qf_max_deviation_low); + float qf_higher = initial_qf_max * (qf_max_deviation_low / asymmetry); + + JXL_ASSERT(qf_higher / qf_lower < 253); + + constexpr int kOriginalComparisonRound = 1; + int iters = cparams.max_butteraugli_iters; + if (iters > 7) { + iters = 7; + } + if (cparams.speed_tier != SpeedTier::kTortoise) { + iters = 2; + } + for (int i = 0; i < iters + 1; ++i) { + if (FLAGS_dump_quant_state) { + printf("\nQuantization field:\n"); + for (size_t y = 0; y < quant_field.ysize(); ++y) { + for (size_t x = 0; x < quant_field.xsize(); ++x) { + printf(" %.5f", quant_field.Row(y)[x]); + } + printf("\n"); + } + } + quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field); + ImageBundle linear = RoundtripImage(opsin, enc_state, pool); + PROFILER_ZONE("enc Butteraugli"); + float score; + ImageF diffmap; + JXL_CHECK(comparator.CompareWith(linear, &diffmap, &score)); + if (!lower_is_better) { + score = -score; + diffmap = ScaleImage(-1.0f, diffmap); + } + tile_distmap = TileDistMap(diffmap, 8, 0, enc_state->shared.ac_strategy); + if (WantDebugOutput(aux_out)) { + aux_out->DumpImage(("dec" + ToString(i)).c_str(), *linear.color()); + DumpHeatmaps(aux_out, butteraugli_target, quant_field, tile_distmap, + diffmap); + } + if (aux_out != nullptr) ++aux_out->num_butteraugli_iters; + if (FLAGS_log_search_state) { + float minval, maxval; + ImageMinMax(quant_field, &minval, &maxval); + printf("\nButteraugli iter: %d/%d\n", i, cparams.max_butteraugli_iters); + printf("Butteraugli distance: %f\n", score); + printf("quant range: %f ... %f DC quant: %f\n", minval, maxval, + initial_quant_dc); + if (FLAGS_dump_quant_state) { + quantizer.DumpQuantizationMap(raw_quant_field); + } + } + + if (i == iters) break; + + double kPow[8] = { + 0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + }; + double kPowMod[8] = { + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + }; + if (i == kOriginalComparisonRound) { + // Don't allow optimization to make the quant field a lot worse than + // what the initial guess was. This allows the AC field to have enough + // precision to reduce the oscillations due to the dc reconstruction. + double kInitMul = 0.6; + const double kOneMinusInitMul = 1.0 - kInitMul; + for (size_t y = 0; y < quant_field.ysize(); ++y) { + float* const JXL_RESTRICT row_q = quant_field.Row(y); + const float* const JXL_RESTRICT row_init = initial_quant_field.Row(y); + for (size_t x = 0; x < quant_field.xsize(); ++x) { + double clamp = kOneMinusInitMul * row_q[x] + kInitMul * row_init[x]; + if (row_q[x] < clamp) { + row_q[x] = clamp; + if (row_q[x] > qf_higher) row_q[x] = qf_higher; + if (row_q[x] < qf_lower) row_q[x] = qf_lower; + } + } + } + } + + double cur_pow = 0.0; + if (i < 7) { + cur_pow = kPow[i] + (butteraugli_target - 1.0) * kPowMod[i]; + if (cur_pow < 0) { + cur_pow = 0; + } + } + if (cur_pow == 0.0) { + for (size_t y = 0; y < quant_field.ysize(); ++y) { + const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y); + float* const JXL_RESTRICT row_q = quant_field.Row(y); + for (size_t x = 0; x < quant_field.xsize(); ++x) { + const float diff = row_dist[x] / butteraugli_target; + if (diff > 1.0f) { + float old = row_q[x]; + row_q[x] *= diff; + int qf_old = old * quantizer.InvGlobalScale() + 0.5; + int qf_new = row_q[x] * quantizer.InvGlobalScale() + 0.5; + if (qf_old == qf_new) { + row_q[x] = old + quantizer.Scale(); + } + } + if (row_q[x] > qf_higher) row_q[x] = qf_higher; + if (row_q[x] < qf_lower) row_q[x] = qf_lower; + } + } + } else { + for (size_t y = 0; y < quant_field.ysize(); ++y) { + const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y); + float* const JXL_RESTRICT row_q = quant_field.Row(y); + for (size_t x = 0; x < quant_field.xsize(); ++x) { + const float diff = row_dist[x] / butteraugli_target; + if (diff <= 1.0f) { + row_q[x] *= std::pow(diff, cur_pow); + } else { + float old = row_q[x]; + row_q[x] *= diff; + int qf_old = old * quantizer.InvGlobalScale() + 0.5; + int qf_new = row_q[x] * quantizer.InvGlobalScale() + 0.5; + if (qf_old == qf_new) { + row_q[x] = old + quantizer.Scale(); + } + } + if (row_q[x] > qf_higher) row_q[x] = qf_higher; + if (row_q[x] < qf_lower) row_q[x] = qf_lower; + } + } + } + } + quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field); +} + +void FindBestQuantizationMaxError(const Image3F& opsin, + PassesEncoderState* enc_state, + ThreadPool* pool, AuxOut* aux_out) { + // TODO(veluca): this only works if opsin is in XYB. The current encoder does + // not have code paths that produce non-XYB opsin here. + JXL_CHECK(enc_state->shared.frame_header.color_transform == + ColorTransform::kXYB); + const CompressParams& cparams = enc_state->cparams; + Quantizer& quantizer = enc_state->shared.quantizer; + ImageI& raw_quant_field = enc_state->shared.raw_quant_field; + ImageF& quant_field = enc_state->initial_quant_field; + + // TODO(veluca): better choice of this value. + const float initial_quant_dc = + 16 * std::sqrt(0.1f / cparams.butteraugli_distance); + AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field), + &quant_field); + + const float inv_max_err[3] = {1.0f / enc_state->cparams.max_error[0], + 1.0f / enc_state->cparams.max_error[1], + 1.0f / enc_state->cparams.max_error[2]}; + + for (int i = 0; i < cparams.max_butteraugli_iters + 1; ++i) { + quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field); + if (aux_out) { + aux_out->DumpXybImage(("ops" + ToString(i)).c_str(), opsin); + } + ImageBundle decoded = RoundtripImage(opsin, enc_state, pool); + if (aux_out) { + aux_out->DumpXybImage(("dec" + ToString(i)).c_str(), *decoded.color()); + } + + for (size_t by = 0; by < enc_state->shared.frame_dim.ysize_blocks; by++) { + AcStrategyRow ac_strategy_row = + enc_state->shared.ac_strategy.ConstRow(by); + for (size_t bx = 0; bx < enc_state->shared.frame_dim.xsize_blocks; bx++) { + AcStrategy acs = ac_strategy_row[bx]; + if (!acs.IsFirstBlock()) continue; + float max_error = 0; + for (size_t c = 0; c < 3; c++) { + for (size_t y = by * kBlockDim; + y < (by + acs.covered_blocks_y()) * kBlockDim; y++) { + if (y >= decoded.ysize()) continue; + const float* JXL_RESTRICT in_row = opsin.ConstPlaneRow(c, y); + const float* JXL_RESTRICT dec_row = + decoded.color()->ConstPlaneRow(c, y); + for (size_t x = bx * kBlockDim; + x < (bx + acs.covered_blocks_x()) * kBlockDim; x++) { + if (x >= decoded.xsize()) continue; + max_error = std::max( + std::abs(in_row[x] - dec_row[x]) * inv_max_err[c], max_error); + } + } + } + // Target an error between max_error/2 and max_error. + // If the error in the varblock is above the target, increase the qf to + // compensate. If the error is below the target, decrease the qf. + // However, to avoid an excessive increase of the qf, only do so if the + // error is less than half the maximum allowed error. + const float qf_mul = (max_error < 0.5f) ? max_error * 2.0f + : (max_error > 1.0f) ? max_error + : 1.0f; + for (size_t qy = by; qy < by + acs.covered_blocks_y(); qy++) { + float* JXL_RESTRICT quant_field_row = quant_field.Row(qy); + for (size_t qx = bx; qx < bx + acs.covered_blocks_x(); qx++) { + quant_field_row[qx] *= qf_mul; + } + } + } + } + } + quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field); +} + +} // namespace + +void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect, + ImageF* quant_field) { + // Replace the whole quant_field in non-8x8 blocks with the maximum of each + // 8x8 block. + size_t stride = quant_field->PixelsPerRow(); + for (size_t y = 0; y < rect.ysize(); ++y) { + AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(rect, y); + float* JXL_RESTRICT quant_row = rect.Row(quant_field, y); + for (size_t x = 0; x < rect.xsize(); ++x) { + AcStrategy acs = ac_strategy_row[x]; + if (!acs.IsFirstBlock()) continue; + JXL_ASSERT(x + acs.covered_blocks_x() <= quant_field->xsize()); + JXL_ASSERT(y + acs.covered_blocks_y() <= quant_field->ysize()); + float max = quant_row[x]; + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + max = std::max(quant_row[x + ix + iy * stride], max); + } + } + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + quant_row[x + ix + iy * stride] = max; + } + } + } + } +} + +float InitialQuantDC(float butteraugli_target) { + const float kDcMul = 2.9; // Butteraugli target where non-linearity kicks in. + const float butteraugli_target_dc = std::max( + 0.5f * butteraugli_target, + std::min(butteraugli_target, + kDcMul * std::pow((1.0f / kDcMul) * butteraugli_target, + kDcQuantPow))); + // We want the maximum DC value to be at most 2**15 * kInvDCQuant / quant_dc. + // The maximum DC value might not be in the kXybRange because of inverse + // gaborish, so we add some slack to the maximum theoretical quant obtained + // this way (64). + return std::min(kDcQuant / butteraugli_target_dc, 50.f); +} + +ImageF InitialQuantField(const float butteraugli_target, const Image3F& opsin, + const FrameDimensions& frame_dim, ThreadPool* pool, + float rescale, ImageF* mask) { + PROFILER_FUNC; + const float quant_ac = kAcQuant / butteraugli_target; + return HWY_DYNAMIC_DISPATCH(AdaptiveQuantizationMap)( + butteraugli_target, opsin, frame_dim, quant_ac * rescale, pool, mask); +} + +void FindBestQuantizer(const ImageBundle* linear, const Image3F& opsin, + PassesEncoderState* enc_state, ThreadPool* pool, + AuxOut* aux_out, double rescale) { + const CompressParams& cparams = enc_state->cparams; + if (cparams.max_error_mode) { + PROFILER_ZONE("enc find best maxerr"); + FindBestQuantizationMaxError(opsin, enc_state, pool, aux_out); + } else if (cparams.speed_tier <= SpeedTier::kKitten) { + // Normal encoding to a butteraugli score. + PROFILER_ZONE("enc find best2"); + FindBestQuantization(*linear, opsin, enc_state, pool, aux_out); + } +} + +ImageBundle RoundtripImage(const Image3F& opsin, PassesEncoderState* enc_state, + ThreadPool* pool) { + PROFILER_ZONE("enc roundtrip"); + std::unique_ptr dec_state = + jxl::make_unique(); + JXL_CHECK(dec_state->output_encoding_info.Set( + *enc_state->shared.metadata, + ColorEncoding::LinearSRGB( + enc_state->shared.metadata->m.color_encoding.IsGray()))); + dec_state->shared = &enc_state->shared; + JXL_ASSERT(opsin.ysize() % kBlockDim == 0); + + const size_t xsize_groups = DivCeil(opsin.xsize(), kGroupDim); + const size_t ysize_groups = DivCeil(opsin.ysize(), kGroupDim); + const size_t num_groups = xsize_groups * ysize_groups; + + size_t num_special_frames = enc_state->special_frames.size(); + + std::unique_ptr modular_frame_encoder = + jxl::make_unique(enc_state->shared.frame_header, + enc_state->cparams); + // InitializePassesEncoder(opsin, pool, enc_state, modular_frame_encoder.get(), + // nullptr); + JXL_CHECK(dec_state->Init()); + dec_state->InitForAC(pool); + + ImageBundle decoded(&enc_state->shared.metadata->m); + decoded.origin = enc_state->shared.frame_header.frame_origin; + decoded.SetFromImage(Image3F(opsin.xsize(), opsin.ysize()), + dec_state->output_encoding_info.color_encoding); + + // Same as dec_state->shared->frame_header.nonserialized_metadata->m + const ImageMetadata& metadata = *decoded.metadata(); + if (!metadata.extra_channel_info.empty()) { + // Add dummy extra channels to the dec_state: FinalizeFrameDecoding moves + // these extra channels to the ImageBundle, and is required that the amount + // of extra channels matches its metadata()->extra_channel_info.size(). + // Normally we'd place these extra channels in the ImageBundle, but in this + // case FinalizeFrameDecoding is the one that does this. + std::vector extra_channels; + extra_channels.reserve(metadata.extra_channel_info.size()); + for (size_t i = 0; i < metadata.extra_channel_info.size(); i++) { + extra_channels.emplace_back(decoded.xsize(), decoded.ysize()); + // Must initialize the image with data to not affect blending with + // uninitialized memory. + ZeroFillImage(&extra_channels.back()); + } + dec_state->extra_channels = std::move(extra_channels); + } + + hwy::AlignedUniquePtr group_dec_caches; + const auto allocate_storage = [&](size_t num_threads) { + dec_state->EnsureStorage(num_threads); + group_dec_caches = hwy::MakeUniqueAlignedArray(num_threads); + return true; + }; + const auto process_group = [&](const int group_index, const int thread) { + if (dec_state->shared->frame_header.loop_filter.epf_iters > 0) { + ComputeSigma(dec_state->shared->BlockGroupRect(group_index), + dec_state.get()); + } + JXL_CHECK(DecodeGroupForRoundtrip( + enc_state->coeffs, group_index, dec_state.get(), + &group_dec_caches[thread], thread, &decoded, nullptr)); + }; + RunOnPool(pool, 0, num_groups, allocate_storage, process_group, "AQ loop"); + + // Fine to do a JXL_ASSERT instead of error handling, since this only happens + // on the encoder side where we can't be fed with invalid data. + JXL_CHECK(FinalizeFrameDecoding(&decoded, dec_state.get(), pool, + /*force_fir=*/false, /*skip_blending=*/true)); + // Ensure we don't create any new special frames. + enc_state->special_frames.resize(num_special_frames); + + return decoded; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_adaptive_quantization.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_adaptive_quantization.h new file mode 100644 index 0000000000..d9666f42b1 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_adaptive_quantization.h @@ -0,0 +1,65 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_ADAPTIVE_QUANTIZATION_H_ +#define LIB_JXL_ENC_ADAPTIVE_QUANTIZATION_H_ + +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" + +// Heuristics to find a good quantizer for a given image. InitialQuantField +// produces a quantization field (i.e. relative quantization amounts for each +// block) out of an opsin-space image. `InitialQuantField` uses heuristics, +// `FindBestQuantizer` (in non-fast mode) will run multiple encoding-decoding +// steps and try to improve the given quant field. + +namespace jxl { + +// Computes the decoded image for a given set of compression parameters. Mainly +// used in the FindBestQuantization loops and in some tests. +// TODO(veluca): this doesn't seem the best possible file for this function. +ImageBundle RoundtripImage(const Image3F& opsin, PassesEncoderState* enc_state, + ThreadPool* pool); + +// Returns an image subsampled by kBlockDim in each direction. If the value +// at pixel (x,y) in the returned image is greater than 1.0, it means that +// more fine-grained quantization should be used in the corresponding block +// of the input image, while a value less than 1.0 indicates that less +// fine-grained quantization should be enough. Returns a mask, too, which +// can later be used to make better decisions about ac strategy. +ImageF InitialQuantField(float butteraugli_target, const Image3F& opsin, + const FrameDimensions& frame_dim, ThreadPool* pool, + float rescale, ImageF* initial_quant_mask); + +float InitialQuantDC(float butteraugli_target); + +void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect, + ImageF* quant_field); + +// Returns a quantizer that uses an adjusted version of the provided +// quant_field. Also computes the dequant_map corresponding to the given +// dequant_float_map and chosen quantization levels. +// `linear` is only used in Kitten mode or slower. +void FindBestQuantizer(const ImageBundle* linear, const Image3F& opsin, + PassesEncoderState* enc_state, ThreadPool* pool, + AuxOut* aux_out, double rescale = 1.0); + +} // namespace jxl + +#endif // LIB_JXL_ENC_ADAPTIVE_QUANTIZATION_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans.cc new file mode 100644 index 0000000000..48bc745f65 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans.cc @@ -0,0 +1,1622 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_ans.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ans_common.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/enc_cluster.h" +#include "lib/jxl/enc_context_map.h" +#include "lib/jxl/enc_huffman.h" +#include "lib/jxl/fast_math-inl.h" +#include "lib/jxl/fields.h" + +namespace jxl { + +namespace { + +bool ans_fuzzer_friendly_ = false; + +static const int kMaxNumSymbolsForSmallCode = 4; + +void ANSBuildInfoTable(const ANSHistBin* counts, const AliasTable::Entry* table, + size_t alphabet_size, size_t log_alpha_size, + ANSEncSymbolInfo* info) { + size_t log_entry_size = ANS_LOG_TAB_SIZE - log_alpha_size; + size_t entry_size_minus_1 = (1 << log_entry_size) - 1; + // create valid alias table for empty streams. + for (size_t s = 0; s < std::max(1, alphabet_size); ++s) { + const ANSHistBin freq = s == alphabet_size ? ANS_TAB_SIZE : counts[s]; + info[s].freq_ = static_cast(freq); +#ifdef USE_MULT_BY_RECIPROCAL + if (freq != 0) { + info[s].ifreq_ = + ((1ull << RECIPROCAL_PRECISION) + info[s].freq_ - 1) / info[s].freq_; + } else { + info[s].ifreq_ = 1; // shouldn't matter (symbol shouldn't occur), but... + } +#endif + info[s].reverse_map_.resize(freq); + } + for (int i = 0; i < ANS_TAB_SIZE; i++) { + AliasTable::Symbol s = + AliasTable::Lookup(table, i, log_entry_size, entry_size_minus_1); + info[s.value].reverse_map_[s.offset] = i; + } +} + +float EstimateDataBits(const ANSHistBin* histogram, const ANSHistBin* counts, + size_t len) { + float sum = 0.0f; + int total_histogram = 0; + int total_counts = 0; + for (size_t i = 0; i < len; ++i) { + total_histogram += histogram[i]; + total_counts += counts[i]; + if (histogram[i] > 0) { + JXL_ASSERT(counts[i] > 0); + // += histogram[i] * -log(counts[i]/total_counts) + sum += histogram[i] * + std::max(0.0f, ANS_LOG_TAB_SIZE - FastLog2f(counts[i])); + } + } + if (total_histogram > 0) { + JXL_ASSERT(total_counts == ANS_TAB_SIZE); + } + return sum; +} + +float EstimateDataBitsFlat(const ANSHistBin* histogram, size_t len) { + const float flat_bits = std::max(FastLog2f(len), 0.0f); + int total_histogram = 0; + for (size_t i = 0; i < len; ++i) { + total_histogram += histogram[i]; + } + return total_histogram * flat_bits; +} + +// Static Huffman code for encoding logcounts. The last symbol is used as RLE +// sequence. +static const uint8_t kLogCountBitLengths[ANS_LOG_TAB_SIZE + 2] = { + 5, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 6, 7, 7, +}; +static const uint8_t kLogCountSymbols[ANS_LOG_TAB_SIZE + 2] = { + 17, 11, 15, 3, 9, 7, 4, 2, 5, 6, 0, 33, 1, 65, +}; + +// Returns the difference between largest count that can be represented and is +// smaller than "count" and smallest representable count larger than "count". +static int SmallestIncrement(uint32_t count, uint32_t shift) { + int bits = count == 0 ? -1 : FloorLog2Nonzero(count); + int drop_bits = bits - GetPopulationCountPrecision(bits, shift); + return drop_bits < 0 ? 1 : (1 << drop_bits); +} + +template +bool RebalanceHistogram(const float* targets, int max_symbol, int table_size, + uint32_t shift, int* omit_pos, ANSHistBin* counts) { + int sum = 0; + float sum_nonrounded = 0.0; + int remainder_pos = 0; // if all of them are handled in first loop + int remainder_log = -1; + for (int n = 0; n < max_symbol; ++n) { + if (targets[n] > 0 && targets[n] < 1.0f) { + counts[n] = 1; + sum_nonrounded += targets[n]; + sum += counts[n]; + } + } + const float discount_ratio = + (table_size - sum) / (table_size - sum_nonrounded); + JXL_ASSERT(discount_ratio > 0); + JXL_ASSERT(discount_ratio <= 1.0f); + // Invariant for minimize_error_of_sum == true: + // abs(sum - sum_nonrounded) + // <= SmallestIncrement(max(targets[])) + max_symbol + for (int n = 0; n < max_symbol; ++n) { + if (targets[n] >= 1.0f) { + sum_nonrounded += targets[n]; + counts[n] = + static_cast(targets[n] * discount_ratio); // truncate + if (counts[n] == 0) counts[n] = 1; + if (counts[n] == table_size) counts[n] = table_size - 1; + // Round the count to the closest nonzero multiple of SmallestIncrement + // (when minimize_error_of_sum is false) or one of two closest so as to + // keep the sum as close as possible to sum_nonrounded. + int inc = SmallestIncrement(counts[n], shift); + counts[n] -= counts[n] & (inc - 1); + // TODO(robryk): Should we rescale targets[n]? + const float target = + minimize_error_of_sum ? (sum_nonrounded - sum) : targets[n]; + if (counts[n] == 0 || + (target > counts[n] + inc / 2 && counts[n] + inc < table_size)) { + counts[n] += inc; + } + sum += counts[n]; + const int count_log = FloorLog2Nonzero(static_cast(counts[n])); + if (count_log > remainder_log) { + remainder_pos = n; + remainder_log = count_log; + } + } + } + JXL_ASSERT(remainder_pos != -1); + // NOTE: This is the only place where counts could go negative. We could + // detect that, return false and make ANSHistBin uint32_t. + counts[remainder_pos] -= sum - table_size; + *omit_pos = remainder_pos; + return counts[remainder_pos] > 0; +} + +Status NormalizeCounts(ANSHistBin* counts, int* omit_pos, const int length, + const int precision_bits, uint32_t shift, + int* num_symbols, int* symbols) { + const int32_t table_size = 1 << precision_bits; // target sum / table size + uint64_t total = 0; + int max_symbol = 0; + int symbol_count = 0; + for (int n = 0; n < length; ++n) { + total += counts[n]; + if (counts[n] > 0) { + if (symbol_count < kMaxNumSymbolsForSmallCode) { + symbols[symbol_count] = n; + } + ++symbol_count; + max_symbol = n + 1; + } + } + *num_symbols = symbol_count; + if (symbol_count == 0) { + return true; + } + if (symbol_count == 1) { + counts[symbols[0]] = table_size; + return true; + } + if (symbol_count > table_size) + return JXL_FAILURE("Too many entries in an ANS histogram"); + + const float norm = 1.f * table_size / total; + std::vector targets(max_symbol); + for (size_t n = 0; n < targets.size(); ++n) { + targets[n] = norm * counts[n]; + } + if (!RebalanceHistogram(&targets[0], max_symbol, table_size, shift, + omit_pos, counts)) { + // Use an alternative rebalancing mechanism if the one above failed + // to create a histogram that is positive wherever the original one was. + if (!RebalanceHistogram(&targets[0], max_symbol, table_size, shift, + omit_pos, counts)) { + return JXL_FAILURE("Logic error: couldn't rebalance a histogram"); + } + } + return true; +} + +struct SizeWriter { + size_t size = 0; + void Write(size_t num, size_t bits) { size += num; } +}; + +template +void StoreVarLenUint8(size_t n, Writer* writer) { + JXL_DASSERT(n <= 255); + if (n == 0) { + writer->Write(1, 0); + } else { + writer->Write(1, 1); + size_t nbits = FloorLog2Nonzero(n); + writer->Write(3, nbits); + writer->Write(nbits, n - (1ULL << nbits)); + } +} + +template +void StoreVarLenUint16(size_t n, Writer* writer) { + JXL_DASSERT(n <= 65535); + if (n == 0) { + writer->Write(1, 0); + } else { + writer->Write(1, 1); + size_t nbits = FloorLog2Nonzero(n); + writer->Write(4, nbits); + writer->Write(nbits, n - (1ULL << nbits)); + } +} + +template +bool EncodeCounts(const ANSHistBin* counts, const int alphabet_size, + const int omit_pos, const int num_symbols, uint32_t shift, + const int* symbols, Writer* writer) { + bool ok = true; + if (num_symbols <= 2) { + // Small tree marker to encode 1-2 symbols. + writer->Write(1, 1); + if (num_symbols == 0) { + writer->Write(1, 0); + StoreVarLenUint8(0, writer); + } else { + writer->Write(1, num_symbols - 1); + for (int i = 0; i < num_symbols; ++i) { + StoreVarLenUint8(symbols[i], writer); + } + } + if (num_symbols == 2) { + writer->Write(ANS_LOG_TAB_SIZE, counts[symbols[0]]); + } + } else { + // Mark non-small tree. + writer->Write(1, 0); + // Mark non-flat histogram. + writer->Write(1, 0); + + // Precompute sequences for RLE encoding. Contains the number of identical + // values starting at a given index. Only contains the value at the first + // element of the series. + std::vector same(alphabet_size, 0); + int last = 0; + for (int i = 1; i < alphabet_size; i++) { + // Store the sequence length once different symbol reached, or we're at + // the end, or the length is longer than we can encode, or we are at + // the omit_pos. We don't support including the omit_pos in an RLE + // sequence because this value may use a different amount of log2 bits + // than standard, it is too complex to handle in the decoder. + if (counts[i] != counts[last] || i + 1 == alphabet_size || + (i - last) >= 255 || i == omit_pos || i == omit_pos + 1) { + same[last] = (i - last); + last = i + 1; + } + } + + int length = 0; + std::vector logcounts(alphabet_size); + int omit_log = 0; + for (int i = 0; i < alphabet_size; ++i) { + JXL_ASSERT(counts[i] <= ANS_TAB_SIZE); + JXL_ASSERT(counts[i] >= 0); + if (i == omit_pos) { + length = i + 1; + } else if (counts[i] > 0) { + logcounts[i] = FloorLog2Nonzero(static_cast(counts[i])) + 1; + length = i + 1; + if (i < omit_pos) { + omit_log = std::max(omit_log, logcounts[i] + 1); + } else { + omit_log = std::max(omit_log, logcounts[i]); + } + } + } + logcounts[omit_pos] = omit_log; + + // Elias gamma-like code for shift. Only difference is that if the number + // of bits to be encoded is equal to FloorLog2(ANS_LOG_TAB_SIZE+1), we skip + // the terminating 0 in unary coding. + int upper_bound_log = FloorLog2Nonzero(ANS_LOG_TAB_SIZE + 1); + int log = FloorLog2Nonzero(shift + 1); + writer->Write(log, (1 << log) - 1); + if (log != upper_bound_log) writer->Write(1, 0); + writer->Write(log, ((1 << log) - 1) & (shift + 1)); + + // Since num_symbols >= 3, we know that length >= 3, therefore we encode + // length - 3. + if (length - 3 > 255) { + // Pretend that everything is OK, but complain about correctness later. + StoreVarLenUint8(255, writer); + ok = false; + } else { + StoreVarLenUint8(length - 3, writer); + } + + // The logcount values are encoded with a static Huffman code. + static const size_t kMinReps = 4; + size_t rep = ANS_LOG_TAB_SIZE + 1; + for (int i = 0; i < length; ++i) { + if (i > 0 && same[i - 1] > kMinReps) { + // Encode the RLE symbol and skip the repeated ones. + writer->Write(kLogCountBitLengths[rep], kLogCountSymbols[rep]); + StoreVarLenUint8(same[i - 1] - kMinReps - 1, writer); + i += same[i - 1] - 2; + continue; + } + writer->Write(kLogCountBitLengths[logcounts[i]], + kLogCountSymbols[logcounts[i]]); + } + for (int i = 0; i < length; ++i) { + if (i > 0 && same[i - 1] > kMinReps) { + // Skip symbols encoded by RLE. + i += same[i - 1] - 2; + continue; + } + if (logcounts[i] > 1 && i != omit_pos) { + int bitcount = GetPopulationCountPrecision(logcounts[i] - 1, shift); + int drop_bits = logcounts[i] - 1 - bitcount; + JXL_CHECK((counts[i] & ((1 << drop_bits) - 1)) == 0); + writer->Write(bitcount, (counts[i] >> drop_bits) - (1 << bitcount)); + } + } + } + return ok; +} + +void EncodeFlatHistogram(const int alphabet_size, BitWriter* writer) { + // Mark non-small tree. + writer->Write(1, 0); + // Mark uniform histogram. + writer->Write(1, 1); + JXL_ASSERT(alphabet_size > 0); + // Encode alphabet size. + StoreVarLenUint8(alphabet_size - 1, writer); +} + +float ComputeHistoAndDataCost(const ANSHistBin* histogram, size_t alphabet_size, + uint32_t method) { + if (method == 0) { // Flat code + return ANS_LOG_TAB_SIZE + 2 + + EstimateDataBitsFlat(histogram, alphabet_size); + } + // Non-flat: shift = method-1. + uint32_t shift = method - 1; + std::vector counts(histogram, histogram + alphabet_size); + int omit_pos = 0; + int num_symbols; + int symbols[kMaxNumSymbolsForSmallCode] = {}; + JXL_CHECK(NormalizeCounts(counts.data(), &omit_pos, alphabet_size, + ANS_LOG_TAB_SIZE, shift, &num_symbols, symbols)); + SizeWriter writer; + // Ignore the correctness, no real encoding happens at this stage. + (void)EncodeCounts(counts.data(), alphabet_size, omit_pos, num_symbols, shift, + symbols, &writer); + return writer.size + + EstimateDataBits(histogram, counts.data(), alphabet_size); +} + +uint32_t ComputeBestMethod( + const ANSHistBin* histogram, size_t alphabet_size, float* cost, + HistogramParams::ANSHistogramStrategy ans_histogram_strategy) { + size_t method = 0; + float fcost = ComputeHistoAndDataCost(histogram, alphabet_size, 0); + for (uint32_t shift = 0; shift <= ANS_LOG_TAB_SIZE; + ans_histogram_strategy != HistogramParams::ANSHistogramStrategy::kPrecise + ? shift += 2 + : shift++) { + float c = ComputeHistoAndDataCost(histogram, alphabet_size, shift + 1); + if (c < fcost) { + method = shift + 1; + fcost = c; + } else if (ans_histogram_strategy == + HistogramParams::ANSHistogramStrategy::kFast) { + // do not be as precise if estimating cost. + break; + } + } + *cost = fcost; + return method; +} + +} // namespace + +// Returns an estimate of the cost of encoding this histogram and the +// corresponding data. +size_t BuildAndStoreANSEncodingData( + HistogramParams::ANSHistogramStrategy ans_histogram_strategy, + const ANSHistBin* histogram, size_t alphabet_size, size_t log_alpha_size, + bool use_prefix_code, ANSEncSymbolInfo* info, BitWriter* writer) { + if (use_prefix_code) { + if (alphabet_size <= 1) return 0; + std::vector histo(alphabet_size); + for (size_t i = 0; i < alphabet_size; i++) { + histo[i] = histogram[i]; + JXL_CHECK(histogram[i] >= 0); + } + size_t cost = 0; + { + std::vector depths(alphabet_size); + std::vector bits(alphabet_size); + BitWriter tmp_writer; + BitWriter* w = writer ? writer : &tmp_writer; + size_t start = w->BitsWritten(); + BitWriter::Allotment allotment( + w, 8 * alphabet_size + 8); // safe upper bound + BuildAndStoreHuffmanTree(histo.data(), alphabet_size, depths.data(), + bits.data(), w); + ReclaimAndCharge(w, &allotment, 0, /*aux_out=*/nullptr); + + for (size_t i = 0; i < alphabet_size; i++) { + info[i].bits = depths[i] == 0 ? 0 : bits[i]; + info[i].depth = depths[i]; + } + cost = w->BitsWritten() - start; + } + // Estimate data cost. + for (size_t i = 0; i < alphabet_size; i++) { + cost += histogram[i] * info[i].depth; + } + return cost; + } + JXL_ASSERT(alphabet_size <= ANS_TAB_SIZE); + // Ensure we ignore trailing zeros in the histogram. + if (alphabet_size != 0) { + size_t largest_symbol = 0; + for (size_t i = 0; i < alphabet_size; i++) { + if (histogram[i] != 0) largest_symbol = i; + } + alphabet_size = largest_symbol + 1; + } + float cost; + uint32_t method = ComputeBestMethod(histogram, alphabet_size, &cost, + ans_histogram_strategy); + JXL_ASSERT(cost >= 0); + int num_symbols; + int symbols[kMaxNumSymbolsForSmallCode] = {}; + std::vector counts(histogram, histogram + alphabet_size); + if (!counts.empty()) { + size_t sum = 0; + for (size_t i = 0; i < counts.size(); i++) { + sum += counts[i]; + } + if (sum == 0) { + counts[0] = ANS_TAB_SIZE; + } + } + if (method == 0) { + counts = CreateFlatHistogram(alphabet_size, ANS_TAB_SIZE); + AliasTable::Entry a[ANS_MAX_ALPHABET_SIZE]; + InitAliasTable(counts, ANS_TAB_SIZE, log_alpha_size, a); + ANSBuildInfoTable(counts.data(), a, alphabet_size, log_alpha_size, info); + if (writer != nullptr) { + EncodeFlatHistogram(alphabet_size, writer); + } + return cost; + } + int omit_pos = 0; + uint32_t shift = method - 1; + JXL_CHECK(NormalizeCounts(counts.data(), &omit_pos, alphabet_size, + ANS_LOG_TAB_SIZE, shift, &num_symbols, symbols)); + AliasTable::Entry a[ANS_MAX_ALPHABET_SIZE]; + InitAliasTable(counts, ANS_TAB_SIZE, log_alpha_size, a); + ANSBuildInfoTable(counts.data(), a, alphabet_size, log_alpha_size, info); + if (writer != nullptr) { + bool ok = EncodeCounts(counts.data(), alphabet_size, omit_pos, num_symbols, + shift, symbols, writer); + (void)ok; + JXL_DASSERT(ok); + } + return cost; +} + +float ANSPopulationCost(const ANSHistBin* data, size_t alphabet_size) { + float c; + ComputeBestMethod(data, alphabet_size, &c, + HistogramParams::ANSHistogramStrategy::kFast); + return c; +} + +template +void EncodeUintConfig(const HybridUintConfig uint_config, Writer* writer, + size_t log_alpha_size) { + writer->Write(CeilLog2Nonzero(log_alpha_size + 1), + uint_config.split_exponent); + if (uint_config.split_exponent == log_alpha_size) { + return; // msb/lsb don't matter. + } + size_t nbits = CeilLog2Nonzero(uint_config.split_exponent + 1); + writer->Write(nbits, uint_config.msb_in_token); + nbits = CeilLog2Nonzero(uint_config.split_exponent - + uint_config.msb_in_token + 1); + writer->Write(nbits, uint_config.lsb_in_token); +} +template +void EncodeUintConfigs(const std::vector& uint_config, + Writer* writer, size_t log_alpha_size) { + // TODO(veluca): RLE? + for (size_t i = 0; i < uint_config.size(); i++) { + EncodeUintConfig(uint_config[i], writer, log_alpha_size); + } +} +template void EncodeUintConfigs(const std::vector&, + BitWriter*, size_t); + +namespace { + +void ChooseUintConfigs(const HistogramParams& params, + const std::vector>& tokens, + const std::vector& context_map, + std::vector* clustered_histograms, + EntropyEncodingData* codes, size_t* log_alpha_size) { + codes->uint_config.resize(clustered_histograms->size()); + if (params.uint_method == HistogramParams::HybridUintMethod::kNone) return; + if (params.uint_method == HistogramParams::HybridUintMethod::kContextMap) { + codes->uint_config.clear(); + codes->uint_config.resize(clustered_histograms->size(), + HybridUintConfig(2, 0, 1)); + return; + } + + // Brute-force method that tries a few options. + std::vector configs; + if (params.uint_method == HistogramParams::HybridUintMethod::kBest) { + configs = { + HybridUintConfig(4, 2, 0), // default + HybridUintConfig(4, 1, 0), // less precise + HybridUintConfig(4, 2, 1), // add sign + HybridUintConfig(4, 2, 2), // add sign+parity + HybridUintConfig(4, 1, 2), // add parity but less msb + // Same as above, but more direct coding. + HybridUintConfig(5, 2, 0), HybridUintConfig(5, 1, 0), + HybridUintConfig(5, 2, 1), HybridUintConfig(5, 2, 2), + HybridUintConfig(5, 1, 2), + // Same as above, but less direct coding. + HybridUintConfig(3, 2, 0), HybridUintConfig(3, 1, 0), + HybridUintConfig(3, 2, 1), HybridUintConfig(3, 1, 2), + // For near-lossless. + HybridUintConfig(4, 1, 3), HybridUintConfig(5, 1, 4), + HybridUintConfig(5, 2, 3), HybridUintConfig(6, 1, 5), + HybridUintConfig(6, 2, 4), HybridUintConfig(6, 0, 0), + // Other + HybridUintConfig(0, 0, 0), // varlenuint + HybridUintConfig(2, 0, 1), // works well for ctx map + HybridUintConfig(7, 0, 0), // direct coding + HybridUintConfig(8, 0, 0), // direct coding + HybridUintConfig(9, 0, 0), // direct coding + HybridUintConfig(10, 0, 0), // direct coding + HybridUintConfig(11, 0, 0), // direct coding + HybridUintConfig(12, 0, 0), // direct coding + }; + } else if (params.uint_method == HistogramParams::HybridUintMethod::kFast) { + configs = { + HybridUintConfig(4, 2, 0), // default + HybridUintConfig(4, 1, 2), // add parity but less msb + HybridUintConfig(0, 0, 0), // smallest histograms + HybridUintConfig(2, 0, 1), // works well for ctx map + }; + } + + std::vector costs(clustered_histograms->size(), + std::numeric_limits::max()); + std::vector extra_bits(clustered_histograms->size()); + std::vector is_valid(clustered_histograms->size()); + size_t max_alpha = + codes->use_prefix_code ? PREFIX_MAX_ALPHABET_SIZE : ANS_MAX_ALPHABET_SIZE; + for (HybridUintConfig cfg : configs) { + std::fill(is_valid.begin(), is_valid.end(), true); + std::fill(extra_bits.begin(), extra_bits.end(), 0); + + for (size_t i = 0; i < clustered_histograms->size(); i++) { + (*clustered_histograms)[i].Clear(); + } + for (size_t i = 0; i < tokens.size(); ++i) { + for (size_t j = 0; j < tokens[i].size(); ++j) { + const Token token = tokens[i][j]; + // TODO(veluca): do not ignore lz77 commands. + if (token.is_lz77_length) continue; + size_t histo = context_map[token.context]; + uint32_t tok, nbits, bits; + cfg.Encode(token.value, &tok, &nbits, &bits); + if (tok >= max_alpha || + (codes->lz77.enabled && tok >= codes->lz77.min_symbol)) { + is_valid[histo] = false; + continue; + } + extra_bits[histo] += nbits; + (*clustered_histograms)[histo].Add(tok); + } + } + + for (size_t i = 0; i < clustered_histograms->size(); i++) { + if (!is_valid[i]) continue; + float cost = (*clustered_histograms)[i].PopulationCost() + extra_bits[i]; + if (cost < costs[i]) { + codes->uint_config[i] = cfg; + costs[i] = cost; + } + } + } + + // Rebuild histograms. + for (size_t i = 0; i < clustered_histograms->size(); i++) { + (*clustered_histograms)[i].Clear(); + } + *log_alpha_size = 4; + for (size_t i = 0; i < tokens.size(); ++i) { + for (size_t j = 0; j < tokens[i].size(); ++j) { + const Token token = tokens[i][j]; + uint32_t tok, nbits, bits; + size_t histo = context_map[token.context]; + (token.is_lz77_length ? codes->lz77.length_uint_config + : codes->uint_config[histo]) + .Encode(token.value, &tok, &nbits, &bits); + tok += token.is_lz77_length ? codes->lz77.min_symbol : 0; + (*clustered_histograms)[histo].Add(tok); + while (tok >= (1u << *log_alpha_size)) (*log_alpha_size)++; + } + } +#if JXL_ENABLE_ASSERT + size_t max_log_alpha_size = codes->use_prefix_code ? PREFIX_MAX_BITS : 8; + JXL_ASSERT(*log_alpha_size <= max_log_alpha_size); +#endif +} + +class HistogramBuilder { + public: + explicit HistogramBuilder(const size_t num_contexts) + : histograms_(num_contexts) {} + + void VisitSymbol(int symbol, size_t histo_idx) { + JXL_DASSERT(histo_idx < histograms_.size()); + histograms_[histo_idx].Add(symbol); + } + + // NOTE: `layer` is only for clustered_entropy; caller does ReclaimAndCharge. + size_t BuildAndStoreEntropyCodes( + const HistogramParams& params, + const std::vector>& tokens, EntropyEncodingData* codes, + std::vector* context_map, bool use_prefix_code, + BitWriter* writer, size_t layer, AuxOut* aux_out) const { + size_t cost = 0; + codes->encoding_info.clear(); + std::vector clustered_histograms(histograms_); + context_map->resize(histograms_.size()); + if (histograms_.size() > 1) { + if (!ans_fuzzer_friendly_) { + std::vector histogram_symbols; + ClusterHistograms(params, histograms_, histograms_.size(), + kClustersLimit, &clustered_histograms, + &histogram_symbols); + for (size_t c = 0; c < histograms_.size(); ++c) { + (*context_map)[c] = static_cast(histogram_symbols[c]); + } + } else { + fill(context_map->begin(), context_map->end(), 0); + size_t max_symbol = 0; + for (const Histogram& h : histograms_) { + max_symbol = std::max(h.data_.size(), max_symbol); + } + size_t num_symbols = 1 << CeilLog2Nonzero(max_symbol + 1); + clustered_histograms.resize(1); + clustered_histograms[0].Clear(); + for (size_t i = 0; i < num_symbols; i++) { + clustered_histograms[0].Add(i); + } + } + if (writer != nullptr) { + EncodeContextMap(*context_map, clustered_histograms.size(), writer); + } + } + if (aux_out != nullptr) { + for (size_t i = 0; i < clustered_histograms.size(); ++i) { + aux_out->layers[layer].clustered_entropy += + clustered_histograms[i].ShannonEntropy(); + } + } + codes->use_prefix_code = use_prefix_code; + size_t log_alpha_size = codes->lz77.enabled ? 8 : 7; // Sane default. + if (ans_fuzzer_friendly_) { + codes->uint_config.clear(); + codes->uint_config.resize(1, HybridUintConfig(7, 0, 0)); + } else { + ChooseUintConfigs(params, tokens, *context_map, &clustered_histograms, + codes, &log_alpha_size); + } + if (log_alpha_size < 5) log_alpha_size = 5; + SizeWriter size_writer; // Used if writer == nullptr to estimate costs. + cost += 1; + if (writer) writer->Write(1, use_prefix_code); + + if (use_prefix_code) { + log_alpha_size = PREFIX_MAX_BITS; + } else { + cost += 2; + } + if (writer == nullptr) { + EncodeUintConfigs(codes->uint_config, &size_writer, log_alpha_size); + } else { + if (!use_prefix_code) writer->Write(2, log_alpha_size - 5); + EncodeUintConfigs(codes->uint_config, writer, log_alpha_size); + } + if (use_prefix_code) { + for (size_t c = 0; c < clustered_histograms.size(); ++c) { + size_t num_symbol = 1; + for (size_t i = 0; i < clustered_histograms[c].data_.size(); i++) { + if (clustered_histograms[c].data_[i]) num_symbol = i + 1; + } + if (writer) { + StoreVarLenUint16(num_symbol - 1, writer); + } else { + StoreVarLenUint16(num_symbol - 1, &size_writer); + } + } + } + cost += size_writer.size; + for (size_t c = 0; c < clustered_histograms.size(); ++c) { + size_t num_symbol = 1; + for (size_t i = 0; i < clustered_histograms[c].data_.size(); i++) { + if (clustered_histograms[c].data_[i]) num_symbol = i + 1; + } + codes->encoding_info.emplace_back(); + codes->encoding_info.back().resize(std::max(1, num_symbol)); + + BitWriter::Allotment allotment(writer, 256 + num_symbol * 24); + cost += BuildAndStoreANSEncodingData( + params.ans_histogram_strategy, clustered_histograms[c].data_.data(), + num_symbol, log_alpha_size, use_prefix_code, + codes->encoding_info.back().data(), writer); + allotment.FinishedHistogram(writer); + ReclaimAndCharge(writer, &allotment, layer, aux_out); + } + return cost; + } + + const Histogram& Histo(size_t i) const { return histograms_[i]; } + + private: + std::vector histograms_; +}; + +class SymbolCostEstimator { + public: + SymbolCostEstimator(size_t num_contexts, bool force_huffman, + const std::vector>& tokens, + const LZ77Params& lz77) { + HistogramBuilder builder(num_contexts); + // Build histograms for estimating lz77 savings. + HybridUintConfig uint_config; + for (size_t i = 0; i < tokens.size(); ++i) { + for (size_t j = 0; j < tokens[i].size(); ++j) { + const Token token = tokens[i][j]; + uint32_t tok, nbits, bits; + (token.is_lz77_length ? lz77.length_uint_config : uint_config) + .Encode(token.value, &tok, &nbits, &bits); + tok += token.is_lz77_length ? lz77.min_symbol : 0; + builder.VisitSymbol(tok, token.context); + } + } + max_alphabet_size_ = 0; + for (size_t i = 0; i < num_contexts; i++) { + max_alphabet_size_ = + std::max(max_alphabet_size_, builder.Histo(i).data_.size()); + } + bits_.resize(num_contexts * max_alphabet_size_); + // TODO(veluca): SIMD? + add_symbol_cost_.resize(num_contexts); + for (size_t i = 0; i < num_contexts; i++) { + float inv_total = 1.0f / (builder.Histo(i).total_count_ + 1e-8f); + float total_cost = 0; + for (size_t j = 0; j < builder.Histo(i).data_.size(); j++) { + size_t cnt = builder.Histo(i).data_[j]; + float cost = 0; + if (cnt != 0 && cnt != builder.Histo(i).total_count_) { + cost = -FastLog2f(cnt * inv_total); + if (force_huffman) cost = std::ceil(cost); + } else if (cnt == 0) { + cost = ANS_LOG_TAB_SIZE; // Highest possible cost. + } + bits_[i * max_alphabet_size_ + j] = cost; + total_cost += cost * builder.Histo(i).data_[j]; + } + // Penalty for adding a lz77 symbol to this contest (only used for static + // cost model). Higher penalty for contexts that have a very low + // per-symbol entropy. + add_symbol_cost_[i] = std::max(0.0f, 6.0f - total_cost * inv_total); + } + } + float Bits(size_t ctx, size_t sym) const { + return bits_[ctx * max_alphabet_size_ + sym]; + } + float LenCost(size_t ctx, size_t len, const LZ77Params& lz77) const { + uint32_t nbits, bits, tok; + lz77.length_uint_config.Encode(len, &tok, &nbits, &bits); + tok += lz77.min_symbol; + return nbits + Bits(ctx, tok); + } + float DistCost(size_t len, const LZ77Params& lz77) const { + uint32_t nbits, bits, tok; + HybridUintConfig().Encode(len, &tok, &nbits, &bits); + return nbits + Bits(lz77.nonserialized_distance_context, tok); + } + float AddSymbolCost(size_t idx) const { return add_symbol_cost_[idx]; } + + private: + size_t max_alphabet_size_; + std::vector bits_; + std::vector add_symbol_cost_; +}; + +void ApplyLZ77_RLE(const HistogramParams& params, size_t num_contexts, + const std::vector>& tokens, + LZ77Params& lz77, + std::vector>& tokens_lz77) { + // TODO(veluca): tune heuristics here. + SymbolCostEstimator sce(num_contexts, params.force_huffman, tokens, lz77); + float bit_decrease = 0; + size_t total_symbols = 0; + tokens_lz77.resize(tokens.size()); + std::vector sym_cost; + HybridUintConfig uint_config; + for (size_t stream = 0; stream < tokens.size(); stream++) { + size_t distance_multiplier = + params.image_widths.size() > stream ? params.image_widths[stream] : 0; + const auto& in = tokens[stream]; + auto& out = tokens_lz77[stream]; + total_symbols += in.size(); + // Cumulative sum of bit costs. + sym_cost.resize(in.size() + 1); + for (size_t i = 0; i < in.size(); i++) { + uint32_t tok, nbits, unused_bits; + uint_config.Encode(in[i].value, &tok, &nbits, &unused_bits); + sym_cost[i + 1] = sce.Bits(in[i].context, tok) + nbits + sym_cost[i]; + } + out.reserve(in.size()); + for (size_t i = 0; i < in.size(); i++) { + size_t num_to_copy = 0; + size_t distance_symbol = 0; // 1 for RLE. + if (distance_multiplier != 0) { + distance_symbol = 1; // Special distance 1 if enabled. + JXL_DASSERT(kSpecialDistances[1][0] == 1); + JXL_DASSERT(kSpecialDistances[1][1] == 0); + } + if (i > 0) { + for (; i + num_to_copy < in.size(); num_to_copy++) { + if (in[i + num_to_copy].value != in[i - 1].value) { + break; + } + } + } + if (num_to_copy == 0) { + out.push_back(in[i]); + continue; + } + float cost = sym_cost[i + num_to_copy] - sym_cost[i]; + // This subtraction might overflow, but that's OK. + size_t lz77_len = num_to_copy - lz77.min_length; + float lz77_cost = num_to_copy >= lz77.min_length + ? CeilLog2Nonzero(lz77_len + 1) + 1 + : 0; + if (num_to_copy < lz77.min_length || cost <= lz77_cost) { + for (size_t j = 0; j < num_to_copy; j++) { + out.push_back(in[i + j]); + } + i += num_to_copy - 1; + continue; + } + // Output the LZ77 length + out.emplace_back(in[i].context, lz77_len); + out.back().is_lz77_length = true; + i += num_to_copy - 1; + bit_decrease += cost - lz77_cost; + // Output the LZ77 copy distance. + out.emplace_back(lz77.nonserialized_distance_context, distance_symbol); + } + } + + if (bit_decrease > total_symbols * 0.2 + 16) { + lz77.enabled = true; + } +} + +// Hash chain for LZ77 matching +struct HashChain { + size_t size_; + std::vector data_; + + unsigned hash_num_values_ = 32768; + unsigned hash_mask_ = hash_num_values_ - 1; + unsigned hash_shift_ = 5; + + std::vector head; + std::vector chain; + std::vector val; + + // Speed up repetitions of zero + std::vector headz; + std::vector chainz; + std::vector zeros; + uint32_t numzeros = 0; + + size_t window_size_; + size_t window_mask_; + size_t min_length_; + size_t max_length_; + + // Map of special distance codes. + std::unordered_map special_dist_table_; + size_t num_special_distances_ = 0; + + uint32_t maxchainlength = 256; // window_size_ to allow all + + HashChain(const Token* data, size_t size, size_t window_size, + size_t min_length, size_t max_length, size_t distance_multiplier) + : size_(size), + window_size_(window_size), + window_mask_(window_size - 1), + min_length_(min_length), + max_length_(max_length) { + data_.resize(size); + for (size_t i = 0; i < size; i++) { + data_[i] = data[i].value; + } + + head.resize(hash_num_values_, -1); + val.resize(window_size_, -1); + chain.resize(window_size_); + for (uint32_t i = 0; i < window_size_; ++i) { + chain[i] = i; // same value as index indicates uninitialized + } + + zeros.resize(window_size_); + headz.resize(window_size_ + 1, -1); + chainz.resize(window_size_); + for (uint32_t i = 0; i < window_size_; ++i) { + chainz[i] = i; + } + // Translate distance to special distance code. + if (distance_multiplier) { + // Count down, so if due to small distance multiplier multiple distances + // map to the same code, the smallest code will be used in the end. + for (int i = kNumSpecialDistances - 1; i >= 0; --i) { + int xi = kSpecialDistances[i][0]; + int yi = kSpecialDistances[i][1]; + int distance = yi * distance_multiplier + xi; + // Ensure that we map distance 1 to the lowest symbols. + if (distance < 1) distance = 1; + special_dist_table_[distance] = i; + } + num_special_distances_ = kNumSpecialDistances; + } + } + + uint32_t GetHash(size_t pos) const { + uint32_t result = 0; + if (pos + 2 < size_) { + // TODO(lode): take the MSB's of the uint32_t values into account as well, + // given that the hash code itself is less than 32 bits. + result ^= (uint32_t)(data_[pos + 0] << 0u); + result ^= (uint32_t)(data_[pos + 1] << hash_shift_); + result ^= (uint32_t)(data_[pos + 2] << (hash_shift_ * 2)); + } else { + // No need to compute hash of last 2 bytes, the length 2 is too short. + return 0; + } + return result & hash_mask_; + } + + uint32_t CountZeros(size_t pos, uint32_t prevzeros) const { + size_t end = pos + window_size_; + if (end > size_) end = size_; + if (prevzeros > 0) { + if (prevzeros >= window_mask_ && data_[end - 1] == 0 && + end == pos + window_size_) { + return prevzeros; + } else { + return prevzeros - 1; + } + } + uint32_t num = 0; + while (pos + num < end && data_[pos + num] == 0) num++; + return num; + } + + void Update(size_t pos) { + uint32_t hashval = GetHash(pos); + uint32_t wpos = pos & window_mask_; + + val[wpos] = (int)hashval; + if (head[hashval] != -1) chain[wpos] = head[hashval]; + head[hashval] = wpos; + + if (pos > 0 && data_[pos] != data_[pos - 1]) numzeros = 0; + numzeros = CountZeros(pos, numzeros); + + zeros[wpos] = numzeros; + if (headz[numzeros] != -1) chainz[wpos] = headz[numzeros]; + headz[numzeros] = wpos; + } + + void Update(size_t pos, size_t len) { + for (size_t i = 0; i < len; i++) { + Update(pos + i); + } + } + + template + void FindMatches(size_t pos, int max_dist, const CB& found_match) const { + uint32_t wpos = pos & window_mask_; + uint32_t hashval = GetHash(pos); + uint32_t hashpos = chain[wpos]; + + int prev_dist = 0; + int end = std::min(pos + max_length_, size_); + uint32_t chainlength = 0; + uint32_t best_len = 0; + for (;;) { + int dist = (hashpos <= wpos) ? (wpos - hashpos) + : (wpos - hashpos + window_mask_ + 1); + if (dist < prev_dist) break; + prev_dist = dist; + uint32_t len = 0; + if (dist > 0) { + int i = pos; + int j = pos - dist; + if (numzeros > 3) { + int r = std::min(numzeros - 1, zeros[hashpos]); + if (i + r >= end) r = end - i - 1; + i += r; + j += r; + } + while (i < end && data_[i] == data_[j]) { + i++; + j++; + } + len = i - pos; + // This can trigger even if the new length is slightly smaller than the + // best length, because it is possible for a slightly cheaper distance + // symbol to occur. + if (len >= min_length_ && len + 2 >= best_len) { + auto it = special_dist_table_.find(dist); + int dist_symbol = (it == special_dist_table_.end()) + ? (num_special_distances_ + dist - 1) + : it->second; + found_match(len, dist_symbol); + if (len > best_len) best_len = len; + } + } + + chainlength++; + if (chainlength >= maxchainlength) break; + + if (numzeros >= 3 && len > numzeros) { + if (hashpos == chainz[hashpos]) break; + hashpos = chainz[hashpos]; + if (zeros[hashpos] != numzeros) break; + } else { + if (hashpos == chain[hashpos]) break; + hashpos = chain[hashpos]; + if (val[hashpos] != (int)hashval) break; // outdated hash value + } + } + } + void FindMatch(size_t pos, int max_dist, size_t* result_dist_symbol, + size_t* result_len) const { + *result_dist_symbol = 0; + *result_len = 1; + FindMatches(pos, max_dist, [&](size_t len, size_t dist_symbol) { + if (len > *result_len || + (len == *result_len && *result_dist_symbol > dist_symbol)) { + *result_len = len; + *result_dist_symbol = dist_symbol; + } + }); + } +}; + +float LenCost(size_t len) { + uint32_t nbits, bits, tok; + HybridUintConfig(1, 0, 0).Encode(len, &tok, &nbits, &bits); + constexpr float kCostTable[] = { + 2.797667318563126, 3.213177690381199, 2.5706009246743737, + 2.408392498667534, 2.829649191872326, 3.3923087753324577, + 4.029267451554331, 4.415576699706408, 4.509357574741465, + 9.21481543803004, 10.020590190114898, 11.858671627804766, + 12.45853300490526, 11.713105831990857, 12.561996324849314, + 13.775477692278367, 13.174027068768641, + }; + size_t table_size = sizeof kCostTable / sizeof *kCostTable; + if (tok >= table_size) tok = table_size - 1; + return kCostTable[tok] + nbits; +} + +// TODO(veluca): this does not take into account usage or non-usage of distance +// multipliers. +float DistCost(size_t dist) { + uint32_t nbits, bits, tok; + HybridUintConfig(7, 0, 0).Encode(dist, &tok, &nbits, &bits); + constexpr float kCostTable[] = { + 6.368282626312716, 5.680793277090298, 8.347404197105247, + 7.641619201599141, 6.914328374119438, 7.959808291537444, + 8.70023120759855, 8.71378518934703, 9.379132523982769, + 9.110472749092708, 9.159029569270908, 9.430936766731973, + 7.278284055315169, 7.8278514904267755, 10.026641158289236, + 9.976049229827066, 9.64351607048908, 9.563403863480442, + 10.171474111762747, 10.45950155077234, 9.994813912104219, + 10.322524683741156, 8.465808729388186, 8.756254166066853, + 10.160930174662234, 10.247329273413435, 10.04090403724809, + 10.129398517544082, 9.342311691539546, 9.07608009102374, + 10.104799540677513, 10.378079384990906, 10.165828974075072, + 10.337595322341553, 7.940557464567944, 10.575665823319431, + 11.023344321751955, 10.736144698831827, 11.118277044595054, + 7.468468230648442, 10.738305230932939, 10.906980780216568, + 10.163468216353817, 10.17805759656433, 11.167283670483565, + 11.147050200274544, 10.517921919244333, 10.651764778156886, + 10.17074446448919, 11.217636876224745, 11.261630721139484, + 11.403140815247259, 10.892472096873417, 11.1859607804481, + 8.017346947551262, 7.895143720278828, 11.036577113822025, + 11.170562110315794, 10.326988722591086, 10.40872184751056, + 11.213498225466386, 11.30580635516863, 10.672272515665442, + 10.768069466228063, 11.145257364153565, 11.64668307145549, + 10.593156194627339, 11.207499484844943, 10.767517766396908, + 10.826629811407042, 10.737764794499988, 10.6200448518045, + 10.191315385198092, 8.468384171390085, 11.731295299170432, + 11.824619886654398, 10.41518844301179, 10.16310536548649, + 10.539423685097576, 10.495136599328031, 10.469112847728267, + 11.72057686174922, 10.910326337834674, 11.378921834673758, + 11.847759036098536, 11.92071647623854, 10.810628276345282, + 11.008601085273893, 11.910326337834674, 11.949212023423133, + 11.298614839104337, 11.611603659010392, 10.472930394619985, + 11.835564720850282, 11.523267392285337, 12.01055816679611, + 8.413029688994023, 11.895784139536406, 11.984679534970505, + 11.220654278717394, 11.716311684833672, 10.61036646226114, + 10.89849965960364, 10.203762898863669, 10.997560826267238, + 11.484217379438984, 11.792836176993665, 12.24310468755171, + 11.464858097919262, 12.212747017409377, 11.425595666074955, + 11.572048533398757, 12.742093965163013, 11.381874288645637, + 12.191870445817015, 11.683156920035426, 11.152442115262197, + 11.90303691580457, 11.653292787169159, 11.938615382266098, + 16.970641701570223, 16.853602280380002, 17.26240782594733, + 16.644655390108507, 17.14310889757499, 16.910935455445955, + 17.505678976959697, 17.213498225466388, 2.4162310293553024, + 3.494587244462329, 3.5258600986408344, 3.4959806589517095, + 3.098390886949687, 3.343454654302911, 3.588847442290287, + 4.14614790111827, 5.152948641990529, 7.433696808092598, + 9.716311684833672, + }; + size_t table_size = sizeof kCostTable / sizeof *kCostTable; + if (tok >= table_size) tok = table_size - 1; + return kCostTable[tok] + nbits; +} + +void ApplyLZ77_LZ77(const HistogramParams& params, size_t num_contexts, + const std::vector>& tokens, + LZ77Params& lz77, + std::vector>& tokens_lz77) { + // TODO(veluca): tune heuristics here. + SymbolCostEstimator sce(num_contexts, params.force_huffman, tokens, lz77); + float bit_decrease = 0; + size_t total_symbols = 0; + tokens_lz77.resize(tokens.size()); + HybridUintConfig uint_config; + std::vector sym_cost; + for (size_t stream = 0; stream < tokens.size(); stream++) { + size_t distance_multiplier = + params.image_widths.size() > stream ? params.image_widths[stream] : 0; + const auto& in = tokens[stream]; + auto& out = tokens_lz77[stream]; + total_symbols += in.size(); + // Cumulative sum of bit costs. + sym_cost.resize(in.size() + 1); + for (size_t i = 0; i < in.size(); i++) { + uint32_t tok, nbits, unused_bits; + uint_config.Encode(in[i].value, &tok, &nbits, &unused_bits); + sym_cost[i + 1] = sce.Bits(in[i].context, tok) + nbits + sym_cost[i]; + } + + out.reserve(in.size()); + size_t max_distance = in.size(); + size_t min_length = lz77.min_length; + JXL_ASSERT(min_length >= 3); + size_t max_length = in.size(); + + // Use next power of two as window size. + size_t window_size = 1; + while (window_size < max_distance && window_size < kWindowSize) { + window_size <<= 1; + } + + HashChain chain(in.data(), in.size(), window_size, min_length, max_length, + distance_multiplier); + size_t len, dist_symbol; + + const size_t max_lazy_match_len = 256; // 0 to disable lazy matching + + // Whether the next symbol was already updated (to test lazy matching) + bool already_updated = false; + for (size_t i = 0; i < in.size(); i++) { + out.push_back(in[i]); + if (!already_updated) chain.Update(i); + already_updated = false; + chain.FindMatch(i, max_distance, &dist_symbol, &len); + if (len >= min_length) { + if (len < max_lazy_match_len && i + 1 < in.size()) { + // Try length at next symbol lazy matching + chain.Update(i + 1); + already_updated = true; + size_t len2, dist_symbol2; + chain.FindMatch(i + 1, max_distance, &dist_symbol2, &len2); + if (len2 > len) { + // Use the lazy match. Add literal, and use the next length starting + // from the next byte. + ++i; + already_updated = false; + len = len2; + dist_symbol = dist_symbol2; + out.push_back(in[i]); + } + } + + float cost = sym_cost[i + len] - sym_cost[i]; + size_t lz77_len = len - lz77.min_length; + float lz77_cost = LenCost(lz77_len) + DistCost(dist_symbol) + + sce.AddSymbolCost(out.back().context); + + if (lz77_cost <= cost) { + out.back().value = len - min_length; + out.back().is_lz77_length = true; + out.emplace_back(lz77.nonserialized_distance_context, dist_symbol); + bit_decrease += cost - lz77_cost; + } else { + // LZ77 match ignored, and symbol already pushed. Push all other + // symbols and skip. + for (size_t j = 1; j < len; j++) { + out.push_back(in[i + j]); + } + } + + if (already_updated) { + chain.Update(i + 2, len - 2); + already_updated = false; + } else { + chain.Update(i + 1, len - 1); + } + i += len - 1; + } else { + // Literal, already pushed + } + } + } + + if (bit_decrease > total_symbols * 0.2 + 16) { + lz77.enabled = true; + } +} + +void ApplyLZ77_Optimal(const HistogramParams& params, size_t num_contexts, + const std::vector>& tokens, + LZ77Params& lz77, + std::vector>& tokens_lz77) { + std::vector> tokens_for_cost_estimate; + ApplyLZ77_LZ77(params, num_contexts, tokens, lz77, tokens_for_cost_estimate); + // If greedy-LZ77 does not give better compression than no-lz77, no reason to + // run the optimal matching. + if (!lz77.enabled) return; + SymbolCostEstimator sce(num_contexts + 1, params.force_huffman, + tokens_for_cost_estimate, lz77); + tokens_lz77.resize(tokens.size()); + HybridUintConfig uint_config; + std::vector sym_cost; + std::vector dist_symbols; + for (size_t stream = 0; stream < tokens.size(); stream++) { + size_t distance_multiplier = + params.image_widths.size() > stream ? params.image_widths[stream] : 0; + const auto& in = tokens[stream]; + auto& out = tokens_lz77[stream]; + // Cumulative sum of bit costs. + sym_cost.resize(in.size() + 1); + for (size_t i = 0; i < in.size(); i++) { + uint32_t tok, nbits, unused_bits; + uint_config.Encode(in[i].value, &tok, &nbits, &unused_bits); + sym_cost[i + 1] = sce.Bits(in[i].context, tok) + nbits + sym_cost[i]; + } + + out.reserve(in.size()); + size_t max_distance = in.size(); + size_t min_length = lz77.min_length; + JXL_ASSERT(min_length >= 3); + size_t max_length = in.size(); + + // Use next power of two as window size. + size_t window_size = 1; + while (window_size < max_distance && window_size < kWindowSize) { + window_size <<= 1; + } + + HashChain chain(in.data(), in.size(), window_size, min_length, max_length, + distance_multiplier); + + struct MatchInfo { + uint32_t len; + uint32_t dist_symbol; + uint32_t ctx; + float total_cost = std::numeric_limits::max(); + }; + // Total cost to encode the first N symbols. + std::vector prefix_costs(in.size() + 1); + prefix_costs[0].total_cost = 0; + + size_t rle_length = 0; + size_t skip_lz77 = 0; + for (size_t i = 0; i < in.size(); i++) { + chain.Update(i); + float lit_cost = + prefix_costs[i].total_cost + sym_cost[i + 1] - sym_cost[i]; + if (prefix_costs[i + 1].total_cost > lit_cost) { + prefix_costs[i + 1].dist_symbol = 0; + prefix_costs[i + 1].len = 1; + prefix_costs[i + 1].ctx = in[i].context; + prefix_costs[i + 1].total_cost = lit_cost; + } + if (skip_lz77 > 0) { + skip_lz77--; + continue; + } + dist_symbols.clear(); + chain.FindMatches(i, max_distance, + [&dist_symbols](size_t len, size_t dist_symbol) { + if (dist_symbols.size() <= len) { + dist_symbols.resize(len + 1, dist_symbol); + } + if (dist_symbol < dist_symbols[len]) { + dist_symbols[len] = dist_symbol; + } + }); + if (dist_symbols.size() <= min_length) continue; + { + size_t best_cost = dist_symbols.back(); + for (size_t j = dist_symbols.size() - 1; j >= min_length; j--) { + if (dist_symbols[j] < best_cost) { + best_cost = dist_symbols[j]; + } + dist_symbols[j] = best_cost; + } + } + for (size_t j = min_length; j < dist_symbols.size(); j++) { + // Cost model that uses results from lazy LZ77. + float lz77_cost = sce.LenCost(in[i].context, j - min_length, lz77) + + sce.DistCost(dist_symbols[j], lz77); + float cost = prefix_costs[i].total_cost + lz77_cost; + if (prefix_costs[i + j].total_cost > cost) { + prefix_costs[i + j].len = j; + prefix_costs[i + j].dist_symbol = dist_symbols[j] + 1; + prefix_costs[i + j].ctx = in[i].context; + prefix_costs[i + j].total_cost = cost; + } + } + // We are in a RLE sequence: skip all the symbols except the first 8 and + // the last 8. This avoid quadratic costs for sequences with long runs of + // the same symbol. + if ((dist_symbols.back() == 0 && distance_multiplier == 0) || + (dist_symbols.back() == 1 && distance_multiplier != 0)) { + rle_length++; + } else { + rle_length = 0; + } + if (rle_length >= 8 && dist_symbols.size() > 9) { + skip_lz77 = dist_symbols.size() - 10; + rle_length = 0; + } + } + size_t pos = in.size(); + while (pos > 0) { + bool is_lz77_length = prefix_costs[pos].dist_symbol != 0; + if (is_lz77_length) { + size_t dist_symbol = prefix_costs[pos].dist_symbol - 1; + out.emplace_back(lz77.nonserialized_distance_context, dist_symbol); + } + size_t val = is_lz77_length ? prefix_costs[pos].len - min_length + : in[pos - 1].value; + out.emplace_back(prefix_costs[pos].ctx, val); + out.back().is_lz77_length = is_lz77_length; + pos -= prefix_costs[pos].len; + } + std::reverse(out.begin(), out.end()); + } +} + +void ApplyLZ77(const HistogramParams& params, size_t num_contexts, + const std::vector>& tokens, LZ77Params& lz77, + std::vector>& tokens_lz77) { + lz77.enabled = false; + if (params.force_huffman) { + lz77.min_symbol = std::min(PREFIX_MAX_ALPHABET_SIZE - 32, 512); + } else { + lz77.min_symbol = 224; + } + if (params.lz77_method == HistogramParams::LZ77Method::kNone) { + return; + } else if (params.lz77_method == HistogramParams::LZ77Method::kRLE) { + ApplyLZ77_RLE(params, num_contexts, tokens, lz77, tokens_lz77); + } else if (params.lz77_method == HistogramParams::LZ77Method::kLZ77) { + ApplyLZ77_LZ77(params, num_contexts, tokens, lz77, tokens_lz77); + } else if (params.lz77_method == HistogramParams::LZ77Method::kOptimal) { + ApplyLZ77_Optimal(params, num_contexts, tokens, lz77, tokens_lz77); + } else { + JXL_ABORT("Not implemented"); + } +} +} // namespace + +size_t BuildAndEncodeHistograms(const HistogramParams& params, + size_t num_contexts, + std::vector>& tokens, + EntropyEncodingData* codes, + std::vector* context_map, + BitWriter* writer, size_t layer, + AuxOut* aux_out) { + size_t total_bits = 0; + codes->lz77.nonserialized_distance_context = num_contexts; + std::vector> tokens_lz77; + ApplyLZ77(params, num_contexts, tokens, codes->lz77, tokens_lz77); + if (ans_fuzzer_friendly_) { + codes->lz77.length_uint_config = HybridUintConfig(10, 0, 0); + codes->lz77.min_symbol = 2048; + } + + const size_t max_contexts = std::min(num_contexts, kClustersLimit); + BitWriter::Allotment allotment(writer, + 128 + num_contexts * 40 + max_contexts * 96); + if (writer) { + JXL_CHECK(Bundle::Write(codes->lz77, writer, layer, aux_out)); + } else { + size_t ebits, bits; + JXL_CHECK(Bundle::CanEncode(codes->lz77, &ebits, &bits)); + total_bits += bits; + } + if (codes->lz77.enabled) { + if (writer) { + size_t b = writer->BitsWritten(); + EncodeUintConfig(codes->lz77.length_uint_config, writer, + /*log_alpha_size=*/8); + total_bits += writer->BitsWritten() - b; + } else { + SizeWriter size_writer; + EncodeUintConfig(codes->lz77.length_uint_config, &size_writer, + /*log_alpha_size=*/8); + total_bits += size_writer.size; + } + num_contexts += 1; + tokens = std::move(tokens_lz77); + } + size_t total_tokens = 0; + // Build histograms. + HistogramBuilder builder(num_contexts); + HybridUintConfig uint_config; // Default config for clustering. + // Unless we are using the kContextMap histogram option. + if (params.uint_method == HistogramParams::HybridUintMethod::kContextMap) { + uint_config = HybridUintConfig(2, 0, 1); + } + if (ans_fuzzer_friendly_) { + uint_config = HybridUintConfig(10, 0, 0); + } + for (size_t i = 0; i < tokens.size(); ++i) { + for (size_t j = 0; j < tokens[i].size(); ++j) { + const Token token = tokens[i][j]; + total_tokens++; + uint32_t tok, nbits, bits; + (token.is_lz77_length ? codes->lz77.length_uint_config : uint_config) + .Encode(token.value, &tok, &nbits, &bits); + tok += token.is_lz77_length ? codes->lz77.min_symbol : 0; + builder.VisitSymbol(tok, token.context); + } + } + + bool use_prefix_code = + params.force_huffman || total_tokens < 100 || + params.clustering == HistogramParams::ClusteringType::kFastest || + ans_fuzzer_friendly_; + if (!use_prefix_code) { + bool all_singleton = true; + for (size_t i = 0; i < num_contexts; i++) { + if (builder.Histo(i).ShannonEntropy() >= 1e-5) { + all_singleton = false; + } + } + if (all_singleton) { + use_prefix_code = true; + } + } + + // Encode histograms. + total_bits += builder.BuildAndStoreEntropyCodes(params, tokens, codes, + context_map, use_prefix_code, + writer, layer, aux_out); + allotment.FinishedHistogram(writer); + ReclaimAndCharge(writer, &allotment, layer, aux_out); + + if (aux_out != nullptr) { + aux_out->layers[layer].num_clustered_histograms += + codes->encoding_info.size(); + } + return total_bits; +} + +size_t WriteTokens(const std::vector& tokens, + const EntropyEncodingData& codes, + const std::vector& context_map, BitWriter* writer) { + size_t num_extra_bits = 0; + if (codes.use_prefix_code) { + for (size_t i = 0; i < tokens.size(); i++) { + uint32_t tok, nbits, bits; + const Token& token = tokens[i]; + size_t histo = context_map[token.context]; + (token.is_lz77_length ? codes.lz77.length_uint_config + : codes.uint_config[histo]) + .Encode(token.value, &tok, &nbits, &bits); + tok += token.is_lz77_length ? codes.lz77.min_symbol : 0; + // Combine two calls to the BitWriter. Equivalent to: + // writer->Write(codes.encoding_info[histo][tok].depth, + // codes.encoding_info[histo][tok].bits); + // writer->Write(nbits, bits); + uint64_t data = codes.encoding_info[histo][tok].bits; + data |= bits << codes.encoding_info[histo][tok].depth; + writer->Write(codes.encoding_info[histo][tok].depth + nbits, data); + num_extra_bits += nbits; + } + return num_extra_bits; + } + std::vector out; + std::vector out_nbits; + out.reserve(tokens.size()); + out_nbits.reserve(tokens.size()); + uint64_t allbits = 0; + size_t numallbits = 0; + // Writes in *reversed* order. + auto addbits = [&](size_t bits, size_t nbits) { + JXL_DASSERT(bits >> nbits == 0); + if (JXL_UNLIKELY(numallbits + nbits > BitWriter::kMaxBitsPerCall)) { + out.push_back(allbits); + out_nbits.push_back(numallbits); + numallbits = allbits = 0; + } + allbits <<= nbits; + allbits |= bits; + numallbits += nbits; + }; + const int end = tokens.size(); + ANSCoder ans; + for (int i = end - 1; i >= 0; --i) { + const Token token = tokens[i]; + const uint8_t histo = context_map[token.context]; + uint32_t tok, nbits, bits; + (token.is_lz77_length ? codes.lz77.length_uint_config + : codes.uint_config[histo]) + .Encode(tokens[i].value, &tok, &nbits, &bits); + tok += token.is_lz77_length ? codes.lz77.min_symbol : 0; + const ANSEncSymbolInfo& info = codes.encoding_info[histo][tok]; + // Extra bits first as this is reversed. + addbits(bits, nbits); + num_extra_bits += nbits; + uint8_t ans_nbits = 0; + uint32_t ans_bits = ans.PutSymbol(info, &ans_nbits); + addbits(ans_bits, ans_nbits); + } + const uint32_t state = ans.GetState(); + writer->Write(32, state); + writer->Write(numallbits, allbits); + for (int i = out.size(); i > 0; --i) { + writer->Write(out_nbits[i - 1], out[i - 1]); + } + return num_extra_bits; +} + +void WriteTokens(const std::vector& tokens, + const EntropyEncodingData& codes, + const std::vector& context_map, BitWriter* writer, + size_t layer, AuxOut* aux_out) { + BitWriter::Allotment allotment(writer, 32 * tokens.size() + 32 * 1024 * 4); + size_t num_extra_bits = WriteTokens(tokens, codes, context_map, writer); + ReclaimAndCharge(writer, &allotment, layer, aux_out); + if (aux_out != nullptr) { + aux_out->layers[layer].extra_bits += num_extra_bits; + } +} + +void SetANSFuzzerFriendly(bool ans_fuzzer_friendly) { +#if JXL_IS_DEBUG_BUILD // Guard against accidental / malicious changes. + ans_fuzzer_friendly_ = ans_fuzzer_friendly; +#endif +} +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans.h new file mode 100644 index 0000000000..9614ede9c6 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans.h @@ -0,0 +1,142 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_ANS_H_ +#define LIB_JXL_ENC_ANS_H_ + +// Library to encode the ANS population counts to the bit-stream and encode +// symbols based on the respective distributions. + +#include +#include +#include +#include +#include + +#include +#include + +#include "lib/jxl/ans_common.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/enc_ans_params.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/huffman_table.h" + +namespace jxl { + +#define USE_MULT_BY_RECIPROCAL + +// precision must be equal to: #bits(state_) + #bits(freq) +#define RECIPROCAL_PRECISION (32 + ANS_LOG_TAB_SIZE) + +// Data structure representing one element of the encoding table built +// from a distribution. +// TODO(veluca): split this up, or use an union. +struct ANSEncSymbolInfo { + // ANS + uint16_t freq_; + std::vector reverse_map_; +#ifdef USE_MULT_BY_RECIPROCAL + uint64_t ifreq_; +#endif + // Prefix coding. + uint8_t depth; + uint16_t bits; +}; + +class ANSCoder { + public: + ANSCoder() : state_(ANS_SIGNATURE << 16) {} + + uint32_t PutSymbol(const ANSEncSymbolInfo& t, uint8_t* nbits) { + uint32_t bits = 0; + *nbits = 0; + if ((state_ >> (32 - ANS_LOG_TAB_SIZE)) >= t.freq_) { + bits = state_ & 0xffff; + state_ >>= 16; + *nbits = 16; + } +#ifdef USE_MULT_BY_RECIPROCAL + // We use mult-by-reciprocal trick, but that requires 64b calc. + const uint32_t v = (state_ * t.ifreq_) >> RECIPROCAL_PRECISION; + const uint32_t offset = t.reverse_map_[state_ - v * t.freq_]; + state_ = (v << ANS_LOG_TAB_SIZE) + offset; +#else + state_ = ((state_ / t.freq_) << ANS_LOG_TAB_SIZE) + + t.reverse_map_[state_ % t.freq_]; +#endif + return bits; + } + + uint32_t GetState() const { return state_; } + + private: + uint32_t state_; +}; + +// RebalanceHistogram requires a signed type. +using ANSHistBin = int32_t; + +struct EntropyEncodingData { + std::vector> encoding_info; + bool use_prefix_code; + std::vector uint_config; + LZ77Params lz77; +}; + +// Integer to be encoded by an entropy coder, either ANS or Huffman. +struct Token { + Token(uint32_t c, uint32_t value) + : is_lz77_length(false), context(c), value(value) {} + uint32_t is_lz77_length : 1; + uint32_t context : 31; + uint32_t value; +}; + +// Returns an estimate of the number of bits required to encode the given +// histogram (header bits plus data bits). +float ANSPopulationCost(const ANSHistBin* data, size_t alphabet_size); + +// Apply context clustering, compute histograms and encode them. Returns an +// estimate of the total bits used for encoding the stream. If `writer` == +// nullptr, the bit estimate will not take into account the context map (which +// does not get written if `num_contexts` == 1). +size_t BuildAndEncodeHistograms(const HistogramParams& params, + size_t num_contexts, + std::vector>& tokens, + EntropyEncodingData* codes, + std::vector* context_map, + BitWriter* writer, size_t layer, + AuxOut* aux_out); + +// Write the tokens to a string. +void WriteTokens(const std::vector& tokens, + const EntropyEncodingData& codes, + const std::vector& context_map, BitWriter* writer, + size_t layer, AuxOut* aux_out); + +// Same as above, but assumes allotment created by caller. +size_t WriteTokens(const std::vector& tokens, + const EntropyEncodingData& codes, + const std::vector& context_map, BitWriter* writer); + +// Exposed for tests; to be used with Writer=BitWriter only. +template +void EncodeUintConfigs(const std::vector& uint_config, + Writer* writer, size_t log_alpha_size); +extern template void EncodeUintConfigs(const std::vector&, + BitWriter*, size_t); + +// Globally set the option to create fuzzer-friendly ANS streams. Negatively +// impacts compression. Not thread-safe. +void SetANSFuzzerFriendly(bool ans_fuzzer_friendly); +} // namespace jxl + +#endif // LIB_JXL_ENC_ANS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans_params.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans_params.h new file mode 100644 index 0000000000..6f7cd897cc --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ans_params.h @@ -0,0 +1,75 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_ANS_PARAMS_H_ +#define LIB_JXL_ENC_ANS_PARAMS_H_ + +// Encoder-only parameter needed for ANS entropy encoding methods. + +#include +#include + +#include "lib/jxl/enc_params.h" + +namespace jxl { + +struct HistogramParams { + enum class ClusteringType { + kFastest, // Only 4 clusters. + kFast, + kBest, + }; + + enum class HybridUintMethod { + kNone, // just use kHybridUint420Config. + kFast, // just try a couple of options. + kContextMap, // fast choice for ctx map. + kBest, + }; + + enum class LZ77Method { + kNone, // do not try lz77. + kRLE, // only try doing RLE. + kLZ77, // try lz77 with backward references. + kOptimal, // optimal-matching LZ77 parsing. + }; + + enum class ANSHistogramStrategy { + kFast, // Only try some methods, early exit. + kApproximate, // Only try some methods. + kPrecise, // Try all methods. + }; + + HistogramParams() = default; + + HistogramParams(SpeedTier tier, size_t num_ctx) { + if (tier > SpeedTier::kFalcon) { + clustering = ClusteringType::kFastest; + lz77_method = LZ77Method::kNone; + } else if (tier > SpeedTier::kTortoise) { + clustering = ClusteringType::kFast; + } else { + clustering = ClusteringType::kBest; + } + if (tier > SpeedTier::kTortoise) { + uint_method = HybridUintMethod::kNone; + } + if (tier >= SpeedTier::kSquirrel) { + ans_histogram_strategy = ANSHistogramStrategy::kApproximate; + } + } + + ClusteringType clustering = ClusteringType::kBest; + HybridUintMethod uint_method = HybridUintMethod::kBest; + LZ77Method lz77_method = LZ77Method::kRLE; + ANSHistogramStrategy ans_histogram_strategy = ANSHistogramStrategy::kPrecise; + std::vector image_widths; + size_t max_histograms = ~0; + bool force_huffman = false; +}; + +} // namespace jxl + +#endif // LIB_JXL_ENC_ANS_PARAMS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ar_control_field.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ar_control_field.cc new file mode 100644 index 0000000000..f43340eda4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ar_control_field.cc @@ -0,0 +1,318 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_ar_control_field.h" + +#include +#include + +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_ar_control_field.cc" +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +void ProcessTile(const Image3F& opsin, PassesEncoderState* enc_state, + const Rect& rect, + ArControlFieldHeuristics::TempImages* temp_image) { + constexpr size_t N = kBlockDim; + ImageB* JXL_RESTRICT epf_sharpness = &enc_state->shared.epf_sharpness; + ImageF* JXL_RESTRICT quant = &enc_state->initial_quant_field; + JXL_ASSERT( + epf_sharpness->xsize() == enc_state->shared.frame_dim.xsize_blocks && + epf_sharpness->ysize() == enc_state->shared.frame_dim.ysize_blocks); + + if (enc_state->cparams.butteraugli_distance < kMinButteraugliForDynamicAR || + enc_state->cparams.speed_tier > SpeedTier::kWombat || + enc_state->shared.frame_header.loop_filter.epf_iters == 0) { + FillPlane(static_cast(4), epf_sharpness, rect); + return; + } + + // Likely better to have a higher X weight, like: + // const float kChannelWeights[3] = {47.0f, 4.35f, 0.287f}; + const float kChannelWeights[3] = {4.35f, 4.35f, 0.287f}; + const float kChannelWeightsLapNeg[3] = {-0.125f * kChannelWeights[0], + -0.125f * kChannelWeights[1], + -0.125f * kChannelWeights[2]}; + const size_t sharpness_stride = + static_cast(epf_sharpness->PixelsPerRow()); + + size_t by0 = rect.y0(); + size_t by1 = rect.y0() + rect.ysize(); + size_t bx0 = rect.x0(); + size_t bx1 = rect.x0() + rect.xsize(); + temp_image->InitOnce(); + ImageF& laplacian_sqrsum = temp_image->laplacian_sqrsum; + // Calculate the L2 of the 3x3 Laplacian in an integral transform + // (for example 32x32 dct). This relates to transforms ability + // to propagate artefacts. + size_t y0 = by0 == 0 ? 2 : 0; + size_t y1 = by1 * N + 4 <= opsin.ysize() + 2 ? (by1 - by0) * N + 4 + : opsin.ysize() + 2 - by0 * N; + size_t x0 = bx0 == 0 ? 2 : 0; + size_t x1 = bx1 * N + 4 <= opsin.xsize() + 2 ? (bx1 - bx0) * N + 4 + : opsin.xsize() + 2 - bx0 * N; + HWY_FULL(float) df; + for (size_t y = y0; y < y1; y++) { + float* JXL_RESTRICT laplacian_sqrsum_row = laplacian_sqrsum.Row(y); + size_t cy = y + by0 * N - 2; + const float* JXL_RESTRICT in_row_t[3]; + const float* JXL_RESTRICT in_row[3]; + const float* JXL_RESTRICT in_row_b[3]; + for (size_t c = 0; c < 3; c++) { + in_row_t[c] = opsin.PlaneRow(c, cy > 0 ? cy - 1 : cy); + in_row[c] = opsin.PlaneRow(c, cy); + in_row_b[c] = opsin.PlaneRow(c, cy + 1 < opsin.ysize() ? cy + 1 : cy); + } + auto compute_laplacian_scalar = [&](size_t x) { + size_t cx = x + bx0 * N - 2; + const size_t prevX = cx >= 1 ? cx - 1 : cx; + const size_t nextX = cx + 1 < opsin.xsize() ? cx + 1 : cx; + float sumsqr = 0; + for (size_t c = 0; c < 3; c++) { + float laplacian = + kChannelWeights[c] * in_row[c][cx] + + kChannelWeightsLapNeg[c] * + (in_row[c][prevX] + in_row[c][nextX] + in_row_b[c][prevX] + + in_row_b[c][cx] + in_row_b[c][nextX] + in_row_t[c][prevX] + + in_row_t[c][cx] + in_row_t[c][nextX]); + sumsqr += laplacian * laplacian; + } + laplacian_sqrsum_row[x] = sumsqr; + }; + size_t x = x0; + for (; x + bx0 * N < 3; x++) { + compute_laplacian_scalar(x); + } + // Interior. One extra pixel of border as the last pixel is special. + for (; x + Lanes(df) <= x1 && x + Lanes(df) + bx0 * N - 1 <= opsin.xsize(); + x += Lanes(df)) { + size_t cx = x + bx0 * N - 2; + auto sumsqr = Zero(df); + for (size_t c = 0; c < 3; c++) { + auto laplacian = + LoadU(df, in_row[c] + cx) * Set(df, kChannelWeights[c]); + auto sum_oth0 = LoadU(df, in_row[c] + cx - 1); + auto sum_oth1 = LoadU(df, in_row[c] + cx + 1); + auto sum_oth2 = LoadU(df, in_row_t[c] + cx - 1); + auto sum_oth3 = LoadU(df, in_row_t[c] + cx); + sum_oth0 += LoadU(df, in_row_t[c] + cx + 1); + sum_oth1 += LoadU(df, in_row_b[c] + cx - 1); + sum_oth2 += LoadU(df, in_row_b[c] + cx); + sum_oth3 += LoadU(df, in_row_b[c] + cx + 1); + sum_oth0 += sum_oth1; + sum_oth2 += sum_oth3; + sum_oth0 += sum_oth2; + laplacian = + MulAdd(Set(df, kChannelWeightsLapNeg[c]), sum_oth0, laplacian); + sumsqr = MulAdd(laplacian, laplacian, sumsqr); + } + StoreU(sumsqr, df, laplacian_sqrsum_row + x); + } + for (; x < x1; x++) { + compute_laplacian_scalar(x); + } + } + HWY_CAPPED(float, 4) df4; + // Calculate the L2 of the 3x3 Laplacian in 4x4 blocks within the area + // of the integral transform. Sample them within the integral transform + // with two offsets (0,0) and (-2, -2) pixels (sqrsum_00 and sqrsum_22, + // respectively). + ImageF& sqrsum_00 = temp_image->sqrsum_00; + size_t sqrsum_00_stride = sqrsum_00.PixelsPerRow(); + float* JXL_RESTRICT sqrsum_00_row = sqrsum_00.Row(0); + for (size_t y = 0; y < (by1 - by0) * 2; y++) { + const float* JXL_RESTRICT rows_in[4]; + for (size_t iy = 0; iy < 4; iy++) { + rows_in[iy] = laplacian_sqrsum.ConstRow(y * 4 + iy + 2); + } + float* JXL_RESTRICT row_out = sqrsum_00_row + y * sqrsum_00_stride; + for (size_t x = 0; x < (bx1 - bx0) * 2; x++) { + auto sum = Zero(df4); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix += Lanes(df4)) { + sum += LoadU(df4, rows_in[iy] + x * 4 + ix + 2); + } + } + row_out[x] = GetLane(Sqrt(SumOfLanes(sum))) * (1.0f / 4.0f); + } + } + // Indexing iy and ix is a bit tricky as we include a 2 pixel border + // around the block for evenness calculations. This is similar to what + // we did in guetzli for the observability of artefacts, except there + // the element is a sliding 5x5, not sparsely sampled 4x4 box like here. + ImageF& sqrsum_22 = temp_image->sqrsum_22; + size_t sqrsum_22_stride = sqrsum_22.PixelsPerRow(); + float* JXL_RESTRICT sqrsum_22_row = sqrsum_22.Row(0); + for (size_t y = 0; y < (by1 - by0) * 2 + 1; y++) { + const float* JXL_RESTRICT rows_in[4]; + for (size_t iy = 0; iy < 4; iy++) { + rows_in[iy] = laplacian_sqrsum.ConstRow(y * 4 + iy); + } + float* JXL_RESTRICT row_out = sqrsum_22_row + y * sqrsum_22_stride; + // ignore pixels outside the image. + // Y coordinates are relative to by0*8+y*4. + size_t sy = y * 4 + by0 * 8 > 0 ? 0 : 2; + size_t ey = y * 4 + by0 * 8 + 4 <= opsin.ysize() + 2 + ? 4 + : opsin.ysize() - y * 4 - by0 * 8 + 2; + for (size_t x = 0; x < (bx1 - bx0) * 2 + 1; x++) { + // ignore pixels outside the image. + // X coordinates are relative to bx0*8. + size_t sx = x * 4 + bx0 * 8 > 0 ? x * 4 : x * 4 + 2; + size_t ex = x * 4 + bx0 * 8 + 4 <= opsin.xsize() + 2 + ? x * 4 + 4 + : opsin.xsize() - bx0 * 8 + 2; + if (ex - sx == 4 && ey - sy == 4) { + auto sum = Zero(df4); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix += Lanes(df4)) { + sum += Load(df4, rows_in[iy] + sx + ix); + } + } + row_out[x] = GetLane(Sqrt(SumOfLanes(sum))) * (1.0f / 4.0f); + } else { + float sum = 0; + for (size_t iy = sy; iy < ey; iy++) { + for (size_t ix = sx; ix < ex; ix++) { + sum += rows_in[iy][ix]; + } + } + row_out[x] = std::sqrt(sum / ((ex - sx) * (ey - sy))); + } + } + } + for (size_t by = by0; by < by1; by++) { + AcStrategyRow acs_row = enc_state->shared.ac_strategy.ConstRow(by); + uint8_t* JXL_RESTRICT out_row = epf_sharpness->Row(by); + float* JXL_RESTRICT quant_row = quant->Row(by); + for (size_t bx = bx0; bx < bx1; bx++) { + AcStrategy acs = acs_row[bx]; + if (!acs.IsFirstBlock()) continue; + // The errors are going to be linear to the quantization value in this + // locality. We only have access to the initial quant field here. + float quant_val = 1.0f / quant_row[bx]; + + const auto sq00 = [&](size_t y, size_t x) { + return sqrsum_00_row[((by - by0) * 2 + y) * sqrsum_00_stride + + (bx - bx0) * 2 + x]; + }; + const auto sq22 = [&](size_t y, size_t x) { + return sqrsum_22_row[((by - by0) * 2 + y) * sqrsum_22_stride + + (bx - bx0) * 2 + x]; + }; + float sqrsum_integral_transform = 0; + for (size_t iy = 0; iy < acs.covered_blocks_y() * 2; iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x() * 2; ix++) { + sqrsum_integral_transform += sq00(iy, ix) * sq00(iy, ix); + } + } + sqrsum_integral_transform /= + 4 * acs.covered_blocks_x() * acs.covered_blocks_y(); + sqrsum_integral_transform = std::sqrt(sqrsum_integral_transform); + // If masking is high or amplitude of the artefacts is low, then no + // smoothing is needed. + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + // Five 4x4 blocks for masking estimation, all within the + // 8x8 area. + float minval_1 = std::min(sq00(2 * iy + 0, 2 * ix + 0), + sq00(2 * iy + 0, 2 * ix + 1)); + float minval_2 = std::min(sq00(2 * iy + 1, 2 * ix + 0), + sq00(2 * iy + 1, 2 * ix + 1)); + float minval = std::min(minval_1, minval_2); + minval = std::min(minval, sq22(2 * iy + 1, 2 * ix + 1)); + // Nine more 4x4 blocks for masking estimation, includes + // the 2 pixel area around the 8x8 block being controlled. + float minval2_1 = std::min(sq22(2 * iy + 0, 2 * ix + 0), + sq22(2 * iy + 0, 2 * ix + 1)); + float minval2_2 = std::min(sq22(2 * iy + 0, 2 * ix + 2), + sq22(2 * iy + 1, 2 * ix + 0)); + float minval2_3 = std::min(sq22(2 * iy + 1, 2 * ix + 1), + sq22(2 * iy + 1, 2 * ix + 2)); + float minval2_4 = std::min(sq22(2 * iy + 2, 2 * ix + 0), + sq22(2 * iy + 2, 2 * ix + 1)); + float minval2_5 = std::min(minval2_1, minval2_2); + float minval2_6 = std::min(minval2_3, minval2_4); + float minval2 = std::min(minval2_5, minval2_6); + minval2 = std::min(minval2, sq22(2 * iy + 2, 2 * ix + 2)); + float minval3 = std::min(minval, minval2); + minval *= 0.125f; + minval += 0.625f * minval3; + minval += + 0.125f * std::min(1.5f * minval3, sq22(2 * iy + 1, 2 * ix + 1)); + minval += 0.125f * minval2; + // Larger kBias, less smoothing for low intensity changes. + float kDeltaLimit = 3.2; + float bias = 0.0625f * quant_val; + float delta = + (sqrsum_integral_transform + (kDeltaLimit + 0.05) * bias) / + (minval + bias); + int out = 4; + if (delta > kDeltaLimit) { + out = 4; // smooth + } else { + out = 0; + } + // 'threshold' is separate from 'bias' for easier tuning of these + // heuristics. + float threshold = 0.0625f * quant_val; + const float kSmoothLimit = 0.085f; + float smooth = 0.20f * (sq00(2 * iy + 0, 2 * ix + 0) + + sq00(2 * iy + 0, 2 * ix + 1) + + sq00(2 * iy + 1, 2 * ix + 0) + + sq00(2 * iy + 1, 2 * ix + 1) + minval); + if (smooth < kSmoothLimit * threshold) { + out = 4; + } + out_row[bx + sharpness_stride * iy + ix] = out; + } + } + } + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(ProcessTile); + +void ArControlFieldHeuristics::RunRect(const Rect& block_rect, + const Image3F& opsin, + PassesEncoderState* enc_state, + size_t thread) { + HWY_DYNAMIC_DISPATCH(ProcessTile) + (opsin, enc_state, block_rect, &temp_images[thread]); +} + +} // namespace jxl + +#endif diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ar_control_field.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ar_control_field.h new file mode 100644 index 0000000000..ae9d399b92 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_ar_control_field.h @@ -0,0 +1,49 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_AR_CONTROL_FIELD_H_ +#define LIB_JXL_ENC_AR_CONTROL_FIELD_H_ + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image.h" +#include "lib/jxl/quant_weights.h" + +namespace jxl { + +struct ArControlFieldHeuristics { + struct TempImages { + void InitOnce() { + if (laplacian_sqrsum.xsize() != 0) return; + laplacian_sqrsum = ImageF(kEncTileDim + 4, kEncTileDim + 4); + sqrsum_00 = ImageF(kEncTileDim / 4, kEncTileDim / 4); + sqrsum_22 = ImageF(kEncTileDim / 4 + 1, kEncTileDim / 4 + 1); + } + + ImageF laplacian_sqrsum; + ImageF sqrsum_00; + ImageF sqrsum_22; + }; + + void PrepareForThreads(size_t num_threads) { + temp_images.resize(num_threads); + } + + void RunRect(const Rect& block_rect, const Image3F& opsin, + PassesEncoderState* enc_state, size_t thread); + + std::vector temp_images; + ImageB* epf_sharpness; + ImageF* quant; + bool all_default; +}; + +} // namespace jxl + +#endif // LIB_JXL_AR_ENC_CONTROL_FIELD_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_bit_writer.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_bit_writer.cc new file mode 100644 index 0000000000..50e13f3883 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_bit_writer.cc @@ -0,0 +1,379 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_bit_writer.h" + +#include // memcpy + +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/dec_bit_reader.h" + +namespace jxl { + +BitWriter::Allotment::Allotment(BitWriter* JXL_RESTRICT writer, size_t max_bits) + : max_bits_(max_bits) { + if (writer == nullptr) return; + prev_bits_written_ = writer->BitsWritten(); + const size_t prev_bytes = writer->storage_.size(); + const size_t next_bytes = DivCeil(max_bits, kBitsPerByte); + writer->storage_.resize(prev_bytes + next_bytes); + parent_ = writer->current_allotment_; + writer->current_allotment_ = this; +} + +BitWriter::Allotment::~Allotment() { + if (!called_) { + // Not calling is a bug - unused storage will not be reclaimed. + JXL_ABORT("Did not call Allotment::ReclaimUnused"); + } +} + +void BitWriter::Allotment::FinishedHistogram(BitWriter* JXL_RESTRICT writer) { + if (writer == nullptr) return; + JXL_ASSERT(!called_); // Call before ReclaimUnused + JXL_ASSERT(histogram_bits_ == 0); // Do not call twice + JXL_ASSERT(writer->BitsWritten() >= prev_bits_written_); + histogram_bits_ = writer->BitsWritten() - prev_bits_written_; +} + +void BitWriter::Allotment::PrivateReclaim(BitWriter* JXL_RESTRICT writer, + size_t* JXL_RESTRICT used_bits, + size_t* JXL_RESTRICT unused_bits) { + JXL_ASSERT(!called_); // Do not call twice + called_ = true; + if (writer == nullptr) return; + + JXL_ASSERT(writer->BitsWritten() >= prev_bits_written_); + *used_bits = writer->BitsWritten() - prev_bits_written_; + JXL_ASSERT(*used_bits <= max_bits_); + *unused_bits = max_bits_ - *used_bits; + + // Reclaim unused bytes whole bytes from writer's allotment. + const size_t unused_bytes = *unused_bits / kBitsPerByte; // truncate + JXL_ASSERT(writer->storage_.size() >= unused_bytes); + writer->storage_.resize(writer->storage_.size() - unused_bytes); + writer->current_allotment_ = parent_; + // Ensure we don't also charge the parent for these bits. + auto parent = parent_; + while (parent != nullptr) { + parent->prev_bits_written_ += *used_bits; + parent = parent->parent_; + } +} + +void BitWriter::AppendByteAligned(const Span& span) { + if (!span.size()) return; + storage_.resize(storage_.size() + span.size() + 1); // extra zero padding + + // Concatenate by copying bytes because both source and destination are bytes. + JXL_ASSERT(BitsWritten() % kBitsPerByte == 0); + size_t pos = BitsWritten() / kBitsPerByte; + memcpy(storage_.data() + pos, span.data(), span.size()); + pos += span.size(); + storage_[pos++] = 0; // for next Write + JXL_ASSERT(pos <= storage_.size()); + bits_written_ += span.size() * kBitsPerByte; +} + +void BitWriter::AppendByteAligned(const BitWriter& other) { + JXL_ASSERT(other.BitsWritten() % kBitsPerByte == 0); + JXL_ASSERT(other.BitsWritten() / kBitsPerByte != 0); + + AppendByteAligned(other.GetSpan()); +} + +void BitWriter::AppendByteAligned(const std::vector& others) { + // Total size to add so we can preallocate + size_t other_bytes = 0; + for (const BitWriter& writer : others) { + JXL_ASSERT(writer.BitsWritten() % kBitsPerByte == 0); + other_bytes += writer.BitsWritten() / kBitsPerByte; + } + if (other_bytes == 0) { + // No bytes to append: this happens for example when creating per-group + // storage for groups, but not writing anything in them for e.g. lossless + // images with no alpha. Do nothing. + return; + } + storage_.resize(storage_.size() + other_bytes + 1); // extra zero padding + + // Concatenate by copying bytes because both source and destination are bytes. + JXL_ASSERT(BitsWritten() % kBitsPerByte == 0); + size_t pos = BitsWritten() / kBitsPerByte; + for (const BitWriter& writer : others) { + const Span span = writer.GetSpan(); + memcpy(storage_.data() + pos, span.data(), span.size()); + pos += span.size(); + } + storage_[pos++] = 0; // for next Write + JXL_ASSERT(pos <= storage_.size()); + bits_written_ += other_bytes * kBitsPerByte; +} + +// TODO(lode): avoid code duplication +void BitWriter::AppendByteAligned( + const std::vector>& others) { + // Total size to add so we can preallocate + size_t other_bytes = 0; + for (const auto& writer : others) { + JXL_ASSERT(writer->BitsWritten() % kBitsPerByte == 0); + other_bytes += writer->BitsWritten() / kBitsPerByte; + } + if (other_bytes == 0) { + // No bytes to append: this happens for example when creating per-group + // storage for groups, but not writing anything in them for e.g. lossless + // images with no alpha. Do nothing. + return; + } + storage_.resize(storage_.size() + other_bytes + 1); // extra zero padding + + // Concatenate by copying bytes because both source and destination are bytes. + JXL_ASSERT(BitsWritten() % kBitsPerByte == 0); + size_t pos = BitsWritten() / kBitsPerByte; + for (const auto& writer : others) { + const Span span = writer->GetSpan(); + memcpy(storage_.data() + pos, span.data(), span.size()); + pos += span.size(); + } + storage_[pos++] = 0; // for next Write + JXL_ASSERT(pos <= storage_.size()); + bits_written_ += other_bytes * kBitsPerByte; +} + +BitWriter& BitWriter::operator+=(const BitWriter& other) { + // Required for correctness, otherwise owned[bits_written_] is out of bounds. + if (other.bits_written_ == 0) return *this; + const size_t other_bytes = DivCeil(other.bits_written_, kBitsPerByte); + const size_t prev_bytes = storage_.size(); + storage_.resize(prev_bytes + other_bytes + 1); // extra zero padding + + if (bits_written_ % kBitsPerByte == 0) { + // Only copy fully-initialized bytes. + const size_t full_bytes = other.bits_written_ / kBitsPerByte; // truncated + memcpy(&storage_[bits_written_ / kBitsPerByte], other.storage_.data(), + full_bytes); + storage_[bits_written_ / kBitsPerByte + full_bytes] = 0; // for next Write + bits_written_ += full_bytes * kBitsPerByte; + + const size_t leftovers = other.bits_written_ % kBitsPerByte; + if (leftovers != 0) { + BitReader reader(Span(other.storage_.data() + full_bytes, + other_bytes - full_bytes)); + Write(leftovers, reader.ReadBits(leftovers)); + JXL_CHECK(reader.Close()); + } + return *this; + } + + constexpr size_t N = kMaxBitsPerCall < BitReader::kMaxBitsPerCall + ? kMaxBitsPerCall + : BitReader::kMaxBitsPerCall; + + // Do not use GetSpan because other may not be byte-aligned. + BitReader reader(other.storage_); + size_t i = 0; + for (; i + N <= other.bits_written_; i += N) { + Write(N, reader.ReadFixedBits()); + } + const size_t leftovers = other.bits_written_ - i; + if (leftovers != 0) { + Write(leftovers, reader.ReadBits(leftovers)); + } + JXL_CHECK(reader.Close()); + return *this; +} + +#ifndef DISABLE_ACC_BIT_WRITER +void BitWriter::init(size_t cnt){ + cur_part = 0; + nbits_streams.resize(cnt); + bits_streams.resize(cnt); +} + +void BitWriter::update_part(size_t cnt){ + cur_part=cnt; +} +#endif +// Example: let's assume that 3 bits (Rs below) have been written already: +// BYTE+0 BYTE+1 BYTE+2 +// 0000 0RRR ???? ???? ???? ???? +// +// Now, we could write up to 5 bits by just shifting them left by 3 bits and +// OR'ing to BYTE-0. +// +// For n > 5 bits, we write the lowest 5 bits as above, then write the next +// lowest bits into BYTE+1 starting from its lower bits and so on. +#ifndef DISABLE_ACC_BIT_WRITER +void BitWriter::Write(size_t n_bits, uint64_t bits) { + JXL_DASSERT((bits >> n_bits) == 0); + JXL_DASSERT(n_bits <= kMaxBitsPerCall); + + nbits_streams[cur_part].push(n_bits); + bits_streams[cur_part].push(bits); +/* + uint8_t* p = &storage_[bits_written_ / kBitsPerByte]; + const size_t bits_in_first_byte = bits_written_ % kBitsPerByte; + bits <<= bits_in_first_byte; +#if JXL_BYTE_ORDER_LITTLE + uint64_t v = *p; + // Last (partial) or next byte to write must be zero-initialized! + // PaddedBytes initializes the first, and Write/Append maintain this. + JXL_DASSERT(v >> bits_in_first_byte == 0); + v |= bits; + memcpy(p, &v, sizeof(v)); // Write bytes: possibly more than n_bits/8 +#else + *p++ |= static_cast(bits & 0xFF); + for (size_t bits_left_to_write = n_bits + bits_in_first_byte; + bits_left_to_write >= 9; bits_left_to_write -= 8) { + bits >>= 8; + *p++ = static_cast(bits & 0xFF); + } + *p = 0; +#endif*/ + bits_written_ += n_bits; +} +#else +void BitWriter::Write(size_t n_bits, uint64_t bits) { + JXL_DASSERT((bits >> n_bits) == 0); + JXL_DASSERT(n_bits <= kMaxBitsPerCall); + uint8_t* p = &storage_[bits_written_ / kBitsPerByte]; + const size_t bits_in_first_byte = bits_written_ % kBitsPerByte; + bits <<= bits_in_first_byte; +#if JXL_BYTE_ORDER_LITTLE + uint64_t v = *p; + // Last (partial) or next byte to write must be zero-initialized! + // PaddedBytes initializes the first, and Write/Append maintain this. + JXL_DASSERT(v >> bits_in_first_byte == 0); + v |= bits; + memcpy(p, &v, sizeof(v)); // Write bytes: possibly more than n_bits/8 +#else + *p++ |= static_cast(bits & 0xFF); + for (size_t bits_left_to_write = n_bits + bits_in_first_byte; + bits_left_to_write >= 9; bits_left_to_write -= 8) { + bits >>= 8; + *p++ = static_cast(bits & 0xFF); + } + *p = 0; +#endif + bits_written_ += n_bits; +} +#endif + +#ifndef DISABLE_ACC_BIT_WRITER +void BitWriter::Finalize(std::vector seq){ + int cnt=0; + storage_.resize(bits_written_); + size_t bits_written=old_bits_written_; + for(size_t i=0;i> n_bits) == 0); + JXL_DASSERT(n_bits <= kMaxBitsPerCall); + + uint8_t* p = &storage_[bits_written / kBitsPerByte]; + const size_t bits_in_first_byte = bits_written % kBitsPerByte; + bits <<= bits_in_first_byte; +#if JXL_BYTE_ORDER_LITTLE + uint64_t v = *p; + // Last (partial) or next byte to write must be zero-initialized! + // PaddedBytes initializes the first, and Write/Append maintain this. + JXL_DASSERT(v >> bits_in_first_byte == 0); + v |= bits; + memcpy(p, &v, sizeof(v)); // Write bytes: possibly more than n_bits/8 +#else + *p++ |= static_cast(bits & 0xFF); + for (size_t bits_left_to_write = n_bits + bits_in_first_byte; + bits_left_to_write >= 9; bits_left_to_write -= 8) { + bits >>= 8; + *p++ = static_cast(bits & 0xFF); + } + *p = 0; +#endif + bits_written += n_bits; + cnt++; + } + } + + JXL_DASSERT(bits_written==bits_written_); + old_bits_written_=bits_written_; +} + +void BitWriter::Finalize(){ + int cnt=0; + storage_.resize(bits_written_); + size_t bits_written=old_bits_written_; + for(size_t i=0;i> n_bits) == 0); + JXL_DASSERT(n_bits <= kMaxBitsPerCall); + + uint8_t* p = &storage_[bits_written / kBitsPerByte]; + const size_t bits_in_first_byte = bits_written % kBitsPerByte; + bits <<= bits_in_first_byte; +#if JXL_BYTE_ORDER_LITTLE + uint64_t v = *p; + // Last (partial) or next byte to write must be zero-initialized! + // PaddedBytes initializes the first, and Write/Append maintain this. + JXL_DASSERT(v >> bits_in_first_byte == 0); + v |= bits; + memcpy(p, &v, sizeof(v)); // Write bytes: possibly more than n_bits/8 +#else + *p++ |= static_cast(bits & 0xFF); + for (size_t bits_left_to_write = n_bits + bits_in_first_byte; + bits_left_to_write >= 9; bits_left_to_write -= 8) { + bits >>= 8; + *p++ = static_cast(bits & 0xFF); + } + *p = 0; +#endif + bits_written += n_bits; + cnt++; + } + } + + JXL_DASSERT(bits_written==bits_written_); + old_bits_written_=bits_written_; +} +#endif +BitWriter& BitWriter::operator+=(const PaddedBytes& other) { + const size_t other_bytes = other.size(); + // Required for correctness, otherwise owned[bits_written_] is out of bounds. + if (other_bytes == 0) return *this; + const size_t other_bits = other_bytes * kBitsPerByte; + + storage_.resize(storage_.size() + other_bytes + 1); + if (bits_written_ % kBitsPerByte == 0) { + memcpy(&storage_[bits_written_ / kBitsPerByte], other.data(), other_bytes); + storage_[bits_written_ / kBitsPerByte + other_bytes] = 0; // for next Write + bits_written_ += other_bits; + return *this; + } + constexpr size_t N = kMaxBitsPerCall < BitReader::kMaxBitsPerCall + ? kMaxBitsPerCall + : BitReader::kMaxBitsPerCall; + + BitReader reader(other); + size_t i = 0; + for (; i + N <= other_bits; i += N) { + Write(N, reader.ReadFixedBits()); + } + const size_t leftovers = other_bits - i; + Write(leftovers, reader.ReadBits(leftovers)); + JXL_CHECK(reader.Close()); + return *this; +} + +} // namespace jxl \ No newline at end of file diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_bit_writer.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_bit_writer.h new file mode 100644 index 0000000000..750a12b88e --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_bit_writer.h @@ -0,0 +1,182 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_BIT_WRITER_H_ +#define LIB_JXL_ENC_BIT_WRITER_H_ + +// BitWriter class: unbuffered writes using unaligned 64-bit stores. +#include "hls_stream.h" +#include "ap_int.h" +#include + +#include +#include + +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" + +#include "xlnx_cfg.h" + +namespace jxl { + +struct BitWriter { + // Upper bound on `n_bits` in each call to Write. We shift a 64-bit word by + // 7 bits (max already valid bits in the last byte) and at least 1 bit is + // needed to zero-initialize the bit-stream ahead (i.e. if 7 bits are valid + // and we write 57 bits, then the next write will access a byte that was not + // yet zero-initialized). + static constexpr size_t kMaxBitsPerCall = 56; + +#ifdef DISABLE_ACC_BIT_WRITER + BitWriter() : bits_written_(0) {} +#else + size_t cur_part; + std::vector > bits_streams; + std::vector > nbits_streams; + + BitWriter() : bits_written_(0) { + cur_part = 0; + old_bits_written_=0; + bits_streams.resize(1); + nbits_streams.resize(1); + } +#endif + + // Disallow copying - may lead to bugs. + BitWriter(const BitWriter&) = delete; + BitWriter& operator=(const BitWriter&) = delete; + BitWriter(BitWriter&&) = default; + BitWriter& operator=(BitWriter&&) = default; + +#ifdef DISABLE_ACC_BIT_WRITER + explicit BitWriter(PaddedBytes&& donor) + : bits_written_(donor.size() * kBitsPerByte), + storage_(std::move(donor)) {} +#else + explicit BitWriter(PaddedBytes&& donor) + : bits_written_(donor.size() * kBitsPerByte), + storage_(std::move(donor)) { + JXL_DASSERT(bits_written_==old_bits_written_); + old_bits_written_=donor.size()*kBitsPerByte; + } +#endif + + size_t BitsWritten() const { return bits_written_; } + + Span GetSpan() const { + // Callers must ensure byte alignment to avoid uninitialized bits. + JXL_ASSERT(bits_written_ % kBitsPerByte == 0); + return Span(storage_.data(), bits_written_ / kBitsPerByte); + } + + // Example usage: bytes = std::move(writer).TakeBytes(); Useful for the + // top-level encoder which returns PaddedBytes, not a BitWriter. + // *this must be an rvalue reference and is invalid afterwards. + PaddedBytes&& TakeBytes() && { + // Callers must ensure byte alignment to avoid uninitialized bits. + JXL_ASSERT(bits_written_ % kBitsPerByte == 0); + storage_.resize(bits_written_ / kBitsPerByte); + return std::move(storage_); + } + + // Must be byte-aligned before calling. + void AppendByteAligned(const Span& span); + // NOTE: no allotment needed, the other BitWriters have already been charged. + void AppendByteAligned(const BitWriter& other); + void AppendByteAligned(const std::vector>& others); + void AppendByteAligned(const std::vector& others); + + class Allotment { + public: + // Expands a BitWriter's storage. Must happen before calling Write or + // ZeroPadToByte. Must call ReclaimUnused after writing to reclaim the + // unused storage so that BitWriter memory use remains tightly bounded. + Allotment(BitWriter* JXL_RESTRICT writer, size_t max_bits); + ~Allotment(); + + size_t MaxBits() const { return max_bits_; } + + // Call after writing a histogram, but before ReclaimUnused. + void FinishedHistogram(BitWriter* JXL_RESTRICT writer); + + size_t HistogramBits() const { + JXL_ASSERT(called_); + return histogram_bits_; + } + + // Do not call directly - use ::ReclaimAndCharge instead, which ensures + // the bits are charged to a layer. + void PrivateReclaim(BitWriter* JXL_RESTRICT writer, + size_t* JXL_RESTRICT used_bits, + size_t* JXL_RESTRICT unused_bits); + + private: + size_t prev_bits_written_; + const size_t max_bits_; + size_t histogram_bits_ = 0; + bool called_ = false; + Allotment* parent_; + }; + + // WARNING: think twice before using this. Concatenating two BitWriters that + // pad to bytes is NOT the same as one contiguous BitWriter. + BitWriter& operator+=(const BitWriter& other); + + // TODO(janwas): remove once all callers use BitWriter + BitWriter& operator+=(const PaddedBytes& other); + + // Writes bits into bytes in increasing addresses, and within a byte + // least-significant-bit first. + // + // The function can write up to 56 bits in one go. +#ifdef DISABLE_ACC_BIT_WRITER + void Write(size_t n_bits, uint64_t bits); +#else + void init(size_t cnt); + void update_part(size_t cnt); + void Write(size_t n_bits, uint64_t bits); + void Finalize(std::vector seq); + void Finalize(); +#endif + + // This should only rarely be used - e.g. when the current location will be + // referenced via byte offset (TOCs point to groups), or byte-aligned reading + // is required for speed. WARNING: this interacts badly with operator+=, + // see above. + void ZeroPadToByte() { + const size_t remainder_bits = + RoundUpBitsToByteMultiple(bits_written_) - bits_written_; + if (remainder_bits == 0) return; + Write(remainder_bits, 0); + JXL_ASSERT(bits_written_ % kBitsPerByte == 0); + } + + // TODO(janwas): remove? only called from ANS + void RewindStorage(const size_t pos0) { + JXL_ASSERT(pos0 <= bits_written_); + bits_written_ = pos0; + static const uint8_t kRewindMasks[8] = {0x0, 0x1, 0x3, 0x7, + 0xf, 0x1f, 0x3f, 0x7f}; + storage_[pos0 >> 3] &= kRewindMasks[pos0 & 7]; + } + + private: + size_t bits_written_; + #ifndef DISABLE_ACC_BIT_WRITER + size_t old_bits_written_; + #endif + PaddedBytes storage_; + Allotment* current_allotment_ = nullptr; +}; + +} // namespace jxl + +#endif // LIB_JXL_ENC_BIT_WRITER_H_i \ No newline at end of file diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_comparator.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_comparator.cc new file mode 100644 index 0000000000..e253509466 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_comparator.cc @@ -0,0 +1,93 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_butteraugli_comparator.h" + +#include +#include + +#include "lib/jxl/color_management.h" + +namespace jxl { + +JxlButteraugliComparator::JxlButteraugliComparator( + const ButteraugliParams& params) + : params_(params) {} + +Status JxlButteraugliComparator::SetReferenceImage(const ImageBundle& ref) { + const ImageBundle* ref_linear_srgb; + ImageMetadata metadata = *ref.metadata(); + ImageBundle store(&metadata); + if (!TransformIfNeeded(ref, ColorEncoding::LinearSRGB(ref.IsGray()), + /*pool=*/nullptr, &store, &ref_linear_srgb)) { + return false; + } + + comparator_.reset( + new ButteraugliComparator(ref_linear_srgb->color(), params_)); + xsize_ = ref.xsize(); + ysize_ = ref.ysize(); + return true; +} + +Status JxlButteraugliComparator::CompareWith(const ImageBundle& actual, + ImageF* diffmap, float* score) { + if (!comparator_) { + return JXL_FAILURE("Must set reference image first"); + } + if (xsize_ != actual.xsize() || ysize_ != actual.ysize()) { + return JXL_FAILURE("Images must have same size"); + } + + const ImageBundle* actual_linear_srgb; + ImageMetadata metadata = *actual.metadata(); + ImageBundle store(&metadata); + if (!TransformIfNeeded(actual, ColorEncoding::LinearSRGB(actual.IsGray()), + /*pool=*/nullptr, &store, &actual_linear_srgb)) { + return false; + } + + ImageF temp_diffmap(xsize_, ysize_); + comparator_->Diffmap(actual_linear_srgb->color(), temp_diffmap); + + if (score != nullptr) { + *score = ButteraugliScoreFromDiffmap(temp_diffmap, ¶ms_); + } + if (diffmap != nullptr) { + diffmap->Swap(temp_diffmap); + } + + return true; +} + +float JxlButteraugliComparator::GoodQualityScore() const { + return ButteraugliFuzzyInverse(1.5); +} + +float JxlButteraugliComparator::BadQualityScore() const { + return ButteraugliFuzzyInverse(0.5); +} + +float ButteraugliDistance(const ImageBundle& rgb0, const ImageBundle& rgb1, + const ButteraugliParams& params, ImageF* distmap, + ThreadPool* pool) { + JxlButteraugliComparator comparator(params); + return ComputeScore(rgb0, rgb1, &comparator, distmap, pool); +} + +float ButteraugliDistance(const CodecInOut& rgb0, const CodecInOut& rgb1, + const ButteraugliParams& params, ImageF* distmap, + ThreadPool* pool) { + JxlButteraugliComparator comparator(params); + JXL_ASSERT(rgb0.frames.size() == rgb1.frames.size()); + float max_dist = 0.0f; + for (size_t i = 0; i < rgb0.frames.size(); ++i) { + max_dist = std::max(max_dist, ComputeScore(rgb0.frames[i], rgb1.frames[i], + &comparator, distmap, pool)); + } + return max_dist; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_comparator.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_comparator.h new file mode 100644 index 0000000000..48a1d8950e --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_comparator.h @@ -0,0 +1,56 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_BUTTERAUGLI_COMPARATOR_H_ +#define LIB_JXL_ENC_BUTTERAUGLI_COMPARATOR_H_ + +#include + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/butteraugli/butteraugli.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/enc_comparator.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +class JxlButteraugliComparator : public Comparator { + public: + explicit JxlButteraugliComparator(const ButteraugliParams& params); + + Status SetReferenceImage(const ImageBundle& ref) override; + + Status CompareWith(const ImageBundle& actual, ImageF* diffmap, + float* score) override; + + float GoodQualityScore() const override; + float BadQualityScore() const override; + + private: + ButteraugliParams params_; + std::unique_ptr comparator_; + size_t xsize_ = 0; + size_t ysize_ = 0; +}; + +// Returns the butteraugli distance between rgb0 and rgb1. +// If distmap is not null, it must be the same size as rgb0 and rgb1. +float ButteraugliDistance(const ImageBundle& rgb0, const ImageBundle& rgb1, + const ButteraugliParams& params, + ImageF* distmap = nullptr, + ThreadPool* pool = nullptr); + +float ButteraugliDistance(const CodecInOut& rgb0, const CodecInOut& rgb1, + const ButteraugliParams& params, + ImageF* distmap = nullptr, + ThreadPool* pool = nullptr); + +} // namespace jxl + +#endif // LIB_JXL_ENC_BUTTERAUGLI_COMPARATOR_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_pnorm.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_pnorm.cc new file mode 100644 index 0000000000..7c3fb9c287 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_pnorm.cc @@ -0,0 +1,212 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_butteraugli_pnorm.h" + +#include +#include + +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_butteraugli_pnorm.cc" +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Rebind; + +double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params, + double p) { + PROFILER_FUNC; + // In approximate-border mode, skip pixels on the border likely to be affected + // by FastGauss' zero-valued-boundary behavior. The border is less than half + // the largest-diameter kernel (37x37 pixels), and 0 if the image is tiny. + // NOTE: chosen such that it is vector-aligned. + size_t border = (params.approximate_border) ? 8 : 0; + if (distmap.xsize() <= 2 * border || distmap.ysize() <= 2 * border) { + border = 0; + } + + const double onePerPixels = 1.0 / (distmap.ysize() * distmap.xsize()); + if (std::abs(p - 3.0) < 1E-6) { + double sum1[3] = {0.0}; + +// Prefer double if possible, but otherwise use float rather than scalar. +#if HWY_CAP_FLOAT64 + using T = double; + const Rebind df; +#else + using T = float; +#endif + const HWY_FULL(T) d; + constexpr size_t N = MaxLanes(HWY_FULL(T)()); + // Manually aligned storage to avoid asan crash on clang-7 due to + // unaligned spill. + HWY_ALIGN T sum_totals0[N] = {0}; + HWY_ALIGN T sum_totals1[N] = {0}; + HWY_ALIGN T sum_totals2[N] = {0}; + + for (size_t y = border; y < distmap.ysize() - border; ++y) { + const float* JXL_RESTRICT row = distmap.ConstRow(y); + + auto sums0 = Zero(d); + auto sums1 = Zero(d); + auto sums2 = Zero(d); + + size_t x = border; + for (; x + Lanes(d) <= distmap.xsize() - border; x += Lanes(d)) { +#if HWY_CAP_FLOAT64 + const auto d1 = PromoteTo(d, Load(df, row + x)); +#else + const auto d1 = Load(d, row + x); +#endif + const auto d2 = d1 * d1 * d1; + sums0 += d2; + const auto d3 = d2 * d2; + sums1 += d3; + const auto d4 = d3 * d3; + sums2 += d4; + } + + Store(sums0 + Load(d, sum_totals0), d, sum_totals0); + Store(sums1 + Load(d, sum_totals1), d, sum_totals1); + Store(sums2 + Load(d, sum_totals2), d, sum_totals2); + + for (; x < distmap.xsize() - border; ++x) { + const double d1 = row[x]; + double d2 = d1 * d1 * d1; + sum1[0] += d2; + d2 *= d2; + sum1[1] += d2; + d2 *= d2; + sum1[2] += d2; + } + } + double v = 0; + v += pow( + onePerPixels * (sum1[0] + GetLane(SumOfLanes(Load(d, sum_totals0)))), + 1.0 / (p * 1.0)); + v += pow( + onePerPixels * (sum1[1] + GetLane(SumOfLanes(Load(d, sum_totals1)))), + 1.0 / (p * 2.0)); + v += pow( + onePerPixels * (sum1[2] + GetLane(SumOfLanes(Load(d, sum_totals2)))), + 1.0 / (p * 4.0)); + v /= 3.0; + return v; + } else { + static std::atomic once{0}; + if (once.fetch_add(1, std::memory_order_relaxed) == 0) { + JXL_WARNING("WARNING: using slow ComputeDistanceP"); + } + double sum1[3] = {0.0}; + for (size_t y = border; y < distmap.ysize() - border; ++y) { + const float* JXL_RESTRICT row = distmap.ConstRow(y); + for (size_t x = border; x < distmap.xsize() - border; ++x) { + double d2 = std::pow(row[x], p); + sum1[0] += d2; + d2 *= d2; + sum1[1] += d2; + d2 *= d2; + sum1[2] += d2; + } + } + double v = 0; + for (int i = 0; i < 3; ++i) { + v += pow(onePerPixels * (sum1[i]), 1.0 / (p * (1 << i))); + } + v /= 3.0; + return v; + } +} + +// TODO(lode): take alpha into account when needed +double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2) { + PROFILER_FUNC; + // Convert to sRGB - closer to perception than linear. + const Image3F* srgb1 = &ib1.color(); + Image3F copy1; + if (!ib1.IsSRGB()) { + JXL_CHECK(ib1.CopyTo(Rect(ib1), ColorEncoding::SRGB(ib1.IsGray()), ©1)); + srgb1 = ©1; + } + const Image3F* srgb2 = &ib2.color(); + Image3F copy2; + if (!ib2.IsSRGB()) { + JXL_CHECK(ib2.CopyTo(Rect(ib2), ColorEncoding::SRGB(ib2.IsGray()), ©2)); + srgb2 = ©2; + } + + JXL_CHECK(SameSize(*srgb1, *srgb2)); + + // TODO(veluca): SIMD. + float yuvmatrix[3][3] = {{0.299, 0.587, 0.114}, + {-0.14713, -0.28886, 0.436}, + {0.615, -0.51499, -0.10001}}; + double sum_of_squares[3] = {}; + for (size_t y = 0; y < srgb1->ysize(); ++y) { + const float* JXL_RESTRICT row1[3]; + const float* JXL_RESTRICT row2[3]; + for (size_t j = 0; j < 3; j++) { + row1[j] = srgb1->ConstPlaneRow(j, y); + row2[j] = srgb2->ConstPlaneRow(j, y); + } + for (size_t x = 0; x < srgb1->xsize(); ++x) { + float cdiff[3] = {}; + // YUV conversion is linear, so we can run it on the difference. + for (size_t j = 0; j < 3; j++) { + cdiff[j] = row1[j][x] - row2[j][x]; + } + float yuvdiff[3] = {}; + for (size_t j = 0; j < 3; j++) { + for (size_t k = 0; k < 3; k++) { + yuvdiff[j] += yuvmatrix[j][k] * cdiff[k]; + } + } + for (size_t j = 0; j < 3; j++) { + sum_of_squares[j] += yuvdiff[j] * yuvdiff[j]; + } + } + } + // Weighted PSNR as in JPEG-XL: chroma counts 1/8. + const float weights[3] = {6.0f / 8, 1.0f / 8, 1.0f / 8}; + // Avoid squaring the weight - 1/64 is too extreme. + double norm = 0; + for (size_t i = 0; i < 3; i++) { + norm += std::sqrt(sum_of_squares[i]) * weights[i]; + } + // This function returns distance *squared*. + return norm * norm; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(ComputeDistanceP); +double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params, + double p) { + return HWY_DYNAMIC_DISPATCH(ComputeDistanceP)(distmap, params, p); +} + +HWY_EXPORT(ComputeDistance2); +double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2) { + return HWY_DYNAMIC_DISPATCH(ComputeDistance2)(ib1, ib2); +} + +} // namespace jxl +#endif diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_pnorm.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_pnorm.h new file mode 100644 index 0000000000..5579c0adee --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_butteraugli_pnorm.h @@ -0,0 +1,24 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_BUTTERAUGLI_PNORM_H_ +#define LIB_JXL_ENC_BUTTERAUGLI_PNORM_H_ + +#include + +#include "lib/jxl/butteraugli/butteraugli.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +// Computes p-norm given the butteraugli distmap. +double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params, + double p); + +double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2); + +} // namespace jxl + +#endif // LIB_JXL_ENC_BUTTERAUGLI_PNORM_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cache.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cache.cc new file mode 100644 index 0000000000..038a706d02 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cache.cc @@ -0,0 +1,198 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_cache.h" + +#include +#include + +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/dec_frame.h" +#include "lib/jxl/enc_frame.h" +#include "lib/jxl/enc_group.h" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/passes_state.h" +#include "lib/jxl/quantizer.h" + +namespace jxl { + +void InitializePassesEncoder(const Image3F& opsin, ThreadPool* pool, + PassesEncoderState* enc_state, + ModularFrameEncoder* modular_frame_encoder, + AuxOut* aux_out) { + PROFILER_FUNC; + + PassesSharedState& JXL_RESTRICT shared = enc_state->shared; + + enc_state->histogram_idx.resize(shared.frame_dim.num_groups); + + enc_state->x_qm_multiplier = + std::pow(1.25f, shared.frame_header.x_qm_scale - 2.0f); + enc_state->b_qm_multiplier = + std::pow(1.25f, shared.frame_header.b_qm_scale - 2.0f); + + if (enc_state->coeffs.size() < shared.frame_header.passes.num_passes) { + enc_state->coeffs.reserve(shared.frame_header.passes.num_passes); + for (size_t i = enc_state->coeffs.size(); + i < shared.frame_header.passes.num_passes; i++) { + // Allocate enough coefficients for each group on every row. + enc_state->coeffs.emplace_back(make_unique>( + kGroupDim * kGroupDim, shared.frame_dim.num_groups)); + } + } + while (enc_state->coeffs.size() > shared.frame_header.passes.num_passes) { + enc_state->coeffs.pop_back(); + } + + Image3F dc(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks); + RunOnPool( + pool, 0, shared.frame_dim.num_groups, ThreadPool::SkipInit(), + [&](size_t group_idx, size_t _) { + ComputeCoefficients(group_idx, enc_state, opsin, &dc); + }, + "Compute coeffs"); + + if (shared.frame_header.flags & FrameHeader::kUseDcFrame) { + CompressParams cparams = enc_state->cparams; + // Guess a distance that produces good initial results. + cparams.butteraugli_distance = + std::max(kMinButteraugliDistance, + enc_state->cparams.butteraugli_distance * 0.1f); + cparams.dots = Override::kOff; + cparams.noise = Override::kOff; + cparams.patches = Override::kOff; + cparams.gaborish = Override::kOff; + cparams.epf = 0; + cparams.max_error_mode = true; + cparams.resampling = 1; + cparams.ec_resampling = 1; + for (size_t c = 0; c < 3; c++) { + cparams.max_error[c] = shared.quantizer.MulDC()[c]; + } + JXL_ASSERT(cparams.progressive_dc > 0); + cparams.progressive_dc--; + // The DC frame will have alpha=0. Don't erase its contents. + cparams.keep_invisible = Override::kOn; + // No EPF or Gaborish in DC frames. + cparams.epf = 0; + cparams.gaborish = Override::kOff; + // Use kVarDCT in max_error_mode for intermediate progressive DC, + // and kModular for the smallest DC (first in the bitstream) + if (cparams.progressive_dc == 0) { + cparams.modular_mode = true; + cparams.quality_pair.first = cparams.quality_pair.second = + 99.f - enc_state->cparams.butteraugli_distance * 0.2f; + } + ImageBundle ib(&shared.metadata->m); + // This is a lie - dc is in XYB + // (but EncodeFrame will skip RGB->XYB conversion anyway) + ib.SetFromImage( + std::move(dc), + ColorEncoding::LinearSRGB(shared.metadata->m.color_encoding.IsGray())); + if (!ib.metadata()->extra_channel_info.empty()) { + // Add dummy extra channels to the patch image: dc_level frames do not yet + // support extra channels, but the codec expects that the amount of extra + // channels in frames matches that in the metadata of the codestream. + std::vector extra_channels; + extra_channels.reserve(ib.metadata()->extra_channel_info.size()); + for (size_t i = 0; i < ib.metadata()->extra_channel_info.size(); i++) { + extra_channels.emplace_back(ib.xsize(), ib.ysize()); + // Must initialize the image with data to not affect blending with + // uninitialized memory. + // TODO(lode): dc_level must copy and use the real extra channels + // instead. + ZeroFillImage(&extra_channels.back()); + } + ib.SetExtraChannels(std::move(extra_channels)); + } + std::unique_ptr state = + jxl::make_unique(); + + auto special_frame = std::unique_ptr(new BitWriter()); + FrameInfo dc_frame_info; + dc_frame_info.frame_type = FrameType::kDCFrame; + dc_frame_info.dc_level = shared.frame_header.dc_level + 1; + dc_frame_info.ib_needs_color_transform = false; + dc_frame_info.save_before_color_transform = true; // Implicitly true + // TODO(lode): the EncodeFrame / DecodeFrame pair here is likely broken in + // case of dc_level >= 3, since EncodeFrame may output multiple frames + // to the bitwriter, while DecodeFrame reads only one. + JXL_CHECK(EncodeFrame(cparams, dc_frame_info, shared.metadata, ib, + state.get(), pool, special_frame.get(), nullptr)); + const Span encoded = special_frame->GetSpan(); + enc_state->special_frames.emplace_back(std::move(special_frame)); + + BitReader br(encoded); + ImageBundle decoded(&shared.metadata->m); + std::unique_ptr dec_state = + jxl::make_unique(); + JXL_CHECK(dec_state->output_encoding_info.Set( + *shared.metadata, + ColorEncoding::LinearSRGB(shared.metadata->m.color_encoding.IsGray()))); + JXL_CHECK(DecodeFrame({}, dec_state.get(), pool, &br, &decoded, + *shared.metadata, /*constraints=*/nullptr)); + // TODO(lode): shared.frame_header.dc_level should be equal to + // dec_state.shared->frame_header.dc_level - 1 here, since above we set + // dc_frame_info.dc_level = shared.frame_header.dc_level + 1, and + // dc_frame_info.dc_level is used by EncodeFrame. However, if EncodeFrame + // outputs multiple frames, this assumption could be wrong. + shared.dc_storage = + CopyImage(dec_state->shared->dc_frames[shared.frame_header.dc_level]); + ZeroFillImage(&shared.quant_dc); + shared.dc = &shared.dc_storage; + JXL_CHECK(br.Close()); + } else { + auto compute_dc_coeffs = [&](int group_index, int /* thread */) { + modular_frame_encoder->AddVarDCTDC( + dc, group_index, + enc_state->cparams.butteraugli_distance >= 2.0f && + enc_state->cparams.speed_tier < SpeedTier::kFalcon, + enc_state); + }; + RunOnPool(pool, 0, shared.frame_dim.num_dc_groups, ThreadPool::SkipInit(), + compute_dc_coeffs, "Compute DC coeffs"); + // TODO(veluca): this is only useful in tests and if inspection is enabled. + if (!(shared.frame_header.flags & FrameHeader::kSkipAdaptiveDCSmoothing)) { + AdaptiveDCSmoothing(shared.quantizer.MulDC(), &shared.dc_storage, pool); + } + } + auto compute_ac_meta = [&](int group_index, int /* thread */) { + modular_frame_encoder->AddACMetadata(group_index, /*jpeg_transcode=*/false, + enc_state); + }; + RunOnPool(pool, 0, shared.frame_dim.num_dc_groups, ThreadPool::SkipInit(), + compute_ac_meta, "Compute AC Metadata"); + + if (aux_out != nullptr) { + aux_out->InspectImage3F("compressed_image:InitializeFrameEncCache:dc_dec", + shared.dc_storage); + } +} + +void EncCache::InitOnce() { + PROFILER_FUNC; + + if (num_nzeroes.xsize() == 0) { + num_nzeroes = Image3I(kGroupDimInBlocks, kGroupDimInBlocks); + } +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cache.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cache.h new file mode 100644 index 0000000000..4c78893d75 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cache.h @@ -0,0 +1,116 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_CACHE_H_ +#define LIB_JXL_ENC_CACHE_H_ + +#include +#include + +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_heuristics.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/passes_state.h" +#include "lib/jxl/progressive_split.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" + +namespace jxl { + +// Contains encoder state. +struct PassesEncoderState { + PassesSharedState shared; + + ImageF initial_quant_field; // Invalid in Falcon mode. + ImageF initial_quant_masking; // Invalid in Falcon mode. + + // Per-pass DCT coefficients for the image. One row per group. + std::vector> coeffs; + + // Raw data for special (reference+DC) frames. + std::vector> special_frames; + + // For splitting into passes. + ProgressiveSplitter progressive_splitter; + + CompressParams cparams; + + struct PassData { + std::vector> ac_tokens; + std::vector context_map; + EntropyEncodingData codes; + }; + + std::vector passes; + std::vector histogram_idx; + + // Coefficient orders that are non-default. + std::vector used_orders; + + // Multiplier to be applied to the quant matrices of the x channel. + float x_qm_multiplier = 1.0f; + float b_qm_multiplier = 1.0f; + + // Heuristics to be used by the encoder. + std::unique_ptr heuristics = + make_unique(); +}; + +// Initialize per-frame information. +class ModularFrameEncoder; + +// XLNX_MODIFY +/*void InitializePassesEncoder(const Image3F& opsin, ThreadPool* pool, + PassesEncoderState* passes_enc_state, + ModularFrameEncoder* modular_frame_encoder, + AuxOut* aux_out);*/ + +void InitializePassesEncoder(const Image3F& opsin, ThreadPool* pool, + PassesEncoderState* passes_enc_state, + ModularFrameEncoder* modular_frame_encoder, + AuxOut* aux_out, + //==========hls interface======== + size_t xsize, size_t ysize, + std::vector>& dctIDT, + std::vector>& dct2x2, + std::vector>& dct4x4, + std::vector>& dct8x8, + std::vector>& dct16x16, + std::vector>& dct32x32, + + std::vector>& dcIDT, + std::vector>& dc2x2, + std::vector>& dc4x4, + std::vector>& dc8x8, + std::vector>& dc16x16, + std::vector>& dc32x32 + //================================ + ); + +// Working area for ComputeCoefficients (per-group!) +struct EncCache { + // Allocates memory when first called, shrinks images to current group size. + void InitOnce(); + + // TokenizeCoefficients + Image3I num_nzeroes; +}; + +} // namespace jxl + +#endif // LIB_JXL_ENC_CACHE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_chroma_from_luma.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_chroma_from_luma.cc new file mode 100644 index 0000000000..e5c3f38991 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_chroma_from_luma.cc @@ -0,0 +1,375 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_chroma_from_luma.h" + +#include +#include + +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_chroma_from_luma.cc" +#include +#include +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_transforms-inl.h" +#include "lib/jxl/enc_transforms-inl.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/quantizer.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +static HWY_FULL(float) df; + +struct CFLFunction { + static constexpr float kCoeff = 1.f / 3; + static constexpr float kThres = 100.0f; + static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor; + CFLFunction(const float* values_m, const float* values_s, size_t num, + float base, float distance_mul) + : values_m(values_m), + values_s(values_s), + num(num), + base(base), + distance_mul(distance_mul) {} + + // Returns f'(x), where f is 1/3 * sum ((|color residual| + 1)^2-1) + + // distance_mul * x^2 * num. + float Compute(float x, float eps, float* fpeps, float* fmeps) const { + float first_derivative = 2 * distance_mul * num * x; + float first_derivative_peps = 2 * distance_mul * num * (x + eps); + float first_derivative_meps = 2 * distance_mul * num * (x - eps); + + const auto inv_color_factor = Set(df, kInvColorFactor); + const auto thres = Set(df, kThres); + const auto coeffx2 = Set(df, kCoeff * 2.0f); + const auto one = Set(df, 1.0f); + const auto zero = Set(df, 0.0f); + const auto base_v = Set(df, base); + const auto x_v = Set(df, x); + const auto xpe_v = Set(df, x + eps); + const auto xme_v = Set(df, x - eps); + auto fd_v = Zero(df); + auto fdpe_v = Zero(df); + auto fdme_v = Zero(df); + JXL_ASSERT(num % Lanes(df) == 0); + + for (size_t i = 0; i < num; i += Lanes(df)) { + // color residual = ax + b + const auto a = inv_color_factor * Load(df, values_m + i); + const auto b = base_v * Load(df, values_m + i) - Load(df, values_s + i); + const auto v = a * x_v + b; + const auto vpe = a * xpe_v + b; + const auto vme = a * xme_v + b; + const auto av = Abs(v); + const auto avpe = Abs(vpe); + const auto avme = Abs(vme); + auto d = coeffx2 * (av + one) * a; + auto dpe = coeffx2 * (avpe + one) * a; + auto dme = coeffx2 * (avme + one) * a; + d = IfThenElse(v < zero, zero - d, d); + dpe = IfThenElse(vpe < zero, zero - dpe, dpe); + dme = IfThenElse(vme < zero, zero - dme, dme); + fd_v += IfThenElse(av >= thres, zero, d); + fdpe_v += IfThenElse(av >= thres, zero, dpe); + fdme_v += IfThenElse(av >= thres, zero, dme); + } + + *fpeps = first_derivative_peps + GetLane(SumOfLanes(fdpe_v)); + *fmeps = first_derivative_meps + GetLane(SumOfLanes(fdme_v)); + return first_derivative + GetLane(SumOfLanes(fd_v)); + } + + const float* JXL_RESTRICT values_m; + const float* JXL_RESTRICT values_s; + size_t num; + float base; + float distance_mul; +}; + +int32_t FindBestMultiplier(const float* values_m, const float* values_s, + size_t num, float base, float distance_mul, + bool fast) { + if (num == 0) { + return 0; + } + float x; + if (fast) { + static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor; + auto ca = Zero(df); + auto cb = Zero(df); + const auto inv_color_factor = Set(df, kInvColorFactor); + const auto base_v = Set(df, base); + for (size_t i = 0; i < num; i += Lanes(df)) { + // color residual = ax + b + const auto a = inv_color_factor * Load(df, values_m + i); + const auto b = base_v * Load(df, values_m + i) - Load(df, values_s + i); + ca = MulAdd(a, a, ca); + cb = MulAdd(a, b, cb); + } + // + distance_mul * x^2 * num + x = -GetLane(SumOfLanes(cb)) / + (GetLane(SumOfLanes(ca)) + num * distance_mul * 0.5f); + } else { + constexpr float eps = 1; + constexpr float kClamp = 20.0f; + CFLFunction fn(values_m, values_s, num, base, distance_mul); + x = 0; + // Up to 20 Newton iterations, with approximate derivatives. + // Derivatives are approximate due to the high amount of noise in the exact + // derivatives. + for (size_t i = 0; i < 20; i++) { + float dfpeps, dfmeps; + float df = fn.Compute(x, eps, &dfpeps, &dfmeps); + float ddf = (dfpeps - dfmeps) / (2 * eps); + float step = df / ddf; + x -= std::min(kClamp, std::max(-kClamp, step)); + if (std::abs(step) < 3e-3) break; + } + } + return std::max(-128.0f, std::min(127.0f, roundf(x))); +} + +void InitDCStorage(size_t num_blocks, ImageF* dc_values) { + // First row: Y channel + // Second row: X channel + // Third row: Y channel + // Fourth row: B channel + *dc_values = ImageF(RoundUpTo(num_blocks, Lanes(df)), 4); + + JXL_ASSERT(dc_values->xsize() != 0); + // Zero-fill the last lanes + for (size_t y = 0; y < 4; y++) { + for (size_t x = dc_values->xsize() - Lanes(df); x < dc_values->xsize(); + x++) { + dc_values->Row(y)[x] = 0; + } + } +} + +void ComputeDC(const ImageF& dc_values, bool fast, int* dc_x, int* dc_b) { + constexpr float kDistanceMultiplierDC = 1e-5f; + const float* JXL_RESTRICT dc_values_yx = dc_values.Row(0); + const float* JXL_RESTRICT dc_values_x = dc_values.Row(1); + const float* JXL_RESTRICT dc_values_yb = dc_values.Row(2); + const float* JXL_RESTRICT dc_values_b = dc_values.Row(3); + *dc_x = FindBestMultiplier(dc_values_yx, dc_values_x, dc_values.xsize(), 0.0f, + kDistanceMultiplierDC, fast); + *dc_b = FindBestMultiplier(dc_values_yb, dc_values_b, dc_values.xsize(), + kYToBRatio, kDistanceMultiplierDC, fast); +} + +void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant, + const AcStrategyImage* ac_strategy, const Quantizer* quantizer, + const Rect& r, bool fast, bool use_dct8, ImageSB* map_x, + ImageSB* map_b, ImageF* dc_values, float* mem) { + static_assert(kEncTileDimInBlocks == kColorTileDimInBlocks, + "Invalid color tile dim"); + size_t xsize_blocks = opsin.xsize() / kBlockDim; + constexpr float kDistanceMultiplierAC = 1e-3f; + + const size_t y0 = r.y0(); + const size_t x0 = r.x0(); + const size_t x1 = r.x0() + r.xsize(); + const size_t y1 = r.y0() + r.ysize(); + + int ty = y0 / kColorTileDimInBlocks; + int tx = x0 / kColorTileDimInBlocks; + + int8_t* JXL_RESTRICT row_out_x = map_x->Row(ty); + int8_t* JXL_RESTRICT row_out_b = map_b->Row(ty); + + float* JXL_RESTRICT dc_values_yx = dc_values->Row(0); + float* JXL_RESTRICT dc_values_x = dc_values->Row(1); + float* JXL_RESTRICT dc_values_yb = dc_values->Row(2); + float* JXL_RESTRICT dc_values_b = dc_values->Row(3); + + // All are aligned. + float* HWY_RESTRICT block_y = mem; + float* HWY_RESTRICT block_x = block_y + AcStrategy::kMaxCoeffArea; + float* HWY_RESTRICT block_b = block_x + AcStrategy::kMaxCoeffArea; + float* HWY_RESTRICT coeffs_yx = block_b + AcStrategy::kMaxCoeffArea; + float* HWY_RESTRICT coeffs_x = coeffs_yx + kColorTileDim * kColorTileDim; + float* HWY_RESTRICT coeffs_yb = coeffs_x + kColorTileDim * kColorTileDim; + float* HWY_RESTRICT coeffs_b = coeffs_yb + kColorTileDim * kColorTileDim; + float* HWY_RESTRICT scratch_space = coeffs_b + kColorTileDim * kColorTileDim; + JXL_DASSERT(scratch_space + 2 * AcStrategy::kMaxCoeffArea == + block_y + CfLHeuristics::kItemsPerThread); + + // Small (~256 bytes each) + HWY_ALIGN_MAX float + dc_y[AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks] = {}; + HWY_ALIGN_MAX float + dc_x[AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks] = {}; + HWY_ALIGN_MAX float + dc_b[AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks] = {}; + size_t num_ac = 0; + + for (size_t y = y0; y < y1; ++y) { + const float* JXL_RESTRICT row_y = opsin.ConstPlaneRow(1, y * kBlockDim); + const float* JXL_RESTRICT row_x = opsin.ConstPlaneRow(0, y * kBlockDim); + const float* JXL_RESTRICT row_b = opsin.ConstPlaneRow(2, y * kBlockDim); + size_t stride = opsin.PixelsPerRow(); + + for (size_t x = x0; x < x1; x++) { + AcStrategy acs = use_dct8 + ? AcStrategy::FromRawStrategy(AcStrategy::Type::DCT) + : ac_strategy->ConstRow(y)[x]; + if (!acs.IsFirstBlock()) continue; + size_t xs = acs.covered_blocks_x(); + TransformFromPixels(acs.Strategy(), row_y + x * kBlockDim, stride, + block_y, scratch_space); + DCFromLowestFrequencies(acs.Strategy(), block_y, dc_y, xs); + TransformFromPixels(acs.Strategy(), row_x + x * kBlockDim, stride, + block_x, scratch_space); + DCFromLowestFrequencies(acs.Strategy(), block_x, dc_x, xs); + TransformFromPixels(acs.Strategy(), row_b + x * kBlockDim, stride, + block_b, scratch_space); + DCFromLowestFrequencies(acs.Strategy(), block_b, dc_b, xs); + const float* const JXL_RESTRICT qm_x = + dequant.InvMatrix(acs.Strategy(), 0); + const float* const JXL_RESTRICT qm_b = + dequant.InvMatrix(acs.Strategy(), 2); + // Why does a constant seem to work better than + // raw_quant_field->Row(y)[x] ? + float q = use_dct8 ? 1 : quantizer->Scale() * 400.0f; + float q_dc_x = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(0); + float q_dc_b = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(2); + + // Copy DCs in dc_values. + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < xs; ix++) { + dc_values_yx[(iy + y) * xsize_blocks + ix + x] = + dc_y[iy * xs + ix] * q_dc_x; + dc_values_x[(iy + y) * xsize_blocks + ix + x] = + dc_x[iy * xs + ix] * q_dc_x; + dc_values_yb[(iy + y) * xsize_blocks + ix + x] = + dc_y[iy * xs + ix] * q_dc_b; + dc_values_b[(iy + y) * xsize_blocks + ix + x] = + dc_b[iy * xs + ix] * q_dc_b; + } + } + + // Do not use this block for computing AC CfL. + if (acs.covered_blocks_x() + x0 > x1 || + acs.covered_blocks_y() + y0 > y1) { + continue; + } + + // Copy AC coefficients in the local block. The order in which + // coefficients get stored does not matter. + size_t cx = acs.covered_blocks_x(); + size_t cy = acs.covered_blocks_y(); + CoefficientLayout(&cy, &cx); + // Zero out LFs. This introduces terms in the optimization loop that + // don't affect the result, as they are all 0, but allow for simpler + // SIMDfication. + for (size_t iy = 0; iy < cy; iy++) { + for (size_t ix = 0; ix < cx; ix++) { + block_y[cx * kBlockDim * iy + ix] = 0; + block_x[cx * kBlockDim * iy + ix] = 0; + block_b[cx * kBlockDim * iy + ix] = 0; + } + } + const auto qv = Set(df, q); + for (size_t i = 0; i < cx * cy * 64; i += Lanes(df)) { + const auto b_y = Load(df, block_y + i); + const auto b_x = Load(df, block_x + i); + const auto b_b = Load(df, block_b + i); + const auto qqm_x = qv * Load(df, qm_x + i); + const auto qqm_b = qv * Load(df, qm_b + i); + Store(b_y * qqm_x, df, coeffs_yx + num_ac); + Store(b_x * qqm_x, df, coeffs_x + num_ac); + Store(b_y * qqm_b, df, coeffs_yb + num_ac); + Store(b_b * qqm_b, df, coeffs_b + num_ac); + num_ac += Lanes(df); + } + } + } + JXL_CHECK(num_ac % Lanes(df) == 0); + row_out_x[tx] = FindBestMultiplier(coeffs_yx, coeffs_x, num_ac, 0.0f, + kDistanceMultiplierAC, fast); + row_out_b[tx] = FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, kYToBRatio, + kDistanceMultiplierAC, fast); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(InitDCStorage); +HWY_EXPORT(ComputeDC); +HWY_EXPORT(ComputeTile); + +void CfLHeuristics::Init(const Image3F& opsin) { + size_t xsize_blocks = opsin.xsize() / kBlockDim; + size_t ysize_blocks = opsin.ysize() / kBlockDim; + HWY_DYNAMIC_DISPATCH(InitDCStorage) + (xsize_blocks * ysize_blocks, &dc_values); +} + +void CfLHeuristics::ComputeTile(const Rect& r, const Image3F& opsin, + const DequantMatrices& dequant, + const AcStrategyImage* ac_strategy, + const Quantizer* quantizer, bool fast, + size_t thread, ColorCorrelationMap* cmap) { + bool use_dct8 = ac_strategy == nullptr; + HWY_DYNAMIC_DISPATCH(ComputeTile) + (opsin, dequant, ac_strategy, quantizer, r, fast, use_dct8, &cmap->ytox_map, + &cmap->ytob_map, &dc_values, mem.get() + thread * kItemsPerThread); +} + +void CfLHeuristics::ComputeDC(bool fast, ColorCorrelationMap* cmap) { + int32_t ytob_dc = 0; + int32_t ytox_dc = 0; + HWY_DYNAMIC_DISPATCH(ComputeDC)(dc_values, fast, &ytox_dc, &ytob_dc); + cmap->SetYToBDC(ytob_dc); + cmap->SetYToXDC(ytox_dc); +} + +void ColorCorrelationMapEncodeDC(ColorCorrelationMap* map, BitWriter* writer, + size_t layer, AuxOut* aux_out) { + float color_factor = map->GetColorFactor(); + float base_correlation_x = map->GetBaseCorrelationX(); + float base_correlation_b = map->GetBaseCorrelationB(); + int32_t ytox_dc = map->GetYToXDC(); + int32_t ytob_dc = map->GetYToBDC(); + + BitWriter::Allotment allotment(writer, 1 + 2 * kBitsPerByte + 12 + 32); + if (ytox_dc == 0 && ytob_dc == 0 && color_factor == kDefaultColorFactor && + base_correlation_x == 0.0f && base_correlation_b == kYToBRatio) { + writer->Write(1, 1); + ReclaimAndCharge(writer, &allotment, layer, aux_out); + return; + } + writer->Write(1, 0); + JXL_CHECK(U32Coder::Write(kColorFactorDist, color_factor, writer)); + JXL_CHECK(F16Coder::Write(base_correlation_x, writer)); + JXL_CHECK(F16Coder::Write(base_correlation_b, writer)); + writer->Write(kBitsPerByte, ytox_dc - std::numeric_limits::min()); + writer->Write(kBitsPerByte, ytob_dc - std::numeric_limits::min()); + ReclaimAndCharge(writer, &allotment, layer, aux_out); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_chroma_from_luma.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_chroma_from_luma.h new file mode 100644 index 0000000000..a097774030 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_chroma_from_luma.h @@ -0,0 +1,67 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_CHROMA_FROM_LUMA_H_ +#define LIB_JXL_ENC_CHROMA_FROM_LUMA_H_ + +// Chroma-from-luma, computed using heuristics to determine the best linear +// model for the X and B channels from the Y channel. + +#include +#include + +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/quant_weights.h" + +namespace jxl { + +void ColorCorrelationMapEncodeDC(ColorCorrelationMap* map, BitWriter* writer, + size_t layer, AuxOut* aux_out); + +struct CfLHeuristics { + void Init(const Image3F& opsin); + + void PrepareForThreads(size_t num_threads) { + mem = hwy::AllocateAligned(num_threads * kItemsPerThread); + } + + void ComputeTile(const Rect& r, const Image3F& opsin, + const DequantMatrices& dequant, + const AcStrategyImage* ac_strategy, + const Quantizer* quantizer, bool fast, size_t thread, + ColorCorrelationMap* cmap); + + void ComputeDC(bool fast, ColorCorrelationMap* cmap); + + ImageF dc_values; + hwy::AlignedFreeUniquePtr mem; + + // Working set is too large for stack; allocate dynamically. + constexpr static size_t kItemsPerThread = + AcStrategy::kMaxCoeffArea * 3 // Blocks + + kColorTileDim * kColorTileDim * 4 // AC coeff storage + + AcStrategy::kMaxCoeffArea * 2; // Scratch space +}; + +} // namespace jxl + +#endif // LIB_JXL_ENC_CHROMA_FROM_LUMA_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cluster.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cluster.cc new file mode 100644 index 0000000000..1f12a29881 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cluster.cc @@ -0,0 +1,310 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_cluster.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_cluster.cc" +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/fast_math-inl.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +template +V Entropy(V count, V inv_total, V total) { + const HWY_CAPPED(float, Histogram::kRounding) d; + const auto zero = Set(d, 0.0f); + return IfThenZeroElse(count == total, + zero - count * FastLog2f(d, inv_total * count)); +} + +void HistogramEntropy(const Histogram& a) { + a.entropy_ = 0.0f; + if (a.total_count_ == 0) return; + + const HWY_CAPPED(float, Histogram::kRounding) df; + const HWY_CAPPED(int32_t, Histogram::kRounding) di; + + const auto inv_tot = Set(df, 1.0f / a.total_count_); + auto entropy_lanes = Zero(df); + auto total = Set(df, a.total_count_); + + for (size_t i = 0; i < a.data_.size(); i += Lanes(di)) { + const auto counts = LoadU(di, &a.data_[i]); + entropy_lanes += Entropy(ConvertTo(df, counts), inv_tot, total); + } + a.entropy_ += GetLane(SumOfLanes(entropy_lanes)); +} + +float HistogramDistance(const Histogram& a, const Histogram& b) { + if (a.total_count_ == 0 || b.total_count_ == 0) return 0; + + const HWY_CAPPED(float, Histogram::kRounding) df; + const HWY_CAPPED(int32_t, Histogram::kRounding) di; + + const auto inv_tot = Set(df, 1.0f / (a.total_count_ + b.total_count_)); + auto distance_lanes = Zero(df); + auto total = Set(df, a.total_count_ + b.total_count_); + + for (size_t i = 0; i < std::max(a.data_.size(), b.data_.size()); + i += Lanes(di)) { + const auto a_counts = + a.data_.size() > i ? LoadU(di, &a.data_[i]) : Zero(di); + const auto b_counts = + b.data_.size() > i ? LoadU(di, &b.data_[i]) : Zero(di); + const auto counts = ConvertTo(df, a_counts + b_counts); + distance_lanes += Entropy(counts, inv_tot, total); + } + const float total_distance = GetLane(SumOfLanes(distance_lanes)); + return total_distance - a.entropy_ - b.entropy_; +} + +// First step of a k-means clustering with a fancy distance metric. +void FastClusterHistograms(const std::vector& in, + const size_t num_contexts_in, size_t max_histograms, + float min_distance, std::vector* out, + std::vector* histogram_symbols) { + PROFILER_FUNC; + size_t largest_idx = 0; + std::vector nonempty_histograms; + nonempty_histograms.reserve(in.size()); + for (size_t i = 0; i < num_contexts_in; i++) { + if (in[i].total_count_ == 0) continue; + HistogramEntropy(in[i]); + if (in[i].total_count_ > in[largest_idx].total_count_) { + largest_idx = i; + } + nonempty_histograms.push_back(i); + } + // No symbols. + if (nonempty_histograms.empty()) { + out->resize(1); + histogram_symbols->clear(); + histogram_symbols->resize(in.size(), 0); + return; + } + largest_idx = std::find(nonempty_histograms.begin(), + nonempty_histograms.end(), largest_idx) - + nonempty_histograms.begin(); + size_t num_contexts = nonempty_histograms.size(); + out->clear(); + out->reserve(max_histograms); + std::vector dists(num_contexts, std::numeric_limits::max()); + histogram_symbols->clear(); + histogram_symbols->resize(in.size(), max_histograms); + + while (out->size() < max_histograms && out->size() < num_contexts) { + (*histogram_symbols)[nonempty_histograms[largest_idx]] = out->size(); + out->push_back(in[nonempty_histograms[largest_idx]]); + largest_idx = 0; + for (size_t i = 0; i < num_contexts; i++) { + dists[i] = std::min( + HistogramDistance(in[nonempty_histograms[i]], out->back()), dists[i]); + // Avoid repeating histograms + if ((*histogram_symbols)[nonempty_histograms[i]] != max_histograms) { + continue; + } + if (dists[i] > dists[largest_idx]) largest_idx = i; + } + if (dists[largest_idx] < min_distance) break; + } + + for (size_t i = 0; i < num_contexts_in; i++) { + if ((*histogram_symbols)[i] != max_histograms) continue; + if (in[i].total_count_ == 0) { + (*histogram_symbols)[i] = 0; + continue; + } + size_t best = 0; + float best_dist = HistogramDistance(in[i], (*out)[best]); + for (size_t j = 1; j < out->size(); j++) { + float dist = HistogramDistance(in[i], (*out)[j]); + if (dist < best_dist) { + best = j; + best_dist = dist; + } + } + (*out)[best].AddHistogram(in[i]); + HistogramEntropy((*out)[best]); + (*histogram_symbols)[i] = best; + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(FastClusterHistograms); // Local function +HWY_EXPORT(HistogramEntropy); // Local function + +float Histogram::ShannonEntropy() const { + HWY_DYNAMIC_DISPATCH(HistogramEntropy)(*this); + return entropy_; +} + +namespace { +// ----------------------------------------------------------------------------- +// Histogram refinement + +// Reorder histograms in *out so that the new symbols in *symbols come in +// increasing order. +void HistogramReindex(std::vector* out, + std::vector* symbols) { + std::vector tmp(*out); + std::map new_index; + int next_index = 0; + for (uint32_t symbol : *symbols) { + if (new_index.find(symbol) == new_index.end()) { + new_index[symbol] = next_index; + (*out)[next_index] = tmp[symbol]; + ++next_index; + } + } + out->resize(next_index); + for (uint32_t& symbol : *symbols) { + symbol = new_index[symbol]; + } +} + +} // namespace + +// Clusters similar histograms in 'in' together, the selected histograms are +// placed in 'out', and for each index in 'in', *histogram_symbols will +// indicate which of the 'out' histograms is the best approximation. +void ClusterHistograms(const HistogramParams params, + const std::vector& in, + const size_t num_contexts, size_t max_histograms, + std::vector* out, + std::vector* histogram_symbols) { + constexpr float kMinDistanceForDistinctFast = 64.0f; + constexpr float kMinDistanceForDistinctBest = 16.0f; + max_histograms = std::min(max_histograms, params.max_histograms); + if (params.clustering == HistogramParams::ClusteringType::kFastest) { + HWY_DYNAMIC_DISPATCH(FastClusterHistograms) + (in, num_contexts, 4, kMinDistanceForDistinctFast, out, histogram_symbols); + } else if (params.clustering == HistogramParams::ClusteringType::kFast) { + HWY_DYNAMIC_DISPATCH(FastClusterHistograms) + (in, num_contexts, max_histograms, kMinDistanceForDistinctFast, out, + histogram_symbols); + } else { + PROFILER_FUNC; + HWY_DYNAMIC_DISPATCH(FastClusterHistograms) + (in, num_contexts, max_histograms, kMinDistanceForDistinctBest, out, + histogram_symbols); + for (size_t i = 0; i < out->size(); i++) { + (*out)[i].entropy_ = + ANSPopulationCost((*out)[i].data_.data(), (*out)[i].data_.size()); + } + uint32_t next_version = 2; + std::vector version(out->size(), 1); + std::vector renumbering(out->size()); + std::iota(renumbering.begin(), renumbering.end(), 0); + + // Try to pair up clusters if doing so reduces the total cost. + + struct HistogramPair { + // validity of a pair: p.version == max(version[i], version[j]) + float cost; + uint32_t first; + uint32_t second; + uint32_t version; + // We use > because priority queues sort in *decreasing* order, but we + // want lower cost elements to appear first. + bool operator<(const HistogramPair& other) const { + return std::make_tuple(cost, first, second, version) > + std::make_tuple(other.cost, other.first, other.second, + other.version); + } + }; + + // Create list of all pairs by increasing merging cost. + std::priority_queue pairs_to_merge; + for (uint32_t i = 0; i < out->size(); i++) { + for (uint32_t j = i + 1; j < out->size(); j++) { + Histogram histo; + histo.AddHistogram((*out)[i]); + histo.AddHistogram((*out)[j]); + float cost = ANSPopulationCost(histo.data_.data(), histo.data_.size()) - + (*out)[i].entropy_ - (*out)[j].entropy_; + // Avoid enqueueing pairs that are not advantageous to merge. + if (cost >= 0) continue; + pairs_to_merge.push( + HistogramPair{cost, i, j, std::max(version[i], version[j])}); + } + } + + // Merge the best pair to merge, add new pairs that get formed as a + // consequence. + while (!pairs_to_merge.empty()) { + uint32_t first = pairs_to_merge.top().first; + uint32_t second = pairs_to_merge.top().second; + uint32_t ver = pairs_to_merge.top().version; + pairs_to_merge.pop(); + if (ver != std::max(version[first], version[second]) || + version[first] == 0 || version[second] == 0) { + continue; + } + (*out)[first].AddHistogram((*out)[second]); + (*out)[first].entropy_ = ANSPopulationCost((*out)[first].data_.data(), + (*out)[first].data_.size()); + for (size_t i = 0; i < renumbering.size(); i++) { + if (renumbering[i] == second) { + renumbering[i] = first; + } + } + version[second] = 0; + version[first] = next_version++; + for (uint32_t j = 0; j < out->size(); j++) { + if (j == first) continue; + if (version[j] == 0) continue; + Histogram histo; + histo.AddHistogram((*out)[first]); + histo.AddHistogram((*out)[j]); + float cost = ANSPopulationCost(histo.data_.data(), histo.data_.size()) - + (*out)[first].entropy_ - (*out)[j].entropy_; + // Avoid enqueueing pairs that are not advantageous to merge. + if (cost >= 0) continue; + pairs_to_merge.push( + HistogramPair{cost, std::min(first, j), std::max(first, j), + std::max(version[first], version[j])}); + } + } + std::vector reverse_renumbering(out->size(), -1); + size_t num_alive = 0; + for (size_t i = 0; i < out->size(); i++) { + if (version[i] == 0) continue; + (*out)[num_alive++] = (*out)[i]; + reverse_renumbering[i] = num_alive - 1; + } + out->resize(num_alive); + for (size_t i = 0; i < histogram_symbols->size(); i++) { + (*histogram_symbols)[i] = + reverse_renumbering[renumbering[(*histogram_symbols)[i]]]; + } + } + + // Convert the context map to a canonical form. + HistogramReindex(out, histogram_symbols); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cluster.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cluster.h new file mode 100644 index 0000000000..622a567950 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_cluster.h @@ -0,0 +1,61 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Functions for clustering similar histograms together. + +#ifndef LIB_JXL_ENC_CLUSTER_H_ +#define LIB_JXL_ENC_CLUSTER_H_ + +#include +#include +#include + +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/enc_ans.h" + +namespace jxl { + +struct Histogram { + Histogram() { total_count_ = 0; } + void Clear() { + data_.clear(); + total_count_ = 0; + } + void Add(size_t symbol) { + if (data_.size() <= symbol) { + data_.resize(DivCeil(symbol + 1, kRounding) * kRounding); + } + ++data_[symbol]; + ++total_count_; + } + void AddHistogram(const Histogram& other) { + if (other.data_.size() > data_.size()) { + data_.resize(other.data_.size()); + } + for (size_t i = 0; i < other.data_.size(); ++i) { + data_[i] += other.data_[i]; + } + total_count_ += other.total_count_; + } + float PopulationCost() const { + return ANSPopulationCost(data_.data(), data_.size()); + } + float ShannonEntropy() const; + + std::vector data_; + size_t total_count_; + mutable float entropy_; // WARNING: not kept up-to-date. + static constexpr size_t kRounding = 8; +}; + +void ClusterHistograms(HistogramParams params, const std::vector& in, + size_t num_contexts, size_t max_histograms, + std::vector* out, + std::vector* histogram_symbols); +} // namespace jxl + +#endif // LIB_JXL_ENC_CLUSTER_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_coeff_order.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_coeff_order.cc new file mode 100644 index 0000000000..81315a0787 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_coeff_order.cc @@ -0,0 +1,274 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/lehmer_code.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/modular_image.h" + +namespace jxl { + +uint32_t ComputeUsedOrders(const SpeedTier speed, + const AcStrategyImage& ac_strategy, + const Rect& rect) { + // Use default orders for small images. + if (ac_strategy.xsize() < 5 && ac_strategy.ysize() < 5) return 0; + + // Only uses DCT8 = 0, so bitfield = 1. + if (speed >= SpeedTier::kFalcon) return 1; + + uint32_t ret = 0; + size_t xsize_blocks = rect.xsize(); + size_t ysize_blocks = rect.ysize(); + // TODO(veluca): precompute when doing DCT. + for (size_t by = 0; by < ysize_blocks; ++by) { + AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by); + for (size_t bx = 0; bx < xsize_blocks; ++bx) { + int ord = kStrategyOrder[acs_row[bx].RawStrategy()]; + // Do not customize coefficient orders for blocks bigger than 32x32. + if (ord > 6) { + continue; + } + ret |= 1u << ord; + } + } + return ret; +} + +void ComputeCoeffOrder(SpeedTier speed, const ACImage& acs, + const AcStrategyImage& ac_strategy, + const FrameDimensions& frame_dim, uint32_t& used_orders, + coeff_order_t* JXL_RESTRICT order) { + std::vector num_zeros(kCoeffOrderMaxSize); + // If compressing at high speed and only using 8x8 DCTs, only consider a + // subset of blocks. + double block_fraction = 1.0f; + // TODO(veluca): figure out why sampling blocks if non-8x8s are used makes + // encoding significantly less dense. + if (speed >= SpeedTier::kSquirrel && used_orders == 1) { + block_fraction = 0.5f; + } + // No need to compute number of zero coefficients if all orders are the + // default. + if (used_orders != 0) { + uint64_t threshold = + (std::numeric_limits::max() >> 32) * block_fraction; + uint64_t s[2] = {0x94D049BB133111EBull, 0xBF58476D1CE4E5B9ull}; + // Xorshift128+ adapted from xorshift128+-inl.h + auto use_sample = [&]() { + auto s1 = s[0]; + const auto s0 = s[1]; + const auto bits = s1 + s0; // b, c + s[0] = s0; + s1 ^= s1 << 23; + s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5); + s[1] = s1; + return (bits >> 32) <= threshold; + }; + + // Count number of zero coefficients, separately for each DCT band. + // TODO(veluca): precompute when doing DCT. + for (size_t group_index = 0; group_index < frame_dim.num_groups; + group_index++) { + const size_t gx = group_index % frame_dim.xsize_groups; + const size_t gy = group_index / frame_dim.xsize_groups; + const Rect rect(gx * kGroupDimInBlocks, gy * kGroupDimInBlocks, + kGroupDimInBlocks, kGroupDimInBlocks, + frame_dim.xsize_blocks, frame_dim.ysize_blocks); + ConstACPtr rows[3]; + ACType type = acs.Type(); + for (size_t c = 0; c < 3; c++) { + rows[c] = acs.PlaneRow(c, group_index, 0); + } + size_t ac_offset = 0; + + // TODO(veluca): SIMDfy. + for (size_t by = 0; by < rect.ysize(); ++by) { + AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by); + for (size_t bx = 0; bx < rect.xsize(); ++bx) { + AcStrategy acs = acs_row[bx]; + if (!acs.IsFirstBlock()) continue; + if (!use_sample()) continue; + size_t size = kDCTBlockSize << acs.log2_covered_blocks(); + for (size_t c = 0; c < 3; ++c) { + const size_t order_offset = + CoeffOrderOffset(kStrategyOrder[acs.RawStrategy()], c); + if (type == ACType::k16) { + for (size_t k = 0; k < size; k++) { + bool is_zero = rows[c].ptr16[ac_offset + k] == 0; + num_zeros[order_offset + k] += is_zero ? 1 : 0; + } + } else { + for (size_t k = 0; k < size; k++) { + bool is_zero = rows[c].ptr32[ac_offset + k] == 0; + num_zeros[order_offset + k] += is_zero ? 1 : 0; + } + } + // Ensure LLFs are first in the order. + size_t cx = acs.covered_blocks_x(); + size_t cy = acs.covered_blocks_y(); + CoefficientLayout(&cy, &cx); + for (size_t iy = 0; iy < cy; iy++) { + for (size_t ix = 0; ix < cx; ix++) { + num_zeros[order_offset + iy * kBlockDim * cx + ix] = -1; + } + } + } + ac_offset += size; + } + } + } + } + struct PosAndCount { + uint32_t pos; + uint32_t count; + }; + auto mem = hwy::AllocateAligned(AcStrategy::kMaxCoeffArea); + + uint16_t computed = 0; + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + uint8_t ord = kStrategyOrder[o]; + if (computed & (1 << ord)) continue; + computed |= 1 << ord; + AcStrategy acs = AcStrategy::FromRawStrategy(o); + size_t sz = kDCTBlockSize * acs.covered_blocks_x() * acs.covered_blocks_y(); + // Ensure natural coefficient order is not permuted if the order is + // not transmitted. + if ((1 << ord) & ~used_orders) { + for (size_t c = 0; c < 3; c++) { + size_t offset = CoeffOrderOffset(ord, c); + JXL_DASSERT(CoeffOrderOffset(ord, c + 1) - offset == sz); + SetDefaultOrder(AcStrategy::FromRawStrategy(o), &order[offset]); + } + continue; + } + const coeff_order_t* natural_coeff_order = acs.NaturalCoeffOrder(); + + bool is_nondefault = false; + for (uint8_t c = 0; c < 3; c++) { + // Apply zig-zag order. + PosAndCount* pos_and_val = mem.get(); + size_t offset = CoeffOrderOffset(ord, c); + JXL_DASSERT(CoeffOrderOffset(ord, c + 1) - offset == sz); + float inv_sqrt_sz = 1.0f / std::sqrt(sz); + for (size_t i = 0; i < sz; ++i) { + size_t pos = natural_coeff_order[i]; + pos_and_val[i].pos = pos; + // We don't care for the exact number -> quantize number of zeros, + // to get less permuted order. + pos_and_val[i].count = num_zeros[offset + pos] * inv_sqrt_sz + 0.1f; + } + + // Stable-sort -> elements with same number of zeros will preserve their + // order. + auto comparator = [](const PosAndCount& a, const PosAndCount& b) -> bool { + return a.count < b.count; + }; + std::stable_sort(pos_and_val, pos_and_val + sz, comparator); + + // Grab indices. + for (size_t i = 0; i < sz; ++i) { + order[offset + i] = pos_and_val[i].pos; + is_nondefault |= natural_coeff_order[i] != pos_and_val[i].pos; + } + } + if (!is_nondefault) { + used_orders &= ~(1 << ord); + } + } +} + +namespace { + +void TokenizePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip, + size_t size, std::vector* tokens) { + std::vector lehmer(size); + std::vector temp(size + 1); + ComputeLehmerCode(order, temp.data(), size, lehmer.data()); + size_t end = size; + while (end > skip && lehmer[end - 1] == 0) { + --end; + } + tokens->emplace_back(CoeffOrderContext(size), end - skip); + uint32_t last = 0; + for (size_t i = skip; i < end; ++i) { + tokens->emplace_back(CoeffOrderContext(last), lehmer[i]); + last = lehmer[i]; + } +} + +} // namespace + +void EncodePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip, + size_t size, BitWriter* writer, int layer, + AuxOut* aux_out) { + std::vector> tokens(1); + TokenizePermutation(order, skip, size, &tokens[0]); + std::vector context_map; + EntropyEncodingData codes; + BuildAndEncodeHistograms(HistogramParams(), kPermutationContexts, tokens, + &codes, &context_map, writer, layer, aux_out); + WriteTokens(tokens[0], codes, context_map, writer, layer, aux_out); +} + +namespace { +void EncodeCoeffOrder(const coeff_order_t* JXL_RESTRICT order, AcStrategy acs, + std::vector* tokens, coeff_order_t* order_zigzag) { + const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y(); + const size_t size = kDCTBlockSize * llf; + const coeff_order_t* natural_coeff_order_lut = acs.NaturalCoeffOrderLut(); + for (size_t i = 0; i < size; ++i) { + order_zigzag[i] = natural_coeff_order_lut[order[i]]; + } + TokenizePermutation(order_zigzag, llf, size, tokens); +} +} // namespace + +void EncodeCoeffOrders(uint16_t used_orders, + const coeff_order_t* JXL_RESTRICT order, + BitWriter* writer, size_t layer, + AuxOut* JXL_RESTRICT aux_out) { + auto mem = hwy::AllocateAligned(AcStrategy::kMaxCoeffArea); + uint16_t computed = 0; + std::vector> tokens(1); + for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { + uint8_t ord = kStrategyOrder[o]; + if (computed & (1 << ord)) continue; + computed |= 1 << ord; + if ((used_orders & (1 << ord)) == 0) continue; + AcStrategy acs = AcStrategy::FromRawStrategy(o); + for (size_t c = 0; c < 3; c++) { + EncodeCoeffOrder(&order[CoeffOrderOffset(ord, c)], acs, &tokens[0], + mem.get()); + } + } + // Do not write anything if no order is used. + if (used_orders != 0) { + std::vector context_map; + EntropyEncodingData codes; + BuildAndEncodeHistograms(HistogramParams(), kPermutationContexts, tokens, + &codes, &context_map, writer, layer, aux_out); + WriteTokens(tokens[0], codes, context_map, writer, layer, aux_out); + } +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_coeff_order.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_coeff_order.h new file mode 100644 index 0000000000..5eee746592 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_coeff_order.h @@ -0,0 +1,52 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_COEFF_ORDER_H_ +#define LIB_JXL_ENC_COEFF_ORDER_H_ + +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_params.h" + +namespace jxl { + +// Orders that are actually used in part of image. `rect` is in block units. +uint32_t ComputeUsedOrders(SpeedTier speed, const AcStrategyImage& ac_strategy, + const Rect& rect); + +// Modify zig-zag order, so that DCT bands with more zeros go later. +// Order of DCT bands with same number of zeros is untouched, so +// permutation will be cheaper to encode. +void ComputeCoeffOrder(SpeedTier speed, const ACImage& acs, + const AcStrategyImage& ac_strategy, + const FrameDimensions& frame_dim, uint32_t& used_orders, + coeff_order_t* JXL_RESTRICT order); + +void EncodeCoeffOrders(uint16_t used_orders, + const coeff_order_t* JXL_RESTRICT order, + BitWriter* writer, size_t layer, + AuxOut* JXL_RESTRICT aux_out); + +// Encoding/decoding of a single permutation. `size`: number of elements in the +// permutation. `skip`: number of elements to skip from the *beginning* of the +// permutation. +void EncodePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip, + size_t size, BitWriter* writer, int layer, + AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_ENC_COEFF_ORDER_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_color_management.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_color_management.cc new file mode 100644 index 0000000000..ff7dbe557b --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_color_management.cc @@ -0,0 +1,886 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Defined by build system; this avoids IDE warnings. Must come before +// color_management.h (affects header definitions). +#ifndef JPEGXL_ENABLE_SKCMS +#define JPEGXL_ENABLE_SKCMS 0 +#endif + +#include "lib/jxl/enc_color_management.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_color_management.cc" +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/linalg.h" +#include "lib/jxl/transfer_functions-inl.h" +#if JPEGXL_ENABLE_SKCMS +#include "skcms.h" +#else // JPEGXL_ENABLE_SKCMS +#include "lcms2.h" +#include "lcms2_plugin.h" +#endif // JPEGXL_ENABLE_SKCMS + +#define JXL_CMS_VERBOSE 0 + +// Define these only once. We can't use HWY_ONCE here because it is defined as +// 1 only on the last pass. +#ifndef LIB_JXL_ENC_COLOR_MANAGEMENT_CC_ +#define LIB_JXL_ENC_COLOR_MANAGEMENT_CC_ + +namespace jxl { +#if JPEGXL_ENABLE_SKCMS +struct ColorSpaceTransform::SkcmsICC { + // Parsed skcms_ICCProfiles retain pointers to the original data. + PaddedBytes icc_src_, icc_dst_; + skcms_ICCProfile profile_src_, profile_dst_; +}; +#endif // JPEGXL_ENABLE_SKCMS +} // namespace jxl + +#endif // LIB_JXL_ENC_COLOR_MANAGEMENT_CC_ + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +#if JXL_CMS_VERBOSE >= 2 +const size_t kX = 0; // pixel index, multiplied by 3 for RGB +#endif + +// xform_src = UndoGammaCompression(buf_src). +void BeforeTransform(ColorSpaceTransform* t, const float* buf_src, + float* xform_src) { + switch (t->preprocess_) { + case ExtraTF::kNone: + JXL_DASSERT(false); // unreachable + break; + + case ExtraTF::kPQ: { + // By default, PQ content has an intensity target of 10000, stored + // exactly. + HWY_FULL(float) df; + const auto multiplier = Set(df, t->intensity_target_ == 10000.f + ? 1.0f + : 10000.f / t->intensity_target_); + for (size_t i = 0; i < t->buf_src_.xsize(); i += Lanes(df)) { + const auto val = Load(df, buf_src + i); + const auto result = multiplier * TF_PQ().DisplayFromEncoded(df, val); + Store(result, df, xform_src + i); + } +#if JXL_CMS_VERBOSE >= 2 + printf("pre in %.4f %.4f %.4f undoPQ %.4f %.4f %.4f\n", buf_src[3 * kX], + buf_src[3 * kX + 1], buf_src[3 * kX + 2], xform_src[3 * kX], + xform_src[3 * kX + 1], xform_src[3 * kX + 2]); +#endif + break; + } + + case ExtraTF::kHLG: + for (size_t i = 0; i < t->buf_src_.xsize(); ++i) { + xform_src[i] = static_cast( + TF_HLG().DisplayFromEncoded(static_cast(buf_src[i]))); + } +#if JXL_CMS_VERBOSE >= 2 + printf("pre in %.4f %.4f %.4f undoHLG %.4f %.4f %.4f\n", buf_src[3 * kX], + buf_src[3 * kX + 1], buf_src[3 * kX + 2], xform_src[3 * kX], + xform_src[3 * kX + 1], xform_src[3 * kX + 2]); +#endif + break; + + case ExtraTF::kSRGB: + HWY_FULL(float) df; + for (size_t i = 0; i < t->buf_src_.xsize(); i += Lanes(df)) { + const auto val = Load(df, buf_src + i); + const auto result = TF_SRGB().DisplayFromEncoded(val); + Store(result, df, xform_src + i); + } +#if JXL_CMS_VERBOSE >= 2 + printf("pre in %.4f %.4f %.4f undoSRGB %.4f %.4f %.4f\n", buf_src[3 * kX], + buf_src[3 * kX + 1], buf_src[3 * kX + 2], xform_src[3 * kX], + xform_src[3 * kX + 1], xform_src[3 * kX + 2]); +#endif + break; + } +} + +// Applies gamma compression in-place. +void AfterTransform(ColorSpaceTransform* t, float* JXL_RESTRICT buf_dst) { + switch (t->postprocess_) { + case ExtraTF::kNone: + JXL_DASSERT(false); // unreachable + break; + case ExtraTF::kPQ: { + HWY_FULL(float) df; + const auto multiplier = Set(df, t->intensity_target_ == 10000.f + ? 1.0f + : t->intensity_target_ * 1e-4f); + for (size_t i = 0; i < t->buf_dst_.xsize(); i += Lanes(df)) { + const auto val = Load(df, buf_dst + i); + const auto result = TF_PQ().EncodedFromDisplay(df, multiplier * val); + Store(result, df, buf_dst + i); + } +#if JXL_CMS_VERBOSE >= 2 + printf("after PQ enc %.4f %.4f %.4f\n", buf_dst[3 * kX], + buf_dst[3 * kX + 1], buf_dst[3 * kX + 2]); +#endif + break; + } + case ExtraTF::kHLG: + for (size_t i = 0; i < t->buf_dst_.xsize(); ++i) { + buf_dst[i] = static_cast( + TF_HLG().EncodedFromDisplay(static_cast(buf_dst[i]))); + } +#if JXL_CMS_VERBOSE >= 2 + printf("after HLG enc %.4f %.4f %.4f\n", buf_dst[3 * kX], + buf_dst[3 * kX + 1], buf_dst[3 * kX + 2]); +#endif + break; + case ExtraTF::kSRGB: + HWY_FULL(float) df; + for (size_t i = 0; i < t->buf_dst_.xsize(); i += Lanes(df)) { + const auto val = Load(df, buf_dst + i); + const auto result = + TF_SRGB().EncodedFromDisplay(HWY_FULL(float)(), val); + Store(result, df, buf_dst + i); + } +#if JXL_CMS_VERBOSE >= 2 + printf("after SRGB enc %.4f %.4f %.4f\n", buf_dst[3 * kX], + buf_dst[3 * kX + 1], buf_dst[3 * kX + 2]); +#endif + break; + } +} + +void DoColorSpaceTransform(ColorSpaceTransform* t, const size_t thread, + const float* buf_src, float* buf_dst) { + // No lock needed. + + float* xform_src = const_cast(buf_src); // Read-only. + if (t->preprocess_ != ExtraTF::kNone) { + xform_src = t->buf_src_.Row(thread); // Writable buffer. + BeforeTransform(t, buf_src, xform_src); + } + +#if JXL_CMS_VERBOSE >= 2 + // Save inputs for printing before in-place transforms overwrite them. + const float in0 = xform_src[3 * kX + 0]; + const float in1 = xform_src[3 * kX + 1]; + const float in2 = xform_src[3 * kX + 2]; +#endif + + if (t->skip_lcms_) { + if (buf_dst != xform_src) { + memcpy(buf_dst, xform_src, t->buf_dst_.xsize() * sizeof(*buf_dst)); + } // else: in-place, no need to copy + } else { +#if JPEGXL_ENABLE_SKCMS + JXL_CHECK(skcms_Transform( + xform_src, skcms_PixelFormat_RGB_fff, skcms_AlphaFormat_Opaque, + &t->skcms_icc_->profile_src_, buf_dst, skcms_PixelFormat_RGB_fff, + skcms_AlphaFormat_Opaque, &t->skcms_icc_->profile_dst_, t->xsize_)); +#else // JPEGXL_ENABLE_SKCMS + cmsDoTransform(t->lcms_transform_, xform_src, buf_dst, + static_cast(t->xsize_)); +#endif // JPEGXL_ENABLE_SKCMS + } +#if JXL_CMS_VERBOSE >= 2 + printf("xform skip%d: %.4f %.4f %.4f (%p) -> (%p) %.4f %.4f %.4f\n", + t->skip_lcms_, in0, in1, in2, xform_src, buf_dst, buf_dst[3 * kX], + buf_dst[3 * kX + 1], buf_dst[3 * kX + 2]); +#endif + + if (t->postprocess_ != ExtraTF::kNone) { + AfterTransform(t, buf_dst); + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(DoColorSpaceTransform); +void DoColorSpaceTransform(ColorSpaceTransform* t, size_t thread, + const float* buf_src, float* buf_dst) { + return HWY_DYNAMIC_DISPATCH(DoColorSpaceTransform)(t, thread, buf_src, + buf_dst); +} + +namespace { + +#define JXL_CMS_OLD_VERSION 0 + +// cms functions (even *THR) are not thread-safe, except cmsDoTransform. +// To ensure all functions are covered without frequent lock-taking nor risk of +// recursive lock, we lock in the top-level APIs. +static std::mutex& LcmsMutex() { + static std::mutex m; + return m; +} + +#if JPEGXL_ENABLE_SKCMS + +JXL_MUST_USE_RESULT CIExy CIExyFromXYZ(const float XYZ[3]) { + const float factor = 1.f / (XYZ[0] + XYZ[1] + XYZ[2]); + CIExy xy; + xy.x = XYZ[0] * factor; + xy.y = XYZ[1] * factor; + return xy; +} + +#else // JPEGXL_ENABLE_SKCMS +// (LCMS interface requires xyY but we omit the Y for white points/primaries.) + +JXL_MUST_USE_RESULT CIExy CIExyFromxyY(const cmsCIExyY& xyY) { + CIExy xy; + xy.x = xyY.x; + xy.y = xyY.y; + return xy; +} + +JXL_MUST_USE_RESULT CIExy CIExyFromXYZ(const cmsCIEXYZ& XYZ) { + cmsCIExyY xyY; + cmsXYZ2xyY(/*Dest=*/&xyY, /*Source=*/&XYZ); + return CIExyFromxyY(xyY); +} + +JXL_MUST_USE_RESULT cmsCIEXYZ D50_XYZ() { + // Quantized D50 as stored in ICC profiles. + return {0.96420288, 1.0, 0.82490540}; +} + +JXL_MUST_USE_RESULT cmsCIExyY xyYFromCIExy(const CIExy& xy) { + const cmsCIExyY xyY = {xy.x, xy.y, 1.0}; + return xyY; +} + +// RAII + +struct ProfileDeleter { + void operator()(void* p) { cmsCloseProfile(p); } +}; +using Profile = std::unique_ptr; + +struct TransformDeleter { + void operator()(void* p) { cmsDeleteTransform(p); } +}; +using Transform = std::unique_ptr; + +struct CurveDeleter { + void operator()(cmsToneCurve* p) { cmsFreeToneCurve(p); } +}; +using Curve = std::unique_ptr; + +Status CreateProfileXYZ(const cmsContext context, + Profile* JXL_RESTRICT profile) { + profile->reset(cmsCreateXYZProfileTHR(context)); + if (profile->get() == nullptr) return JXL_FAILURE("Failed to create XYZ"); + return true; +} + +#endif // !JPEGXL_ENABLE_SKCMS + +#if JPEGXL_ENABLE_SKCMS +// IMPORTANT: icc must outlive profile. +Status DecodeProfile(const PaddedBytes& icc, skcms_ICCProfile* const profile) { + if (!skcms_Parse(icc.data(), icc.size(), profile)) { + return JXL_FAILURE("Failed to parse ICC profile with %zu bytes", + icc.size()); + } + return true; +} +#else // JPEGXL_ENABLE_SKCMS +Status DecodeProfile(const cmsContext context, const PaddedBytes& icc, + Profile* profile) { + profile->reset(cmsOpenProfileFromMemTHR(context, icc.data(), icc.size())); + if (profile->get() == nullptr) { + return JXL_FAILURE("Failed to decode profile"); + } + + return true; +} +#endif // JPEGXL_ENABLE_SKCMS + +#if JPEGXL_ENABLE_SKCMS + +ColorSpace ColorSpaceFromProfile(const skcms_ICCProfile& profile) { + switch (profile.data_color_space) { + case skcms_Signature_RGB: + return ColorSpace::kRGB; + case skcms_Signature_Gray: + return ColorSpace::kGray; + default: + return ColorSpace::kUnknown; + } +} + +// "profile1" is pre-decoded to save time in DetectTransferFunction. +Status ProfileEquivalentToICC(const skcms_ICCProfile& profile1, + const PaddedBytes& icc) { + skcms_ICCProfile profile2; + JXL_RETURN_IF_ERROR(skcms_Parse(icc.data(), icc.size(), &profile2)); + return skcms_ApproximatelyEqualProfiles(&profile1, &profile2); +} + +// vector_out := matmul(matrix, vector_in) +void MatrixProduct(const skcms_Matrix3x3& matrix, const float vector_in[3], + float vector_out[3]) { + for (int i = 0; i < 3; ++i) { + vector_out[i] = 0; + for (int j = 0; j < 3; ++j) { + vector_out[i] += matrix.vals[i][j] * vector_in[j]; + } + } +} + +// Returns white point that was specified when creating the profile. +JXL_MUST_USE_RESULT Status UnadaptedWhitePoint(const skcms_ICCProfile& profile, + CIExy* out) { + float media_white_point_XYZ[3]; + if (!skcms_GetWTPT(&profile, media_white_point_XYZ)) { + return JXL_FAILURE("ICC profile does not contain WhitePoint tag"); + } + skcms_Matrix3x3 CHAD; + if (!skcms_GetCHAD(&profile, &CHAD)) { + // If there is no chromatic adaptation matrix, it means that the white point + // is already unadapted. + *out = CIExyFromXYZ(media_white_point_XYZ); + return true; + } + // Otherwise, it has been adapted to the PCS white point using said matrix, + // and the adaptation needs to be undone. + skcms_Matrix3x3 inverse_CHAD; + if (!skcms_Matrix3x3_invert(&CHAD, &inverse_CHAD)) { + return JXL_FAILURE("Non-invertible ChromaticAdaptation matrix"); + } + float unadapted_white_point_XYZ[3]; + MatrixProduct(inverse_CHAD, media_white_point_XYZ, unadapted_white_point_XYZ); + *out = CIExyFromXYZ(unadapted_white_point_XYZ); + return true; +} + +Status IdentifyPrimaries(const skcms_ICCProfile& profile, + const CIExy& wp_unadapted, ColorEncoding* c) { + if (!c->HasPrimaries()) return true; + + skcms_Matrix3x3 CHAD, inverse_CHAD; + if (skcms_GetCHAD(&profile, &CHAD)) { + JXL_RETURN_IF_ERROR(skcms_Matrix3x3_invert(&CHAD, &inverse_CHAD)); + } else { + static constexpr skcms_Matrix3x3 kLMSFromXYZ = { + {{0.8951, 0.2664, -0.1614}, + {-0.7502, 1.7135, 0.0367}, + {0.0389, -0.0685, 1.0296}}}; + static constexpr skcms_Matrix3x3 kXYZFromLMS = { + {{0.9869929, -0.1470543, 0.1599627}, + {0.4323053, 0.5183603, 0.0492912}, + {-0.0085287, 0.0400428, 0.9684867}}}; + static constexpr float kWpD50XYZ[3] = {0.96420288, 1.0, 0.82490540}; + float wp_unadapted_XYZ[3]; + JXL_RETURN_IF_ERROR(CIEXYZFromWhiteCIExy(wp_unadapted, wp_unadapted_XYZ)); + float wp_D50_LMS[3], wp_unadapted_LMS[3]; + MatrixProduct(kLMSFromXYZ, kWpD50XYZ, wp_D50_LMS); + MatrixProduct(kLMSFromXYZ, wp_unadapted_XYZ, wp_unadapted_LMS); + inverse_CHAD = {{{wp_unadapted_LMS[0] / wp_D50_LMS[0], 0, 0}, + {0, wp_unadapted_LMS[1] / wp_D50_LMS[1], 0}, + {0, 0, wp_unadapted_LMS[2] / wp_D50_LMS[2]}}}; + inverse_CHAD = skcms_Matrix3x3_concat(&kXYZFromLMS, &inverse_CHAD); + inverse_CHAD = skcms_Matrix3x3_concat(&inverse_CHAD, &kLMSFromXYZ); + } + + float XYZ[3]; + PrimariesCIExy primaries; + CIExy* const chromaticities[] = {&primaries.r, &primaries.g, &primaries.b}; + for (int i = 0; i < 3; ++i) { + float RGB[3] = {}; + RGB[i] = 1; + skcms_Transform(RGB, skcms_PixelFormat_RGB_fff, skcms_AlphaFormat_Opaque, + &profile, XYZ, skcms_PixelFormat_RGB_fff, + skcms_AlphaFormat_Opaque, skcms_XYZD50_profile(), 1); + float unadapted_XYZ[3]; + MatrixProduct(inverse_CHAD, XYZ, unadapted_XYZ); + *chromaticities[i] = CIExyFromXYZ(unadapted_XYZ); + } + return c->SetPrimaries(primaries); +} + +void DetectTransferFunction(const skcms_ICCProfile& profile, + ColorEncoding* JXL_RESTRICT c) { + if (c->tf.SetImplicit()) return; + + for (TransferFunction tf : Values()) { + // Can only create profile from known transfer function. + if (tf == TransferFunction::kUnknown) continue; + + c->tf.SetTransferFunction(tf); + + skcms_ICCProfile profile_test; + PaddedBytes bytes; + if (MaybeCreateProfile(*c, &bytes) && DecodeProfile(bytes, &profile_test) && + skcms_ApproximatelyEqualProfiles(&profile, &profile_test)) { + return; + } + } + + c->tf.SetTransferFunction(TransferFunction::kUnknown); +} + +#else // JPEGXL_ENABLE_SKCMS + +uint32_t Type32(const ColorEncoding& c) { + if (c.IsGray()) return TYPE_GRAY_FLT; + return TYPE_RGB_FLT; +} + +uint32_t Type64(const ColorEncoding& c) { + if (c.IsGray()) return TYPE_GRAY_DBL; + return TYPE_RGB_DBL; +} + +ColorSpace ColorSpaceFromProfile(const Profile& profile) { + switch (cmsGetColorSpace(profile.get())) { + case cmsSigRgbData: + return ColorSpace::kRGB; + case cmsSigGrayData: + return ColorSpace::kGray; + default: + return ColorSpace::kUnknown; + } +} + +// "profile1" is pre-decoded to save time in DetectTransferFunction. +Status ProfileEquivalentToICC(const cmsContext context, const Profile& profile1, + const PaddedBytes& icc, const ColorEncoding& c) { + const uint32_t type_src = Type64(c); + + Profile profile2; + JXL_RETURN_IF_ERROR(DecodeProfile(context, icc, &profile2)); + + Profile profile_xyz; + JXL_RETURN_IF_ERROR(CreateProfileXYZ(context, &profile_xyz)); + + const uint32_t intent = INTENT_RELATIVE_COLORIMETRIC; + const uint32_t flags = cmsFLAGS_NOOPTIMIZE | cmsFLAGS_BLACKPOINTCOMPENSATION | + cmsFLAGS_HIGHRESPRECALC; + Transform xform1(cmsCreateTransformTHR(context, profile1.get(), type_src, + profile_xyz.get(), TYPE_XYZ_DBL, + intent, flags)); + Transform xform2(cmsCreateTransformTHR(context, profile2.get(), type_src, + profile_xyz.get(), TYPE_XYZ_DBL, + intent, flags)); + if (xform1 == nullptr || xform2 == nullptr) { + return JXL_FAILURE("Failed to create transform"); + } + + double in[3]; + double out1[3]; + double out2[3]; + + // Uniformly spaced samples from very dark to almost fully bright. + const double init = 1E-3; + const double step = 0.2; + + if (c.IsGray()) { + // Finer sampling and replicate each component. + for (in[0] = init; in[0] < 1.0; in[0] += step / 8) { + cmsDoTransform(xform1.get(), in, out1, 1); + cmsDoTransform(xform2.get(), in, out2, 1); + if (!ApproxEq(out1[0], out2[0], 2E-4)) { + return false; + } + } + } else { + for (in[0] = init; in[0] < 1.0; in[0] += step) { + for (in[1] = init; in[1] < 1.0; in[1] += step) { + for (in[2] = init; in[2] < 1.0; in[2] += step) { + cmsDoTransform(xform1.get(), in, out1, 1); + cmsDoTransform(xform2.get(), in, out2, 1); + for (size_t i = 0; i < 3; ++i) { + if (!ApproxEq(out1[i], out2[i], 2E-4)) { + return false; + } + } + } + } + } + } + + return true; +} + +// Returns white point that was specified when creating the profile. +// NOTE: we can't just use cmsSigMediaWhitePointTag because its interpretation +// differs between ICC versions. +JXL_MUST_USE_RESULT cmsCIEXYZ UnadaptedWhitePoint(const cmsContext context, + const Profile& profile, + const ColorEncoding& c) { + cmsCIEXYZ XYZ = {1.0, 1.0, 1.0}; + + Profile profile_xyz; + if (!CreateProfileXYZ(context, &profile_xyz)) return XYZ; + // Array arguments are one per profile. + cmsHPROFILE profiles[2] = {profile.get(), profile_xyz.get()}; + // Leave white point unchanged - that is what we're trying to extract. + cmsUInt32Number intents[2] = {INTENT_ABSOLUTE_COLORIMETRIC, + INTENT_ABSOLUTE_COLORIMETRIC}; + cmsBool black_compensation[2] = {0, 0}; + cmsFloat64Number adaption[2] = {0.0, 0.0}; + // Only transforming a single pixel, so skip expensive optimizations. + cmsUInt32Number flags = cmsFLAGS_NOOPTIMIZE | cmsFLAGS_HIGHRESPRECALC; + Transform xform(cmsCreateExtendedTransform( + context, 2, profiles, black_compensation, intents, adaption, nullptr, 0, + Type64(c), TYPE_XYZ_DBL, flags)); + if (!xform) return XYZ; // TODO(lode): return error + + // xy are relative, so magnitude does not matter if we ignore output Y. + const cmsFloat64Number in[3] = {1.0, 1.0, 1.0}; + cmsDoTransform(xform.get(), in, &XYZ.X, 1); + return XYZ; +} + +Status IdentifyPrimaries(const Profile& profile, const cmsCIEXYZ& wp_unadapted, + ColorEncoding* c) { + if (!c->HasPrimaries()) return true; + if (ColorSpaceFromProfile(profile) == ColorSpace::kUnknown) return true; + + // These were adapted to the profile illuminant before storing in the profile. + const cmsCIEXYZ* adapted_r = static_cast( + cmsReadTag(profile.get(), cmsSigRedColorantTag)); + const cmsCIEXYZ* adapted_g = static_cast( + cmsReadTag(profile.get(), cmsSigGreenColorantTag)); + const cmsCIEXYZ* adapted_b = static_cast( + cmsReadTag(profile.get(), cmsSigBlueColorantTag)); + if (adapted_r == nullptr || adapted_g == nullptr || adapted_b == nullptr) { + return JXL_FAILURE("Failed to retrieve colorants"); + } + + // TODO(janwas): no longer assume Bradford and D50. + // Undo the chromatic adaptation. + const cmsCIEXYZ d50 = D50_XYZ(); + + cmsCIEXYZ r, g, b; + cmsAdaptToIlluminant(&r, &d50, &wp_unadapted, adapted_r); + cmsAdaptToIlluminant(&g, &d50, &wp_unadapted, adapted_g); + cmsAdaptToIlluminant(&b, &d50, &wp_unadapted, adapted_b); + + const PrimariesCIExy rgb = {CIExyFromXYZ(r), CIExyFromXYZ(g), + CIExyFromXYZ(b)}; + return c->SetPrimaries(rgb); +} + +void DetectTransferFunction(const cmsContext context, const Profile& profile, + ColorEncoding* JXL_RESTRICT c) { + if (c->tf.SetImplicit()) return; + + for (TransferFunction tf : Values()) { + // Can only create profile from known transfer function. + if (tf == TransferFunction::kUnknown) continue; + + c->tf.SetTransferFunction(tf); + + PaddedBytes icc_test; + if (MaybeCreateProfile(*c, &icc_test) && + ProfileEquivalentToICC(context, profile, icc_test, *c)) { + return; + } + } + + c->tf.SetTransferFunction(TransferFunction::kUnknown); +} + +void ErrorHandler(cmsContext context, cmsUInt32Number code, const char* text) { + JXL_WARNING("LCMS error %u: %s", code, text); +} + +// Returns a context for the current thread, creating it if necessary. +cmsContext GetContext() { + static thread_local void* context_; + if (context_ == nullptr) { + context_ = cmsCreateContext(nullptr, nullptr); + JXL_ASSERT(context_ != nullptr); + + cmsSetLogErrorHandlerTHR(static_cast(context_), &ErrorHandler); + } + return static_cast(context_); +} + +#endif // JPEGXL_ENABLE_SKCMS + +} // namespace + +// All functions that call lcms directly (except ColorSpaceTransform::Run) must +// lock LcmsMutex(). + +Status ColorEncoding::SetFieldsFromICC() { + // In case parsing fails, mark the ColorEncoding as invalid. + SetColorSpace(ColorSpace::kUnknown); + tf.SetTransferFunction(TransferFunction::kUnknown); + + if (icc_.empty()) return JXL_FAILURE("Empty ICC profile"); + +#if JPEGXL_ENABLE_SKCMS + if (icc_.size() < 128) { + return JXL_FAILURE("ICC file too small"); + } + + skcms_ICCProfile profile; + JXL_RETURN_IF_ERROR(skcms_Parse(icc_.data(), icc_.size(), &profile)); + + // skcms does not return the rendering intent, so get it from the file. It + // is encoded as big-endian 32-bit integer in bytes 60..63. + uint32_t rendering_intent32 = icc_[67]; + if (rendering_intent32 > 3 || icc_[64] != 0 || icc_[65] != 0 || + icc_[66] != 0) { + return JXL_FAILURE("Invalid rendering intent %u\n", rendering_intent32); + } + + SetColorSpace(ColorSpaceFromProfile(profile)); + + CIExy wp_unadapted; + JXL_RETURN_IF_ERROR(UnadaptedWhitePoint(profile, &wp_unadapted)); + JXL_RETURN_IF_ERROR(SetWhitePoint(wp_unadapted)); + + // Relies on color_space. + JXL_RETURN_IF_ERROR(IdentifyPrimaries(profile, wp_unadapted, this)); + + // Relies on color_space/white point/primaries being set already. + DetectTransferFunction(profile, this); + // ICC and RenderingIntent have the same values (0..3). + rendering_intent = static_cast(rendering_intent32); +#else // JPEGXL_ENABLE_SKCMS + + std::lock_guard guard(LcmsMutex()); + const cmsContext context = GetContext(); + + Profile profile; + JXL_RETURN_IF_ERROR(DecodeProfile(context, icc_, &profile)); + + const cmsUInt32Number rendering_intent32 = + cmsGetHeaderRenderingIntent(profile.get()); + if (rendering_intent32 > 3) { + return JXL_FAILURE("Invalid rendering intent %u\n", rendering_intent32); + } + + SetColorSpace(ColorSpaceFromProfile(profile)); + + const cmsCIEXYZ wp_unadapted = UnadaptedWhitePoint(context, profile, *this); + JXL_RETURN_IF_ERROR(SetWhitePoint(CIExyFromXYZ(wp_unadapted))); + + // Relies on color_space. + JXL_RETURN_IF_ERROR(IdentifyPrimaries(profile, wp_unadapted, this)); + + // Relies on color_space/white point/primaries being set already. + DetectTransferFunction(context, profile, this); + + // ICC and RenderingIntent have the same values (0..3). + rendering_intent = static_cast(rendering_intent32); +#endif // JPEGXL_ENABLE_SKCMS + + return true; +} + +void ColorEncoding::DecideIfWantICC() { + PaddedBytes icc_new; + bool equivalent; +#if JPEGXL_ENABLE_SKCMS + skcms_ICCProfile profile; + if (!DecodeProfile(ICC(), &profile)) return; + if (!MaybeCreateProfile(*this, &icc_new)) return; + equivalent = ProfileEquivalentToICC(profile, icc_new); +#else // JPEGXL_ENABLE_SKCMS + const cmsContext context = GetContext(); + Profile profile; + if (!DecodeProfile(context, ICC(), &profile)) return; + if (!MaybeCreateProfile(*this, &icc_new)) return; + equivalent = ProfileEquivalentToICC(context, profile, icc_new, *this); +#endif // JPEGXL_ENABLE_SKCMS + + // Successfully created a profile => reconstruction should be equivalent. + JXL_ASSERT(equivalent); + want_icc_ = false; +} + +ColorSpaceTransform::~ColorSpaceTransform() { +#if !JPEGXL_ENABLE_SKCMS + std::lock_guard guard(LcmsMutex()); + TransformDeleter()(lcms_transform_); +#endif +} + +ColorSpaceTransform::ColorSpaceTransform() +#if JPEGXL_ENABLE_SKCMS + : skcms_icc_(new SkcmsICC()) +#endif // JPEGXL_ENABLE_SKCMS +{ +} + +Status ColorSpaceTransform::Init(const ColorEncoding& c_src, + const ColorEncoding& c_dst, + float intensity_target, size_t xsize, + const size_t num_threads) { + std::lock_guard guard(LcmsMutex()); +#if JXL_CMS_VERBOSE + printf("%s -> %s\n", Description(c_src).c_str(), Description(c_dst).c_str()); +#endif + +#if JPEGXL_ENABLE_SKCMS + skcms_icc_->icc_src_ = c_src.ICC(); + skcms_icc_->icc_dst_ = c_dst.ICC(); + JXL_RETURN_IF_ERROR( + DecodeProfile(skcms_icc_->icc_src_, &skcms_icc_->profile_src_)); + JXL_RETURN_IF_ERROR( + DecodeProfile(skcms_icc_->icc_dst_, &skcms_icc_->profile_dst_)); +#else // JPEGXL_ENABLE_SKCMS + const cmsContext context = GetContext(); + Profile profile_src, profile_dst; + JXL_RETURN_IF_ERROR(DecodeProfile(context, c_src.ICC(), &profile_src)); + JXL_RETURN_IF_ERROR(DecodeProfile(context, c_dst.ICC(), &profile_dst)); +#endif // JPEGXL_ENABLE_SKCMS + + skip_lcms_ = false; + if (c_src.SameColorEncoding(c_dst)) { + skip_lcms_ = true; +#if JXL_CMS_VERBOSE + printf("Skip CMS\n"); +#endif + } + + // Special-case for BT.2100 HLG/PQ and SRGB <=> linear: + const bool src_linear = c_src.tf.IsLinear(); + const bool dst_linear = c_dst.tf.IsLinear(); + if (((c_src.tf.IsPQ() || c_src.tf.IsHLG()) && dst_linear) || + ((c_dst.tf.IsPQ() || c_dst.tf.IsHLG()) && src_linear) || + ((c_src.tf.IsPQ() != c_dst.tf.IsPQ()) && intensity_target_ != 10000) || + (c_src.tf.IsSRGB() && dst_linear) || (c_dst.tf.IsSRGB() && src_linear)) { + // Construct new profiles as if the data were already/still linear. + ColorEncoding c_linear_src = c_src; + ColorEncoding c_linear_dst = c_dst; + c_linear_src.tf.SetTransferFunction(TransferFunction::kLinear); + c_linear_dst.tf.SetTransferFunction(TransferFunction::kLinear); + PaddedBytes icc_src, icc_dst; +#if JPEGXL_ENABLE_SKCMS + skcms_ICCProfile new_src, new_dst; +#else // JPEGXL_ENABLE_SKCMS + Profile new_src, new_dst; +#endif // JPEGXL_ENABLE_SKCMS + // Only enable ExtraTF if profile creation succeeded. + if (MaybeCreateProfile(c_linear_src, &icc_src) && + MaybeCreateProfile(c_linear_dst, &icc_dst) && +#if JPEGXL_ENABLE_SKCMS + DecodeProfile(icc_src, &new_src) && DecodeProfile(icc_dst, &new_dst)) { +#else // JPEGXL_ENABLE_SKCMS + DecodeProfile(context, icc_src, &new_src) && + DecodeProfile(context, icc_dst, &new_dst)) { +#endif // JPEGXL_ENABLE_SKCMS + if (c_src.SameColorSpace(c_dst)) { + skip_lcms_ = true; + } +#if JXL_CMS_VERBOSE + printf("Special linear <-> HLG/PQ/sRGB; skip=%d\n", skip_lcms_); +#endif +#if JPEGXL_ENABLE_SKCMS + skcms_icc_->icc_src_ = PaddedBytes(); + skcms_icc_->profile_src_ = new_src; + skcms_icc_->icc_dst_ = PaddedBytes(); + skcms_icc_->profile_dst_ = new_dst; +#else // JPEGXL_ENABLE_SKCMS + profile_src.swap(new_src); + profile_dst.swap(new_dst); +#endif // JPEGXL_ENABLE_SKCMS + if (!c_src.tf.IsLinear()) { + preprocess_ = c_src.tf.IsSRGB() + ? ExtraTF::kSRGB + : (c_src.tf.IsPQ() ? ExtraTF::kPQ : ExtraTF::kHLG); + } + if (!c_dst.tf.IsLinear()) { + postprocess_ = c_dst.tf.IsSRGB() + ? ExtraTF::kSRGB + : (c_dst.tf.IsPQ() ? ExtraTF::kPQ : ExtraTF::kHLG); + } + } else { + JXL_WARNING("Failed to create extra linear profiles"); + } + } + +#if JPEGXL_ENABLE_SKCMS + if (!skcms_MakeUsableAsDestination(&skcms_icc_->profile_dst_)) { + return JXL_FAILURE( + "Failed to make %s usable as a color transform destination", + Description(c_dst).c_str()); + } +#endif // JPEGXL_ENABLE_SKCMS + + // Not including alpha channel (copied separately). + const size_t channels_src = c_src.Channels(); + const size_t channels_dst = c_dst.Channels(); + JXL_CHECK(channels_src == channels_dst); +#if JXL_CMS_VERBOSE + printf("Channels: %zu; Threads: %zu\n", channels_src, num_threads); +#endif + +#if !JPEGXL_ENABLE_SKCMS + // Type includes color space (XYZ vs RGB), so can be different. + const uint32_t type_src = Type32(c_src); + const uint32_t type_dst = Type32(c_dst); + const uint32_t intent = static_cast(c_dst.rendering_intent); + // Use cmsFLAGS_NOCACHE to disable the 1-pixel cache and make calling + // cmsDoTransform() thread-safe. + const uint32_t flags = cmsFLAGS_NOCACHE | cmsFLAGS_BLACKPOINTCOMPENSATION | + cmsFLAGS_HIGHRESPRECALC; + lcms_transform_ = + cmsCreateTransformTHR(context, profile_src.get(), type_src, + profile_dst.get(), type_dst, intent, flags); + if (lcms_transform_ == nullptr) { + return JXL_FAILURE("Failed to create transform"); + } +#endif // !JPEGXL_ENABLE_SKCMS + + // Ideally LCMS would convert directly from External to Image3. However, + // cmsDoTransformLineStride only accepts 32-bit BytesPerPlaneIn, whereas our + // planes can be more than 4 GiB apart. Hence, transform inputs/outputs must + // be interleaved. Calling cmsDoTransform for each pixel is expensive + // (indirect call). We therefore transform rows, which requires per-thread + // buffers. To avoid separate allocations, we use the rows of an image. + // Because LCMS apparently also cannot handle <= 16 bit inputs and 32-bit + // outputs (or vice versa), we use floating point input/output. +#if JPEGXL_ENABLE_SKCMS + // SkiaCMS doesn't support grayscale float buffers, so we create space for RGB + // float buffers anyway. + buf_src_ = ImageF(xsize * 3, num_threads); + buf_dst_ = ImageF(xsize * 3, num_threads); +#else + buf_src_ = ImageF(xsize * channels_src, num_threads); + buf_dst_ = ImageF(xsize * channels_dst, num_threads); +#endif + intensity_target_ = intensity_target; + xsize_ = xsize; + return true; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_color_management.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_color_management.h new file mode 100644 index 0000000000..9dbce855bf --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_color_management.h @@ -0,0 +1,70 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_COLOR_MANAGEMENT_H_ +#define LIB_JXL_ENC_COLOR_MANAGEMENT_H_ + +// ICC profiles and color space conversions. + +#include +#include + +#include + +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image.h" + +namespace jxl { + +// Run is thread-safe. +class ColorSpaceTransform { + public: + ColorSpaceTransform(); + ~ColorSpaceTransform(); + + // Cannot copy (transforms_ holds pointers). + ColorSpaceTransform(const ColorSpaceTransform&) = delete; + ColorSpaceTransform& operator=(const ColorSpaceTransform&) = delete; + + // "Constructor"; allocates for up to `num_threads`, or returns false. + // `intensity_target` is used for conversion to and from PQ, which is absolute + // (1 always represents 10000 cd/m²) and thus needs scaling in linear space if + // 1 is to represent another luminance level instead. + Status Init(const ColorEncoding& c_src, const ColorEncoding& c_dst, + float intensity_target, size_t xsize, size_t num_threads); + + float* BufSrc(const size_t thread) { return buf_src_.Row(thread); } + + float* BufDst(const size_t thread) { return buf_dst_.Row(thread); } + +#if JPEGXL_ENABLE_SKCMS + struct SkcmsICC; + std::unique_ptr skcms_icc_; +#else + void* lcms_transform_; +#endif + + ImageF buf_src_; + ImageF buf_dst_; + float intensity_target_; + size_t xsize_; + bool skip_lcms_ = false; + ExtraTF preprocess_ = ExtraTF::kNone; + ExtraTF postprocess_ = ExtraTF::kNone; +}; + +// buf_X can either be from BufX() or caller-allocated, interleaved storage. +// `thread` must be less than the `num_threads` passed to Init. +// `t` is non-const because buf_* may be modified. +void DoColorSpaceTransform(ColorSpaceTransform* t, size_t thread, + const float* buf_src, float* buf_dst); + +} // namespace jxl + +#endif // LIB_JXL_ENC_COLOR_MANAGEMENT_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_comparator.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_comparator.cc new file mode 100644 index 0000000000..f5b25f876a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_comparator.cc @@ -0,0 +1,140 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_comparator.h" + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/enc_gamma_correct.h" + +namespace jxl { +namespace { + +// color is linear, but blending happens in gamma-compressed space using +// (gamma-compressed) grayscale background color, alpha image represents +// weights of the sRGB colors in the [0 .. (1 << bit_depth) - 1] interval, +// output image is in linear space. +void AlphaBlend(const Image3F& in, const size_t c, float background_linear, + const ImageF& alpha, Image3F* out) { + const float background = LinearToSrgb8Direct(background_linear); + + for (size_t y = 0; y < out->ysize(); ++y) { + const float* JXL_RESTRICT row_a = alpha.ConstRow(y); + const float* JXL_RESTRICT row_i = in.ConstPlaneRow(c, y); + float* JXL_RESTRICT row_o = out->PlaneRow(c, y); + for (size_t x = 0; x < out->xsize(); ++x) { + const float a = row_a[x]; + if (a <= 0.f) { + row_o[x] = background_linear; + } else if (a >= 1.f) { + row_o[x] = row_i[x]; + } else { + const float w_fg = a; + const float w_bg = 1.0f - w_fg; + const float fg = w_fg * LinearToSrgb8Direct(row_i[x]); + const float bg = w_bg * background; + row_o[x] = Srgb8ToLinearDirect(fg + bg); + } + } + } +} + +const Image3F* AlphaBlend(const ImageBundle& ib, const Image3F& linear, + float background_linear, Image3F* copy) { + // No alpha => all opaque. + if (!ib.HasAlpha()) return &linear; + + *copy = Image3F(linear.xsize(), linear.ysize()); + for (size_t c = 0; c < 3; ++c) { + AlphaBlend(linear, c, background_linear, ib.alpha(), copy); + } + return copy; +} + +void AlphaBlend(float background_linear, ImageBundle* io_linear_srgb) { + // No alpha => all opaque. + if (!io_linear_srgb->HasAlpha()) return; + + for (size_t c = 0; c < 3; ++c) { + AlphaBlend(*io_linear_srgb->color(), c, background_linear, + *io_linear_srgb->alpha(), io_linear_srgb->color()); + } +} + +float ComputeScoreImpl(const ImageBundle& rgb0, const ImageBundle& rgb1, + Comparator* comparator, ImageF* distmap) { + JXL_CHECK(comparator->SetReferenceImage(rgb0)); + float score; + JXL_CHECK(comparator->CompareWith(rgb1, distmap, &score)); + return score; +} + +} // namespace + +float ComputeScore(const ImageBundle& rgb0, const ImageBundle& rgb1, + Comparator* comparator, ImageF* diffmap, ThreadPool* pool) { + PROFILER_FUNC; + // Convert to linear sRGB (unless already in that space) + ImageMetadata metadata0 = *rgb0.metadata(); + ImageBundle store0(&metadata0); + const ImageBundle* linear_srgb0; + JXL_CHECK(TransformIfNeeded(rgb0, ColorEncoding::LinearSRGB(rgb0.IsGray()), + pool, &store0, &linear_srgb0)); + ImageMetadata metadata1 = *rgb1.metadata(); + ImageBundle store1(&metadata1); + const ImageBundle* linear_srgb1; + JXL_CHECK(TransformIfNeeded(rgb1, ColorEncoding::LinearSRGB(rgb1.IsGray()), + pool, &store1, &linear_srgb1)); + + // No alpha: skip blending, only need a single call to Butteraugli. + if (!rgb0.HasAlpha() && !rgb1.HasAlpha()) { + return ComputeScoreImpl(*linear_srgb0, *linear_srgb1, comparator, diffmap); + } + + // Blend on black and white backgrounds + + const float black = 0.0f; + ImageBundle blended_black0 = linear_srgb0->Copy(); + ImageBundle blended_black1 = linear_srgb1->Copy(); + AlphaBlend(black, &blended_black0); + AlphaBlend(black, &blended_black1); + + const float white = 1.0f; + ImageBundle blended_white0 = linear_srgb0->Copy(); + ImageBundle blended_white1 = linear_srgb1->Copy(); + + AlphaBlend(white, &blended_white0); + AlphaBlend(white, &blended_white1); + + ImageF diffmap_black, diffmap_white; + const float dist_black = ComputeScoreImpl(blended_black0, blended_black1, + comparator, &diffmap_black); + const float dist_white = ComputeScoreImpl(blended_white0, blended_white1, + comparator, &diffmap_white); + + // diffmap and return values are the max of diffmap_black/white. + if (diffmap != nullptr) { + const size_t xsize = rgb0.xsize(); + const size_t ysize = rgb0.ysize(); + *diffmap = ImageF(xsize, ysize); + for (size_t y = 0; y < ysize; ++y) { + const float* JXL_RESTRICT row_black = diffmap_black.ConstRow(y); + const float* JXL_RESTRICT row_white = diffmap_white.ConstRow(y); + float* JXL_RESTRICT row_out = diffmap->Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = std::max(row_black[x], row_white[x]); + } + } + } + return std::max(dist_black, dist_white); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_comparator.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_comparator.h new file mode 100644 index 0000000000..e348a4e8eb --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_comparator.h @@ -0,0 +1,52 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_COMPARATOR_H_ +#define LIB_JXL_ENC_COMPARATOR_H_ + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +class Comparator { + public: + virtual ~Comparator() = default; + + // Sets the reference image, the first to compare + // Image must be in linear sRGB (gamma expanded) in range 0.0f-1.0f as + // the range from standard black point to standard white point, but values + // outside permitted. + virtual Status SetReferenceImage(const ImageBundle& ref) = 0; + + // Sets the actual image (with loss), the second to compare + // Image must be in linear sRGB (gamma expanded) in range 0.0f-1.0f as + // the range from standard black point to standard white point, but values + // outside permitted. + // In diffmap it outputs the local score per pixel, while in score it outputs + // a single score. Any one may be set to nullptr to not compute it. + virtual Status CompareWith(const ImageBundle& actual, ImageF* diffmap, + float* score) = 0; + + // Quality thresholds for diffmap and score values. + // The good score must represent a value where the images are considered to + // be perceptually indistinguishable (but not identical) + // The bad value must be larger than good to indicate "lower means better" + // and smaller than good to indicate "higher means better" + virtual float GoodQualityScore() const = 0; + virtual float BadQualityScore() const = 0; +}; + +// Computes the score given images in any RGB color model, optionally with +// alpha channel. +float ComputeScore(const ImageBundle& rgb0, const ImageBundle& rgb1, + Comparator* comparator, ImageF* diffmap = nullptr, + ThreadPool* pool = nullptr); + +} // namespace jxl + +#endif // LIB_JXL_ENC_COMPARATOR_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_context_map.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_context_map.cc new file mode 100644 index 0000000000..d7ae8e4a6b --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_context_map.cc @@ -0,0 +1,139 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Library to encode the context map. + +#include "lib/jxl/enc_context_map.h" + +#include + +#include +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/entropy_coder.h" + +namespace jxl { + +namespace { + +size_t IndexOf(const std::vector& v, uint8_t value) { + size_t i = 0; + for (; i < v.size(); ++i) { + if (v[i] == value) return i; + } + return i; +} + +void MoveToFront(std::vector* v, size_t index) { + uint8_t value = (*v)[index]; + for (size_t i = index; i != 0; --i) { + (*v)[i] = (*v)[i - 1]; + } + (*v)[0] = value; +} + +std::vector MoveToFrontTransform(const std::vector& v) { + if (v.empty()) return v; + uint8_t max_value = *std::max_element(v.begin(), v.end()); + std::vector mtf(max_value + 1); + for (size_t i = 0; i <= max_value; ++i) mtf[i] = i; + std::vector result(v.size()); + for (size_t i = 0; i < v.size(); ++i) { + size_t index = IndexOf(mtf, v[i]); + JXL_ASSERT(index < mtf.size()); + result[i] = static_cast(index); + MoveToFront(&mtf, index); + } + return result; +} + +} // namespace + +void EncodeContextMap(const std::vector& context_map, + size_t num_histograms, BitWriter* writer) { + if (num_histograms == 1) { + // Simple code + writer->Write(1, 1); + // 0 bits per entry. + writer->Write(2, 0); + return; + } + + std::vector transformed_symbols = MoveToFrontTransform(context_map); + std::vector> tokens(1), mtf_tokens(1); + EntropyEncodingData codes; + std::vector dummy_context_map; + for (size_t i = 0; i < context_map.size(); i++) { + tokens[0].emplace_back(0, context_map[i]); + } + for (size_t i = 0; i < transformed_symbols.size(); i++) { + mtf_tokens[0].emplace_back(0, transformed_symbols[i]); + } + HistogramParams params; + params.uint_method = HistogramParams::HybridUintMethod::kContextMap; + size_t ans_cost = BuildAndEncodeHistograms( + params, 1, tokens, &codes, &dummy_context_map, nullptr, 0, nullptr); + size_t mtf_cost = BuildAndEncodeHistograms( + params, 1, mtf_tokens, &codes, &dummy_context_map, nullptr, 0, nullptr); + bool use_mtf = mtf_cost < ans_cost; + // Rebuild token list. + tokens[0].clear(); + for (size_t i = 0; i < transformed_symbols.size(); i++) { + tokens[0].emplace_back(0, + use_mtf ? transformed_symbols[i] : context_map[i]); + } + size_t entry_bits = CeilLog2Nonzero(num_histograms); + size_t simple_cost = entry_bits * context_map.size(); + if (entry_bits < 4 && simple_cost < ans_cost && simple_cost < mtf_cost) { + writer->Write(1, 1); + writer->Write(2, entry_bits); + for (size_t i = 0; i < context_map.size(); i++) { + writer->Write(entry_bits, context_map[i]); + } + } else { + writer->Write(1, 0); + writer->Write(1, use_mtf); // Use/don't use MTF. + BuildAndEncodeHistograms(params, 1, tokens, &codes, &dummy_context_map, + writer, 0, nullptr); + WriteTokens(tokens[0], codes, dummy_context_map, writer); + } +} + +void EncodeBlockCtxMap(const BlockCtxMap& block_ctx_map, BitWriter* writer, + AuxOut* aux_out) { + auto& dct = block_ctx_map.dc_thresholds; + auto& qft = block_ctx_map.qf_thresholds; + auto& ctx_map = block_ctx_map.ctx_map; + BitWriter::Allotment allotment( + writer, + (dct[0].size() + dct[1].size() + dct[2].size() + qft.size()) * 34 + 1 + + 4 + 4 + ctx_map.size() * 10 + 1024); + if (dct[0].empty() && dct[1].empty() && dct[2].empty() && qft.empty() && + ctx_map.size() == 21 && + std::equal(ctx_map.begin(), ctx_map.end(), jxl::kDefaultCtxMap)) { + writer->Write(1, 1); // default + ReclaimAndCharge(writer, &allotment, kLayerAC, aux_out); + return; + } + writer->Write(1, 0); + for (int j : {0, 1, 2}) { + writer->Write(4, dct[j].size()); + for (int i : dct[j]) { + JXL_CHECK(U32Coder::Write(kDCThresholdDist, PackSigned(i), writer)); + } + } + writer->Write(4, qft.size()); + for (uint32_t i : qft) { + JXL_CHECK(U32Coder::Write(kQFThresholdDist, i - 1, writer)); + } + EncodeContextMap(ctx_map, block_ctx_map.num_ctxs, writer); + ReclaimAndCharge(writer, &allotment, kLayerAC, aux_out); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_context_map.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_context_map.h new file mode 100644 index 0000000000..7f6c624380 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_context_map.h @@ -0,0 +1,33 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_CONTEXT_MAP_H_ +#define LIB_JXL_ENC_CONTEXT_MAP_H_ + +#include +#include + +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/enc_bit_writer.h" + +namespace jxl { + +// Max limit is 255 because encoding assumes numbers < 255 +// More clusters can help compression, but makes encode/decode somewhat slower +static const size_t kClustersLimit = 128; + +// Encodes the given context map to the bit stream. The number of different +// histogram ids is given by num_histograms. +void EncodeContextMap(const std::vector& context_map, + size_t num_histograms, BitWriter* writer); + +void EncodeBlockCtxMap(const BlockCtxMap& block_ctx_map, BitWriter* writer, + AuxOut* aux_out); +} // namespace jxl + +#endif // LIB_JXL_ENC_CONTEXT_MAP_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_detect_dots.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_detect_dots.cc new file mode 100644 index 0000000000..a2285df362 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_detect_dots.cc @@ -0,0 +1,620 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_detect_dots.h" + +#include + +#include +#include +#include +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_detect_dots.cc" +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/common.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/linalg.h" +#include "lib/jxl/optimize.h" + +// Set JXL_DEBUG_DOT_DETECT to 1 to enable debugging. +#ifndef JXL_DEBUG_DOT_DETECT +#define JXL_DEBUG_DOT_DETECT 0 +#endif + +#if JXL_DEBUG_DOT_DETECT +#include "lib/jxl/aux_out.h" +#endif + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +ImageF SumOfSquareDifferences(const Image3F& forig, const Image3F& smooth, + ThreadPool* pool) { + const HWY_FULL(float) d; + const auto color_coef0 = Set(d, 0.0f); + const auto color_coef1 = Set(d, 10.0f); + const auto color_coef2 = Set(d, 0.0f); + + ImageF sum_of_squares(forig.xsize(), forig.ysize()); + RunOnPool( + pool, 0, forig.ysize(), ThreadPool::SkipInit(), + [&](const int task, const int thread) { + const size_t y = static_cast(task); + const float* JXL_RESTRICT orig_row0 = forig.Plane(0).ConstRow(y); + const float* JXL_RESTRICT orig_row1 = forig.Plane(1).ConstRow(y); + const float* JXL_RESTRICT orig_row2 = forig.Plane(2).ConstRow(y); + const float* JXL_RESTRICT smooth_row0 = smooth.Plane(0).ConstRow(y); + const float* JXL_RESTRICT smooth_row1 = smooth.Plane(1).ConstRow(y); + const float* JXL_RESTRICT smooth_row2 = smooth.Plane(2).ConstRow(y); + float* JXL_RESTRICT sos_row = sum_of_squares.Row(y); + + for (size_t x = 0; x < forig.xsize(); x += Lanes(d)) { + auto v0 = Load(d, orig_row0 + x) - Load(d, smooth_row0 + x); + auto v1 = Load(d, orig_row1 + x) - Load(d, smooth_row1 + x); + auto v2 = Load(d, orig_row2 + x) - Load(d, smooth_row2 + x); + v0 *= v0; + v1 *= v1; + v2 *= v2; + v0 *= color_coef0; // FMA doesn't help here. + v1 *= color_coef1; + v2 *= color_coef2; + const auto sos = v0 + v1 + v2; // weighted sum of square diffs + Store(sos, d, sos_row + x); + } + }, + "ComputeEnergyImage"); + return sum_of_squares; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(SumOfSquareDifferences); // Local function + +const int kEllipseWindowSize = 5; + +namespace { +struct GaussianEllipse { + double x; // position in x + double y; // position in y + double sigma_x; // scale in x + double sigma_y; // scale in y + double angle; // ellipse rotation in radians + std::array intensity; // intensity in each channel + + // The following variables do not need to be encoded + double l2_loss; // error after the Gaussian was fit + double l1_loss; + double ridge_loss; // the l2_loss plus regularization term + double custom_loss; // experimental custom loss + std::array bgColor; // best background color + size_t neg_pixels; // number of negative pixels when subtracting dot + std::array neg_value; // debt due to channel truncation +}; +double DotGaussianModel(double dx, double dy, double ct, double st, + double sigma_x, double sigma_y, double intensity) { + double rx = ct * dx + st * dy; + double ry = -st * dx + ct * dy; + double md = (rx * rx / sigma_x) + (ry * ry / sigma_y); + double value = intensity * exp(-0.5 * md); + return value; +} + +constexpr bool kOptimizeBackground = true; + +// Gaussian that smooths noise but preserves dots +const WeightsSeparable5& WeightsSeparable5Gaussian0_65() { + constexpr float w0 = 0.558311f; + constexpr float w1 = 0.210395f; + constexpr float w2 = 0.010449f; + static constexpr WeightsSeparable5 weights = { + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}, + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}}; + return weights; +} + +// (Iterated) Gaussian that removes dots. +const WeightsSeparable5& WeightsSeparable5Gaussian3() { + constexpr float w0 = 0.222338f; + constexpr float w1 = 0.210431f; + constexpr float w2 = 0.1784f; + static constexpr WeightsSeparable5 weights = { + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}, + {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}}; + return weights; +} + +ImageF ComputeEnergyImage(const Image3F& orig, Image3F* smooth, + ThreadPool* pool) { + PROFILER_FUNC; + + // Prepare guidance images for dot selection. + Image3F forig(orig.xsize(), orig.ysize()); + Image3F tmp(orig.xsize(), orig.ysize()); + *smooth = Image3F(orig.xsize(), orig.ysize()); + + const auto& weights1 = WeightsSeparable5Gaussian0_65(); + const auto& weights3 = WeightsSeparable5Gaussian3(); + + Separable5_3(orig, Rect(orig), weights1, pool, &forig); + + Separable5_3(orig, Rect(orig), weights3, pool, &tmp); + Separable5_3(tmp, Rect(tmp), weights3, pool, smooth); + +#if JXL_DEBUG_DOT_DETECT + AuxOut aux; + aux.debug_prefix = "/tmp/sebastian/"; + aux.DumpImage("filtered", forig); + aux.DumpImage("sm", *smooth); +#endif + + return HWY_DYNAMIC_DISPATCH(SumOfSquareDifferences)(forig, *smooth, pool); +} + +struct Pixel { + int x; + int y; +}; + +Pixel operator+(const Pixel& a, const Pixel& b) { + return Pixel{a.x + b.x, a.y + b.y}; +} + +// Maximum area in pixels of a ellipse +const size_t kMaxCCSize = 1000; + +// Extracts a connected component from a Binary image where seed is part +// of the component +bool ExtractComponent(ImageF* img, std::vector* pixels, + const Pixel& seed, double threshold) { + PROFILER_FUNC; + static const std::vector neighbors{{1, -1}, {1, 0}, {1, 1}, {0, -1}, + {0, 1}, {-1, -1}, {-1, 1}, {1, 0}}; + std::vector q{seed}; + while (!q.empty()) { + Pixel current = q.back(); + q.pop_back(); + pixels->push_back(current); + if (pixels->size() > kMaxCCSize) return false; + for (const Pixel& delta : neighbors) { + Pixel child = current + delta; + if (child.x >= 0 && static_cast(child.x) < img->xsize() && + child.y >= 0 && static_cast(child.y) < img->ysize()) { + float* value = &img->Row(child.y)[child.x]; + if (*value > threshold) { + *value = 0.0; + q.push_back(child); + } + } + } + } + return true; +} + +inline bool PointInRect(const Rect& r, const Pixel& p) { + return (static_cast(p.x) >= r.x0() && + static_cast(p.x) < (r.x0() + r.xsize()) && + static_cast(p.y) >= r.y0() && + static_cast(p.y) < (r.y0() + r.ysize())); +} + +struct ConnectedComponent { + ConnectedComponent(const Rect& bounds, const std::vector&& pixels) + : bounds(bounds), pixels(pixels) {} + Rect bounds; + std::vector pixels; + float maxEnergy; + float meanEnergy; + float varEnergy; + float meanBg; + float varBg; + float score; + Pixel mode; + + void CompStats(const ImageF& energy, int extra) { + PROFILER_FUNC; + maxEnergy = 0.0; + meanEnergy = 0.0; + varEnergy = 0.0; + meanBg = 0.0; + varBg = 0.0; + int nIn = 0; + int nOut = 0; + mode.x = 0; + mode.y = 0; + for (int sy = -extra; sy < (static_cast(bounds.ysize()) + extra); + sy++) { + int y = sy + static_cast(bounds.y0()); + if (y < 0 || static_cast(y) >= energy.ysize()) continue; + const float* JXL_RESTRICT erow = energy.ConstRow(y); + for (int sx = -extra; sx < (static_cast(bounds.xsize()) + extra); + sx++) { + int x = sx + static_cast(bounds.x0()); + if (x < 0 || static_cast(x) >= energy.xsize()) continue; + if (erow[x] > maxEnergy) { + maxEnergy = erow[x]; + mode.x = x; + mode.y = y; + } + if (PointInRect(bounds, Pixel{x, y})) { + meanEnergy += erow[x]; + varEnergy += erow[x] * erow[x]; + nIn++; + } else { + meanBg += erow[x]; + varBg += erow[x] * erow[x]; + nOut++; + } + } + } + meanEnergy = meanEnergy / nIn; + meanBg = meanBg / nOut; + varEnergy = (varEnergy / nIn) - meanEnergy * meanEnergy; + varBg = (varBg / nOut) - meanBg * meanBg; + score = (meanEnergy - meanBg) / std::sqrt(varBg); + } +}; + +Rect BoundingRectangle(const std::vector& pixels) { + PROFILER_FUNC; + JXL_ASSERT(!pixels.empty()); + int low_x, high_x, low_y, high_y; + low_x = high_x = pixels[0].x; + low_y = high_y = pixels[0].y; + for (const Pixel& p : pixels) { + low_x = std::min(low_x, p.x); + high_x = std::max(high_x, p.x); + low_y = std::min(low_y, p.y); + high_y = std::max(high_y, p.y); + } + return Rect(low_x, low_y, high_x - low_x + 1, high_y - low_y + 1); +} + +std::vector FindCC(const ImageF& energy, double t_low, + double t_high, uint32_t maxWindow, + double minScore) { + PROFILER_FUNC; + const int kExtraRect = 4; + ImageF img = CopyImage(energy); + std::vector ans; + for (size_t y = 0; y < img.ysize(); y++) { + float* JXL_RESTRICT row = img.Row(y); + for (size_t x = 0; x < img.xsize(); x++) { + if (row[x] > t_high) { + std::vector pixels; + row[x] = 0.0; + bool success = ExtractComponent( + &img, &pixels, Pixel{static_cast(x), static_cast(y)}, + t_low); + if (!success) continue; +#if JXL_DEBUG_DOT_DETECT + for (size_t i = 0; i < pixels.size(); i++) { + fprintf(stderr, "(%d,%d) ", pixels[i].x, pixels[i].y); + } + fprintf(stderr, "\n"); +#endif // JXL_DEBUG_DOT_DETECT + Rect bounds = BoundingRectangle(pixels); + if (bounds.xsize() < maxWindow && bounds.ysize() < maxWindow) { + ConnectedComponent cc{bounds, std::move(pixels)}; + cc.CompStats(energy, kExtraRect); + if (cc.score < minScore) continue; + JXL_DEBUG(JXL_DEBUG_DOT_DETECT, + "cc mode: (%d,%d), max: %f, bgMean: %f bgVar: " + "%f bound:(%zu,%zu,%zu,%zu)\n", + cc.mode.x, cc.mode.y, cc.maxEnergy, cc.meanEnergy, + cc.varEnergy, cc.bounds.x0(), cc.bounds.y0(), + cc.bounds.xsize(), cc.bounds.ysize()); + ans.push_back(cc); + } + } + } + } + return ans; +} + +// TODO (sggonzalez): Adapt this function for the different color spaces or +// remove it if the color space with the best performance does not need it +void ComputeDotLosses(GaussianEllipse* ellipse, const ConnectedComponent& cc, + const Image3F& img, const Image3F& background) { + PROFILER_FUNC; + const int rectBounds = 2; + const double kIntensityR = 0.0; // 0.015; + const double kSigmaR = 0.0; // 0.01; + const double kZeroEpsilon = 0.1; // Tolerance to consider a value negative + double ct = cos(ellipse->angle), st = sin(ellipse->angle); + const std::array channelGains{1.0, 1.0, 1.0}; + int N = 0; + ellipse->l1_loss = 0.0; + ellipse->l2_loss = 0.0; + ellipse->neg_pixels = 0; + ellipse->neg_value.fill(0.0); + double distMeanModeSq = (cc.mode.x - ellipse->x) * (cc.mode.x - ellipse->x) + + (cc.mode.y - ellipse->y) * (cc.mode.y - ellipse->y); + ellipse->custom_loss = 0.0; + for (int c = 0; c < 3; c++) { + for (int sy = -rectBounds; + sy < (static_cast(cc.bounds.ysize()) + rectBounds); sy++) { + int y = sy + cc.bounds.y0(); + if (y < 0 || static_cast(y) >= img.ysize()) continue; + const float* JXL_RESTRICT row = img.ConstPlaneRow(c, y); + // bgrow is only used if kOptimizeBackground is false. + // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores) + const float* JXL_RESTRICT bgrow = background.ConstPlaneRow(c, y); + for (int sx = -rectBounds; + sx < (static_cast(cc.bounds.xsize()) + rectBounds); sx++) { + int x = sx + cc.bounds.x0(); + if (x < 0 || static_cast(x) >= img.xsize()) continue; + double target = row[x]; + double dotDelta = DotGaussianModel( + x - ellipse->x, y - ellipse->y, ct, st, ellipse->sigma_x, + ellipse->sigma_y, ellipse->intensity[c]); + if (dotDelta > target + kZeroEpsilon) { + ellipse->neg_pixels++; + ellipse->neg_value[c] += dotDelta - target; + } + double bkg = kOptimizeBackground ? ellipse->bgColor[c] : bgrow[x]; + double pred = bkg + dotDelta; + double diff = target - pred; + double l2 = channelGains[c] * diff * diff; + double l1 = channelGains[c] * std::fabs(diff); + ellipse->l2_loss += l2; + ellipse->l1_loss += l1; + double w = DotGaussianModel(x - cc.mode.x, y - cc.mode.y, 1.0, 0.0, + 1.0 + ellipse->sigma_x, + 1.0 + ellipse->sigma_y, 1.0); + ellipse->custom_loss += w * l2; + N++; + } + } + } + ellipse->l2_loss /= N; + ellipse->custom_loss /= N; + ellipse->custom_loss += 20.0 * distMeanModeSq + ellipse->neg_value[1]; + ellipse->l1_loss /= N; + double ridgeTerm = kSigmaR * ellipse->sigma_x + kSigmaR * ellipse->sigma_y; + for (int c = 0; c < 3; c++) { + ridgeTerm += kIntensityR * ellipse->intensity[c] * ellipse->intensity[c]; + } + ellipse->ridge_loss = ellipse->l2_loss + ridgeTerm; +} + +GaussianEllipse FitGaussianFast(const ConnectedComponent& cc, + const ImageF& energy, const Image3F& img, + const Image3F& background) { + PROFILER_FUNC; + constexpr bool leastSqIntensity = true; + constexpr double kEpsilon = 1e-6; + GaussianEllipse ans; + constexpr int kRectBounds = (kEllipseWindowSize >> 1); + + // Compute the 1st and 2nd moments of the CC + double sum = 0.0; + int N = 0; + std::array m1{0.0, 0.0, 0.0}; + std::array m2{0.0, 0.0, 0.0}; + std::array color{0.0, 0.0, 0.0}; + std::array bgColor{0.0, 0.0, 0.0}; + + JXL_DEBUG(JXL_DEBUG_DOT_DETECT, "%zu %zu %zu %zu\n", cc.bounds.x0(), + cc.bounds.y0(), cc.bounds.xsize(), cc.bounds.ysize()); + for (int c = 0; c < 3; c++) { + color[c] = img.ConstPlaneRow(c, cc.mode.y)[cc.mode.x] - + background.ConstPlaneRow(c, cc.mode.y)[cc.mode.x]; + } + double sign = (color[1] > 0) ? 1 : -1; + for (int sy = -kRectBounds; sy <= kRectBounds; sy++) { + int y = sy + cc.mode.y; + if (y < 0 || static_cast(y) >= energy.ysize()) continue; + const float* JXL_RESTRICT row = img.ConstPlaneRow(1, y); + const float* JXL_RESTRICT bgrow = background.ConstPlaneRow(1, y); + for (int sx = -kRectBounds; sx <= kRectBounds; sx++) { + int x = sx + cc.mode.x; + if (x < 0 || static_cast(x) >= energy.xsize()) continue; + double w = std::max(kEpsilon, sign * (row[x] - bgrow[x])); + sum += w; + + m1[0] += w * x; + m1[1] += w * y; + m2[0] += w * x * x; + m2[1] += w * x * y; + m2[2] += w * y * y; + for (int c = 0; c < 3; c++) { + bgColor[c] += background.ConstPlaneRow(c, y)[x]; + } + N++; + } + } + JXL_CHECK(N > 0); + + for (int i = 0; i < 3; i++) { + m1[i] /= sum; + m2[i] /= sum; + bgColor[i] /= N; + } + + // Some magic constants + constexpr double kSigmaMult = 1.0; + constexpr std::array kScaleMult{1.1, 1.1, 1.1}; + + // Now set the parameters of the Gaussian + ans.x = m1[0]; + ans.y = m1[1]; + for (int j = 0; j < 3; j++) { + ans.intensity[j] = kScaleMult[j] * color[j]; + } + + ImageD Sigma(2, 2), D(1, 2), U(2, 2); + Sigma.Row(0)[0] = m2[0] - m1[0] * m1[0]; + Sigma.Row(1)[1] = m2[2] - m1[1] * m1[1]; + Sigma.Row(0)[1] = Sigma.Row(1)[0] = m2[1] - m1[0] * m1[1]; + ConvertToDiagonal(Sigma, &D, &U); + const double* JXL_RESTRICT d = D.ConstRow(0); + const double* JXL_RESTRICT u = U.ConstRow(1); + int p1 = 0, p2 = 1; + if (d[0] < d[1]) std::swap(p1, p2); + ans.sigma_x = kSigmaMult * d[p1]; + ans.sigma_y = kSigmaMult * d[p2]; + ans.angle = std::atan2(u[p1], u[p2]); + ans.l2_loss = 0.0; + ans.bgColor = bgColor; + if (leastSqIntensity) { + GaussianEllipse* ellipse = &ans; + double ct = cos(ans.angle), st = sin(ans.angle); + // Estimate intensity with least squares (fixed background) + for (int c = 0; c < 3; c++) { + double gg = 0.0; + double gd = 0.0; + int yc = static_cast(cc.mode.y); + int xc = static_cast(cc.mode.x); + for (int y = yc - kRectBounds; y <= yc + kRectBounds; y++) { + if (y < 0 || static_cast(y) >= img.ysize()) continue; + const float* JXL_RESTRICT row = img.ConstPlaneRow(c, y); + const float* JXL_RESTRICT bgrow = background.ConstPlaneRow(c, y); + for (int x = xc - kRectBounds; x <= xc + kRectBounds; x++) { + if (x < 0 || static_cast(x) >= img.xsize()) continue; + double target = row[x] - bgrow[x]; + double gaussian = + DotGaussianModel(x - ellipse->x, y - ellipse->y, ct, st, + ellipse->sigma_x, ellipse->sigma_y, 1.0); + gg += gaussian * gaussian; + gd += gaussian * target; + } + } + ans.intensity[c] = gd / (gg + 1e-6); // Regularized least squares + } + } + ComputeDotLosses(&ans, cc, img, background); + return ans; +} + +GaussianEllipse FitGaussian(const ConnectedComponent& cc, const ImageF& energy, + const Image3F& img, const Image3F& background) { + auto ellipse = FitGaussianFast(cc, energy, img, background); + if (ellipse.sigma_x < ellipse.sigma_y) { + std::swap(ellipse.sigma_x, ellipse.sigma_y); + ellipse.angle += kPi / 2.0; + } + ellipse.angle -= kPi * std::floor(ellipse.angle / kPi); + if (fabs(ellipse.angle - kPi) < 1e-6 || fabs(ellipse.angle) < 1e-6) { + ellipse.angle = 0.0; + } + JXL_CHECK(ellipse.angle >= 0 && ellipse.angle <= kPi && + ellipse.sigma_x >= ellipse.sigma_y); + JXL_DEBUG(JXL_DEBUG_DOT_DETECT, + "Ellipse mu=(%lf,%lf) sigma=(%lf,%lf) angle=%lf " + "intensity=(%lf,%lf,%lf) bg=(%lf,%lf,%lf) l2_loss=%lf " + "custom_loss=%lf, neg_pix=%zu, neg_v=(%lf,%lf,%lf)\n", + ellipse.x, ellipse.y, ellipse.sigma_x, ellipse.sigma_y, + ellipse.angle, ellipse.intensity[0], ellipse.intensity[1], + ellipse.intensity[2], ellipse.bgColor[0], ellipse.bgColor[1], + ellipse.bgColor[2], ellipse.l2_loss, ellipse.custom_loss, + ellipse.neg_pixels, ellipse.neg_value[0], ellipse.neg_value[1], + ellipse.neg_value[2]); + return ellipse; +} + +} // namespace + +std::vector DetectGaussianEllipses( + const Image3F& opsin, const GaussianDetectParams& params, + const EllipseQuantParams& qParams, ThreadPool* pool) { + PROFILER_FUNC; + std::vector dots; + Image3F smooth(opsin.xsize(), opsin.ysize()); + ImageF energy = ComputeEnergyImage(opsin, &smooth, pool); +#if JXL_DEBUG_DOT_DETECT + AuxOut aux; + aux.debug_prefix = "/tmp/sebastian/"; + aux.DumpXybImage("smooth", smooth); + aux.DumpPlaneNormalized("energy", energy); +#endif // JXL_DEBUG_DOT_DETECT + std::vector components = FindCC( + energy, params.t_low, params.t_high, params.maxWinSize, params.minScore); + size_t numCC = + std::min(params.maxCC, (components.size() * params.percCC) / 100); + if (components.size() > numCC) { + std::sort( + components.begin(), components.end(), + [](const ConnectedComponent& a, const ConnectedComponent& b) -> bool { + return a.score > b.score; + }); + components.erase(components.begin() + numCC, components.end()); + } + for (const auto& cc : components) { + GaussianEllipse ellipse = FitGaussian(cc, energy, opsin, smooth); + if (ellipse.x < 0.0 || + std::ceil(ellipse.x) >= static_cast(opsin.xsize()) || + ellipse.y < 0.0 || + std::ceil(ellipse.y) >= static_cast(opsin.ysize())) { + continue; + } + if (ellipse.neg_pixels > params.maxNegPixels) continue; + double intensity = 0.21 * ellipse.intensity[0] + + 0.72 * ellipse.intensity[1] + + 0.07 * ellipse.intensity[2]; + double intensitySq = intensity * intensity; + // for (int c = 0; c < 3; c++) { + // intensitySq += ellipse.intensity[c] * ellipse.intensity[c]; + //} + double sqDistMeanMode = (ellipse.x - cc.mode.x) * (ellipse.x - cc.mode.x) + + (ellipse.y - cc.mode.y) * (ellipse.y - cc.mode.y); + if (ellipse.l2_loss < params.maxL2Loss && + ellipse.custom_loss < params.maxCustomLoss && + intensitySq > (params.minIntensity * params.minIntensity) && + sqDistMeanMode < params.maxDistMeanMode * params.maxDistMeanMode) { + size_t x0 = cc.bounds.x0(); + size_t y0 = cc.bounds.y0(); + dots.emplace_back(); + dots.back().second.emplace_back(x0, y0); + QuantizedPatch& patch = dots.back().first; + patch.xsize = cc.bounds.xsize(); + patch.ysize = cc.bounds.ysize(); + for (size_t y = 0; y < patch.ysize; y++) { + for (size_t x = 0; x < patch.xsize; x++) { + for (size_t c = 0; c < 3; c++) { + patch.fpixels[c][y * patch.xsize + x] = + opsin.ConstPlaneRow(c, y0 + y)[x0 + x] - + smooth.ConstPlaneRow(c, y0 + y)[x0 + x]; + } + } + } + } + } +#if JXL_DEBUG_DOT_DETECT + JXL_DEBUG(JXL_DEBUG_DOT_DETECT, "Candidates: %zu, Dots: %zu\n", + components.size(), dots.size()); + ApplyGaussianEllipses(&smooth, dots, 1.0); + aux.DumpXybImage("draw", smooth); + ApplyGaussianEllipses(&smooth, dots, -1.0); + + auto qdots = QuantizeGaussianEllipses(dots, qParams); + auto deq = DequantizeGaussianEllipses(qdots, qParams); + ApplyGaussianEllipses(&smooth, deq, 1.0); + aux.DumpXybImage("qdraw", smooth); + ApplyGaussianEllipses(&smooth, deq, -1.0); +#endif // JXL_DEBUG_DOT_DETECT + return dots; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_detect_dots.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_detect_dots.h new file mode 100644 index 0000000000..6e06a164fd --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_detect_dots.h @@ -0,0 +1,66 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// We attempt to remove dots, or speckle from images using Gaussian blur. +#ifndef LIB_JXL_ENC_DETECT_DOTS_H_ +#define LIB_JXL_ENC_DETECT_DOTS_H_ + +#include +#include + +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/dec_patch_dictionary.h" +#include "lib/jxl/image.h" + +namespace jxl { + +struct GaussianDetectParams { + double t_high = 0; // at least one pixel must have larger energy than t_high + double t_low = 0; // all pixels must have a larger energy than tLow + uint32_t maxWinSize = 0; // discard dots larger than this containing window + double maxL2Loss = 0; + double maxCustomLoss = 0; + double minIntensity = 0; // If the intensity is too low, discard it + double maxDistMeanMode = 0; // The mean and the mode must be close + size_t maxNegPixels = 0; // Maximum number of negative pixel + size_t minScore = 0; + size_t maxCC = 50; // Maximum number of CC to keep + size_t percCC = 15; // Percentage in [0,100] of CC to keep +}; + +// Ellipse Quantization Params +struct EllipseQuantParams { + size_t xsize; // Image size in x + size_t ysize; // Image size in y + size_t qPosition; // Position quantization delta + // Quantization for the Gaussian sigma parameters + double minSigma; + double maxSigma; + size_t qSigma; // number of quantization levels + // Quantization for the rotation angle (between -pi and pi) + size_t qAngle; + // Quantization for the intensity + std::array minIntensity; + std::array maxIntensity; + std::array qIntensity; // number of quantization levels + // Extra parameters for the encoding + bool subtractQuantized; // Should we subtract quantized or detected dots? + float ytox; + float ytob; + + void QuantPositionSize(size_t* xsize, size_t* ysize) const; +}; + +// Detects dots in XYB image. +std::vector DetectGaussianEllipses( + const Image3F& opsin, const GaussianDetectParams& params, + const EllipseQuantParams& qParams, ThreadPool* pool); + +} // namespace jxl + +#endif // LIB_JXL_ENC_DETECT_DOTS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_dot_dictionary.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_dot_dictionary.cc new file mode 100644 index 0000000000..40440c4aa4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_dot_dictionary.cc @@ -0,0 +1,72 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_dot_dictionary.h" + +#include +#include + +#include +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_detect_dots.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/image.h" + +namespace jxl { + +// Private implementation of Dictionary Encode/Decode +namespace { + +/* Quantization constants for Ellipse dots */ +const size_t kEllipsePosQ = 2; // Quantization level for the position +const double kEllipseMinSigma = 0.1; // Minimum sigma value +const double kEllipseMaxSigma = 3.1; // Maximum Sigma value +const size_t kEllipseSigmaQ = 16; // Number of quantization levels for sigma +const size_t kEllipseAngleQ = 8; // Quantization level for the angle +// TODO: fix these values. +const std::array kEllipseMinIntensity{-0.05, 0.0, -0.5}; +const std::array kEllipseMaxIntensity{0.05, 1.0, 0.4}; +const std::array kEllipseIntensityQ{10, 36, 10}; +} // namespace + +std::vector FindDotDictionary(const CompressParams& cparams, + const Image3F& opsin, + const ColorCorrelationMap& cmap, + ThreadPool* pool) { + if (ApplyOverride(cparams.dots, + cparams.butteraugli_distance >= kMinButteraugliForDots)) { + GaussianDetectParams ellipse_params; + ellipse_params.t_high = 0.04; + ellipse_params.t_low = 0.02; + ellipse_params.maxWinSize = 5; + ellipse_params.maxL2Loss = 0.005; + ellipse_params.maxCustomLoss = 300; + ellipse_params.minIntensity = 0.12; + ellipse_params.maxDistMeanMode = 1.0; + ellipse_params.maxNegPixels = 0; + ellipse_params.minScore = 12.0; + ellipse_params.maxCC = 100; + ellipse_params.percCC = 100; + EllipseQuantParams qParams{ + opsin.xsize(), opsin.ysize(), kEllipsePosQ, + kEllipseMinSigma, kEllipseMaxSigma, kEllipseSigmaQ, + kEllipseAngleQ, kEllipseMinIntensity, kEllipseMaxIntensity, + kEllipseIntensityQ, kEllipsePosQ <= 5, cmap.YtoXRatio(0), + cmap.YtoBRatio(0)}; + + return DetectGaussianEllipses(opsin, ellipse_params, qParams, pool); + } + return {}; +} +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_dot_dictionary.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_dot_dictionary.h new file mode 100644 index 0000000000..f89791e4b1 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_dot_dictionary.h @@ -0,0 +1,35 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_DOT_DICTIONARY_H_ +#define LIB_JXL_ENC_DOT_DICTIONARY_H_ + +// Dots are stored in a dictionary to avoid storing similar dots multiple +// times. + +#include + +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_patch_dictionary.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image.h" + +namespace jxl { + +std::vector FindDotDictionary(const CompressParams& cparams, + const Image3F& opsin, + const ColorCorrelationMap& cmap, + ThreadPool* pool); + +} // namespace jxl + +#endif // LIB_JXL_ENC_DOT_DICTIONARY_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_entropy_coder.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_entropy_coder.cc new file mode 100644 index 0000000000..0946300972 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_entropy_coder.cc @@ -0,0 +1,268 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_entropy_coder.h" + +#include +#include + +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_entropy_coder.cc" +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_context_map.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/epf.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// Returns number of non-zero coefficients (but skip LLF). +// We cannot rely on block[] being all-zero bits, so first truncate to integer. +// Also writes the per-8x8 block nzeros starting at nzeros_pos. +int32_t NumNonZeroExceptLLF(const size_t cx, const size_t cy, + const AcStrategy acs, const size_t covered_blocks, + const size_t log2_covered_blocks, + const int32_t* JXL_RESTRICT block, + const size_t nzeros_stride, + int32_t* JXL_RESTRICT nzeros_pos) { + const HWY_CAPPED(int32_t, kBlockDim) di; + + const auto zero = Zero(di); + // Add FF..FF for every zero coefficient, negate to get #zeros. + auto neg_sum_zero = zero; + + { + // Mask sufficient for one row of coefficients. + HWY_ALIGN const int32_t + llf_mask_lanes[AcStrategy::kMaxCoeffBlocks * (1 + kBlockDim)] = { + -1, -1, -1, -1}; + // First cx=1,2,4 elements are FF..FF, others 0. + const int32_t* llf_mask_pos = + llf_mask_lanes + AcStrategy::kMaxCoeffBlocks - cx; + + // Rows with LLF: mask out the LLF + for (size_t y = 0; y < cy; y++) { + for (size_t x = 0; x < cx * kBlockDim; x += Lanes(di)) { + const auto llf_mask = LoadU(di, llf_mask_pos + x); + + // LLF counts as zero so we don't include it in nzeros. + const auto coef = + AndNot(llf_mask, Load(di, &block[y * cx * kBlockDim + x])); + + neg_sum_zero += VecFromMask(di, coef == zero); + } + } + } + + // Remaining rows: no mask + for (size_t y = cy; y < cy * kBlockDim; y++) { + for (size_t x = 0; x < cx * kBlockDim; x += Lanes(di)) { + const auto coef = Load(di, &block[y * cx * kBlockDim + x]); + neg_sum_zero += VecFromMask(di, coef == zero); + } + } + + // We want area - sum_zero, add because neg_sum_zero is already negated. + const int32_t nzeros = + int32_t(cx * cy * kDCTBlockSize) + GetLane(SumOfLanes(neg_sum_zero)); + + const int32_t shifted_nzeros = static_cast( + (nzeros + covered_blocks - 1) >> log2_covered_blocks); + // Need non-canonicalized dimensions! + for (size_t y = 0; y < acs.covered_blocks_y(); y++) { + for (size_t x = 0; x < acs.covered_blocks_x(); x++) { + nzeros_pos[x + y * nzeros_stride] = shifted_nzeros; + } + } + + return nzeros; +} + +// Specialization for 8x8, where only top-left is LLF/DC. +// About 1% overall speedup vs. NumNonZeroExceptLLF. +int32_t NumNonZero8x8ExceptDC(const int32_t* JXL_RESTRICT block, + int32_t* JXL_RESTRICT nzeros_pos) { + const HWY_CAPPED(int32_t, kBlockDim) di; + + const auto zero = Zero(di); + // Add FF..FF for every zero coefficient, negate to get #zeros. + auto neg_sum_zero = zero; + + { + // First row has DC, so mask + const size_t y = 0; + HWY_ALIGN const int32_t dc_mask_lanes[kBlockDim] = {-1}; + + for (size_t x = 0; x < kBlockDim; x += Lanes(di)) { + const auto dc_mask = Load(di, dc_mask_lanes + x); + + // DC counts as zero so we don't include it in nzeros. + const auto coef = AndNot(dc_mask, Load(di, &block[y * kBlockDim + x])); + + neg_sum_zero += VecFromMask(di, coef == zero); + } + } + + // Remaining rows: no mask + for (size_t y = 1; y < kBlockDim; y++) { + for (size_t x = 0; x < kBlockDim; x += Lanes(di)) { + const auto coef = Load(di, &block[y * kBlockDim + x]); + neg_sum_zero += VecFromMask(di, coef == zero); + } + } + + // We want 64 - sum_zero, add because neg_sum_zero is already negated. + const int32_t nzeros = + int32_t(kDCTBlockSize) + GetLane(SumOfLanes(neg_sum_zero)); + + *nzeros_pos = nzeros; + + return nzeros; +} + +// The number of nonzeros of each block is predicted from the top and the left +// blocks, with opportune scaling to take into account the number of blocks of +// each strategy. The predicted number of nonzeros divided by two is used as a +// context; if this number is above 63, a specific context is used. If the +// number of nonzeros of a strategy is above 63, it is written directly using a +// fixed number of bits (that depends on the size of the strategy). +void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders, + const Rect& rect, + const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows, + const AcStrategyImage& ac_strategy, + YCbCrChromaSubsampling cs, + Image3I* JXL_RESTRICT tmp_num_nzeroes, + std::vector* JXL_RESTRICT output, + const ImageB& qdc, const ImageI& qf, + const BlockCtxMap& block_ctx_map) { + const size_t xsize_blocks = rect.xsize(); + const size_t ysize_blocks = rect.ysize(); + + // TODO(user): update the estimate: usually less coefficients are used. + output->reserve(output->size() + + 3 * xsize_blocks * ysize_blocks * kDCTBlockSize); + + size_t offset[3] = {}; + const size_t nzeros_stride = tmp_num_nzeroes->PixelsPerRow(); + for (size_t by = 0; by < ysize_blocks; ++by) { + size_t sby[3] = {by >> cs.VShift(0), by >> cs.VShift(1), + by >> cs.VShift(2)}; + int32_t* JXL_RESTRICT row_nzeros[3] = { + tmp_num_nzeroes->PlaneRow(0, sby[0]), + tmp_num_nzeroes->PlaneRow(1, sby[1]), + tmp_num_nzeroes->PlaneRow(2, sby[2]), + }; + const int32_t* JXL_RESTRICT row_nzeros_top[3] = { + sby[0] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(0, sby[0] - 1), + sby[1] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(1, sby[1] - 1), + sby[2] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(2, sby[2] - 1), + }; + const uint8_t* JXL_RESTRICT row_qdc = + qdc.ConstRow(rect.y0() + by) + rect.x0(); + const int32_t* JXL_RESTRICT row_qf = rect.ConstRow(qf, by); + AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by); + for (size_t bx = 0; bx < xsize_blocks; ++bx) { + AcStrategy acs = acs_row[bx]; + if (!acs.IsFirstBlock()) continue; + size_t sbx[3] = {bx >> cs.HShift(0), bx >> cs.HShift(1), + bx >> cs.HShift(2)}; + size_t cx = acs.covered_blocks_x(); + size_t cy = acs.covered_blocks_y(); + const size_t covered_blocks = cx * cy; // = #LLF coefficients + const size_t log2_covered_blocks = + Num0BitsBelowLS1Bit_Nonzero(covered_blocks); + const size_t size = covered_blocks * kDCTBlockSize; + + CoefficientLayout(&cy, &cx); // swap cx/cy to canonical order + + for (int c : {1, 0, 2}) { + if (sbx[c] << cs.HShift(c) != bx) continue; + if (sby[c] << cs.VShift(c) != by) continue; + const int32_t* JXL_RESTRICT block = ac_rows[c] + offset[c]; + + int32_t nzeros = + (covered_blocks == 1) + ? NumNonZero8x8ExceptDC(block, row_nzeros[c] + sbx[c]) + : NumNonZeroExceptLLF(cx, cy, acs, covered_blocks, + log2_covered_blocks, block, nzeros_stride, + row_nzeros[c] + sbx[c]); + + int ord = kStrategyOrder[acs.RawStrategy()]; + const coeff_order_t* JXL_RESTRICT order = + &orders[CoeffOrderOffset(ord, c)]; + + int32_t predicted_nzeros = + PredictFromTopAndLeft(row_nzeros_top[c], row_nzeros[c], sbx[c], 32); + size_t block_ctx = + block_ctx_map.Context(row_qdc[bx], row_qf[sbx[c]], ord, c); + const int32_t nzero_ctx = + block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx); + + output->emplace_back(nzero_ctx, nzeros); + const size_t histo_offset = + block_ctx_map.ZeroDensityContextsOffset(block_ctx); + // Skip LLF. + size_t prev = (nzeros > static_cast(size / 16) ? 0 : 1); + for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) { + int32_t coeff = block[order[k]]; + size_t ctx = + histo_offset + ZeroDensityContext(nzeros, k, covered_blocks, + log2_covered_blocks, prev); + uint32_t u_coeff = PackSigned(coeff); + output->emplace_back(ctx, u_coeff); + prev = coeff != 0; + nzeros -= prev; + } + JXL_DASSERT(nzeros == 0); + offset[c] += size; + } + } + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(TokenizeCoefficients); +void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders, + const Rect& rect, + const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows, + const AcStrategyImage& ac_strategy, + YCbCrChromaSubsampling cs, + Image3I* JXL_RESTRICT tmp_num_nzeroes, + std::vector* JXL_RESTRICT output, + const ImageB& qdc, const ImageI& qf, + const BlockCtxMap& block_ctx_map) { + return HWY_DYNAMIC_DISPATCH(TokenizeCoefficients)( + orders, rect, ac_rows, ac_strategy, cs, tmp_num_nzeroes, output, qdc, qf, + block_ctx_map); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_entropy_coder.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_entropy_coder.h new file mode 100644 index 0000000000..7dfc71c726 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_entropy_coder.h @@ -0,0 +1,46 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_ENTROPY_CODER_H_ +#define LIB_JXL_ENC_ENTROPY_CODER_H_ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "lib/jxl/ac_context.h" // BlockCtxMap +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/frame_header.h" // YCbCrChromaSubsampling +#include "lib/jxl/image.h" + +// Entropy coding and context modeling of DC and AC coefficients, as well as AC +// strategy and quantization field. + +namespace jxl { + +// Generate DCT NxN quantized AC values tokens. +// Only the subset "rect" [in units of blocks] within all images. +// See also DecodeACVarBlock. +void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders, + const Rect& rect, + const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows, + const AcStrategyImage& ac_strategy, + YCbCrChromaSubsampling cs, + Image3I* JXL_RESTRICT tmp_num_nzeroes, + std::vector* JXL_RESTRICT output, + const ImageB& qdc, const ImageI& qf, + const BlockCtxMap& block_ctx_map); + +} // namespace jxl + +#endif // LIB_JXL_ENC_ENTROPY_CODER_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image.cc new file mode 100644 index 0000000000..f1eb155e71 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image.cc @@ -0,0 +1,283 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_external_image.h" + +#include + +#include +#include +#include +#include +#include + +#include "lib/jxl/alpha.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" + +namespace jxl { +namespace { + +// Loads a float in big endian +float LoadBEFloat(const uint8_t* p) { + float value; + const uint32_t u = LoadBE32(p); + memcpy(&value, &u, 4); + return value; +} + +// Loads a float in little endian +float LoadLEFloat(const uint8_t* p) { + float value; + const uint32_t u = LoadLE32(p); + memcpy(&value, &u, 4); + return value; +} + +typedef uint32_t(LoadFuncType)(const uint8_t* p); +template +void JXL_INLINE LoadFloatRow(float* JXL_RESTRICT row_out, const uint8_t* in, + float mul, size_t xsize, size_t bytes_per_pixel) { + size_t i = 0; + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = mul * LoadFunc(in + i); + i += bytes_per_pixel; + } +} + +uint32_t JXL_INLINE Load8(const uint8_t* p) { return *p; } + +} // namespace + +Status ConvertFromExternal(Span bytes, size_t xsize, + size_t ysize, const ColorEncoding& c_current, + bool has_alpha, bool alpha_is_premultiplied, + size_t bits_per_sample, JxlEndianness endianness, + bool flipped_y, ThreadPool* pool, ImageBundle* ib) { + if (bits_per_sample < 1 || bits_per_sample > 32) { + return JXL_FAILURE("Invalid bits_per_sample value."); + } + // TODO(deymo): Implement 1-bit per sample as 8 samples per byte. In + // any other case we use DivCeil(bits_per_sample, 8) bytes per pixel per + // channel. + if (bits_per_sample == 1) { + return JXL_FAILURE("packed 1-bit per sample is not yet supported"); + } + + const size_t color_channels = c_current.Channels(); + const size_t channels = color_channels + has_alpha; + + // bytes_per_channel and bytes_per_pixel are only valid for + // bits_per_sample > 1. + const size_t bytes_per_channel = DivCeil(bits_per_sample, jxl::kBitsPerByte); + const size_t bytes_per_pixel = channels * bytes_per_channel; + + const size_t row_size = xsize * bytes_per_pixel; + if (ysize && bytes.size() / ysize < row_size) { + return JXL_FAILURE("Buffer size is too small"); + } + + const bool little_endian = + endianness == JXL_LITTLE_ENDIAN || + (endianness == JXL_NATIVE_ENDIAN && IsLittleEndian()); + + const uint8_t* const in = bytes.data(); + + Image3F color(xsize, ysize); + ImageF alpha; + if (has_alpha) { + alpha = ImageF(xsize, ysize); + } + + // Matches the old behavior of PackedImage. + // TODO(sboukortt): make this a parameter. + const bool float_in = bits_per_sample == 32; + + const auto get_y = [flipped_y, ysize](const size_t y) { + return flipped_y ? ysize - 1 - y : y; + }; + + if (float_in) { + if (bits_per_sample != 32) { + return JXL_FAILURE("non-32-bit float not supported"); + } + for (size_t c = 0; c < color_channels; ++c) { + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const size_t y = get_y(task); + size_t i = + row_size * task + (c * bits_per_sample / jxl::kBitsPerByte); + float* JXL_RESTRICT row_out = color.PlaneRow(c, y); + if (little_endian) { + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = LoadLEFloat(in + i); + i += bytes_per_pixel; + } + } else { + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = LoadBEFloat(in + i); + i += bytes_per_pixel; + } + } + }, + "ConvertRGBFloat"); + } + } else { + // Multiplier to convert from the integer range to floating point 0-1 range. + float mul = 1. / ((1ull << bits_per_sample) - 1); + for (size_t c = 0; c < color_channels; ++c) { + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const size_t y = get_y(task); + size_t i = row_size * task + c * bytes_per_channel; + float* JXL_RESTRICT row_out = color.PlaneRow(c, y); + // TODO(deymo): add bits_per_sample == 1 case here. Also maybe + // implement masking if bits_per_sample is not a multiple of 8. + if (bits_per_sample <= 8) { + LoadFloatRow(row_out, in + i, mul, xsize, bytes_per_pixel); + } else if (bits_per_sample <= 16) { + if (little_endian) { + LoadFloatRow(row_out, in + i, mul, xsize, + bytes_per_pixel); + } else { + LoadFloatRow(row_out, in + i, mul, xsize, + bytes_per_pixel); + } + } else if (bits_per_sample <= 24) { + if (little_endian) { + LoadFloatRow(row_out, in + i, mul, xsize, + bytes_per_pixel); + } else { + LoadFloatRow(row_out, in + i, mul, xsize, + bytes_per_pixel); + } + } else { + if (little_endian) { + LoadFloatRow(row_out, in + i, mul, xsize, + bytes_per_pixel); + } else { + LoadFloatRow(row_out, in + i, mul, xsize, + bytes_per_pixel); + } + } + }, + "ConvertRGBUint"); + } + } + + if (color_channels == 1) { + CopyImageTo(color.Plane(0), &color.Plane(1)); + CopyImageTo(color.Plane(0), &color.Plane(2)); + } + + ib->SetFromImage(std::move(color), c_current); + + if (has_alpha) { + if (float_in) { + if (bits_per_sample != 32) { + return JXL_FAILURE("non-32-bit float not supported"); + } + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const size_t y = get_y(task); + size_t i = row_size * task + + (color_channels * bits_per_sample / jxl::kBitsPerByte); + float* JXL_RESTRICT row_out = alpha.Row(y); + if (little_endian) { + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = LoadLEFloat(in + i); + i += bytes_per_pixel; + } + } else { + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = LoadBEFloat(in + i); + i += bytes_per_pixel; + } + } + }, + "ConvertAlphaFloat"); + } else { + float mul = 1. / ((1ull << bits_per_sample) - 1); + RunOnPool( + pool, 0, static_cast(ysize), ThreadPool::SkipInit(), + [&](const int task, int /*thread*/) { + const size_t y = get_y(task); + size_t i = row_size * task + color_channels * bytes_per_channel; + float* JXL_RESTRICT row_out = alpha.Row(y); + // TODO(deymo): add bits_per_sample == 1 case here. Also maybe + // implement masking if bits_per_sample is not a multiple of 8. + if (bits_per_sample <= 8) { + LoadFloatRow(row_out, in + i, mul, xsize, bytes_per_pixel); + } else if (bits_per_sample <= 16) { + if (little_endian) { + LoadFloatRow(row_out, in + i, mul, xsize, + bytes_per_pixel); + } else { + LoadFloatRow(row_out, in + i, mul, xsize, + bytes_per_pixel); + } + } else if (bits_per_sample <= 24) { + if (little_endian) { + LoadFloatRow(row_out, in + i, mul, xsize, + bytes_per_pixel); + } else { + LoadFloatRow(row_out, in + i, mul, xsize, + bytes_per_pixel); + } + } else { + if (little_endian) { + LoadFloatRow(row_out, in + i, mul, xsize, + bytes_per_pixel); + } else { + LoadFloatRow(row_out, in + i, mul, xsize, + bytes_per_pixel); + } + } + }, + "ConvertAlphaUint"); + } + + ib->SetAlpha(std::move(alpha), alpha_is_premultiplied); + } + + return true; +} + +Status BufferToImageBundle(const JxlPixelFormat& pixel_format, uint32_t xsize, + uint32_t ysize, const void* buffer, size_t size, + jxl::ThreadPool* pool, + const jxl::ColorEncoding& c_current, + jxl::ImageBundle* ib) { + size_t bitdepth; + + // TODO(zond): Make this accept more than float and uint8/16. + if (pixel_format.data_type == JXL_TYPE_FLOAT) { + bitdepth = 32; + } else if (pixel_format.data_type == JXL_TYPE_UINT8) { + bitdepth = 8; + } else if (pixel_format.data_type == JXL_TYPE_UINT16) { + bitdepth = 16; + } else { + return JXL_FAILURE("unsupported bitdepth"); + } + + JXL_RETURN_IF_ERROR(ConvertFromExternal( + jxl::Span(static_cast(const_cast(buffer)), + size), + xsize, ysize, c_current, + /*has_alpha=*/pixel_format.num_channels == 2 || + pixel_format.num_channels == 4, + /*alpha_is_premultiplied=*/false, bitdepth, pixel_format.endianness, + /*flipped_y=*/false, pool, ib)); + ib->VerifyMetadata(); + + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image.h new file mode 100644 index 0000000000..f943fc54ef --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image.h @@ -0,0 +1,50 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_EXTERNAL_IMAGE_H_ +#define LIB_JXL_ENC_EXTERNAL_IMAGE_H_ + +// Interleaved image for color transforms and Codec. + +#include +#include + +#include "jxl/types.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +// Return the size in bytes of a given xsize, channels and bits_per_sample +// interleaved image. +constexpr size_t RowSize(size_t xsize, size_t channels, + size_t bits_per_sample) { + return bits_per_sample == 1 + ? DivCeil(xsize, kBitsPerByte) + : xsize * channels * DivCeil(bits_per_sample, kBitsPerByte); +} + +// Convert an interleaved pixel buffer to the internal ImageBundle +// representation. This is the opposite of ConvertToExternal(). +Status ConvertFromExternal(Span bytes, size_t xsize, + size_t ysize, const ColorEncoding& c_current, + bool has_alpha, bool alpha_is_premultiplied, + size_t bits_per_sample, JxlEndianness endianness, + bool flipped_y, ThreadPool* pool, ImageBundle* ib); + +Status BufferToImageBundle(const JxlPixelFormat& pixel_format, uint32_t xsize, + uint32_t ysize, const void* buffer, size_t size, + jxl::ThreadPool* pool, + const jxl::ColorEncoding& c_current, + jxl::ImageBundle* ib); + +} // namespace jxl + +#endif // LIB_JXL_ENC_EXTERNAL_IMAGE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image_gbench.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image_gbench.cc new file mode 100644 index 0000000000..2af942b7f5 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image_gbench.cc @@ -0,0 +1,49 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "benchmark/benchmark.h" +#include "lib/jxl/enc_external_image.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { +namespace { + +// Encoder case, deinterleaves a buffer. +void BM_EncExternalImage_ConvertImageRGBA(benchmark::State& state) { + const size_t kNumIter = 5; + size_t xsize = state.range(); + size_t ysize = state.range(); + + ImageMetadata im; + im.SetAlphaBits(8); + ImageBundle ib(&im); + + std::vector interleaved(xsize * ysize * 4); + + for (auto _ : state) { + for (size_t i = 0; i < kNumIter; ++i) { + JXL_CHECK(ConvertFromExternal( + Span(interleaved.data(), interleaved.size()), xsize, + ysize, + /*c_current=*/ColorEncoding::SRGB(), + /*has_alpha=*/true, + /*alpha_is_premultiplied=*/false, + /*bits_per_sample=*/8, JXL_NATIVE_ENDIAN, + /*flipped_y=*/false, + /*pool=*/nullptr, &ib)); + } + } + + // Pixels per second. + state.SetItemsProcessed(kNumIter * state.iterations() * xsize * ysize); + state.SetBytesProcessed(kNumIter * state.iterations() * interleaved.size()); +} + +BENCHMARK(BM_EncExternalImage_ConvertImageRGBA) + ->RangeMultiplier(2) + ->Range(256, 2048); + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image_test.cc new file mode 100644 index 0000000000..3f3ac8988c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_external_image_test.cc @@ -0,0 +1,49 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_external_image.h" + +#include +#include + +#include "gtest/gtest.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/thread_pool_internal.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/image_test_utils.h" + +namespace jxl { +namespace { + +#if !defined(JXL_CRASH_ON_ERROR) +TEST(ExternalImageTest, InvalidSize) { + ImageMetadata im; + im.SetAlphaBits(8); + ImageBundle ib(&im); + + const uint8_t buf[10 * 100 * 8] = {}; + EXPECT_FALSE(ConvertFromExternal( + Span(buf, 10), /*xsize=*/10, /*ysize=*/100, + /*c_current=*/ColorEncoding::SRGB(), /*has_alpha=*/true, + /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, JXL_BIG_ENDIAN, + /*flipped_y=*/false, nullptr, &ib)); + EXPECT_FALSE(ConvertFromExternal( + Span(buf, sizeof(buf) - 1), /*xsize=*/10, /*ysize=*/100, + /*c_current=*/ColorEncoding::SRGB(), /*has_alpha=*/true, + /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, JXL_BIG_ENDIAN, + /*flipped_y=*/false, nullptr, &ib)); + EXPECT_TRUE( + ConvertFromExternal(Span(buf, sizeof(buf)), /*xsize=*/10, + /*ysize=*/100, /*c_current=*/ColorEncoding::SRGB(), + /*has_alpha=*/true, /*alpha_is_premultiplied=*/false, + /*bits_per_sample=*/16, JXL_BIG_ENDIAN, + /*flipped_y=*/false, nullptr, &ib)); +} +#endif + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_fast_heuristics.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_fast_heuristics.cc new file mode 100644 index 0000000000..16f7670c1a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_fast_heuristics.cc @@ -0,0 +1,361 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include +#include +#include + +#include "lib/jxl/convolve.h" +#include "lib/jxl/enc_ac_strategy.h" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_ar_control_field.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_heuristics.h" +#include "lib/jxl/enc_noise.h" +#include "lib/jxl/gaborish.h" +#include "lib/jxl/gauss_blur.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_fast_heuristics.cc" +#include +#include + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { +using DF4 = HWY_CAPPED(float, 4); +DF4 df4; +HWY_FULL(float) df; + +Status Heuristics(PassesEncoderState* enc_state, + ModularFrameEncoder* modular_frame_encoder, + const ImageBundle* linear, Image3F* opsin, ThreadPool* pool, + AuxOut* aux_out) { + PROFILER_ZONE("JxlLossyFrameHeuristics uninstrumented"); + CompressParams& cparams = enc_state->cparams; + PassesSharedState& shared = enc_state->shared; + const FrameDimensions& frame_dim = enc_state->shared.frame_dim; + JXL_CHECK(cparams.butteraugli_distance > 0); + + // TODO(veluca): make this tiled. + if (shared.frame_header.loop_filter.gab) { + GaborishInverse(opsin, 0.9908511000000001f, pool); + } + // Compute image of high frequencies by removing a blurred version. + // TODO(veluca): certainly can be made faster, and use less memory... + constexpr size_t pad = 16; + Image3F padded = PadImageMirror(*opsin, pad, pad); + // Make the image (X, Y, B-Y) + // TODO(veluca): SubtractFrom is not parallel *and* not SIMD-fied. + SubtractFrom(padded.Plane(1), &padded.Plane(2)); + // Ensure that OOB access for CfL does nothing. Not necessary if doing things + // properly... + Image3F hf(padded.xsize() + 64, padded.ysize()); + ZeroFillImage(&hf); + hf.ShrinkTo(padded.xsize(), padded.ysize()); + ImageF temp(padded.xsize(), padded.ysize()); + // TODO(veluca): consider some faster blurring method. + auto g = CreateRecursiveGaussian(11.415258091746161); + for (size_t c = 0; c < 3; c++) { + FastGaussian(g, padded.Plane(c), pool, &temp, &hf.Plane(c)); + SubtractFrom(padded.Plane(c), &hf.Plane(c)); + } + // TODO(veluca): DC CfL? + size_t xcolortiles = DivCeil(frame_dim.xsize_blocks, kColorTileDimInBlocks); + size_t ycolortiles = DivCeil(frame_dim.ysize_blocks, kColorTileDimInBlocks); + RunOnPool( + pool, 0, xcolortiles * ycolortiles, ThreadPool::SkipInit(), + [&](size_t tile_id, size_t _) { + size_t tx = tile_id % xcolortiles; + size_t ty = tile_id / xcolortiles; + size_t x0 = tx * kColorTileDim; + size_t x1 = std::min(x0 + kColorTileDim, hf.xsize()); + size_t y0 = ty * kColorTileDim; + size_t y1 = std::min(y0 + kColorTileDim, hf.ysize()); + for (size_t c : {0, 2}) { + static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor; + auto ca = Zero(df); + auto cb = Zero(df); + const auto inv_color_factor = Set(df, kInvColorFactor); + for (size_t y = y0; y < y1; y++) { + const float* row_m = hf.PlaneRow(1, y); + const float* row_s = hf.PlaneRow(c, y); + for (size_t x = x0; x < x1; x += Lanes(df)) { + // color residual = ax + b + const auto a = inv_color_factor * Load(df, row_m + x); + const auto b = Zero(df) - Load(df, row_s + x); + ca = MulAdd(a, a, ca); + cb = MulAdd(a, b, cb); + } + } + float best = + -GetLane(SumOfLanes(cb)) / (GetLane(SumOfLanes(ca)) + 1e-9f); + int8_t& res = (c == 0 ? shared.cmap.ytox_map : shared.cmap.ytob_map) + .Row(ty)[tx]; + res = std::max(-128.0f, std::min(127.0f, roundf(best))); + } + }, + "CfL"); + Image3F pooled(frame_dim.xsize_padded / 4, frame_dim.ysize_padded / 4); + Image3F summed(frame_dim.xsize_padded / 4, frame_dim.ysize_padded / 4); + RunOnPool( + pool, 0, frame_dim.ysize_padded / 4, ThreadPool::SkipInit(), + [&](size_t y, size_t _) { + for (size_t c = 0; c < 3; c++) { + float* JXL_RESTRICT row_out = pooled.PlaneRow(c, y); + float* JXL_RESTRICT row_out_avg = summed.PlaneRow(c, y); + const float* JXL_RESTRICT row_in[4]; + for (size_t iy = 0; iy < 4; iy++) { + row_in[iy] = hf.PlaneRow(c, 4 * y + pad + iy); + } + for (size_t x = 0; x < frame_dim.xsize_padded / 4; x++) { + auto max = Zero(df4); + auto sum = Zero(df4); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix += Lanes(df4)) { + const auto nn = Abs(Load(df4, row_in[iy] + x * 4 + ix + pad)); + sum += nn; + max = IfThenElse(max > nn, max, nn); + } + } + row_out_avg[x] = GetLane(SumOfLanes(sum)); + row_out[x] = GetLane(MaxOfLanes(max)); + } + } + }, + "MaxPool"); + // TODO(veluca): better handling of the border + // TODO(veluca): consider some faster blurring method. + // TODO(veluca): parallelize. + // Remove noise from the resulting image. + auto g2 = CreateRecursiveGaussian(2.0849544429861884); + constexpr size_t pad2 = 16; + Image3F summed_pad = PadImageMirror(summed, pad2, pad2); + ImageF tmp_out(summed_pad.xsize(), summed_pad.ysize()); + ImageF tmp2(summed_pad.xsize(), summed_pad.ysize()); + Image3F pooled_pad = PadImageMirror(pooled, pad2, pad2); + for (size_t c = 0; c < 3; c++) { + FastGaussian(g2, summed_pad.Plane(c), pool, &tmp2, &tmp_out); + const auto unblurred_multiplier = Set(df, 0.5f); + for (size_t y = 0; y < summed.ysize(); y++) { + float* row = summed.PlaneRow(c, y); + const float* row_blur = tmp_out.Row(y + pad2); + for (size_t x = 0; x < summed.xsize(); x += Lanes(df)) { + const auto b = Load(df, row_blur + x + pad2); + const auto o = Load(df, row + x) * unblurred_multiplier; + const auto m = IfThenElse(b > o, b, o); + Store(m, df, row + x); + } + } + } + for (size_t c = 0; c < 3; c++) { + FastGaussian(g2, pooled_pad.Plane(c), pool, &tmp2, &tmp_out); + const auto unblurred_multiplier = Set(df, 0.5f); + for (size_t y = 0; y < pooled.ysize(); y++) { + float* row = pooled.PlaneRow(c, y); + const float* row_blur = tmp_out.Row(y + pad2); + for (size_t x = 0; x < pooled.xsize(); x += Lanes(df)) { + const auto b = Load(df, row_blur + x + pad2); + const auto o = Load(df, row + x) * unblurred_multiplier; + const auto m = IfThenElse(b > o, b, o); + Store(m, df, row + x); + } + } + } + const static float kChannelMul[3] = { + 7.9644294909680253f, + 0.5700000183257159f, + 0.20267448837597055f, + }; + ImageF pooledhf44(pooled.xsize(), pooled.ysize()); + for (size_t y = 0; y < pooled.ysize(); y++) { + const float* row_in_x = pooled.ConstPlaneRow(0, y); + const float* row_in_y = pooled.ConstPlaneRow(1, y); + const float* row_in_b = pooled.ConstPlaneRow(2, y); + float* row_out = pooledhf44.Row(y); + for (size_t x = 0; x < pooled.xsize(); x += Lanes(df)) { + auto v = Set(df, kChannelMul[0]) * Load(df, row_in_x + x); + v = MulAdd(Set(df, kChannelMul[1]), Load(df, row_in_y + x), v); + v = MulAdd(Set(df, kChannelMul[2]), Load(df, row_in_b + x), v); + Store(v, df, row_out + x); + } + } + ImageF summedhf44(summed.xsize(), summed.ysize()); + for (size_t y = 0; y < summed.ysize(); y++) { + const float* row_in_x = summed.ConstPlaneRow(0, y); + const float* row_in_y = summed.ConstPlaneRow(1, y); + const float* row_in_b = summed.ConstPlaneRow(2, y); + float* row_out = summedhf44.Row(y); + for (size_t x = 0; x < summed.xsize(); x += Lanes(df)) { + auto v = Set(df, kChannelMul[0]) * Load(df, row_in_x + x); + v = MulAdd(Set(df, kChannelMul[1]), Load(df, row_in_y + x), v); + v = MulAdd(Set(df, kChannelMul[2]), Load(df, row_in_b + x), v); + Store(v, df, row_out + x); + } + } + aux_out->DumpPlaneNormalized("pooledhf44", pooledhf44); + aux_out->DumpPlaneNormalized("summedhf44", summedhf44); + + static const float kDcQuantMul = 0.88170190420916206; + static const float kAcQuantMul = 2.5165738934721524; + + float dc_quant = kDcQuantMul * InitialQuantDC(cparams.butteraugli_distance); + float ac_quant_base = kAcQuantMul / cparams.butteraugli_distance; + ImageF quant_field(frame_dim.xsize_blocks, frame_dim.ysize_blocks); + + static_assert(kColorTileDim == 64, "Fix the code below"); + auto mmacs = [&](size_t bx, size_t by, AcStrategy acs, float& min, + float& max) { + min = 1e10; + max = 0; + for (size_t y = 2 * by; y < 2 * (by + acs.covered_blocks_y()); y++) { + const float* row = summedhf44.Row(y); + for (size_t x = 2 * bx; x < 2 * (bx + acs.covered_blocks_x()); x++) { + min = std::min(min, row[x]); + max = std::max(max, row[x]); + } + } + }; + // Multipliers for allowed range of summedhf44. + std::pair candidates[] = { + // The order is such that, in case of ties, 8x8 is favoured over 4x4 which + // is favoured over 2x2. Similarly, we prefer square transforms over + // same-area rectangular ones. + {AcStrategy::Type::DCT2X2, 1.5f}, + {AcStrategy::Type::DCT4X4, 1.4f}, + {AcStrategy::Type::DCT4X8, 1.2f}, + {AcStrategy::Type::DCT8X4, 1.2f}, + {AcStrategy::Type::AFV0, + 1.15f}, // doesn't really work with these heuristics + {AcStrategy::Type::AFV1, 1.15f}, + {AcStrategy::Type::AFV2, 1.15f}, + {AcStrategy::Type::AFV3, 1.15f}, + {AcStrategy::Type::DCT, 1.0f}, + {AcStrategy::Type::DCT16X8, 0.8f}, + {AcStrategy::Type::DCT8X16, 0.8f}, + {AcStrategy::Type::DCT16X16, 0.2f}, + {AcStrategy::Type::DCT16X32, 0.2f}, + {AcStrategy::Type::DCT32X16, 0.2f}, + {AcStrategy::Type::DCT32X32, 0.2f}, + {AcStrategy::Type::DCT32X64, 0.1f}, + {AcStrategy::Type::DCT64X32, 0.1f}, + {AcStrategy::Type::DCT64X64, 0.04f}, + +#if 0 + {AcStrategy::Type::DCT2X2, 1e+10}, {AcStrategy::Type::DCT4X4, 2.0f}, + {AcStrategy::Type::DCT, 1.0f}, {AcStrategy::Type::DCT16X8, 1.0f}, + {AcStrategy::Type::DCT8X16, 1.0f}, {AcStrategy::Type::DCT32X8, 1.0f}, + {AcStrategy::Type::DCT8X32, 1.0f}, {AcStrategy::Type::DCT32X16, 1.0f}, + {AcStrategy::Type::DCT16X32, 1.0f}, {AcStrategy::Type::DCT64X32, 1.0f}, + {AcStrategy::Type::DCT32X64, 1.0f}, {AcStrategy::Type::DCT16X16, 1.0f}, + {AcStrategy::Type::DCT32X32, 1.0f}, {AcStrategy::Type::DCT64X64, 1.0f}, +#endif + // TODO(veluca): figure out if we want 4x8 and/or AVF. + }; + float max_range = 1e-8f + 0.5f * std::pow(cparams.butteraugli_distance, 0.5f); + // Change quant field and sharpness amounts based on (pooled|summed)hf44, and + // compute block sizes. + // TODO(veluca): maybe this could be done per group: it would allow choosing + // floating blocks better. + RunOnPool( + pool, 0, xcolortiles * ycolortiles, ThreadPool::SkipInit(), + [&](size_t tile_id, size_t _) { + size_t tx = tile_id % xcolortiles; + size_t ty = tile_id / xcolortiles; + size_t x0 = tx * kColorTileDim / kBlockDim; + size_t x1 = std::min(x0 + kColorTileDimInBlocks, quant_field.xsize()); + size_t y0 = ty * kColorTileDim / kBlockDim; + size_t y1 = std::min(y0 + kColorTileDimInBlocks, quant_field.ysize()); + size_t qf_stride = quant_field.PixelsPerRow(); + size_t epf_stride = shared.epf_sharpness.PixelsPerRow(); + bool chosen_mask[64] = {}; + for (size_t y = y0; y < y1; y++) { + uint8_t* epf_row = shared.epf_sharpness.Row(y); + float* qf_row = quant_field.Row(y); + for (size_t x = x0; x < x1; x++) { + if (chosen_mask[(y - y0) * 8 + (x - x0)]) continue; + // Default to DCT8 just in case something funny happens in the loop + // below. + AcStrategy::Type best = AcStrategy::DCT; + size_t best_covered = 1; + float qf = ac_quant_base; + for (size_t i = 0; i < sizeof(candidates) / sizeof(*candidates); + i++) { + AcStrategy acs = AcStrategy::FromRawStrategy(candidates[i].first); + if (y + acs.covered_blocks_y() > y1) continue; + if (x + acs.covered_blocks_x() > x1) continue; + bool fits = true; + for (size_t iy = y; iy < y + acs.covered_blocks_y(); iy++) { + for (size_t ix = x; ix < x + acs.covered_blocks_x(); ix++) { + if (chosen_mask[(iy - y0) * 8 + (ix - x0)]) { + fits = false; + break; + } + } + } + if (!fits) continue; + float min, max; + mmacs(x, y, acs, min, max); + if (max - min > max_range * candidates[i].second) continue; + size_t cb = acs.covered_blocks_x() * acs.covered_blocks_y(); + if (cb >= best_covered) { + best_covered = cb; + best = candidates[i].first; + // TODO(veluca): make this better. + qf = ac_quant_base / + (3.9312946339134007f + 2.6011435675118082f * min); + } + } + shared.ac_strategy.Set(x, y, best); + AcStrategy acs = AcStrategy::FromRawStrategy(best); + for (size_t iy = y; iy < y + acs.covered_blocks_y(); iy++) { + for (size_t ix = x; ix < x + acs.covered_blocks_x(); ix++) { + chosen_mask[(iy - y0) * 8 + (ix - x0)] = 1; + qf_row[ix + (iy - y) * qf_stride] = qf; + } + } + // TODO + for (size_t iy = y; iy < y + acs.covered_blocks_y(); iy++) { + for (size_t ix = x; ix < x + acs.covered_blocks_x(); ix++) { + epf_row[ix + (iy - y) * epf_stride] = 4; + } + } + } + } + }, + "QF+ACS+EPF"); + aux_out->DumpPlaneNormalized("qf", quant_field); + aux_out->DumpPlaneNormalized("epf", shared.epf_sharpness); + DumpAcStrategy(shared.ac_strategy, frame_dim.xsize_padded, + frame_dim.ysize_padded, "acs", aux_out); + + shared.quantizer.SetQuantField(dc_quant, quant_field, + &shared.raw_quant_field); + + return true; +} +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(Heuristics); +Status FastEncoderHeuristics::LossyFrameHeuristics( + PassesEncoderState* enc_state, ModularFrameEncoder* modular_frame_encoder, + const ImageBundle* linear, Image3F* opsin, ThreadPool* pool, + AuxOut* aux_out) { + return HWY_DYNAMIC_DISPATCH(Heuristics)(enc_state, modular_frame_encoder, + linear, opsin, pool, aux_out); +} + +} // namespace jxl +#endif diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_file.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_file.cc new file mode 100644 index 0000000000..d4f94c74d7 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_file.cc @@ -0,0 +1,279 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_file.h" + +#include + +#include +#include +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_frame.h" +#include "lib/jxl/enc_icc_codec.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +namespace { + +// DC + 'Very Low Frequency' +PassDefinition progressive_passes_dc_vlf[] = { + {/*num_coefficients=*/2, /*shift=*/0, /*salient_only=*/false, + /*suitable_for_downsampling_of_at_least=*/4}}; + +PassDefinition progressive_passes_dc_lf[] = { + {/*num_coefficients=*/2, /*shift=*/0, /*salient_only=*/false, + /*suitable_for_downsampling_of_at_least=*/4}, + {/*num_coefficients=*/3, /*shift=*/0, /*salient_only=*/false, + /*suitable_for_downsampling_of_at_least=*/2}}; + +PassDefinition progressive_passes_dc_lf_salient_ac[] = { + {/*num_coefficients=*/2, /*shift=*/0, /*salient_only=*/false, + /*suitable_for_downsampling_of_at_least=*/4}, + {/*num_coefficients=*/3, /*shift=*/0, /*salient_only=*/false, + /*suitable_for_downsampling_of_at_least=*/2}, + {/*num_coefficients=*/8, /*shift=*/0, /*salient_only=*/true, + /*suitable_for_downsampling_of_at_least=*/0}}; + +PassDefinition progressive_passes_dc_lf_salient_ac_other_ac[] = { + {/*num_coefficients=*/2, /*shift=*/0, /*salient_only=*/false, + /*suitable_for_downsampling_of_at_least=*/4}, + {/*num_coefficients=*/3, /*shift=*/0, /*salient_only=*/false, + /*suitable_for_downsampling_of_at_least=*/2}, + {/*num_coefficients=*/8, /*shift=*/0, /*salient_only=*/true, + /*suitable_for_downsampling_of_at_least=*/0}, + {/*num_coefficients=*/8, /*shift=*/0, /*salient_only=*/false, + /*suitable_for_downsampling_of_at_least=*/0}}; + +PassDefinition progressive_passes_dc_quant_ac_full_ac[] = { + {/*num_coefficients=*/8, /*shift=*/1, /*salient_only=*/false, + /*suitable_for_downsampling_of_at_least=*/2}, + {/*num_coefficients=*/8, /*shift=*/0, /*salient_only=*/false, + /*suitable_for_downsampling_of_at_least=*/0}, +}; + +constexpr uint16_t kExifOrientationTag = 274; + +// Parses the Exif data just enough to extract any render-impacting info. +// If the Exif data is invalid or could not be parsed, then it is treated +// as a no-op. +// TODO (jon): tag 1 can be used to represent Adobe RGB 1998 if it has value +// "R03" +// TODO (jon): set intrinsic dimensions according to +// https://discourse.wicg.io/t/proposal-exif-image-resolution-auto-and-from-image/4326/24 +void InterpretExif(const PaddedBytes& exif, CodecMetadata* metadata) { + if (exif.size() < 12) return; // not enough bytes for a valid exif blob + const uint8_t* t = exif.data(); + bool bigendian = false; + if (LoadLE32(t) == 0x2A004D4D) { + bigendian = true; + } else if (LoadLE32(t) != 0x002A4949) { + return; // not a valid tiff header + } + t += 4; + uint32_t offset = (bigendian ? LoadBE32(t) : LoadLE32(t)); + if (exif.size() < 12 + offset + 2 || offset < 8) return; + t += offset - 4; + uint16_t nb_tags = (bigendian ? LoadBE16(t) : LoadLE16(t)); + t += 2; + while (nb_tags > 0) { + if (t + 12 >= exif.data() + exif.size()) return; + uint16_t tag = (bigendian ? LoadBE16(t) : LoadLE16(t)); + t += 2; + uint16_t type = (bigendian ? LoadBE16(t) : LoadLE16(t)); + t += 2; + uint32_t count = (bigendian ? LoadBE32(t) : LoadLE32(t)); + t += 4; + uint16_t value = (bigendian ? LoadBE16(t) : LoadLE16(t)); + t += 4; + if (tag == kExifOrientationTag) { + if (type == 3 && count == 1) { + if (value >= 1 && value <= 8) { + metadata->m.orientation = value; + } + } + } + nb_tags--; + } +} + +Status PrepareCodecMetadataFromIO(const CompressParams& cparams, + const CodecInOut* io, + CodecMetadata* metadata) { + *metadata = io->metadata; + size_t ups = 1; + if (cparams.already_downsampled) ups = cparams.resampling; + + JXL_RETURN_IF_ERROR(metadata->size.Set(io->xsize() * ups, io->ysize() * ups)); + + // Keep ICC profile in lossless modes because a reconstructed profile may be + // slightly different (quantization). + // Also keep ICC in JPEG reconstruction mode as we need byte-exact profiles. + const bool lossless_modular = + cparams.modular_mode && cparams.quality_pair.first == 100.0f; + if (!lossless_modular && !io->Main().IsJPEG()) { + metadata->m.color_encoding.DecideIfWantICC(); + } + + metadata->m.xyb_encoded = + cparams.color_transform == ColorTransform::kXYB ? true : false; + + InterpretExif(io->blobs.exif, metadata); + + return true; +} + +} // namespace + +Status EncodePreview(const CompressParams& cparams, const ImageBundle& ib, + const CodecMetadata* metadata, ThreadPool* pool, + BitWriter* JXL_RESTRICT writer) { + BitWriter preview_writer; + // TODO(janwas): also support generating preview by downsampling + if (ib.HasColor()) { + AuxOut aux_out; + PassesEncoderState passes_enc_state; + // TODO(lode): check if we want all extra channels and matching xyb_encoded + // for the preview, such that using the main ImageMetadata object for + // encoding this frame is warrented. + FrameInfo frame_info; + frame_info.is_preview = true; + JXL_RETURN_IF_ERROR(EncodeFrame(cparams, frame_info, metadata, ib, + &passes_enc_state, pool, &preview_writer, + &aux_out)); + preview_writer.ZeroPadToByte(); + } + + if (preview_writer.BitsWritten() != 0) { + writer->ZeroPadToByte(); + writer->AppendByteAligned(preview_writer); + } + + return true; +} + +Status WriteHeaders(CodecMetadata* metadata, BitWriter* writer, + AuxOut* aux_out) { + // Marker/signature + BitWriter::Allotment allotment(writer, 16); + writer->Write(8, 0xFF); + writer->Write(8, kCodestreamMarker); + ReclaimAndCharge(writer, &allotment, kLayerHeader, aux_out); + + JXL_RETURN_IF_ERROR( + WriteSizeHeader(metadata->size, writer, kLayerHeader, aux_out)); + + JXL_RETURN_IF_ERROR( + WriteImageMetadata(metadata->m, writer, kLayerHeader, aux_out)); + + metadata->transform_data.nonserialized_xyb_encoded = metadata->m.xyb_encoded; + JXL_RETURN_IF_ERROR( + Bundle::Write(metadata->transform_data, writer, kLayerHeader, aux_out)); + + return true; +} + +Status EncodeFile(const CompressParams& cparams_orig, const CodecInOut* io, + PassesEncoderState* passes_enc_state, PaddedBytes* compressed, + AuxOut* aux_out, ThreadPool* pool, std::string xclbinPath) { + io->CheckMetadata(); + BitWriter writer; + + CompressParams cparams = cparams_orig; + if (io->Main().color_transform != ColorTransform::kNone) { + // Set the color transform to YCbCr or XYB if the original image is such. + cparams.color_transform = io->Main().color_transform; + } + + std::unique_ptr metadata = jxl::make_unique(); + JXL_RETURN_IF_ERROR(PrepareCodecMetadataFromIO(cparams, io, metadata.get())); + JXL_RETURN_IF_ERROR(WriteHeaders(metadata.get(), &writer, aux_out)); + + // Only send ICC (at least several hundred bytes) if fields aren't enough. + if (metadata->m.color_encoding.WantICC()) { + JXL_RETURN_IF_ERROR(WriteICC(metadata->m.color_encoding.ICC(), &writer, + kLayerHeader, aux_out)); + } + + if (metadata->m.have_preview) { + JXL_RETURN_IF_ERROR(EncodePreview(cparams, io->preview_frame, + metadata.get(), pool, &writer)); + } + + // Each frame should start on byte boundaries. + writer.ZeroPadToByte(); + + if (cparams.progressive_mode || cparams.qprogressive_mode) { + if (cparams.saliency_map != nullptr) { + passes_enc_state->progressive_splitter.SetSaliencyMap( + cparams.saliency_map); + } + passes_enc_state->progressive_splitter.SetSaliencyThreshold( + cparams.saliency_threshold); + if (cparams.qprogressive_mode) { + passes_enc_state->progressive_splitter.SetProgressiveMode( + ProgressiveMode{progressive_passes_dc_quant_ac_full_ac}); + } else { + switch (cparams.saliency_num_progressive_steps) { + case 1: + passes_enc_state->progressive_splitter.SetProgressiveMode( + ProgressiveMode{progressive_passes_dc_vlf}); + break; + case 2: + passes_enc_state->progressive_splitter.SetProgressiveMode( + ProgressiveMode{progressive_passes_dc_lf}); + break; + case 3: + passes_enc_state->progressive_splitter.SetProgressiveMode( + ProgressiveMode{progressive_passes_dc_lf_salient_ac}); + break; + case 4: + if (cparams.saliency_threshold == 0.0f) { + // No need for a 4th pass if saliency-threshold regards everything + // as salient. + passes_enc_state->progressive_splitter.SetProgressiveMode( + ProgressiveMode{progressive_passes_dc_lf_salient_ac}); + } else { + passes_enc_state->progressive_splitter.SetProgressiveMode( + ProgressiveMode{progressive_passes_dc_lf_salient_ac_other_ac}); + } + break; + default: + return JXL_FAILURE("Invalid saliency_num_progressive_steps."); + } + } + } + for (size_t i = 0; i < io->frames.size(); i++) { + FrameInfo info; + info.is_last = i == io->frames.size() - 1; + if (io->frames[i].use_for_next_frame) { + info.save_as_reference = 1; + } + JXL_RETURN_IF_ERROR(EncodeFrame(cparams, info, metadata.get(), + io->frames[i], passes_enc_state, pool, + &writer, aux_out, xclbinPath)); + } + + // Clean up passes_enc_state in case it gets reused. + for (size_t i = 0; i < 4; i++) { + passes_enc_state->shared.dc_frames[i] = Image3F(); + passes_enc_state->shared.reference_frames[i].storage = ImageBundle(); + } + + *compressed = std::move(writer).TakeBytes(); + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_file.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_file.h new file mode 100644 index 0000000000..12b5c37b4b --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_file.h @@ -0,0 +1,52 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_FILE_H_ +#define LIB_JXL_ENC_FILE_H_ + +// Facade for JXL encoding. + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_params.h" + +namespace jxl { + +// Write preview from `io`. +Status EncodePreview(const CompressParams& cparams, const ImageBundle& ib, + const CodecMetadata* metadata, ThreadPool* pool, + BitWriter* JXL_RESTRICT writer); + +// Write headers from the CodecMetadata. Also may modify nonserialized_... +// fields of the metadata. +Status WriteHeaders(CodecMetadata* metadata, BitWriter* writer, + AuxOut* aux_out); + +// Compresses pixels from `io` (given in any ColorEncoding). +// `io->metadata.m.original` must be set. +Status EncodeFile(const CompressParams& params, const CodecInOut* io, + PassesEncoderState* passes_enc_state, PaddedBytes* compressed, + AuxOut* aux_out = nullptr, ThreadPool* pool = nullptr, + std::string xclbinPath = ""); + +// Backwards-compatible interface. Don't use in new code. +// TODO(deymo): Remove this function once we migrate users to C encoder API. +struct FrameEncCache {}; +JXL_INLINE Status EncodeFile(const CompressParams& params, const CodecInOut* io, + FrameEncCache* /* unused */, + PaddedBytes* compressed, AuxOut* aux_out = nullptr, + ThreadPool* pool = nullptr, std::string xclbinPath = "") { + PassesEncoderState passes_enc_state; + return EncodeFile(params, io, &passes_enc_state, compressed, aux_out, pool); +} + +} // namespace jxl + +#endif // LIB_JXL_ENC_FILE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_frame.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_frame.cc new file mode 100644 index 0000000000..40e40d8d59 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_frame.cc @@ -0,0 +1,1418 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_frame.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_chroma_from_luma.h" +#include "lib/jxl/enc_coeff_order.h" +#include "lib/jxl/enc_context_map.h" +#include "lib/jxl/enc_entropy_coder.h" +#include "lib/jxl/enc_group.h" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_noise.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/enc_toc.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/gaborish.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" +#include "lib/jxl/toc.h" + +namespace jxl { +namespace { + +void ClusterGroups(PassesEncoderState* enc_state) { + if (enc_state->shared.frame_header.passes.num_passes > 1) { + // TODO(veluca): implement this for progressive modes. + return; + } + // This only considers pass 0 for now. + std::vector context_map; + EntropyEncodingData codes; + auto& ac = enc_state->passes[0].ac_tokens; + size_t limit = std::ceil(std::sqrt(ac.size())); + if (limit == 1) return; + size_t num_contexts = enc_state->shared.block_ctx_map.NumACContexts(); + std::vector costs(ac.size()); + HistogramParams params; + params.uint_method = HistogramParams::HybridUintMethod::kNone; + params.lz77_method = HistogramParams::LZ77Method::kNone; + params.ans_histogram_strategy = + HistogramParams::ANSHistogramStrategy::kApproximate; + size_t max = 0; + auto token_cost = [&](std::vector>& tokens, size_t num_ctx, + bool estimate = true) { + // TODO(veluca): not estimating is very expensive. + BitWriter writer; + size_t c = BuildAndEncodeHistograms( + params, num_ctx, tokens, &codes, &context_map, + estimate ? nullptr : &writer, 0, /*aux_out=*/0); + if (estimate) return c; + for (size_t i = 0; i < tokens.size(); i++) { + WriteTokens(tokens[i], codes, context_map, &writer, 0, nullptr); + } + return writer.BitsWritten(); + }; + for (size_t i = 0; i < ac.size(); i++) { + std::vector> tokens{ac[i]}; + costs[i] = + token_cost(tokens, enc_state->shared.block_ctx_map.NumACContexts()); + if (costs[i] > costs[max]) { + max = i; + } + } + auto dist = [&](int i, int j) { + std::vector> tokens{ac[i], ac[j]}; + return token_cost(tokens, num_contexts) - costs[i] - costs[j]; + }; + std::vector out{max}; + std::vector old_map(ac.size()); + std::vector dists(ac.size()); + size_t farthest = 0; + for (size_t i = 0; i < ac.size(); i++) { + if (i == max) continue; + dists[i] = dist(max, i); + if (dists[i] > dists[farthest]) { + farthest = i; + } + } + + while (dists[farthest] > 0 && out.size() < limit) { + out.push_back(farthest); + dists[farthest] = 0; + enc_state->histogram_idx[farthest] = out.size() - 1; + for (size_t i = 0; i < ac.size(); i++) { + float d = dist(out.back(), i); + if (d < dists[i]) { + dists[i] = d; + old_map[i] = enc_state->histogram_idx[i]; + enc_state->histogram_idx[i] = out.size() - 1; + } + if (dists[i] > dists[farthest]) { + farthest = i; + } + } + } + + std::vector remap(out.size()); + std::iota(remap.begin(), remap.end(), 0); + for (size_t i = 0; i < enc_state->histogram_idx.size(); i++) { + enc_state->histogram_idx[i] = remap[enc_state->histogram_idx[i]]; + } + auto remap_cost = [&](std::vector remap) { + std::vector re_remap(remap.size(), remap.size()); + size_t r = 0; + for (size_t i = 0; i < remap.size(); i++) { + if (re_remap[remap[i]] == remap.size()) { + re_remap[remap[i]] = r++; + } + remap[i] = re_remap[remap[i]]; + } + auto tokens = ac; + size_t max_hist = 0; + for (size_t i = 0; i < tokens.size(); i++) { + for (size_t j = 0; j < tokens[i].size(); j++) { + size_t hist = remap[enc_state->histogram_idx[i]]; + tokens[i][j].context += hist * num_contexts; + max_hist = std::max(hist + 1, max_hist); + } + } + return token_cost(tokens, max_hist * num_contexts, /*estimate=*/false); + }; + + for (size_t src = 0; src < out.size(); src++) { + float cost = remap_cost(remap); + size_t best = src; + for (size_t j = src + 1; j < out.size(); j++) { + if (remap[src] == remap[j]) continue; + auto remap_c = remap; + std::replace(remap_c.begin(), remap_c.end(), remap[src], remap[j]); + float c = remap_cost(remap_c); + if (c < cost) { + best = j; + cost = c; + } + } + if (src != best) { + std::replace(remap.begin(), remap.end(), remap[src], remap[best]); + } + } + std::vector re_remap(remap.size(), remap.size()); + size_t r = 0; + for (size_t i = 0; i < remap.size(); i++) { + if (re_remap[remap[i]] == remap.size()) { + re_remap[remap[i]] = r++; + } + remap[i] = re_remap[remap[i]]; + } + + enc_state->shared.num_histograms = + *std::max_element(remap.begin(), remap.end()) + 1; + for (size_t i = 0; i < enc_state->histogram_idx.size(); i++) { + enc_state->histogram_idx[i] = remap[enc_state->histogram_idx[i]]; + } + for (size_t i = 0; i < ac.size(); i++) { + for (size_t j = 0; j < ac[i].size(); j++) { + ac[i][j].context += enc_state->histogram_idx[i] * num_contexts; + } + } +} + +uint64_t FrameFlagsFromParams(const CompressParams& cparams) { + uint64_t flags = 0; + + const float dist = cparams.butteraugli_distance; + + // We don't add noise at low butteraugli distances because the original + // noise is stored within the compressed image and adding noise makes things + // worse. + if (ApplyOverride(cparams.noise, dist >= kMinButteraugliForNoise) || + cparams.photon_noise_iso > 0) { + flags |= FrameHeader::kNoise; + } + + if (cparams.progressive_dc > 0 && cparams.modular_mode == false) { + flags |= FrameHeader::kUseDcFrame; + } + + return flags; +} + +Status LoopFilterFromParams(const CompressParams& cparams, + FrameHeader* JXL_RESTRICT frame_header) { + LoopFilter* loop_filter = &frame_header->loop_filter; + + // Gaborish defaults to enabled in Hare or slower. + loop_filter->gab = ApplyOverride( + cparams.gaborish, cparams.speed_tier <= SpeedTier::kHare && + frame_header->encoding == FrameEncoding::kVarDCT && + cparams.decoding_speed_tier < 4); + + if (cparams.epf != -1) { + loop_filter->epf_iters = cparams.epf; + } else { + if (frame_header->encoding == FrameEncoding::kModular) { + loop_filter->epf_iters = 0; + } else { + constexpr float kThresholds[3] = {0.7, 1.5, 4.0}; + loop_filter->epf_iters = 0; + if (cparams.decoding_speed_tier < 3) { + for (size_t i = cparams.decoding_speed_tier == 2 ? 1 : 0; i < 3; i++) { + if (cparams.butteraugli_distance >= kThresholds[i]) { + loop_filter->epf_iters++; + } + } + } + } + } + // Strength of EPF in modular mode. + if (frame_header->encoding == FrameEncoding::kModular && + cparams.quality_pair.first < 100) { + // TODO(veluca): this formula is nonsense. + loop_filter->epf_sigma_for_modular = + 20.0f * (1.0f - cparams.quality_pair.first / 100); + } + if (frame_header->encoding == FrameEncoding::kModular && + cparams.lossy_palette) { + loop_filter->epf_sigma_for_modular = 1.0f; + } + + return true; +} + +Status MakeFrameHeader(const CompressParams& cparams, + const ProgressiveSplitter& progressive_splitter, + const FrameInfo& frame_info, const ImageBundle& ib, + FrameHeader* JXL_RESTRICT frame_header) { + frame_header->nonserialized_is_preview = frame_info.is_preview; + frame_header->is_last = frame_info.is_last; + frame_header->save_before_color_transform = + frame_info.save_before_color_transform; + frame_header->frame_type = frame_info.frame_type; + frame_header->name = ib.name; + + progressive_splitter.InitPasses(&frame_header->passes); + + if (cparams.modular_mode) { + frame_header->encoding = FrameEncoding::kModular; + frame_header->group_size_shift = cparams.modular_group_size_shift; + } + + frame_header->chroma_subsampling = ib.chroma_subsampling; + if (ib.IsJPEG()) { + // we are transcoding a JPEG, so we don't get to choose + frame_header->encoding = FrameEncoding::kVarDCT; + frame_header->color_transform = ib.color_transform; + } else { + frame_header->color_transform = cparams.color_transform; + if (!cparams.modular_mode && + (frame_header->chroma_subsampling.MaxHShift() != 0 || + frame_header->chroma_subsampling.MaxVShift() != 0)) { + return JXL_FAILURE( + "Chroma subsampling is not supported in VarDCT mode when not " + "recompressing JPEGs"); + } + } + + frame_header->flags = FrameFlagsFromParams(cparams); + // Noise is not supported in the Modular encoder for now. + if (frame_header->encoding != FrameEncoding::kVarDCT) { + frame_header->UpdateFlag(false, FrameHeader::Flags::kNoise); + } + + JXL_RETURN_IF_ERROR(LoopFilterFromParams(cparams, frame_header)); + + frame_header->dc_level = frame_info.dc_level; + if (frame_header->dc_level > 2) { + // With 3 or more progressive_dc frames, the implementation does not yet + // work, see enc_cache.cc. + return JXL_FAILURE("progressive_dc > 2 is not yet supported"); + } + if (cparams.progressive_dc > 0 && + (cparams.ec_resampling != 1 || cparams.resampling != 1)) { + return JXL_FAILURE("Resampling not supported with DC frames"); + } + if (cparams.resampling != 1 && cparams.resampling != 2 && + cparams.resampling != 4 && cparams.resampling != 8) { + return JXL_FAILURE("Invalid resampling factor"); + } + if (cparams.ec_resampling != 1 && cparams.ec_resampling != 2 && + cparams.ec_resampling != 4 && cparams.ec_resampling != 8) { + return JXL_FAILURE("Invalid ec_resampling factor"); + } + // Resized frames. + if (frame_info.frame_type != FrameType::kDCFrame) { + frame_header->frame_origin = ib.origin; + size_t ups = 1; + if (cparams.already_downsampled) ups = cparams.resampling; + frame_header->frame_size.xsize = ib.xsize() * ups; + frame_header->frame_size.ysize = ib.ysize() * ups; + if (ib.origin.x0 != 0 || ib.origin.y0 != 0 || + frame_header->frame_size.xsize != frame_header->default_xsize() || + frame_header->frame_size.ysize != frame_header->default_ysize()) { + frame_header->custom_size_or_origin = true; + } + } + // Upsampling. + frame_header->upsampling = cparams.resampling; + const std::vector& extra_channels = + frame_header->nonserialized_metadata->m.extra_channel_info; + frame_header->extra_channel_upsampling.clear(); + frame_header->extra_channel_upsampling.resize(extra_channels.size(), + cparams.ec_resampling); + frame_header->save_as_reference = frame_info.save_as_reference; + + // Set blending-related information. + if (ib.blend || frame_header->custom_size_or_origin) { + // Set blend_channel to the first alpha channel. These values are only + // encoded in case a blend mode involving alpha is used and there are more + // than one extra channels. + size_t index = 0; + if (extra_channels.size() > 1) { + for (size_t i = 0; i < extra_channels.size(); i++) { + if (extra_channels[i].type == ExtraChannel::kAlpha) { + index = i; + break; + } + } + } + frame_header->blending_info.alpha_channel = index; + frame_header->blending_info.mode = + ib.blend ? ib.blendmode : BlendMode::kReplace; + // previous frames are saved with ID 1. + frame_header->blending_info.source = 1; + for (size_t i = 0; i < extra_channels.size(); i++) { + frame_header->extra_channel_blending_info[i].alpha_channel = index; + BlendMode default_blend = ib.blendmode; + if (extra_channels[i].type != ExtraChannel::kBlack && i != index) { + // K needs to be blended, spot colors and other stuff gets added + default_blend = BlendMode::kAdd; + } + frame_header->extra_channel_blending_info[i].mode = + ib.blend ? default_blend : BlendMode::kReplace; + frame_header->extra_channel_blending_info[i].source = 1; + } + } + + frame_header->animation_frame.duration = ib.duration; + + // TODO(veluca): timecode. + + return true; +} + +// Invisible (alpha = 0) pixels tend to be a mess in optimized PNGs. +// Since they have no visual impact whatsoever, we can replace them with +// something that compresses better and reduces artifacts near the edges. This +// does some kind of smooth stuff that seems to work. +// Replace invisible pixels with a weighted average of the pixel to the left, +// the pixel to the topright, and non-invisible neighbours. +// Produces downward-blurry smears, with in the upwards direction only a 1px +// edge duplication but not more. It would probably be better to smear in all +// directions. That requires an alpha-weighed convolution with a large enough +// kernel though, which might be overkill... +void SimplifyInvisible(Image3F* image, const ImageF& alpha, bool lossless) { + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < image->ysize(); ++y) { + float* JXL_RESTRICT row = image->PlaneRow(c, y); + const float* JXL_RESTRICT prow = + (y > 0 ? image->PlaneRow(c, y - 1) : nullptr); + const float* JXL_RESTRICT nrow = + (y + 1 < image->ysize() ? image->PlaneRow(c, y + 1) : nullptr); + const float* JXL_RESTRICT a = alpha.Row(y); + const float* JXL_RESTRICT pa = (y > 0 ? alpha.Row(y - 1) : nullptr); + const float* JXL_RESTRICT na = + (y + 1 < image->ysize() ? alpha.Row(y + 1) : nullptr); + for (size_t x = 0; x < image->xsize(); ++x) { + if (a[x] == 0) { + if (lossless) { + row[x] = 0; + continue; + } + float d = 0.f; + row[x] = 0; + if (x > 0) { + row[x] += row[x - 1]; + d++; + if (a[x - 1] > 0.f) { + row[x] += row[x - 1]; + d++; + } + } + if (x + 1 < image->xsize()) { + if (y > 0) { + row[x] += prow[x + 1]; + d++; + } + if (a[x + 1] > 0.f) { + row[x] += 2.f * row[x + 1]; + d += 2.f; + } + if (y > 0 && pa[x + 1] > 0.f) { + row[x] += 2.f * prow[x + 1]; + d += 2.f; + } + if (y + 1 < image->ysize() && na[x + 1] > 0.f) { + row[x] += 2.f * nrow[x + 1]; + d += 2.f; + } + } + if (y > 0 && pa[x] > 0.f) { + row[x] += 2.f * prow[x]; + d += 2.f; + } + if (y + 1 < image->ysize() && na[x] > 0.f) { + row[x] += 2.f * nrow[x]; + d += 2.f; + } + if (d > 1.f) row[x] /= d; + } + } + } + } +} + +} // namespace + +class LossyFrameEncoder { + public: + LossyFrameEncoder(const CompressParams& cparams, + const FrameHeader& frame_header, + PassesEncoderState* JXL_RESTRICT enc_state, + ThreadPool* pool, AuxOut* aux_out) + : enc_state_(enc_state), pool_(pool), aux_out_(aux_out) { + JXL_CHECK(InitializePassesSharedState(frame_header, &enc_state_->shared, + /*encoder=*/true)); + enc_state_->cparams = cparams; + enc_state_->passes.clear(); + } + + Status ComputeEncodingData(const ImageBundle* linear, + Image3F* JXL_RESTRICT opsin, ThreadPool* pool, + ModularFrameEncoder* modular_frame_encoder, + BitWriter* JXL_RESTRICT writer, + FrameHeader* frame_header) { + PROFILER_ZONE("ComputeEncodingData uninstrumented"); + JXL_ASSERT((opsin->xsize() % kBlockDim) == 0 && + (opsin->ysize() % kBlockDim) == 0); + PassesSharedState& shared = enc_state_->shared; + + if (!enc_state_->cparams.max_error_mode) { + float x_qm_scale_steps[3] = {0.65f, 1.25f, 9.0f}; + shared.frame_header.x_qm_scale = 1; + for (float x_qm_scale_step : x_qm_scale_steps) { + if (enc_state_->cparams.butteraugli_distance > x_qm_scale_step) { + shared.frame_header.x_qm_scale++; + } + } + } + + JXL_RETURN_IF_ERROR(enc_state_->heuristics->LossyFrameHeuristics( + enc_state_, modular_frame_encoder, linear, opsin, pool_, aux_out_)); + + InitializePassesEncoder(*opsin, pool_, enc_state_, modular_frame_encoder, + aux_out_); + + enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses()); + for (PassesEncoderState::PassData& pass : enc_state_->passes) { + pass.ac_tokens.resize(shared.frame_dim.num_groups); + } + + ComputeAllCoeffOrders(shared.frame_dim); + shared.num_histograms = 1; + + const auto tokenize_group_init = [&](const size_t num_threads) { + group_caches_.resize(num_threads); + return true; + }; + const auto tokenize_group = [&](const int group_index, const int thread) { + // Tokenize coefficients. + const Rect rect = shared.BlockGroupRect(group_index); + for (size_t idx_pass = 0; idx_pass < enc_state_->passes.size(); + idx_pass++) { + JXL_ASSERT(enc_state_->coeffs[idx_pass]->Type() == ACType::k32); + const int32_t* JXL_RESTRICT ac_rows[3] = { + enc_state_->coeffs[idx_pass]->PlaneRow(0, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(1, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(2, group_index, 0).ptr32, + }; + // Ensure group cache is initialized. + group_caches_[thread].InitOnce(); + TokenizeCoefficients( + &shared.coeff_orders[idx_pass * shared.coeff_order_size], rect, + ac_rows, shared.ac_strategy, frame_header->chroma_subsampling, + &group_caches_[thread].num_nzeroes, + &enc_state_->passes[idx_pass].ac_tokens[group_index], + enc_state_->shared.quant_dc, enc_state_->shared.raw_quant_field, + enc_state_->shared.block_ctx_map); + } + }; + RunOnPool(pool_, 0, shared.frame_dim.num_groups, tokenize_group_init, + tokenize_group, "TokenizeGroup"); + + *frame_header = shared.frame_header; + return true; + } + + Status ComputeJPEGTranscodingData(const jpeg::JPEGData& jpeg_data, + ModularFrameEncoder* modular_frame_encoder, + FrameHeader* frame_header) { + PROFILER_ZONE("ComputeJPEGTranscodingData uninstrumented"); + PassesSharedState& shared = enc_state_->shared; + + frame_header->x_qm_scale = 2; + frame_header->b_qm_scale = 2; + + FrameDimensions frame_dim = frame_header->ToFrameDimensions(); + + const size_t xsize = frame_dim.xsize_padded; + const size_t ysize = frame_dim.ysize_padded; + const size_t xsize_blocks = frame_dim.xsize_blocks; + const size_t ysize_blocks = frame_dim.ysize_blocks; + + // no-op chroma from luma + shared.cmap = ColorCorrelationMap(xsize, ysize, false); + shared.ac_strategy.FillDCT8(); + FillImage(uint8_t(0), &shared.epf_sharpness); + + enc_state_->coeffs.clear(); + enc_state_->coeffs.emplace_back(make_unique>( + kGroupDim * kGroupDim, frame_dim.num_groups)); + + // convert JPEG quantization table to a Quantizer object + float dcquantization[3]; + std::vector qe(DequantMatrices::kNum, + QuantEncoding::Library(0)); + + auto jpeg_c_map = JpegOrder(frame_header->color_transform, + jpeg_data.components.size() == 1); + + std::vector qt(192); + for (size_t c = 0; c < 3; c++) { + size_t jpeg_c = jpeg_c_map[c]; + const int* quant = + jpeg_data.quant[jpeg_data.components[jpeg_c].quant_idx].values.data(); + + dcquantization[c] = 255 * 8.0f / quant[0]; + for (size_t y = 0; y < 8; y++) { + for (size_t x = 0; x < 8; x++) { + // JPEG XL transposes the DCT, JPEG doesn't. + qt[c * 64 + 8 * x + y] = quant[8 * y + x]; + } + } + } + DequantMatricesSetCustomDC(&shared.matrices, dcquantization); + float dcquantization_r[3] = {1.0f / dcquantization[0], + 1.0f / dcquantization[1], + 1.0f / dcquantization[2]}; + + qe[AcStrategy::Type::DCT] = QuantEncoding::RAW(qt); + DequantMatricesSetCustom(&shared.matrices, qe, modular_frame_encoder); + + // Ensure that InvGlobalScale() is 1. + shared.quantizer = Quantizer(&shared.matrices, 1, kGlobalScaleDenom); + // Recompute MulDC() and InvMulDC(). + shared.quantizer.RecomputeFromGlobalScale(); + + // Per-block dequant scaling should be 1. + FillImage(static_cast(shared.quantizer.InvGlobalScale()), + &shared.raw_quant_field); + + std::vector scaled_qtable(192); + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 64; i++) { + scaled_qtable[64 * c + i] = + (1 << kCFLFixedPointPrecision) * qt[64 + i] / qt[64 * c + i]; + } + } + + auto jpeg_row = [&](size_t c, size_t y) { + return jpeg_data.components[jpeg_c_map[c]].coeffs.data() + + jpeg_data.components[jpeg_c_map[c]].width_in_blocks * + kDCTBlockSize * y; + }; + + Image3F dc = Image3F(xsize_blocks, ysize_blocks); + bool DCzero = + (shared.frame_header.color_transform == ColorTransform::kYCbCr); + // Compute chroma-from-luma for AC (doesn't seem to be useful for DC) + if (frame_header->chroma_subsampling.Is444() && + enc_state_->cparams.force_cfl_jpeg_recompression && + jpeg_data.components.size() == 3) { + for (size_t c : {0, 2}) { + ImageSB* map = (c == 0 ? &shared.cmap.ytox_map : &shared.cmap.ytob_map); + const float kScale = kDefaultColorFactor; + const int kOffset = 127; + const float kBase = + c == 0 ? shared.cmap.YtoXRatio(0) : shared.cmap.YtoBRatio(0); + const float kZeroThresh = + kScale * kZeroBiasDefault[c] * + 0.9999f; // just epsilon less for better rounding + + auto process_row = [&](int task, int thread) { + size_t ty = task; + int8_t* JXL_RESTRICT row_out = map->Row(ty); + for (size_t tx = 0; tx < map->xsize(); ++tx) { + const size_t y0 = ty * kColorTileDimInBlocks; + const size_t x0 = tx * kColorTileDimInBlocks; + const size_t y1 = std::min(frame_dim.ysize_blocks, + (ty + 1) * kColorTileDimInBlocks); + const size_t x1 = std::min(frame_dim.xsize_blocks, + (tx + 1) * kColorTileDimInBlocks); + int32_t d_num_zeros[257] = {0}; + // TODO(veluca): this needs SIMD + fixed point adaptation, and/or + // conversion to the new CfL algorithm. + for (size_t y = y0; y < y1; ++y) { + const int16_t* JXL_RESTRICT row_m = jpeg_row(1, y); + const int16_t* JXL_RESTRICT row_s = jpeg_row(c, y); + for (size_t x = x0; x < x1; ++x) { + for (size_t coeffpos = 1; coeffpos < kDCTBlockSize; + coeffpos++) { + const float scaled_m = + row_m[x * kDCTBlockSize + coeffpos] * + scaled_qtable[64 * c + coeffpos] * + (1.0f / (1 << kCFLFixedPointPrecision)); + const float scaled_s = + kScale * row_s[x * kDCTBlockSize + coeffpos] + + (kOffset - kBase * kScale) * scaled_m; + if (std::abs(scaled_m) > 1e-8f) { + float from, to; + if (scaled_m > 0) { + from = (scaled_s - kZeroThresh) / scaled_m; + to = (scaled_s + kZeroThresh) / scaled_m; + } else { + from = (scaled_s + kZeroThresh) / scaled_m; + to = (scaled_s - kZeroThresh) / scaled_m; + } + if (from < 0.0f) { + from = 0.0f; + } + if (to > 255.0f) { + to = 255.0f; + } + // Instead of clamping the both values + // we just check that range is sane. + if (from <= to) { + d_num_zeros[static_cast(std::ceil(from))]++; + d_num_zeros[static_cast(std::floor(to + 1))]--; + } + } + } + } + } + int best = 0; + int32_t best_sum = 0; + FindIndexOfSumMaximum(d_num_zeros, 256, &best, &best_sum); + int32_t offset_sum = 0; + for (int i = 0; i < 256; ++i) { + if (i <= kOffset) { + offset_sum += d_num_zeros[i]; + } + } + row_out[tx] = 0; + if (best_sum > offset_sum + 1) { + row_out[tx] = best - kOffset; + } + } + }; + + RunOnPool(pool_, 0, map->ysize(), ThreadPool::SkipInit(), process_row, + "FindCorrelation"); + } + } + if (!frame_header->chroma_subsampling.Is444()) { + ZeroFillImage(&dc); + enc_state_->coeffs[0]->ZeroFill(); + } + // JPEG DC is from -1024 to 1023. + std::vector dc_counts[3] = {}; + dc_counts[0].resize(2048); + dc_counts[1].resize(2048); + dc_counts[2].resize(2048); + size_t total_dc[3] = {}; + for (size_t c : {1, 0, 2}) { + if (jpeg_data.components.size() == 1 && c != 1) { + enc_state_->coeffs[0]->ZeroFillPlane(c); + ZeroFillImage(&dc.Plane(c)); + // Ensure no division by 0. + dc_counts[c][1024] = 1; + total_dc[c] = 1; + continue; + } + size_t hshift = frame_header->chroma_subsampling.HShift(c); + size_t vshift = frame_header->chroma_subsampling.VShift(c); + ImageSB& map = (c == 0 ? shared.cmap.ytox_map : shared.cmap.ytob_map); + for (size_t group_index = 0; group_index < frame_dim.num_groups; + group_index++) { + const size_t gx = group_index % frame_dim.xsize_groups; + const size_t gy = group_index / frame_dim.xsize_groups; + size_t offset = 0; + int32_t* JXL_RESTRICT ac = + enc_state_->coeffs[0]->PlaneRow(c, group_index, 0).ptr32; + for (size_t by = gy * kGroupDimInBlocks; + by < ysize_blocks && by < (gy + 1) * kGroupDimInBlocks; ++by) { + if ((by >> vshift) << vshift != by) continue; + const int16_t* JXL_RESTRICT inputjpeg = jpeg_row(c, by >> vshift); + const int16_t* JXL_RESTRICT inputjpegY = jpeg_row(1, by); + float* JXL_RESTRICT fdc = dc.PlaneRow(c, by >> vshift); + const int8_t* JXL_RESTRICT cm = + map.ConstRow(by / kColorTileDimInBlocks); + for (size_t bx = gx * kGroupDimInBlocks; + bx < xsize_blocks && bx < (gx + 1) * kGroupDimInBlocks; ++bx) { + if ((bx >> hshift) << hshift != bx) continue; + size_t base = (bx >> hshift) * kDCTBlockSize; + int idc; + if (DCzero) { + idc = inputjpeg[base]; + } else { + idc = inputjpeg[base] + 1024 / qt[c * 64]; + } + dc_counts[c][std::min(static_cast(idc + 1024), + uint32_t(2047))]++; + total_dc[c]++; + fdc[bx >> hshift] = idc * dcquantization_r[c]; + if (c == 1 || !enc_state_->cparams.force_cfl_jpeg_recompression || + !frame_header->chroma_subsampling.Is444()) { + for (size_t y = 0; y < 8; y++) { + for (size_t x = 0; x < 8; x++) { + ac[offset + y * 8 + x] = inputjpeg[base + x * 8 + y]; + } + } + } else { + const int32_t scale = + shared.cmap.RatioJPEG(cm[bx / kColorTileDimInBlocks]); + + for (size_t y = 0; y < 8; y++) { + for (size_t x = 0; x < 8; x++) { + int Y = inputjpegY[kDCTBlockSize * bx + x * 8 + y]; + int QChroma = inputjpeg[kDCTBlockSize * bx + x * 8 + y]; + // Fixed-point multiply of CfL scale with quant table ratio + // first, and Y value second. + int coeff_scale = (scale * scaled_qtable[64 * c + y * 8 + x] + + (1 << (kCFLFixedPointPrecision - 1))) >> + kCFLFixedPointPrecision; + int cfl_factor = (Y * coeff_scale + + (1 << (kCFLFixedPointPrecision - 1))) >> + kCFLFixedPointPrecision; + int QCR = QChroma - cfl_factor; + ac[offset + y * 8 + x] = QCR; + } + } + } + offset += 64; + } + } + } + } + + auto& dct = enc_state_->shared.block_ctx_map.dc_thresholds; + auto& num_dc_ctxs = enc_state_->shared.block_ctx_map.num_dc_ctxs; + enc_state_->shared.block_ctx_map.num_dc_ctxs = 1; + for (size_t i = 0; i < 3; i++) { + dct[i].clear(); + int num_thresholds = (CeilLog2Nonzero(total_dc[i]) - 10) / 2; + // up to 3 buckets per channel: + // dark/medium/bright, yellow/unsat/blue, green/unsat/red + num_thresholds = std::min(std::max(num_thresholds, 0), 2); + size_t cumsum = 0; + size_t cut = total_dc[i] / (num_thresholds + 1); + for (int j = 0; j < 2048; j++) { + cumsum += dc_counts[i][j]; + if (cumsum > cut) { + dct[i].push_back(j - 1025); + cut = total_dc[i] * (dct[i].size() + 1) / (num_thresholds + 1); + } + } + num_dc_ctxs *= dct[i].size() + 1; + } + + auto& ctx_map = enc_state_->shared.block_ctx_map.ctx_map; + ctx_map.clear(); + ctx_map.resize(3 * kNumOrders * num_dc_ctxs, 0); + + int lbuckets = (dct[1].size() + 1); + for (size_t i = 0; i < num_dc_ctxs; i++) { + // up to 9 contexts for luma + ctx_map[i] = i / lbuckets; + // up to 3 contexts for chroma + ctx_map[kNumOrders * num_dc_ctxs + i] = + num_dc_ctxs / lbuckets + (i % lbuckets); + ctx_map[2 * kNumOrders * num_dc_ctxs + i] = + num_dc_ctxs / lbuckets + (i % lbuckets); + } + enc_state_->shared.block_ctx_map.num_ctxs = + *std::max_element(ctx_map.begin(), ctx_map.end()) + 1; + + enc_state_->histogram_idx.resize(shared.frame_dim.num_groups); + + // disable DC frame for now + shared.frame_header.UpdateFlag(false, FrameHeader::kUseDcFrame); + auto compute_dc_coeffs = [&](int group_index, int /* thread */) { + modular_frame_encoder->AddVarDCTDC(dc, group_index, /*nl_dc=*/false, + enc_state_); + modular_frame_encoder->AddACMetadata(group_index, /*jpeg_transcode=*/true, + enc_state_); + }; + RunOnPool(pool_, 0, shared.frame_dim.num_dc_groups, ThreadPool::SkipInit(), + compute_dc_coeffs, "Compute DC coeffs"); + + // Must happen before WriteFrameHeader! + shared.frame_header.UpdateFlag(true, FrameHeader::kSkipAdaptiveDCSmoothing); + + enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses()); + for (PassesEncoderState::PassData& pass : enc_state_->passes) { + pass.ac_tokens.resize(shared.frame_dim.num_groups); + } + + JXL_CHECK(enc_state_->passes.size() == + 1); // skipping coeff splitting so need to have only one pass + + ComputeAllCoeffOrders(frame_dim); + shared.num_histograms = 1; + + const auto tokenize_group_init = [&](const size_t num_threads) { + group_caches_.resize(num_threads); + return true; + }; + const auto tokenize_group = [&](const int group_index, const int thread) { + // Tokenize coefficients. + const Rect rect = shared.BlockGroupRect(group_index); + for (size_t idx_pass = 0; idx_pass < enc_state_->passes.size(); + idx_pass++) { + JXL_ASSERT(enc_state_->coeffs[idx_pass]->Type() == ACType::k32); + const int32_t* JXL_RESTRICT ac_rows[3] = { + enc_state_->coeffs[idx_pass]->PlaneRow(0, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(1, group_index, 0).ptr32, + enc_state_->coeffs[idx_pass]->PlaneRow(2, group_index, 0).ptr32, + }; + // Ensure group cache is initialized. + group_caches_[thread].InitOnce(); + TokenizeCoefficients( + &shared.coeff_orders[idx_pass * shared.coeff_order_size], rect, + ac_rows, shared.ac_strategy, frame_header->chroma_subsampling, + &group_caches_[thread].num_nzeroes, + &enc_state_->passes[idx_pass].ac_tokens[group_index], + enc_state_->shared.quant_dc, enc_state_->shared.raw_quant_field, + enc_state_->shared.block_ctx_map); + } + }; + RunOnPool(pool_, 0, shared.frame_dim.num_groups, tokenize_group_init, + tokenize_group, "TokenizeGroup"); + *frame_header = shared.frame_header; + return true; + } + + Status EncodeGlobalDCInfo(const FrameHeader& frame_header, + BitWriter* writer) const { + // Encode quantizer DC and global scale. + JXL_RETURN_IF_ERROR( + enc_state_->shared.quantizer.Encode(writer, kLayerQuant, aux_out_)); + EncodeBlockCtxMap(enc_state_->shared.block_ctx_map, writer, aux_out_); + ColorCorrelationMapEncodeDC(&enc_state_->shared.cmap, writer, kLayerDC, + aux_out_); + return true; + } + + Status EncodeGlobalACInfo(BitWriter* writer, + ModularFrameEncoder* modular_frame_encoder) { + JXL_RETURN_IF_ERROR(DequantMatricesEncode(&enc_state_->shared.matrices, + writer, kLayerDequantTables, + aux_out_, modular_frame_encoder)); + if (enc_state_->cparams.speed_tier <= SpeedTier::kTortoise) { + ClusterGroups(enc_state_); + } + size_t num_histo_bits = + CeilLog2Nonzero(enc_state_->shared.frame_dim.num_groups); + if (num_histo_bits != 0) { + BitWriter::Allotment allotment(writer, num_histo_bits); + writer->Write(num_histo_bits, enc_state_->shared.num_histograms - 1); + ReclaimAndCharge(writer, &allotment, kLayerAC, aux_out_); + } + + for (size_t i = 0; i < enc_state_->progressive_splitter.GetNumPasses(); + i++) { + // Encode coefficient orders. + size_t order_bits = 0; + JXL_RETURN_IF_ERROR(U32Coder::CanEncode( + kOrderEnc, enc_state_->used_orders[i], &order_bits)); + BitWriter::Allotment allotment(writer, order_bits); + JXL_CHECK(U32Coder::Write(kOrderEnc, enc_state_->used_orders[i], writer)); + ReclaimAndCharge(writer, &allotment, kLayerOrder, aux_out_); + EncodeCoeffOrders( + enc_state_->used_orders[i], + &enc_state_->shared + .coeff_orders[i * enc_state_->shared.coeff_order_size], + writer, kLayerOrder, aux_out_); + + // Encode histograms. + HistogramParams hist_params( + enc_state_->cparams.speed_tier, + enc_state_->shared.block_ctx_map.NumACContexts()); + if (enc_state_->cparams.speed_tier > SpeedTier::kTortoise) { + hist_params.lz77_method = HistogramParams::LZ77Method::kNone; + } + if (enc_state_->cparams.decoding_speed_tier >= 1) { + hist_params.max_histograms = 6; + } + BuildAndEncodeHistograms( + hist_params, + enc_state_->shared.num_histograms * + enc_state_->shared.block_ctx_map.NumACContexts(), + enc_state_->passes[i].ac_tokens, &enc_state_->passes[i].codes, + &enc_state_->passes[i].context_map, writer, kLayerAC, aux_out_); + } + + return true; + } + + Status EncodeACGroup(size_t pass, size_t group_index, BitWriter* group_code, + AuxOut* local_aux_out) { + return EncodeGroupTokenizedCoefficients( + group_index, pass, enc_state_->histogram_idx[group_index], *enc_state_, + group_code, local_aux_out); + } + + PassesEncoderState* State() { return enc_state_; } + + private: + void ComputeAllCoeffOrders(const FrameDimensions& frame_dim) { + PROFILER_FUNC; + enc_state_->used_orders.resize( + enc_state_->progressive_splitter.GetNumPasses()); + for (size_t i = 0; i < enc_state_->progressive_splitter.GetNumPasses(); + i++) { + // No coefficient reordering in Falcon or faster. + if (enc_state_->cparams.speed_tier < SpeedTier::kFalcon) { + enc_state_->used_orders[i] = ComputeUsedOrders( + enc_state_->cparams.speed_tier, enc_state_->shared.ac_strategy, + Rect(enc_state_->shared.raw_quant_field)); + } + ComputeCoeffOrder( + enc_state_->cparams.speed_tier, *enc_state_->coeffs[i], + enc_state_->shared.ac_strategy, frame_dim, enc_state_->used_orders[i], + &enc_state_->shared + .coeff_orders[i * enc_state_->shared.coeff_order_size]); + } + } + + template + static inline void FindIndexOfSumMaximum(const V* array, const size_t len, + R* idx, V* sum) { + JXL_ASSERT(len > 0); + V maxval = 0; + V val = 0; + R maxidx = 0; + for (size_t i = 0; i < len; ++i) { + val += array[i]; + if (val > maxval) { + maxval = val; + maxidx = i; + } + } + *idx = maxidx; + *sum = maxval; + } + + PassesEncoderState* JXL_RESTRICT enc_state_; + ThreadPool* pool_; + AuxOut* aux_out_; + std::vector group_caches_; +}; + +Status EncodeFrame(const CompressParams& cparams_orig, + const FrameInfo& frame_info, const CodecMetadata* metadata, + const ImageBundle& ib, PassesEncoderState* passes_enc_state, + ThreadPool* pool, BitWriter* writer, AuxOut* aux_out) { + ib.VerifyMetadata(); + + passes_enc_state->special_frames.clear(); + + CompressParams cparams = cparams_orig; + + if (cparams.progressive_dc < 0) { + if (cparams.progressive_dc != -1) { + return JXL_FAILURE("Invalid progressive DC setting value (%d)", + cparams.progressive_dc); + } + cparams.progressive_dc = 0; + // Enable progressive_dc for lower qualities. + if (cparams.butteraugli_distance >= + kMinButteraugliDistanceForProgressiveDc) { + cparams.progressive_dc = 1; + } + } + if (cparams.ec_resampling < cparams.resampling) { + cparams.ec_resampling = cparams.resampling; + } + if (cparams.resampling > 1) cparams.progressive_dc = 0; + + if (frame_info.dc_level + cparams.progressive_dc > 4) { + return JXL_FAILURE("Too many levels of progressive DC"); + } + + if (cparams.butteraugli_distance != 0 && + cparams.butteraugli_distance < kMinButteraugliDistance) { + return JXL_FAILURE("Butteraugli distance is too low (%f)", + cparams.butteraugli_distance); + } + if (cparams.butteraugli_distance > 0.9f && cparams.modular_mode == false && + cparams.quality_pair.first == 100) { + // in case the color image is lossy, make the alpha slightly lossy too + cparams.quality_pair.first = + std::max(90.f, 99.f - 0.3f * cparams.butteraugli_distance); + } + + if (ib.IsJPEG()) { + cparams.gaborish = Override::kOff; + cparams.epf = 0; + cparams.modular_mode = false; + } + + if (ib.xsize() == 0 || ib.ysize() == 0) return JXL_FAILURE("Empty image"); + + // Assert that this metadata is correctly set up for the compression params, + // this should have been done by enc_file.cc + JXL_ASSERT(metadata->m.xyb_encoded == + (cparams.color_transform == ColorTransform::kXYB)); + std::unique_ptr frame_header = + jxl::make_unique(metadata); + JXL_RETURN_IF_ERROR(MakeFrameHeader(cparams, + passes_enc_state->progressive_splitter, + frame_info, ib, frame_header.get())); + // Check that if the codestream header says xyb_encoded, the color_transform + // matches the requirement. This is checked from the cparams here, even though + // optimally we'd be able to check this against what has actually been written + // in the main codestream header, but since ib is a const object and the data + // written to the main codestream header is (in modified form) in ib, the + // encoder cannot indicate this fact in the ib's metadata. + if (cparams_orig.color_transform == ColorTransform::kXYB) { + if (frame_header->color_transform != ColorTransform::kXYB) { + return JXL_FAILURE( + "The color transform of frames must be xyb if the codestream is xyb " + "encoded"); + } + } else { + if (frame_header->color_transform == ColorTransform::kXYB) { + return JXL_FAILURE( + "The color transform of frames cannot be xyb if the codestream is " + "not xyb encoded"); + } + } + + FrameDimensions frame_dim = frame_header->ToFrameDimensions(); + + const size_t num_groups = frame_dim.num_groups; + + Image3F opsin; + const ColorEncoding& c_linear = ColorEncoding::LinearSRGB(ib.IsGray()); + std::unique_ptr metadata_linear = + jxl::make_unique(); + metadata_linear->xyb_encoded = + (cparams.color_transform == ColorTransform::kXYB); + metadata_linear->color_encoding = c_linear; + ImageBundle linear_storage(metadata_linear.get()); + + std::vector aux_outs; + // LossyFrameEncoder stores a reference to a std::function + // so we need to keep the std::function being referenced + // alive while lossy_frame_encoder is used. We could make resize_aux_outs a + // lambda type by making LossyFrameEncoder a template instead, but this is + // simpler. + const std::function resize_aux_outs = + [&aux_outs, aux_out](size_t num_threads) -> Status { + if (aux_out != nullptr) { + size_t old_size = aux_outs.size(); + for (size_t i = num_threads; i < old_size; i++) { + aux_out->Assimilate(aux_outs[i]); + } + aux_outs.resize(num_threads); + // Each thread needs these INPUTS. Don't copy the entire AuxOut + // because it may contain stats which would be Assimilated multiple + // times below. + for (size_t i = old_size; i < aux_outs.size(); i++) { + aux_outs[i].dump_image = aux_out->dump_image; + aux_outs[i].debug_prefix = aux_out->debug_prefix; + } + } + return true; + }; + + LossyFrameEncoder lossy_frame_encoder(cparams, *frame_header, + passes_enc_state, pool, aux_out); + std::unique_ptr modular_frame_encoder = + jxl::make_unique(*frame_header, cparams); + + const std::vector* extra_channels = &ib.extra_channels(); + std::vector extra_channels_storage; + + if (ib.IsJPEG()) { + JXL_RETURN_IF_ERROR(lossy_frame_encoder.ComputeJPEGTranscodingData( + *ib.jpeg_data, modular_frame_encoder.get(), frame_header.get())); + } else if (!lossy_frame_encoder.State()->heuristics->HandlesColorConversion( + cparams, ib) || + frame_header->encoding != FrameEncoding::kVarDCT) { + // Allocating a large enough image avoids a copy when padding. + opsin = + Image3F(RoundUpToBlockDim(ib.xsize()), RoundUpToBlockDim(ib.ysize())); + opsin.ShrinkTo(ib.xsize(), ib.ysize()); + + const bool want_linear = frame_header->encoding == FrameEncoding::kVarDCT && + cparams.speed_tier <= SpeedTier::kKitten; + const ImageBundle* JXL_RESTRICT ib_or_linear = &ib; + + if (frame_header->color_transform == ColorTransform::kXYB && + frame_info.ib_needs_color_transform) { + // linear_storage would only be used by the Butteraugli loop (passing + // linear sRGB avoids a color conversion there). Otherwise, don't + // fill it to reduce memory usage. + ib_or_linear = + ToXYB(ib, pool, &opsin, want_linear ? &linear_storage : nullptr); + } else { // RGB or YCbCr: don't do anything (forward YCbCr is not + // implemented, this is only used when the input is already in + // YCbCr) + // If encoding a special DC or reference frame, don't do anything: + // input is already in XYB. + CopyImageTo(ib.color(), &opsin); + } + bool lossless = (frame_header->encoding == FrameEncoding::kModular && + cparams.quality_pair.first == 100); + if (ib.HasAlpha() && !ib.AlphaIsPremultiplied() && + !ApplyOverride(cparams.keep_invisible, lossless) && + cparams.ec_resampling == cparams.resampling) { + // simplify invisible pixels + SimplifyInvisible(&opsin, ib.alpha(), lossless); + if (want_linear) { + SimplifyInvisible(const_cast(&ib_or_linear->color()), + ib.alpha(), lossless); + } + } + if (aux_out != nullptr) { + JXL_RETURN_IF_ERROR( + aux_out->InspectImage3F("enc_frame:OpsinDynamicsImage", opsin)); + } + if (frame_header->encoding == FrameEncoding::kVarDCT) { + PadImageToBlockMultipleInPlace(&opsin); + JXL_RETURN_IF_ERROR(lossy_frame_encoder.ComputeEncodingData( + ib_or_linear, &opsin, pool, modular_frame_encoder.get(), writer, + frame_header.get())); + } else if (frame_header->upsampling != 1 && !cparams.already_downsampled) { + // In VarDCT mode, LossyFrameHeuristics takes care of running downsampling + // after noise, if necessary. + DownsampleImage(&opsin, frame_header->upsampling); + } + } else { + JXL_RETURN_IF_ERROR(lossy_frame_encoder.ComputeEncodingData( + &ib, &opsin, pool, modular_frame_encoder.get(), writer, + frame_header.get())); + } + if (cparams.ec_resampling != 1 && !cparams.already_downsampled) { + extra_channels = &extra_channels_storage; + for (size_t i = 0; i < ib.extra_channels().size(); i++) { + extra_channels_storage.emplace_back(CopyImage(ib.extra_channels()[i])); + DownsampleImage(&extra_channels_storage.back(), cparams.ec_resampling); + } + } + // needs to happen *AFTER* VarDCT-ComputeEncodingData. + JXL_RETURN_IF_ERROR(modular_frame_encoder->ComputeEncodingData( + *frame_header, *ib.metadata(), &opsin, *extra_channels, + lossy_frame_encoder.State(), pool, aux_out, + /* do_color=*/frame_header->encoding == FrameEncoding::kModular)); + + writer->AppendByteAligned(lossy_frame_encoder.State()->special_frames); + frame_header->UpdateFlag( + lossy_frame_encoder.State()->shared.image_features.patches.HasAny(), + FrameHeader::kPatches); + frame_header->UpdateFlag( + lossy_frame_encoder.State()->shared.image_features.splines.HasAny(), + FrameHeader::kSplines); + JXL_RETURN_IF_ERROR(WriteFrameHeader(*frame_header, writer, aux_out)); + + const size_t num_passes = + passes_enc_state->progressive_splitter.GetNumPasses(); + + // DC global info + DC groups + AC global info + AC groups * + // num_passes. + const bool has_ac_global = true; + std::vector group_codes(NumTocEntries(frame_dim.num_groups, + frame_dim.num_dc_groups, + num_passes, has_ac_global)); + const size_t global_ac_index = frame_dim.num_dc_groups + 1; + const bool is_small_image = frame_dim.num_groups == 1 && num_passes == 1; + const auto get_output = [&](const size_t index) { + return &group_codes[is_small_image ? 0 : index]; + }; + auto ac_group_code = [&](size_t pass, size_t group) { + return get_output(AcGroupIndex(pass, group, frame_dim.num_groups, + frame_dim.num_dc_groups, has_ac_global)); + }; + + if (frame_header->flags & FrameHeader::kPatches) { + PatchDictionaryEncoder::Encode( + lossy_frame_encoder.State()->shared.image_features.patches, + get_output(0), kLayerDictionary, aux_out); + } + + if (frame_header->flags & FrameHeader::kSplines) { + EncodeSplines(lossy_frame_encoder.State()->shared.image_features.splines, + get_output(0), kLayerSplines, HistogramParams(), aux_out); + } + + if (frame_header->flags & FrameHeader::kNoise) { + EncodeNoise(lossy_frame_encoder.State()->shared.image_features.noise_params, + get_output(0), kLayerNoise, aux_out); + } + + JXL_RETURN_IF_ERROR( + DequantMatricesEncodeDC(&lossy_frame_encoder.State()->shared.matrices, + get_output(0), kLayerDequantTables, aux_out)); + if (frame_header->encoding == FrameEncoding::kVarDCT) { + JXL_RETURN_IF_ERROR( + lossy_frame_encoder.EncodeGlobalDCInfo(*frame_header, get_output(0))); + } + JXL_RETURN_IF_ERROR( + modular_frame_encoder->EncodeGlobalInfo(get_output(0), aux_out)); + JXL_RETURN_IF_ERROR(modular_frame_encoder->EncodeStream( + get_output(0), aux_out, kLayerModularGlobal, ModularStreamId::Global())); + + const auto process_dc_group = [&](const int group_index, const int thread) { + AuxOut* my_aux_out = aux_out ? &aux_outs[thread] : nullptr; + BitWriter* output = get_output(group_index + 1); + if (frame_header->encoding == FrameEncoding::kVarDCT && + !(frame_header->flags & FrameHeader::kUseDcFrame)) { + BitWriter::Allotment allotment(output, 2); + output->Write(2, modular_frame_encoder->extra_dc_precision[group_index]); + ReclaimAndCharge(output, &allotment, kLayerDC, my_aux_out); + JXL_CHECK(modular_frame_encoder->EncodeStream( + output, my_aux_out, kLayerDC, + ModularStreamId::VarDCTDC(group_index))); + } + JXL_CHECK(modular_frame_encoder->EncodeStream( + output, my_aux_out, kLayerModularDcGroup, + ModularStreamId::ModularDC(group_index))); + if (frame_header->encoding == FrameEncoding::kVarDCT) { + const Rect& rect = + lossy_frame_encoder.State()->shared.DCGroupRect(group_index); + size_t nb_bits = CeilLog2Nonzero(rect.xsize() * rect.ysize()); + if (nb_bits != 0) { + BitWriter::Allotment allotment(output, nb_bits); + output->Write(nb_bits, + modular_frame_encoder->ac_metadata_size[group_index] - 1); + ReclaimAndCharge(output, &allotment, kLayerControlFields, my_aux_out); + } + JXL_CHECK(modular_frame_encoder->EncodeStream( + output, my_aux_out, kLayerControlFields, + ModularStreamId::ACMetadata(group_index))); + } + }; + RunOnPool(pool, 0, frame_dim.num_dc_groups, resize_aux_outs, process_dc_group, + "EncodeDCGroup"); + + if (frame_header->encoding == FrameEncoding::kVarDCT) { + JXL_RETURN_IF_ERROR(lossy_frame_encoder.EncodeGlobalACInfo( + get_output(global_ac_index), modular_frame_encoder.get())); + } + + std::atomic num_errors{0}; + const auto process_group = [&](const int group_index, const int thread) { + AuxOut* my_aux_out = aux_out ? &aux_outs[thread] : nullptr; + + for (size_t i = 0; i < num_passes; i++) { + if (frame_header->encoding == FrameEncoding::kVarDCT) { + if (!lossy_frame_encoder.EncodeACGroup( + i, group_index, ac_group_code(i, group_index), my_aux_out)) { + num_errors.fetch_add(1, std::memory_order_relaxed); + return; + } + } + // Write all modular encoded data (color?, alpha, depth, extra channels) + if (!modular_frame_encoder->EncodeStream( + ac_group_code(i, group_index), my_aux_out, kLayerModularAcGroup, + ModularStreamId::ModularAC(group_index, i))) { + num_errors.fetch_add(1, std::memory_order_relaxed); + return; + } + } + }; + RunOnPool(pool, 0, num_groups, resize_aux_outs, process_group, + "EncodeGroupCoefficients"); + + // Resizing aux_outs to 0 also Assimilates the array. + static_cast(resize_aux_outs(0)); + JXL_RETURN_IF_ERROR(num_errors.load(std::memory_order_relaxed) == 0); + + for (BitWriter& bw : group_codes) { + bw.ZeroPadToByte(); // end of group. + } + + std::vector* permutation_ptr = nullptr; + std::vector permutation; + if (cparams.centerfirst && !(num_passes == 1 && num_groups == 1)) { + permutation_ptr = &permutation; + // Don't permute global DC/AC or DC. + permutation.resize(global_ac_index + 1); + std::iota(permutation.begin(), permutation.end(), 0); + std::vector ac_group_order(num_groups); + std::iota(ac_group_order.begin(), ac_group_order.end(), 0); + size_t group_dim = frame_dim.group_dim; + + // The center of the image is either given by parameters or chosen + // to be the middle of the image by default if center_x, center_y resp. + // are not provided. + + int64_t imag_cx; + if (cparams.center_x != static_cast(-1)) { + JXL_RETURN_IF_ERROR(cparams.center_x < ib.xsize()); + imag_cx = cparams.center_x; + } else { + imag_cx = ib.xsize() / 2; + } + + int64_t imag_cy; + if (cparams.center_y != static_cast(-1)) { + JXL_RETURN_IF_ERROR(cparams.center_y < ib.ysize()); + imag_cy = cparams.center_y; + } else { + imag_cy = ib.ysize() / 2; + } + + // The center of the group containing the center of the image. + int64_t cx = (imag_cx / group_dim) * group_dim + group_dim / 2; + int64_t cy = (imag_cy / group_dim) * group_dim + group_dim / 2; + // This identifies in what area of the central group the center of the image + // lies in. + double direction = -std::atan2(imag_cy - cy, imag_cx - cx); + // This identifies the side of the central group the center of the image + // lies closest to. This can take values 0, 1, 2, 3 corresponding to left, + // bottom, right, top. + int64_t side = std::fmod((direction + 5 * kPi / 4), 2 * kPi) * 2 / kPi; + auto get_distance_from_center = [&](size_t gid) { + Rect r = passes_enc_state->shared.GroupRect(gid); + int64_t gcx = r.x0() + group_dim / 2; + int64_t gcy = r.y0() + group_dim / 2; + int64_t dx = gcx - cx; + int64_t dy = gcy - cy; + // The angle is determined by taking atan2 and adding an appropriate + // starting point depending on the side we want to start on. + double angle = std::remainder( + std::atan2(dy, dx) + kPi / 4 + side * (kPi / 2), 2 * kPi); + // Concentric squares in clockwise order. + return std::make_pair(std::max(std::abs(dx), std::abs(dy)), angle); + }; + std::sort(ac_group_order.begin(), ac_group_order.end(), + [&](coeff_order_t a, coeff_order_t b) { + return get_distance_from_center(a) < + get_distance_from_center(b); + }); + std::vector inv_ac_group_order(ac_group_order.size(), 0); + for (size_t i = 0; i < ac_group_order.size(); i++) { + inv_ac_group_order[ac_group_order[i]] = i; + } + for (size_t i = 0; i < num_passes; i++) { + size_t pass_start = permutation.size(); + for (coeff_order_t v : inv_ac_group_order) { + permutation.push_back(pass_start + v); + } + } + std::vector new_group_codes(group_codes.size()); + for (size_t i = 0; i < permutation.size(); i++) { + new_group_codes[permutation[i]] = std::move(group_codes[i]); + } + group_codes = std::move(new_group_codes); + } + + JXL_RETURN_IF_ERROR( + WriteGroupOffsets(group_codes, permutation_ptr, writer, aux_out)); + writer->AppendByteAligned(group_codes); + writer->ZeroPadToByte(); // end of frame. + + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_frame.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_frame.h new file mode 100644 index 0000000000..60e1c0ff65 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_frame.h @@ -0,0 +1,52 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_FRAME_H_ +#define LIB_JXL_ENC_FRAME_H_ + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +// Information needed for encoding a frame that is not contained elsewhere and +// does not belong to `cparams`. +struct FrameInfo { + // TODO(veluca): consider adding more parameters, such as custom patches. + bool save_before_color_transform = false; + // Whether or not the input image bundle is already in the codestream + // colorspace (as deduced by cparams). + // TODO(veluca): this is a hack - ImageBundle doesn't have a simple way to say + // "this is already in XYB". + bool ib_needs_color_transform = true; + FrameType frame_type = FrameType::kRegularFrame; + size_t dc_level = 0; + // Only used for kRegularFrame. + bool is_last = true; + bool is_preview = false; + // Information for storing this frame for future use (only for non-DC frames). + size_t save_as_reference = 0; +}; + +// Encodes a single frame (including its header) into a byte stream. Groups may +// be processed in parallel by `pool`. metadata is the ImageMetadata encoded in +// the codestream, and must be used for the FrameHeaders, do not use +// ib.metadata. +Status EncodeFrame(const CompressParams& cparams_orig, + const FrameInfo& frame_info, const CodecMetadata* metadata, + const ImageBundle& ib, PassesEncoderState* passes_enc_state, + ThreadPool* pool, BitWriter* writer, AuxOut* aux_out, + std::string xclbinPath = ""); + +} // namespace jxl + +#endif // LIB_JXL_ENC_FRAME_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_gamma_correct.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_gamma_correct.h new file mode 100644 index 0000000000..0db7012bbe --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_gamma_correct.h @@ -0,0 +1,36 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_GAMMA_CORRECT_H_ +#define LIB_JXL_ENC_GAMMA_CORRECT_H_ + +// Deprecated: sRGB transfer function. Use color_management.h instead. + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/transfer_functions-inl.h" + +namespace jxl { + +// Values are in [0, 1]. +static JXL_INLINE double Srgb8ToLinearDirect(double srgb) { + if (srgb <= 0.0) return 0.0; + if (srgb <= 0.04045) return srgb / 12.92; + if (srgb >= 1.0) return 1.0; + return std::pow((srgb + 0.055) / 1.055, 2.4); +} + +// Values are in [0, 1]. +static JXL_INLINE double LinearToSrgb8Direct(double linear) { + if (linear <= 0.0) return 0.0; + if (linear >= 1.0) return 1.0; + if (linear <= 0.0031308) return linear * 12.92; + return std::pow(linear, 1.0 / 2.4) * 1.055 - 0.055; +} + +} // namespace jxl + +#endif // LIB_JXL_ENC_GAMMA_CORRECT_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_group.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_group.cc new file mode 100644 index 0000000000..91357dc9b7 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_group.cc @@ -0,0 +1,342 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_group.h" + +#include + +#include "hwy/aligned_allocator.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_group.cc" +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/dec_transforms-inl.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_transforms-inl.h" +#include "lib/jxl/image.h" +#include "lib/jxl/quantizer-inl.h" +#include "lib/jxl/quantizer.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// NOTE: caller takes care of extracting quant from rect of RawQuantField. +void QuantizeBlockAC(const Quantizer& quantizer, const bool error_diffusion, + size_t c, int32_t quant, float qm_multiplier, + size_t quant_kind, size_t xsize, size_t ysize, + const float* JXL_RESTRICT block_in, + int32_t* JXL_RESTRICT block_out) { + PROFILER_FUNC; + const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c); + const float qac = quantizer.Scale() * quant; + // Not SIMD-fied for now. + float thres[4] = {0.5f, 0.6f, 0.6f, 0.65f}; + if (c != 1) { + for (int i = 1; i < 4; ++i) { + thres[i] = 0.75f; + } + } + + if (!error_diffusion) { + HWY_CAPPED(float, kBlockDim) df; + HWY_CAPPED(int32_t, kBlockDim) di; + HWY_CAPPED(uint32_t, kBlockDim) du; + const auto quant = Set(df, qac * qm_multiplier); + + for (size_t y = 0; y < ysize * kBlockDim; y++) { + size_t yfix = static_cast(y >= ysize * kBlockDim / 2) * 2; + const size_t off = y * kBlockDim * xsize; + for (size_t x = 0; x < xsize * kBlockDim; x += Lanes(df)) { + auto thr = Zero(df); + if (xsize == 1) { + HWY_ALIGN uint32_t kMask[kBlockDim] = {0, 0, 0, 0, + ~0u, ~0u, ~0u, ~0u}; + const auto mask = MaskFromVec(BitCast(df, Load(du, kMask + x))); + thr = + IfThenElse(mask, Set(df, thres[yfix + 1]), Set(df, thres[yfix])); + } else { + // Same for all lanes in the vector. + thr = Set( + df, + thres[yfix + static_cast(x >= xsize * kBlockDim / 2)]); + } + + const auto q = Load(df, qm + off + x) * quant; + const auto in = Load(df, block_in + off + x); + const auto val = q * in; + const auto nzero_mask = Abs(val) >= thr; + const auto v = ConvertTo(di, IfThenElseZero(nzero_mask, Round(val))); + Store(v, di, block_out + off + x); + } + } + return; + } + +retry: + int hfNonZeros[4] = {}; + float hfError[4] = {}; + float hfMaxError[4] = {}; + size_t hfMaxErrorIx[4] = {}; + for (size_t y = 0; y < ysize * kBlockDim; y++) { + for (size_t x = 0; x < xsize * kBlockDim; x++) { + const size_t pos = y * kBlockDim * xsize + x; + if (x < xsize && y < ysize) { + // Ensure block is initialized + block_out[pos] = 0; + continue; + } + const size_t hfix = (static_cast(y >= ysize * kBlockDim / 2) * 2 + + static_cast(x >= xsize * kBlockDim / 2)); + const float val = block_in[pos] * (qm[pos] * qac * qm_multiplier); + float v = (std::abs(val) < thres[hfix]) ? 0 : rintf(val); + const float error = std::abs(val) - std::abs(v); + hfError[hfix] += error; + if (hfMaxError[hfix] < error) { + hfMaxError[hfix] = error; + hfMaxErrorIx[hfix] = pos; + } + if (v != 0.0f) { + hfNonZeros[hfix] += std::abs(v); + } + block_out[pos] = static_cast(rintf(v)); + } + } + if (c != 1) return; + // TODO(veluca): include AFV? + const size_t kPartialBlockKinds = + (1 << AcStrategy::Type::IDENTITY) | (1 << AcStrategy::Type::DCT2X2) | + (1 << AcStrategy::Type::DCT4X4) | (1 << AcStrategy::Type::DCT4X8) | + (1 << AcStrategy::Type::DCT8X4); + if ((1 << quant_kind) & kPartialBlockKinds) return; + float hfErrorLimit = 0.1f * (xsize * ysize) * kDCTBlockSize * 0.25f; + bool goretry = false; + for (int i = 1; i < 4; ++i) { + if (hfError[i] >= hfErrorLimit && + hfNonZeros[i] <= (xsize + ysize) * 0.25f) { + if (thres[i] >= 0.4f) { + thres[i] -= 0.01f; + goretry = true; + } + } + } + if (goretry) goto retry; + for (int i = 1; i < 4; ++i) { + if (hfError[i] >= hfErrorLimit && hfNonZeros[i] == 0) { + const size_t pos = hfMaxErrorIx[i]; + if (hfMaxError[i] >= 0.4f) { + block_out[pos] = block_in[pos] > 0.0f ? 1.0f : -1.0f; + } + } + } +} + +// NOTE: caller takes care of extracting quant from rect of RawQuantField. +void QuantizeRoundtripYBlockAC(const Quantizer& quantizer, + const bool error_diffusion, int32_t quant, + size_t quant_kind, size_t xsize, size_t ysize, + const float* JXL_RESTRICT biases, + float* JXL_RESTRICT inout, + int32_t* JXL_RESTRICT quantized) { + QuantizeBlockAC(quantizer, error_diffusion, 1, quant, 1.0f, quant_kind, xsize, + ysize, inout, quantized); + + PROFILER_ZONE("enc quant adjust bias"); + const float* JXL_RESTRICT dequant_matrix = + quantizer.DequantMatrix(quant_kind, 1); + + HWY_CAPPED(float, kDCTBlockSize) df; + HWY_CAPPED(int32_t, kDCTBlockSize) di; + const auto inv_qac = Set(df, quantizer.inv_quant_ac(quant)); + for (size_t k = 0; k < kDCTBlockSize * xsize * ysize; k += Lanes(df)) { + const auto quant = Load(di, quantized + k); + const auto adj_quant = AdjustQuantBias(di, 1, quant, biases); + const auto dequantm = Load(df, dequant_matrix + k); + Store(adj_quant * dequantm * inv_qac, df, inout + k); + } +} + +void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state, + const Image3F& opsin, Image3F* dc) { + PROFILER_FUNC; + const Rect block_group_rect = enc_state->shared.BlockGroupRect(group_idx); + const Rect group_rect = enc_state->shared.GroupRect(group_idx); + const Rect cmap_rect( + block_group_rect.x0() / kColorTileDimInBlocks, + block_group_rect.y0() / kColorTileDimInBlocks, + DivCeil(block_group_rect.xsize(), kColorTileDimInBlocks), + DivCeil(block_group_rect.ysize(), kColorTileDimInBlocks)); + + const size_t xsize_blocks = block_group_rect.xsize(); + const size_t ysize_blocks = block_group_rect.ysize(); + + const size_t dc_stride = static_cast(dc->PixelsPerRow()); + const size_t opsin_stride = static_cast(opsin.PixelsPerRow()); + + const ImageI& full_quant_field = enc_state->shared.raw_quant_field; + const CompressParams& cparams = enc_state->cparams; + + // TODO(veluca): consider strategies to reduce this memory. + auto mem = hwy::AllocateAligned(3 * AcStrategy::kMaxCoeffArea); + auto fmem = hwy::AllocateAligned(5 * AcStrategy::kMaxCoeffArea); + float* JXL_RESTRICT scratch_space = + fmem.get() + 3 * AcStrategy::kMaxCoeffArea; + { + // Only use error diffusion in Squirrel mode or slower. + const bool error_diffusion = cparams.speed_tier <= SpeedTier::kSquirrel; + constexpr HWY_CAPPED(float, kDCTBlockSize) d; + + int32_t* JXL_RESTRICT coeffs[kMaxNumPasses][3] = {}; + size_t num_passes = enc_state->progressive_splitter.GetNumPasses(); + JXL_DASSERT(num_passes > 0); + for (size_t i = 0; i < num_passes; i++) { + // TODO(veluca): 16-bit quantized coeffs are not implemented yet. + JXL_ASSERT(enc_state->coeffs[i]->Type() == ACType::k32); + for (size_t c = 0; c < 3; c++) { + coeffs[i][c] = enc_state->coeffs[i]->PlaneRow(c, group_idx, 0).ptr32; + } + } + + HWY_ALIGN float* coeffs_in = fmem.get(); + HWY_ALIGN int32_t* quantized = mem.get(); + + size_t offset = 0; + + for (size_t by = 0; by < ysize_blocks; ++by) { + const int32_t* JXL_RESTRICT row_quant_ac = + block_group_rect.ConstRow(full_quant_field, by); + size_t ty = by / kColorTileDimInBlocks; + const int8_t* JXL_RESTRICT row_cmap[3] = { + cmap_rect.ConstRow(enc_state->shared.cmap.ytox_map, ty), + nullptr, + cmap_rect.ConstRow(enc_state->shared.cmap.ytob_map, ty), + }; + const float* JXL_RESTRICT opsin_rows[3] = { + group_rect.ConstPlaneRow(opsin, 0, by * kBlockDim), + group_rect.ConstPlaneRow(opsin, 1, by * kBlockDim), + group_rect.ConstPlaneRow(opsin, 2, by * kBlockDim), + }; + float* JXL_RESTRICT dc_rows[3] = { + block_group_rect.PlaneRow(dc, 0, by), + block_group_rect.PlaneRow(dc, 1, by), + block_group_rect.PlaneRow(dc, 2, by), + }; + AcStrategyRow ac_strategy_row = + enc_state->shared.ac_strategy.ConstRow(block_group_rect, by); + for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks); + tx++) { + const auto x_factor = + Set(d, enc_state->shared.cmap.YtoXRatio(row_cmap[0][tx])); + const auto b_factor = + Set(d, enc_state->shared.cmap.YtoBRatio(row_cmap[2][tx])); + for (size_t bx = tx * kColorTileDimInBlocks; + bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks; ++bx) { + const AcStrategy acs = ac_strategy_row[bx]; + if (!acs.IsFirstBlock()) continue; + + size_t xblocks = acs.covered_blocks_x(); + size_t yblocks = acs.covered_blocks_y(); + + CoefficientLayout(&yblocks, &xblocks); + + size_t size = kDCTBlockSize * xblocks * yblocks; + + // DCT Y channel, roundtrip-quantize it and set DC. + const int32_t quant_ac = row_quant_ac[bx]; + TransformFromPixels(acs.Strategy(), opsin_rows[1] + bx * kBlockDim, + opsin_stride, coeffs_in + size, scratch_space); + DCFromLowestFrequencies(acs.Strategy(), coeffs_in + size, + dc_rows[1] + bx, dc_stride); + QuantizeRoundtripYBlockAC( + enc_state->shared.quantizer, error_diffusion, quant_ac, + acs.RawStrategy(), xblocks, yblocks, kDefaultQuantBias, + coeffs_in + size, quantized + size); + + // DCT X and B channels + for (size_t c : {0, 2}) { + TransformFromPixels(acs.Strategy(), opsin_rows[c] + bx * kBlockDim, + opsin_stride, coeffs_in + c * size, + scratch_space); + } + + // Unapply color correlation + for (size_t k = 0; k < size; k += Lanes(d)) { + const auto in_x = Load(d, coeffs_in + k); + const auto in_y = Load(d, coeffs_in + size + k); + const auto in_b = Load(d, coeffs_in + 2 * size + k); + const auto out_x = in_x - x_factor * in_y; + const auto out_b = in_b - b_factor * in_y; + Store(out_x, d, coeffs_in + k); + Store(out_b, d, coeffs_in + 2 * size + k); + } + + // Quantize X and B channels and set DC. + for (size_t c : {0, 2}) { + QuantizeBlockAC(enc_state->shared.quantizer, error_diffusion, c, + quant_ac, + c == 0 ? enc_state->x_qm_multiplier + : enc_state->b_qm_multiplier, + acs.RawStrategy(), xblocks, yblocks, + coeffs_in + c * size, quantized + c * size); + DCFromLowestFrequencies(acs.Strategy(), coeffs_in + c * size, + dc_rows[c] + bx, dc_stride); + } + enc_state->progressive_splitter.SplitACCoefficients( + quantized, size, acs, bx, by, offset, coeffs); + offset += size; + } + } + } + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(ComputeCoefficients); +void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state, + const Image3F& opsin, Image3F* dc) { + return HWY_DYNAMIC_DISPATCH(ComputeCoefficients)(group_idx, enc_state, opsin, + dc); +} + +Status EncodeGroupTokenizedCoefficients(size_t group_idx, size_t pass_idx, + size_t histogram_idx, + const PassesEncoderState& enc_state, + BitWriter* writer, AuxOut* aux_out) { + // Select which histogram to use among those of the current pass. + const size_t num_histograms = enc_state.shared.num_histograms; + // num_histograms is 0 only for lossless. + JXL_ASSERT(num_histograms == 0 || histogram_idx < num_histograms); + size_t histo_selector_bits = CeilLog2Nonzero(num_histograms); + + if (histo_selector_bits != 0) { + BitWriter::Allotment allotment(writer, histo_selector_bits); + writer->Write(histo_selector_bits, histogram_idx); + ReclaimAndCharge(writer, &allotment, kLayerAC, aux_out); + } + WriteTokens(enc_state.passes[pass_idx].ac_tokens[group_idx], + enc_state.passes[pass_idx].codes, + enc_state.passes[pass_idx].context_map, writer, kLayerACTokens, + aux_out); + + return true; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_group.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_group.h new file mode 100644 index 0000000000..62468ddf95 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_group.h @@ -0,0 +1,30 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_GROUP_H_ +#define LIB_JXL_ENC_GROUP_H_ + +#include +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" + +namespace jxl { + +// Fills DC +void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state, + const Image3F& opsin, Image3F* dc); + +Status EncodeGroupTokenizedCoefficients(size_t group_idx, size_t pass_idx, + size_t histogram_idx, + const PassesEncoderState& enc_state, + BitWriter* writer, AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_ENC_GROUP_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_heuristics.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_heuristics.cc new file mode 100644 index 0000000000..3324e50778 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_heuristics.cc @@ -0,0 +1,435 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_heuristics.h" + +#include +#include + +#include +#include +#include + +#include "lib/jxl/enc_ac_strategy.h" +#include "lib/jxl/enc_adaptive_quantization.h" +#include "lib/jxl/enc_ar_control_field.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_chroma_from_luma.h" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_noise.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_photon_noise.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/gaborish.h" + +namespace jxl { +namespace { +void FindBestBlockEntropyModel(PassesEncoderState& enc_state) { + if (enc_state.cparams.decoding_speed_tier >= 1) { + static constexpr uint8_t kSimpleCtxMap[] = { + // Cluster all blocks together + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // + }; + static_assert( + 3 * kNumOrders == sizeof(kSimpleCtxMap) / sizeof *kSimpleCtxMap, + "Update simple context map"); + + auto bcm = enc_state.shared.block_ctx_map; + bcm.ctx_map.assign(std::begin(kSimpleCtxMap), std::end(kSimpleCtxMap)); + bcm.num_ctxs = 2; + bcm.num_dc_ctxs = 1; + return; + } + if (enc_state.cparams.speed_tier >= SpeedTier::kFalcon) { + return; + } + const ImageI& rqf = enc_state.shared.raw_quant_field; + // No need to change context modeling for small images. + size_t tot = rqf.xsize() * rqf.ysize(); + size_t size_for_ctx_model = + (1 << 10) * enc_state.cparams.butteraugli_distance; + if (tot < size_for_ctx_model) return; + + struct OccCounters { + // count the occurrences of each qf value and each strategy type. + OccCounters(const ImageI& rqf, const AcStrategyImage& ac_strategy) { + for (size_t y = 0; y < rqf.ysize(); y++) { + const int32_t* qf_row = rqf.Row(y); + AcStrategyRow acs_row = ac_strategy.ConstRow(y); + for (size_t x = 0; x < rqf.xsize(); x++) { + int ord = kStrategyOrder[acs_row[x].RawStrategy()]; + int qf = qf_row[x] - 1; + qf_counts[qf]++; + qf_ord_counts[ord][qf]++; + ord_counts[ord]++; + } + } + } + + size_t qf_counts[256] = {}; + size_t qf_ord_counts[kNumOrders][256] = {}; + size_t ord_counts[kNumOrders] = {}; + }; + // The OccCounters struct is too big to allocate on the stack. + std::unique_ptr counters( + new OccCounters(rqf, enc_state.shared.ac_strategy)); + + // Splitting the context model according to the quantization field seems to + // mostly benefit only large images. + size_t size_for_qf_split = (1 << 13) * enc_state.cparams.butteraugli_distance; + size_t num_qf_segments = tot < size_for_qf_split ? 1 : 2; + std::vector& qft = enc_state.shared.block_ctx_map.qf_thresholds; + qft.clear(); + // Divide the quant field in up to num_qf_segments segments. + size_t cumsum = 0; + size_t next = 1; + size_t last_cut = 256; + size_t cut = tot * next / num_qf_segments; + for (uint32_t j = 0; j < 256; j++) { + cumsum += counters->qf_counts[j]; + if (cumsum > cut) { + if (j != 0) { + qft.push_back(j); + } + last_cut = j; + while (cumsum > cut) { + next++; + cut = tot * next / num_qf_segments; + } + } else if (next > qft.size() + 1) { + if (j - 1 == last_cut && j != 0) { + qft.push_back(j); + } + } + } + + // Count the occurrences of each segment. + std::vector counts(kNumOrders * (qft.size() + 1)); + size_t qft_pos = 0; + for (size_t j = 0; j < 256; j++) { + if (qft_pos < qft.size() && j == qft[qft_pos]) { + qft_pos++; + } + for (size_t i = 0; i < kNumOrders; i++) { + counts[qft_pos + i * (qft.size() + 1)] += counters->qf_ord_counts[i][j]; + } + } + + // Repeatedly merge the lowest-count pair. + std::vector remap((qft.size() + 1) * kNumOrders); + std::iota(remap.begin(), remap.end(), 0); + std::vector clusters(remap); + size_t nb_clusters = Clamp1((int)(tot / size_for_ctx_model / 2), 4, 8); + // This is O(n^2 log n), but n <= 14. + while (clusters.size() > nb_clusters) { + std::sort(clusters.begin(), clusters.end(), + [&](int a, int b) { return counts[a] > counts[b]; }); + counts[clusters[clusters.size() - 2]] += counts[clusters.back()]; + counts[clusters.back()] = 0; + remap[clusters.back()] = clusters[clusters.size() - 2]; + clusters.pop_back(); + } + for (size_t i = 0; i < remap.size(); i++) { + while (remap[remap[i]] != remap[i]) { + remap[i] = remap[remap[i]]; + } + } + // Relabel starting from 0. + std::vector remap_remap(remap.size(), remap.size()); + size_t num = 0; + for (size_t i = 0; i < remap.size(); i++) { + if (remap_remap[remap[i]] == remap.size()) { + remap_remap[remap[i]] = num++; + } + remap[i] = remap_remap[remap[i]]; + } + // Write the block context map. + auto& ctx_map = enc_state.shared.block_ctx_map.ctx_map; + ctx_map = remap; + ctx_map.resize(remap.size() * 3); + for (size_t i = remap.size(); i < remap.size() * 3; i++) { + ctx_map[i] = remap[i % remap.size()] + num; + } + enc_state.shared.block_ctx_map.num_ctxs = + *std::max_element(ctx_map.begin(), ctx_map.end()) + 1; +} + +// Returns the target size based on whether bitrate or direct targetsize is +// given. +size_t TargetSize(const CompressParams& cparams, + const FrameDimensions& frame_dim) { + if (cparams.target_size > 0) { + return cparams.target_size; + } + if (cparams.target_bitrate > 0.0) { + return 0.5 + + cparams.target_bitrate * frame_dim.xsize * frame_dim.ysize / + kBitsPerByte; + } + return 0; +} +} // namespace + +void FindBestDequantMatrices(const CompressParams& cparams, + const Image3F& opsin, + ModularFrameEncoder* modular_frame_encoder, + DequantMatrices* dequant_matrices) { + // TODO(veluca): quant matrices for no-gaborish. + // TODO(veluca): heuristics for in-bitstream quant tables. + *dequant_matrices = DequantMatrices(); + if (cparams.max_error_mode) { + // Set numerators of all quantization matrices to constant values. + float weights[3][1] = {{1.0f / cparams.max_error[0]}, + {1.0f / cparams.max_error[1]}, + {1.0f / cparams.max_error[2]}}; + DctQuantWeightParams dct_params(weights); + std::vector encodings(DequantMatrices::kNum, + QuantEncoding::DCT(dct_params)); + DequantMatricesSetCustom(dequant_matrices, encodings, + modular_frame_encoder); + float dc_weights[3] = {1.0f / cparams.max_error[0], + 1.0f / cparams.max_error[1], + 1.0f / cparams.max_error[2]}; + DequantMatricesSetCustomDC(dequant_matrices, dc_weights); + } +} + +bool DefaultEncoderHeuristics::HandlesColorConversion( + const CompressParams& cparams, const ImageBundle& ib) { + return cparams.noise != Override::kOn && cparams.patches != Override::kOn && + cparams.speed_tier >= SpeedTier::kWombat && cparams.resampling == 1 && + cparams.color_transform == ColorTransform::kXYB && + !cparams.modular_mode && !ib.HasAlpha(); +} + +Status DefaultEncoderHeuristics::LossyFrameHeuristics( + PassesEncoderState* enc_state, ModularFrameEncoder* modular_frame_encoder, + const ImageBundle* original_pixels, Image3F* opsin, ThreadPool* pool, + AuxOut* aux_out) { + PROFILER_ZONE("JxlLossyFrameHeuristics uninstrumented"); + + CompressParams& cparams = enc_state->cparams; + PassesSharedState& shared = enc_state->shared; + + // Compute parameters for noise synthesis. + if (shared.frame_header.flags & FrameHeader::kNoise) { + PROFILER_ZONE("enc GetNoiseParam"); + if (cparams.photon_noise_iso > 0) { + shared.image_features.noise_params = SimulatePhotonNoise( + opsin->xsize(), opsin->ysize(), cparams.photon_noise_iso); + } else { + // Don't start at zero amplitude since adding noise is expensive -- it + // significantly slows down decoding, and this is unlikely to + // completely go away even with advanced optimizations. After the + // kNoiseModelingRampUpDistanceRange we have reached the full level, + // i.e. noise is no longer represented by the compressed image, so we + // can add full noise by the noise modeling itself. + static const float kNoiseModelingRampUpDistanceRange = 0.6; + static const float kNoiseLevelAtStartOfRampUp = 0.25; + static const float kNoiseRampupStart = 1.0; + // TODO(user) test and properly select quality_coef with smooth + // filter + float quality_coef = 1.0f; + const float rampup = (cparams.butteraugli_distance - kNoiseRampupStart) / + kNoiseModelingRampUpDistanceRange; + if (rampup < 1.0f) { + quality_coef = kNoiseLevelAtStartOfRampUp + + (1.0f - kNoiseLevelAtStartOfRampUp) * rampup; + } + if (rampup < 0.0f) { + quality_coef = kNoiseRampupStart; + } + if (!GetNoiseParameter(*opsin, &shared.image_features.noise_params, + quality_coef)) { + shared.frame_header.flags &= ~FrameHeader::kNoise; + } + } + } + if (enc_state->shared.frame_header.upsampling != 1 && + !cparams.already_downsampled) { + // In VarDCT mode, LossyFrameHeuristics takes care of running downsampling + // after noise, if necessary. + DownsampleImage(opsin, cparams.resampling); + PadImageToBlockMultipleInPlace(opsin); + } + + const FrameDimensions& frame_dim = enc_state->shared.frame_dim; + size_t target_size = TargetSize(cparams, frame_dim); + size_t opsin_target_size = target_size; + if (cparams.target_size > 0 || cparams.target_bitrate > 0.0) { + cparams.target_size = opsin_target_size; + } else if (cparams.butteraugli_distance < 0) { + return JXL_FAILURE("Expected non-negative distance"); + } + + // Find and subtract splines. + // if (cparams.speed_tier <= SpeedTier::kSquirrel) { + // shared.image_features.splines = FindSplines(*opsin); + // JXL_RETURN_IF_ERROR( + // shared.image_features.splines.SubtractFrom(opsin, shared.cmap)); + // } + + // Find and subtract patches/dots. + // if (ApplyOverride(cparams.patches, + // cparams.speed_tier <= SpeedTier::kSquirrel)) { + // FindBestPatchDictionary(*opsin, enc_state, pool, aux_out); + // PatchDictionaryEncoder::SubtractFrom(shared.image_features.patches, + // opsin); + // } + + static const float kAcQuant = 0.79f; + const float quant_dc = InitialQuantDC(cparams.butteraugli_distance); + Quantizer& quantizer = enc_state->shared.quantizer; + // We don't know the quant field yet, but for computing the global scale + // assuming that it will be the same as for Falcon mode is good enough. + quantizer.ComputeGlobalScaleAndQuant( + quant_dc, kAcQuant / cparams.butteraugli_distance, 0); + + // TODO(veluca): we can now run all the code from here to FindBestQuantizer + // (excluded) one rect at a time. Do that. + + // Dependency graph: + // + // input: either XYB or input image + // + // input image -> XYB [optional] + // XYB -> initial quant field + // XYB -> Gaborished XYB + // Gaborished XYB -> CfL1 + // initial quant field, Gaborished XYB, CfL1 -> ACS + // initial quant field, ACS, Gaborished XYB -> EPF control field + // initial quant field -> adjusted initial quant field + // adjusted initial quant field, ACS -> raw quant field + // raw quant field, ACS, Gaborished XYB -> CfL2 + // + // output: Gaborished XYB, CfL, ACS, raw quant field, EPF control field. + + ArControlFieldHeuristics ar_heuristics; + AcStrategyHeuristics acs_heuristics; + CfLHeuristics cfl_heuristics; + + if (!opsin->xsize()) { + JXL_ASSERT(HandlesColorConversion(cparams, *original_pixels)); + *opsin = Image3F(RoundUpToBlockDim(original_pixels->xsize()), + RoundUpToBlockDim(original_pixels->ysize())); + opsin->ShrinkTo(original_pixels->xsize(), original_pixels->ysize()); + ToXYB(*original_pixels, pool, opsin, /*linear=*/nullptr); + PadImageToBlockMultipleInPlace(opsin); + } + + // Compute an initial estimate of the quantization field. + // Call InitialQuantField only in Hare mode or slower. Otherwise, rely + // on simple heuristics in FindBestAcStrategy, or set a constant for Falcon + // mode. + if (cparams.speed_tier > SpeedTier::kHare || cparams.uniform_quant > 0) { + enc_state->initial_quant_field = + ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks); + float q = cparams.uniform_quant > 0 + ? cparams.uniform_quant + : kAcQuant / cparams.butteraugli_distance; + FillImage(q, &enc_state->initial_quant_field); + } else { + // Call this here, as it relies on pre-gaborish values. + float butteraugli_distance_for_iqf = cparams.butteraugli_distance; + if (!shared.frame_header.loop_filter.gab) { + butteraugli_distance_for_iqf *= 0.73f; + } + enc_state->initial_quant_field = InitialQuantField( + butteraugli_distance_for_iqf, *opsin, shared.frame_dim, pool, 1.0f, + &enc_state->initial_quant_masking); + } + + // TODO(veluca): do something about animations. + + // Apply inverse-gaborish. + if (shared.frame_header.loop_filter.gab) { + GaborishInverse(opsin, 0.9908511000000001f, pool); + } + + cfl_heuristics.Init(*opsin); + acs_heuristics.Init(*opsin, enc_state); + + auto process_tile = [&](size_t tid, size_t thread) { + size_t n_enc_tiles = + DivCeil(enc_state->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks); + size_t tx = tid % n_enc_tiles; + size_t ty = tid / n_enc_tiles; + size_t by0 = ty * kEncTileDimInBlocks; + size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks, + enc_state->shared.frame_dim.ysize_blocks); + size_t bx0 = tx * kEncTileDimInBlocks; + size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks, + enc_state->shared.frame_dim.xsize_blocks); + Rect r(bx0, by0, bx1 - bx0, by1 - by0); + + // For speeds up to Wombat, we only compute the color correlation map + // once we know the transform type and the quantization map. + if (cparams.speed_tier <= SpeedTier::kSquirrel) { + cfl_heuristics.ComputeTile(r, *opsin, enc_state->shared.matrices, + /*ac_strategy=*/nullptr, + /*quantizer=*/nullptr, /*fast=*/false, thread, + &enc_state->shared.cmap); + } + + // Choose block sizes. + acs_heuristics.ProcessRect(r); + + // Choose amount of post-processing smoothing. + // TODO(veluca): should this go *after* AdjustQuantField? + ar_heuristics.RunRect(r, *opsin, enc_state, thread); + + // Always set the initial quant field, so we can compute the CfL map with + // more accuracy. The initial quant field might change in slower modes, but + // adjusting the quant field with butteraugli when all the other encoding + // parameters are fixed is likely a more reliable choice anyway. + AdjustQuantField(enc_state->shared.ac_strategy, r, + &enc_state->initial_quant_field); + quantizer.SetQuantFieldRect(enc_state->initial_quant_field, r, + &enc_state->shared.raw_quant_field); + + // Compute a non-default CfL map if we are at Hare speed, or slower. + if (cparams.speed_tier <= SpeedTier::kHare) { + cfl_heuristics.ComputeTile( + r, *opsin, enc_state->shared.matrices, &enc_state->shared.ac_strategy, + &enc_state->shared.quantizer, + /*fast=*/cparams.speed_tier >= SpeedTier::kWombat, thread, + &enc_state->shared.cmap); + } + }; + RunOnPool(pool, 0, DivCeil(enc_state->shared.frame_dim.xsize_blocks, + kEncTileDimInBlocks) * + DivCeil(enc_state->shared.frame_dim.ysize_blocks, + kEncTileDimInBlocks), + [&](const size_t num_threads) { + ar_heuristics.PrepareForThreads(num_threads); + cfl_heuristics.PrepareForThreads(num_threads); + return true; + }, + process_tile, "Enc Heuristics"); + + acs_heuristics.Finalize(aux_out); + if (cparams.speed_tier <= SpeedTier::kHare) { + cfl_heuristics.ComputeDC(/*fast=*/cparams.speed_tier >= SpeedTier::kWombat, + &enc_state->shared.cmap); + } + + FindBestDequantMatrices(cparams, *opsin, modular_frame_encoder, + &enc_state->shared.matrices); + + // Refine quantization levels. + FindBestQuantizer(original_pixels, *opsin, enc_state, pool, aux_out); + + // Choose a context model that depends on the amount of quantization for AC. + if (cparams.speed_tier < SpeedTier::kFalcon) { + FindBestBlockEntropyModel(*enc_state); + } + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_heuristics.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_heuristics.h new file mode 100644 index 0000000000..559603a619 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_heuristics.h @@ -0,0 +1,87 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_HEURISTICS_H_ +#define LIB_JXL_ENC_HEURISTICS_H_ + +// Hook for custom encoder heuristics (VarDCT only for now). + +#include +#include + +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/image.h" +#include "lib/jxl/modular/encoding/enc_ma.h" + +namespace jxl { + +struct PassesEncoderState; +class ImageBundle; +class ModularFrameEncoder; + +class EncoderHeuristics { + public: + virtual ~EncoderHeuristics() = default; + // Initializes encoder structures in `enc_state` using the original image data + // in `original_pixels`, and the XYB image data in `opsin`. Also modifies the + // `opsin` image by applying Gaborish, and doing other modifications if + // necessary. `pool` is used for running the computations on multiple threads. + // `aux_out` collects statistics and can be used to print debug images. + virtual Status LossyFrameHeuristics( + PassesEncoderState* enc_state, ModularFrameEncoder* modular_frame_encoder, + const ImageBundle* original_pixels, Image3F* opsin, ThreadPool* pool, + AuxOut* aux_out) = 0; + + // Custom fixed tree for lossless mode. Must set `tree` to a valid tree if + // the function returns true. + virtual bool CustomFixedTreeLossless(const FrameDimensions& frame_dim, + Tree* tree) { + return false; + } + + // If this method returns `true`, the `opsin` parameter to + // LossyFrameHeuristics will not be initialized, and should be initialized + // during the call. Moreover, `original_pixels` may not be in a linear + // colorspace (but will be the same as the `ib` value passed to this + // function). + virtual bool HandlesColorConversion(const CompressParams& cparams, + const ImageBundle& ib) { + return false; + } +}; + +class DefaultEncoderHeuristics : public EncoderHeuristics { + public: + Status LossyFrameHeuristics(PassesEncoderState* enc_state, + ModularFrameEncoder* modular_frame_encoder, + const ImageBundle* original_pixels, + Image3F* opsin, ThreadPool* pool, + AuxOut* aux_out) override; + bool HandlesColorConversion(const CompressParams& cparams, + const ImageBundle& ib) override; +}; + +class FastEncoderHeuristics : public EncoderHeuristics { + public: + Status LossyFrameHeuristics(PassesEncoderState* enc_state, + ModularFrameEncoder* modular_frame_encoder, + const ImageBundle* linear, Image3F* opsin, + ThreadPool* pool, AuxOut* aux_out) override; +}; + +// Exposed here since it may be used by other EncoderHeuristics implementations +// outside this project. +void FindBestDequantMatrices(const CompressParams& cparams, + const Image3F& opsin, + ModularFrameEncoder* modular_frame_encoder, + DequantMatrices* dequant_matrices); + +} // namespace jxl + +#endif // LIB_JXL_ENC_HEURISTICS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_huffman.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_huffman.cc new file mode 100644 index 0000000000..04b5669982 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_huffman.cc @@ -0,0 +1,214 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_huffman.h" + +#include +#include + +#include "lib/jxl/huffman_tree.h" + +namespace jxl { + +namespace { + +constexpr int kCodeLengthCodes = 18; + +void StoreHuffmanTreeOfHuffmanTreeToBitMask(const int num_codes, + const uint8_t* code_length_bitdepth, + BitWriter* writer) { + static const uint8_t kStorageOrder[kCodeLengthCodes] = { + 1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + // The bit lengths of the Huffman code over the code length alphabet + // are compressed with the following static Huffman code: + // Symbol Code + // ------ ---- + // 0 00 + // 1 1110 + // 2 110 + // 3 01 + // 4 10 + // 5 1111 + static const uint8_t kHuffmanBitLengthHuffmanCodeSymbols[6] = {0, 7, 3, + 2, 1, 15}; + static const uint8_t kHuffmanBitLengthHuffmanCodeBitLengths[6] = {2, 4, 3, + 2, 2, 4}; + + // Throw away trailing zeros: + size_t codes_to_store = kCodeLengthCodes; + if (num_codes > 1) { + for (; codes_to_store > 0; --codes_to_store) { + if (code_length_bitdepth[kStorageOrder[codes_to_store - 1]] != 0) { + break; + } + } + } + size_t skip_some = 0; // skips none. + if (code_length_bitdepth[kStorageOrder[0]] == 0 && + code_length_bitdepth[kStorageOrder[1]] == 0) { + skip_some = 2; // skips two. + if (code_length_bitdepth[kStorageOrder[2]] == 0) { + skip_some = 3; // skips three. + } + } + writer->Write(2, skip_some); + for (size_t i = skip_some; i < codes_to_store; ++i) { + size_t l = code_length_bitdepth[kStorageOrder[i]]; + writer->Write(kHuffmanBitLengthHuffmanCodeBitLengths[l], + kHuffmanBitLengthHuffmanCodeSymbols[l]); + } +} + +void StoreHuffmanTreeToBitMask(const size_t huffman_tree_size, + const uint8_t* huffman_tree, + const uint8_t* huffman_tree_extra_bits, + const uint8_t* code_length_bitdepth, + const uint16_t* code_length_bitdepth_symbols, + BitWriter* writer) { + for (size_t i = 0; i < huffman_tree_size; ++i) { + size_t ix = huffman_tree[i]; + writer->Write(code_length_bitdepth[ix], code_length_bitdepth_symbols[ix]); + // Extra bits + switch (ix) { + case 16: + writer->Write(2, huffman_tree_extra_bits[i]); + break; + case 17: + writer->Write(3, huffman_tree_extra_bits[i]); + break; + } + } +} + +void StoreSimpleHuffmanTree(const uint8_t* depths, size_t symbols[4], + size_t num_symbols, size_t max_bits, + BitWriter* writer) { + // value of 1 indicates a simple Huffman code + writer->Write(2, 1); + writer->Write(2, num_symbols - 1); // NSYM - 1 + + // Sort + for (size_t i = 0; i < num_symbols; i++) { + for (size_t j = i + 1; j < num_symbols; j++) { + if (depths[symbols[j]] < depths[symbols[i]]) { + std::swap(symbols[j], symbols[i]); + } + } + } + + if (num_symbols == 2) { + writer->Write(max_bits, symbols[0]); + writer->Write(max_bits, symbols[1]); + } else if (num_symbols == 3) { + writer->Write(max_bits, symbols[0]); + writer->Write(max_bits, symbols[1]); + writer->Write(max_bits, symbols[2]); + } else { + writer->Write(max_bits, symbols[0]); + writer->Write(max_bits, symbols[1]); + writer->Write(max_bits, symbols[2]); + writer->Write(max_bits, symbols[3]); + // tree-select + writer->Write(1, depths[symbols[0]] == 1 ? 1 : 0); + } +} + +// num = alphabet size +// depths = symbol depths +void StoreHuffmanTree(const uint8_t* depths, size_t num, BitWriter* writer) { + // Write the Huffman tree into the compact representation. + std::unique_ptr arena(new uint8_t[2 * num]); + uint8_t* huffman_tree = arena.get(); + uint8_t* huffman_tree_extra_bits = arena.get() + num; + size_t huffman_tree_size = 0; + WriteHuffmanTree(depths, num, &huffman_tree_size, huffman_tree, + huffman_tree_extra_bits); + + // Calculate the statistics of the Huffman tree in the compact representation. + uint32_t huffman_tree_histogram[kCodeLengthCodes] = {0}; + for (size_t i = 0; i < huffman_tree_size; ++i) { + ++huffman_tree_histogram[huffman_tree[i]]; + } + + int num_codes = 0; + int code = 0; + for (int i = 0; i < kCodeLengthCodes; ++i) { + if (huffman_tree_histogram[i]) { + if (num_codes == 0) { + code = i; + num_codes = 1; + } else if (num_codes == 1) { + num_codes = 2; + break; + } + } + } + + // Calculate another Huffman tree to use for compressing both the + // earlier Huffman tree with. + uint8_t code_length_bitdepth[kCodeLengthCodes] = {0}; + uint16_t code_length_bitdepth_symbols[kCodeLengthCodes] = {0}; + CreateHuffmanTree(&huffman_tree_histogram[0], kCodeLengthCodes, 5, + &code_length_bitdepth[0]); + ConvertBitDepthsToSymbols(code_length_bitdepth, kCodeLengthCodes, + &code_length_bitdepth_symbols[0]); + + // Now, we have all the data, let's start storing it + StoreHuffmanTreeOfHuffmanTreeToBitMask(num_codes, code_length_bitdepth, + writer); + + if (num_codes == 1) { + code_length_bitdepth[code] = 0; + } + + // Store the real huffman tree now. + StoreHuffmanTreeToBitMask(huffman_tree_size, huffman_tree, + huffman_tree_extra_bits, &code_length_bitdepth[0], + code_length_bitdepth_symbols, writer); +} + +} // namespace + +void BuildAndStoreHuffmanTree(const uint32_t* histogram, const size_t length, + uint8_t* depth, uint16_t* bits, + BitWriter* writer) { + size_t count = 0; + size_t s4[4] = {0}; + for (size_t i = 0; i < length; i++) { + if (histogram[i]) { + if (count < 4) { + s4[count] = i; + } else if (count > 4) { + break; + } + count++; + } + } + + size_t max_bits_counter = length - 1; + size_t max_bits = 0; + while (max_bits_counter) { + max_bits_counter >>= 1; + ++max_bits; + } + + if (count <= 1) { + // Output symbol bits and depths are initialized with 0, nothing to do. + writer->Write(4, 1); + writer->Write(max_bits, s4[0]); + return; + } + + CreateHuffmanTree(histogram, length, 15, depth); + ConvertBitDepthsToSymbols(depth, length, bits); + + if (count <= 4) { + StoreSimpleHuffmanTree(depth, s4, count, max_bits, writer); + } else { + StoreHuffmanTree(depth, length, writer); + } +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_huffman.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_huffman.h new file mode 100644 index 0000000000..d7a66584e8 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_huffman.h @@ -0,0 +1,22 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_HUFFMAN_H_ +#define LIB_JXL_ENC_HUFFMAN_H_ + +#include "lib/jxl/enc_bit_writer.h" + +namespace jxl { + +// Builds a Huffman tree for the given histogram, and encodes it into writer +// in a format that can be read by HuffmanDecodingData::ReadFromBitstream. +// An allotment for `writer` must already have been created by the caller. +void BuildAndStoreHuffmanTree(const uint32_t* histogram, size_t length, + uint8_t* depth, uint16_t* bits, + BitWriter* writer); + +} // namespace jxl + +#endif // LIB_JXL_ENC_HUFFMAN_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_icc_codec.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_icc_codec.cc new file mode 100644 index 0000000000..4ec17c6b22 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_icc_codec.cc @@ -0,0 +1,430 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_icc_codec.h" + +#include + +#include +#include +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/icc_codec_common.h" + +namespace jxl { +namespace { + +bool EncodeVarInt(uint64_t value, size_t output_size, size_t* output_pos, + uint8_t* output) { + // While more than 7 bits of data are left, + // store 7 bits and set the next byte flag + while (value > 127) { + if (*output_pos > output_size) return false; + // |128: Set the next byte flag + output[(*output_pos)++] = ((uint8_t)(value & 127)) | 128; + // Remove the seven bits we just wrote + value >>= 7; + } + if (*output_pos > output_size) return false; + output[(*output_pos)++] = ((uint8_t)value) & 127; + return true; +} + +void EncodeVarInt(uint64_t value, PaddedBytes* data) { + size_t pos = data->size(); + data->resize(data->size() + 9); + JXL_CHECK(EncodeVarInt(value, data->size(), &pos, data->data())); + data->resize(pos); +} + +// Unshuffles or de-interleaves bytes, for example with width 2, turns +// "AaBbCcDc" into "ABCDabcd", this for example de-interleaves UTF-16 bytes into +// first all the high order bytes, then all the low order bytes. +// Transposes a matrix of width columns and ceil(size / width) rows. There are +// size elements, size may be < width * height, if so the +// last elements of the bottom row are missing, the missing spots are +// transposed along with the filled spots, and the result has the missing +// elements at the bottom of the rightmost column. The input is the input matrix +// in scanline order, the output is the result matrix in scanline order, with +// missing elements skipped over (this may occur at multiple positions). +void Unshuffle(uint8_t* data, size_t size, size_t width) { + size_t height = (size + width - 1) / width; // amount of rows of input + PaddedBytes result(size); + // i = input index, j output index + size_t s = 0, j = 0; + for (size_t i = 0; i < size; i++) { + result[j] = data[i]; + j += height; + if (j >= size) j = ++s; + } + + for (size_t i = 0; i < size; i++) { + data[i] = result[i]; + } +} + +// This is performed by the encoder, the encoder must be able to encode any +// random byte stream (not just byte streams that are a valid ICC profile), so +// an error returned by this function is an implementation error. +Status PredictAndShuffle(size_t stride, size_t width, int order, size_t num, + const uint8_t* data, size_t size, size_t* pos, + PaddedBytes* result) { + JXL_RETURN_IF_ERROR(CheckOutOfBounds(*pos, num, size)); + // Required by the specification, see decoder. stride * 4 must be < *pos. + if (!*pos || ((*pos - 1u) >> 2u) < stride) { + return JXL_FAILURE("Invalid stride"); + } + if (*pos < stride * 4) return JXL_FAILURE("Too large stride"); + size_t start = result->size(); + for (size_t i = 0; i < num; i++) { + uint8_t predicted = + LinearPredictICCValue(data, *pos, i, stride, width, order); + result->push_back(data[*pos + i] - predicted); + } + *pos += num; + if (width > 1) Unshuffle(result->data() + start, num, width); + return true; +} +} // namespace + +// Outputs a transformed form of the given icc profile. The result itself is +// not particularly smaller than the input data in bytes, but it will be in a +// form that is easier to compress (more zeroes, ...) and will compress better +// with brotli. +Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result) { + PaddedBytes commands; + PaddedBytes data; + + EncodeVarInt(size, result); + + // Header + PaddedBytes header = ICCInitialHeaderPrediction(); + EncodeUint32(0, size, &header); + for (size_t i = 0; i < kICCHeaderSize && i < size; i++) { + ICCPredictHeader(icc, size, header.data(), i); + data.push_back(icc[i] - header[i]); + } + if (size <= kICCHeaderSize) { + EncodeVarInt(0, result); // 0 commands + for (size_t i = 0; i < data.size(); i++) { + result->push_back(data[i]); + } + return true; + } + + std::vector tags; + std::vector tagstarts; + std::vector tagsizes; + std::map tagmap; + + // Tag list + size_t pos = kICCHeaderSize; + if (pos + 4 <= size) { + uint64_t numtags = DecodeUint32(icc, size, pos); + pos += 4; + EncodeVarInt(numtags + 1, &commands); + uint64_t prevtagstart = kICCHeaderSize + numtags * 12; + uint32_t prevtagsize = 0; + for (size_t i = 0; i < numtags; i++) { + if (pos + 12 > size) break; + + Tag tag = DecodeKeyword(icc, size, pos + 0); + uint32_t tagstart = DecodeUint32(icc, size, pos + 4); + uint32_t tagsize = DecodeUint32(icc, size, pos + 8); + pos += 12; + + tags.push_back(tag); + tagstarts.push_back(tagstart); + tagsizes.push_back(tagsize); + tagmap[tagstart] = tags.size() - 1; + + uint8_t tagcode = kCommandTagUnknown; + for (size_t j = 0; j < kNumTagStrings; j++) { + if (tag == *kTagStrings[j]) { + tagcode = j + kCommandTagStringFirst; + break; + } + } + + if (tag == kRtrcTag && pos + 24 < size) { + bool ok = true; + ok &= DecodeKeyword(icc, size, pos + 0) == kGtrcTag; + ok &= DecodeKeyword(icc, size, pos + 12) == kBtrcTag; + if (ok) { + for (size_t i = 0; i < 8; i++) { + if (icc[pos - 8 + i] != icc[pos + 4 + i]) ok = false; + if (icc[pos - 8 + i] != icc[pos + 16 + i]) ok = false; + } + } + if (ok) { + tagcode = kCommandTagTRC; + pos += 24; + i += 2; + } + } + + if (tag == kRxyzTag && pos + 24 < size) { + bool ok = true; + ok &= DecodeKeyword(icc, size, pos + 0) == kGxyzTag; + ok &= DecodeKeyword(icc, size, pos + 12) == kBxyzTag; + uint32_t offsetr = tagstart; + uint32_t offsetg = DecodeUint32(icc, size, pos + 4); + uint32_t offsetb = DecodeUint32(icc, size, pos + 16); + uint32_t sizer = tagsize; + uint32_t sizeg = DecodeUint32(icc, size, pos + 8); + uint32_t sizeb = DecodeUint32(icc, size, pos + 20); + ok &= sizer == 20; + ok &= sizeg == 20; + ok &= sizeb == 20; + ok &= (offsetg == offsetr + 20); + ok &= (offsetb == offsetr + 40); + if (ok) { + tagcode = kCommandTagXYZ; + pos += 24; + i += 2; + } + } + + uint8_t command = tagcode; + uint64_t predicted_tagstart = prevtagstart + prevtagsize; + if (predicted_tagstart != tagstart) command |= kFlagBitOffset; + size_t predicted_tagsize = prevtagsize; + if (tag == kRxyzTag || tag == kGxyzTag || tag == kBxyzTag || + tag == kKxyzTag || tag == kWtptTag || tag == kBkptTag || + tag == kLumiTag) { + predicted_tagsize = 20; + } + if (predicted_tagsize != tagsize) command |= kFlagBitSize; + commands.push_back(command); + if (tagcode == 1) { + AppendKeyword(tag, &data); + } + if (command & kFlagBitOffset) EncodeVarInt(tagstart, &commands); + if (command & kFlagBitSize) EncodeVarInt(tagsize, &commands); + + prevtagstart = tagstart; + prevtagsize = tagsize; + } + } + // Indicate end of tag list or varint indicating there's none + commands.push_back(0); + + // Main content + // The main content in a valid ICC profile contains tagged elements, with the + // tag types (4 letter names) given by the tag list above, and the tag list + // pointing to the start and indicating the size of each tagged element. It is + // allowed for tagged elements to overlap, e.g. the curve for R, G and B could + // all point to the same one. + Tag tag; + size_t tagstart = 0, tagsize = 0, clutstart = 0; + + size_t last0 = pos; + // This loop appends commands to the output, processing some sub-section of a + // current tagged element each time. We need to keep track of the tagtype of + // the current element, and update it when we encounter the boundary of a + // next one. + // It is not required that the input data is a valid ICC profile, if the + // encoder does not recognize the data it will still be able to output bytes + // but will not predict as well. + while (pos <= size) { + size_t last1 = pos; + PaddedBytes commands_add; + PaddedBytes data_add; + + // This means the loop brought the position beyond the tag end. + if (pos > tagstart + tagsize) { + tag = {0, 0, 0, 0}; // nonsensical value + } + + if (commands_add.empty() && data_add.empty() && tagmap.count(pos) && + pos + 4 <= size) { + size_t index = tagmap[pos]; + tag = DecodeKeyword(icc, size, pos); + tagstart = tagstarts[index]; + tagsize = tagsizes[index]; + + if (tag == kMlucTag && pos + tagsize <= size && tagsize > 8 && + icc[pos + 4] == 0 && icc[pos + 5] == 0 && icc[pos + 6] == 0 && + icc[pos + 7] == 0) { + size_t num = tagsize - 8; + commands_add.push_back(kCommandTypeStartFirst + 3); + pos += 8; + commands_add.push_back(kCommandShuffle2); + EncodeVarInt(num, &commands_add); + size_t start = data_add.size(); + for (size_t i = 0; i < num; i++) { + data_add.push_back(icc[pos]); + pos++; + } + Unshuffle(data_add.data() + start, num, 2); + } + + if (tag == kCurvTag && pos + tagsize <= size && tagsize > 8 && + icc[pos + 4] == 0 && icc[pos + 5] == 0 && icc[pos + 6] == 0 && + icc[pos + 7] == 0) { + size_t num = tagsize - 8; + if (num > 16 && num < (1 << 28) && pos + num <= size && pos > 0) { + commands_add.push_back(kCommandTypeStartFirst + 5); + pos += 8; + commands_add.push_back(kCommandPredict); + int order = 1, width = 2, stride = width; + commands_add.push_back((order << 2) | (width - 1)); + EncodeVarInt(num, &commands_add); + JXL_RETURN_IF_ERROR(PredictAndShuffle(stride, width, order, num, icc, + size, &pos, &data_add)); + } + } + } + + if (tag == kMab_Tag || tag == kMba_Tag) { + Tag subTag = DecodeKeyword(icc, size, pos); + if (pos + 12 < size && (subTag == kCurvTag || subTag == kVcgtTag) && + DecodeUint32(icc, size, pos + 4) == 0) { + uint32_t num = DecodeUint32(icc, size, pos + 8) * 2; + if (num > 16 && num < (1 << 28) && pos + 12 + num <= size) { + pos += 12; + last1 = pos; + commands_add.push_back(kCommandPredict); + int order = 1, width = 2, stride = width; + commands_add.push_back((order << 2) | (width - 1)); + EncodeVarInt(num, &commands_add); + JXL_RETURN_IF_ERROR(PredictAndShuffle(stride, width, order, num, icc, + size, &pos, &data_add)); + } + } + + if (pos == tagstart + 24 && pos + 4 < size) { + // Note that this value can be remembered for next iterations of the + // loop, so the "pos == clutstart" if below can trigger during a later + // iteration. + clutstart = tagstart + DecodeUint32(icc, size, pos); + } + + if (pos == clutstart && clutstart + 16 < size) { + size_t numi = icc[tagstart + 8]; + size_t numo = icc[tagstart + 9]; + size_t width = icc[clutstart + 16]; + size_t stride = width * numo; + size_t num = width * numo; + for (size_t i = 0; i < numi && clutstart + i < size; i++) { + num *= icc[clutstart + i]; + } + if ((width == 1 || width == 2) && num > 64 && num < (1 << 28) && + pos + num <= size && pos > stride * 4) { + commands_add.push_back(kCommandPredict); + int order = 1; + uint8_t flags = + (order << 2) | (width - 1) | (stride == width ? 0 : 16); + commands_add.push_back(flags); + if (flags & 16) EncodeVarInt(stride, &commands_add); + EncodeVarInt(num, &commands_add); + JXL_RETURN_IF_ERROR(PredictAndShuffle(stride, width, order, num, icc, + size, &pos, &data_add)); + } + } + } + + if (commands_add.empty() && data_add.empty() && tag == kGbd_Tag && + pos == tagstart + 8 && pos + tagsize - 8 <= size && pos > 16 && + tagsize > 8) { + size_t width = 4, order = 0, stride = width; + size_t num = tagsize - 8; + uint8_t flags = (order << 2) | (width - 1) | (stride == width ? 0 : 16); + commands_add.push_back(kCommandPredict); + commands_add.push_back(flags); + if (flags & 16) EncodeVarInt(stride, &commands_add); + EncodeVarInt(num, &commands_add); + JXL_RETURN_IF_ERROR(PredictAndShuffle(stride, width, order, num, icc, + size, &pos, &data_add)); + } + + if (commands_add.empty() && data_add.empty() && pos + 20 <= size) { + Tag subTag = DecodeKeyword(icc, size, pos); + if (subTag == kXyz_Tag && DecodeUint32(icc, size, pos + 4) == 0) { + commands_add.push_back(kCommandXYZ); + pos += 8; + for (size_t j = 0; j < 12; j++) data_add.push_back(icc[pos++]); + } + } + + if (commands_add.empty() && data_add.empty() && pos + 8 <= size) { + if (DecodeUint32(icc, size, pos + 4) == 0) { + Tag subTag = DecodeKeyword(icc, size, pos); + for (size_t i = 0; i < kNumTypeStrings; i++) { + if (subTag == *kTypeStrings[i]) { + commands_add.push_back(kCommandTypeStartFirst + i); + pos += 8; + break; + } + } + } + } + + if (!(commands_add.empty() && data_add.empty()) || pos == size) { + if (last0 < last1) { + commands.push_back(kCommandInsert); + EncodeVarInt(last1 - last0, &commands); + while (last0 < last1) { + data.push_back(icc[last0++]); + } + } + for (size_t i = 0; i < commands_add.size(); i++) { + commands.push_back(commands_add[i]); + } + for (size_t i = 0; i < data_add.size(); i++) { + data.push_back(data_add[i]); + } + last0 = pos; + } + if (commands_add.empty() && data_add.empty()) { + pos++; + } + } + + EncodeVarInt(commands.size(), result); + for (size_t i = 0; i < commands.size(); i++) { + result->push_back(commands[i]); + } + for (size_t i = 0; i < data.size(); i++) { + result->push_back(data[i]); + } + + return true; +} + +Status WriteICC(const PaddedBytes& icc, BitWriter* JXL_RESTRICT writer, + size_t layer, AuxOut* JXL_RESTRICT aux_out) { + if (icc.empty()) return JXL_FAILURE("ICC must be non-empty"); + PaddedBytes enc; + JXL_RETURN_IF_ERROR(PredictICC(icc.data(), icc.size(), &enc)); + std::vector> tokens(1); + BitWriter::Allotment allotment(writer, 128); + JXL_RETURN_IF_ERROR(U64Coder::Write(enc.size(), writer)); + ReclaimAndCharge(writer, &allotment, layer, aux_out); + + for (size_t i = 0; i < enc.size(); i++) { + tokens[0].emplace_back( + ICCANSContext(i, i > 0 ? enc[i - 1] : 0, i > 1 ? enc[i - 2] : 0), + enc[i]); + } + HistogramParams params; + params.lz77_method = enc.size() < 4096 ? HistogramParams::LZ77Method::kOptimal + : HistogramParams::LZ77Method::kLZ77; + EntropyEncodingData code; + std::vector context_map; + params.force_huffman = true; + BuildAndEncodeHistograms(params, kNumICCContexts, tokens, &code, &context_map, + writer, layer, aux_out); + WriteTokens(tokens[0], code, context_map, writer, layer, aux_out); + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_icc_codec.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_icc_codec.h new file mode 100644 index 0000000000..2480e3ae9a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_icc_codec.h @@ -0,0 +1,33 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_ICC_CODEC_H_ +#define LIB_JXL_ENC_ICC_CODEC_H_ + +// Compressed representation of ICC profiles. + +#include +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_bit_writer.h" + +namespace jxl { + +// Should still be called if `icc.empty()` - if so, writes only 1 bit. +Status WriteICC(const PaddedBytes& icc, BitWriter* JXL_RESTRICT writer, + size_t layer, AuxOut* JXL_RESTRICT aux_out); + +// Exposed only for testing +Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result); + +} // namespace jxl + +#endif // LIB_JXL_ENC_ICC_CODEC_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_image_bundle.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_image_bundle.cc new file mode 100644 index 0000000000..5aac244f5a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_image_bundle.cc @@ -0,0 +1,170 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_image_bundle.h" + +#include +#include + +#include "lib/jxl/alpha.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/enc_color_management.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/luminance.h" + +namespace jxl { + +namespace { + +// Copies ib:rect, converts, and copies into out. +template +Status CopyToT(const ImageMetadata* metadata, const ImageBundle* ib, + const Rect& rect, const ColorEncoding& c_desired, + ThreadPool* pool, Image3* out) { + PROFILER_FUNC; + static_assert( + std::is_same::value || std::numeric_limits::min() == 0, + "CopyToT implemented only for float and unsigned types"); + ColorSpaceTransform c_transform; + // Changing IsGray is probably a bug. + JXL_CHECK(ib->IsGray() == c_desired.IsGray()); +#if JPEGXL_ENABLE_SKCMS + bool is_gray = false; +#else + bool is_gray = ib->IsGray(); +#endif + if (out->xsize() < rect.xsize() || out->ysize() < rect.ysize()) { + *out = Image3(rect.xsize(), rect.ysize()); + } else { + out->ShrinkTo(rect.xsize(), rect.ysize()); + } + RunOnPool( + pool, 0, rect.ysize(), + [&](size_t num_threads) { + return c_transform.Init(ib->c_current(), c_desired, + metadata->IntensityTarget(), rect.xsize(), + num_threads); + }, + [&](const int y, const int thread) { + float* mutable_src_buf = c_transform.BufSrc(thread); + const float* src_buf = mutable_src_buf; + // Interleave input. + if (is_gray) { + src_buf = rect.ConstPlaneRow(ib->color(), 0, y); + } else { + const float* JXL_RESTRICT row_in0 = + rect.ConstPlaneRow(ib->color(), 0, y); + const float* JXL_RESTRICT row_in1 = + rect.ConstPlaneRow(ib->color(), 1, y); + const float* JXL_RESTRICT row_in2 = + rect.ConstPlaneRow(ib->color(), 2, y); + for (size_t x = 0; x < rect.xsize(); x++) { + mutable_src_buf[3 * x + 0] = row_in0[x]; + mutable_src_buf[3 * x + 1] = row_in1[x]; + mutable_src_buf[3 * x + 2] = row_in2[x]; + } + } + float* JXL_RESTRICT dst_buf = c_transform.BufDst(thread); + DoColorSpaceTransform(&c_transform, thread, src_buf, dst_buf); + T* JXL_RESTRICT row_out0 = out->PlaneRow(0, y); + T* JXL_RESTRICT row_out1 = out->PlaneRow(1, y); + T* JXL_RESTRICT row_out2 = out->PlaneRow(2, y); + // De-interleave output and convert type. + if (std::is_same::value) { // deinterleave to float. + if (is_gray) { + for (size_t x = 0; x < rect.xsize(); x++) { + row_out0[x] = dst_buf[x]; + row_out1[x] = dst_buf[x]; + row_out2[x] = dst_buf[x]; + } + } else { + for (size_t x = 0; x < rect.xsize(); x++) { + row_out0[x] = dst_buf[3 * x + 0]; + row_out1[x] = dst_buf[3 * x + 1]; + row_out2[x] = dst_buf[3 * x + 2]; + } + } + } else { + // Convert to T, doing clamping. + float max = std::numeric_limits::max(); + auto cvt = [max](float in) { + float v = std::max(0.0f, std::min(max, in * max)); + return static_cast(v < 0 ? v - 0.5f : v + 0.5f); + }; + if (is_gray) { + for (size_t x = 0; x < rect.xsize(); x++) { + row_out0[x] = cvt(dst_buf[x]); + row_out1[x] = cvt(dst_buf[x]); + row_out2[x] = cvt(dst_buf[x]); + } + } else { + for (size_t x = 0; x < rect.xsize(); x++) { + row_out0[x] = cvt(dst_buf[3 * x + 0]); + row_out1[x] = cvt(dst_buf[3 * x + 1]); + row_out2[x] = cvt(dst_buf[3 * x + 2]); + } + } + } + }, + "Colorspace transform"); + return true; +} + +} // namespace + +Status ImageBundle::TransformTo(const ColorEncoding& c_desired, + ThreadPool* pool) { + PROFILER_FUNC; + JXL_RETURN_IF_ERROR(CopyTo(Rect(color_), c_desired, &color_, pool)); + c_current_ = c_desired; + return true; +} + +Status ImageBundle::CopyTo(const Rect& rect, const ColorEncoding& c_desired, + Image3B* out, ThreadPool* pool) const { + return CopyToT(metadata_, this, rect, c_desired, pool, out); +} +Status ImageBundle::CopyTo(const Rect& rect, const ColorEncoding& c_desired, + Image3F* out, ThreadPool* pool) const { + return CopyToT(metadata_, this, rect, c_desired, pool, out); +} + +Status ImageBundle::CopyToSRGB(const Rect& rect, Image3B* out, + ThreadPool* pool) const { + return CopyTo(rect, ColorEncoding::SRGB(IsGray()), out, pool); +} + +Status TransformIfNeeded(const ImageBundle& in, const ColorEncoding& c_desired, + ThreadPool* pool, ImageBundle* store, + const ImageBundle** out) { + if (in.c_current().SameColorEncoding(c_desired)) { + *out = ∈ + return true; + } + // TODO(janwas): avoid copying via createExternal+copyBackToIO + // instead of copy+createExternal+copyBackToIO + store->SetFromImage(CopyImage(in.color()), in.c_current()); + + // Must at least copy the alpha channel for use by external_image. + if (in.HasExtraChannels()) { + std::vector extra_channels; + for (const ImageF& extra_channel : in.extra_channels()) { + extra_channels.emplace_back(CopyImage(extra_channel)); + } + store->SetExtraChannels(std::move(extra_channels)); + } + + if (!store->TransformTo(c_desired, pool)) { + return false; + } + *out = store; + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_image_bundle.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_image_bundle.h new file mode 100644 index 0000000000..f5cd007296 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_image_bundle.h @@ -0,0 +1,25 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_IMAGE_BUNDLE_H_ +#define LIB_JXL_ENC_IMAGE_BUNDLE_H_ + +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +// Does color transformation from in.c_current() to c_desired if the color +// encodings are different, or nothing if they are already the same. +// If color transformation is done, stores the transformed values into store and +// sets the out pointer to store, else leaves store untouched and sets the out +// pointer to &in. +// Returns false if color transform fails. +Status TransformIfNeeded(const ImageBundle& in, const ColorEncoding& c_desired, + ThreadPool* pool, ImageBundle* store, + const ImageBundle** out); + +} // namespace jxl + +#endif // LIB_JXL_ENC_IMAGE_BUNDLE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.cc new file mode 100644 index 0000000000..4767017ad7 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.cc @@ -0,0 +1,1633 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_modular.h" + +#include +#include + +#include +#include +#include +#include +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/compressed_dc.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cluster.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/enc_patch_dictionary.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/gaborish.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/encoding/enc_encoding.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/encoding/ma_common.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/options.h" +#include "lib/jxl/modular/transform/enc_transform.h" +#include "lib/jxl/toc.h" + +namespace jxl { + +namespace { +// Squeeze default quantization factors +// these quantization factors are for -Q 50 (other qualities simply scale the +// factors; things are rounded down and obviously cannot get below 1) +static const float squeeze_quality_factor = + 0.35; // for easy tweaking of the quality range (decrease this number for + // higher quality) +static const float squeeze_luma_factor = + 1.1; // for easy tweaking of the balance between luma (or anything + // non-chroma) and chroma (decrease this number for higher quality + // luma) +static const float squeeze_quality_factor_xyb = 2.4f; +static const float squeeze_xyb_qtable[3][16] = { + {163.84, 81.92, 40.96, 20.48, 10.24, 5.12, 2.56, 1.28, 0.64, 0.32, 0.16, + 0.08, 0.04, 0.02, 0.01, 0.005}, // Y + {1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1, 0.5, 0.5, 0.5, 0.5, + 0.5}, // X + {2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1, 0.5, 0.5, 0.5, + 0.5}, // B-Y +}; + +static const float squeeze_luma_qtable[16] = { + 163.84, 81.92, 40.96, 20.48, 10.24, 5.12, 2.56, 1.28, + 0.64, 0.32, 0.16, 0.08, 0.04, 0.02, 0.01, 0.005}; +// for 8-bit input, the range of YCoCg chroma is -255..255 so basically this +// does 4:2:0 subsampling (two most fine grained layers get quantized away) +static const float squeeze_chroma_qtable[16] = { + 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1, 0.5, 0.5, 0.5, 0.5, 0.5}; + +// `cutoffs` must be sorted. +Tree MakeFixedTree(int property, const std::vector& cutoffs, + Predictor pred, size_t num_pixels) { + size_t log_px = CeilLog2Nonzero(num_pixels); + size_t min_gap = 0; + // Reduce fixed tree height when encoding small images. + if (log_px < 14) { + min_gap = 8 * (14 - log_px); + } + Tree tree; + struct NodeInfo { + size_t begin, end, pos; + }; + std::queue q; + // Leaf IDs will be set by roundtrip decoding the tree. + tree.push_back(PropertyDecisionNode::Leaf(pred)); + q.push(NodeInfo{0, cutoffs.size(), 0}); + while (!q.empty()) { + NodeInfo info = q.front(); + q.pop(); + if (info.begin + min_gap >= info.end) continue; + uint32_t split = (info.begin + info.end) / 2; + tree[info.pos] = + PropertyDecisionNode::Split(property, cutoffs[split], tree.size()); + q.push(NodeInfo{split + 1, info.end, tree.size()}); + tree.push_back(PropertyDecisionNode::Leaf(pred)); + q.push(NodeInfo{info.begin, split, tree.size()}); + tree.push_back(PropertyDecisionNode::Leaf(pred)); + } + return tree; +} + +Tree PredefinedTree(ModularOptions::TreeKind tree_kind, size_t total_pixels) { + if (tree_kind == ModularOptions::TreeKind::kJpegTranscodeACMeta) { + // All the data is 0, so no need for a fancy tree. + return {PropertyDecisionNode::Leaf(Predictor::Zero)}; + } + if (tree_kind == ModularOptions::TreeKind::kFalconACMeta) { + // All the data is 0 except the quant field. TODO(veluca): make that 0 too. + return {PropertyDecisionNode::Leaf(Predictor::Left)}; + } + if (tree_kind == ModularOptions::TreeKind::kACMeta) { + // Small image. + if (total_pixels < 1024) { + return {PropertyDecisionNode::Leaf(Predictor::Left)}; + } + Tree tree; + // 0: c > 1 + tree.push_back(PropertyDecisionNode::Split(0, 1, 1)); + // 1: c > 2 + tree.push_back(PropertyDecisionNode::Split(0, 2, 3)); + // 2: c > 0 + tree.push_back(PropertyDecisionNode::Split(0, 0, 5)); + // 3: EPF control field (all 0 or 4), top > 0 + tree.push_back(PropertyDecisionNode::Split(6, 0, 21)); + // 4: ACS+QF, y > 0 + tree.push_back(PropertyDecisionNode::Split(2, 0, 7)); + // 5: CfL x + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Gradient)); + // 6: CfL b + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Gradient)); + // 7: QF: split according to the left quant value. + tree.push_back(PropertyDecisionNode::Split(7, 5, 9)); + // 8: ACS: split in 4 segments (8x8 from 0 to 3, large square 4-5, large + // rectangular 6-11, 8x8 12+), according to previous ACS value. + tree.push_back(PropertyDecisionNode::Split(7, 5, 15)); + // QF + tree.push_back(PropertyDecisionNode::Split(7, 11, 11)); + tree.push_back(PropertyDecisionNode::Split(7, 3, 13)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left)); + // ACS + tree.push_back(PropertyDecisionNode::Split(7, 11, 17)); + tree.push_back(PropertyDecisionNode::Split(7, 3, 19)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero)); + // EPF, left > 0 + tree.push_back(PropertyDecisionNode::Split(7, 0, 23)); + tree.push_back(PropertyDecisionNode::Split(7, 0, 25)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero)); + tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero)); + return tree; + } + if (tree_kind == ModularOptions::TreeKind::kWPFixedDC) { + std::vector cutoffs = { + -500, -392, -255, -191, -127, -95, -63, -47, -31, -23, -15, + -11, -7, -4, -3, -1, 0, 1, 3, 5, 7, 11, + 15, 23, 31, 47, 63, 95, 127, 191, 255, 392, 500}; + return MakeFixedTree(kNumNonrefProperties - weighted::kNumProperties, + cutoffs, Predictor::Weighted, total_pixels); + } + if (tree_kind == ModularOptions::TreeKind::kGradientFixedDC) { + std::vector cutoffs = { + -500, -392, -255, -191, -127, -95, -63, -47, -31, -23, -15, + -11, -7, -4, -3, -1, 0, 1, 3, 5, 7, 11, + 15, 23, 31, 47, 63, 95, 127, 191, 255, 392, 500}; + return MakeFixedTree(kGradientProp, cutoffs, Predictor::Gradient, + total_pixels); + } + JXL_ABORT("Unreachable"); + return {}; +} + +// Merges the trees in `trees` using nodes that decide on stream_id, as defined +// by `tree_splits`. +void MergeTrees(const std::vector& trees, + const std::vector& tree_splits, size_t begin, + size_t end, Tree* tree) { + JXL_ASSERT(trees.size() + 1 == tree_splits.size()); + JXL_ASSERT(end > begin); + JXL_ASSERT(end <= trees.size()); + if (end == begin + 1) { + // Insert the tree, adding the opportune offset to all child nodes. + // This will make the leaf IDs wrong, but subsequent roundtripping will fix + // them. + size_t sz = tree->size(); + tree->insert(tree->end(), trees[begin].begin(), trees[begin].end()); + for (size_t i = sz; i < tree->size(); i++) { + (*tree)[i].lchild += sz; + (*tree)[i].rchild += sz; + } + return; + } + size_t mid = (begin + end) / 2; + size_t splitval = tree_splits[mid] - 1; + size_t cur = tree->size(); + tree->emplace_back(1 /*stream_id*/, splitval, 0, 0, Predictor::Zero, 0, 1); + (*tree)[cur].lchild = tree->size(); + MergeTrees(trees, tree_splits, mid, end, tree); + (*tree)[cur].rchild = tree->size(); + MergeTrees(trees, tree_splits, begin, mid, tree); +} + +void QuantizeChannel(Channel& ch, const int q) { + if (q == 1) return; + for (size_t y = 0; y < ch.plane.ysize(); y++) { + pixel_type* row = ch.plane.Row(y); + for (size_t x = 0; x < ch.plane.xsize(); x++) { + if (row[x] < 0) { + row[x] = -((-row[x] + q / 2) / q) * q; + } else { + row[x] = ((row[x] + q / 2) / q) * q; + } + } + } +} + +// convert binary32 float that corresponds to custom [bits]-bit float (with +// [exp_bits] exponent bits) to a [bits]-bit integer representation that should +// fit in pixel_type +Status float_to_int(const float* const row_in, pixel_type* const row_out, + size_t xsize, unsigned int bits, unsigned int exp_bits, + bool fp, float factor) { + JXL_ASSERT(sizeof(pixel_type) * 8 >= bits); + if (!fp) { + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = row_in[x] * factor + 0.5f; + } + return true; + } + if (bits == 32 && fp) { + JXL_ASSERT(exp_bits == 8); + memcpy((void*)row_out, (const void*)row_in, 4 * xsize); + return true; + } + + int exp_bias = (1 << (exp_bits - 1)) - 1; + int max_exp = (1 << exp_bits) - 1; + uint32_t sign = (1u << (bits - 1)); + int mant_bits = bits - exp_bits - 1; + int mant_shift = 23 - mant_bits; + for (size_t x = 0; x < xsize; ++x) { + uint32_t f; + memcpy(&f, &row_in[x], 4); + int signbit = (f >> 31); + f &= 0x7fffffff; + if (f == 0) { + row_out[x] = (signbit ? sign : 0); + continue; + } + int exp = (f >> 23) - 127; + if (exp == 128) return JXL_FAILURE("Inf/NaN not allowed"); + int mantissa = (f & 0x007fffff); + // broke up the binary32 into its parts, now reassemble into + // arbitrary float + exp += exp_bias; + if (exp < 0) { // will become a subnormal number + // add implicit leading 1 to mantissa + mantissa |= 0x00800000; + if (exp < -mant_bits) { + return JXL_FAILURE( + "Invalid float number: %g cannot be represented with %i " + "exp_bits and %i mant_bits (exp %i)", + row_in[x], exp_bits, mant_bits, exp); + } + mantissa >>= 1 - exp; + exp = 0; + } + // exp should be representable in exp_bits, otherwise input was + // invalid + if (exp > max_exp) return JXL_FAILURE("Invalid float exponent"); + if (mantissa & ((1 << mant_shift) - 1)) { + return JXL_FAILURE("%g is losing precision (mant: %x)", row_in[x], + mantissa); + } + mantissa >>= mant_shift; + f = (signbit ? sign : 0); + f |= (exp << mant_bits); + f |= mantissa; + row_out[x] = (pixel_type)f; + } + return true; +} +} // namespace + +ModularFrameEncoder::ModularFrameEncoder(const FrameHeader& frame_header, + const CompressParams& cparams_orig) + : frame_dim(frame_header.ToFrameDimensions()), cparams(cparams_orig) { + size_t num_streams = + ModularStreamId::Num(frame_dim, frame_header.passes.num_passes); + if (cparams.modular_mode && + cparams.quality_pair == std::pair{100.0, 100.0}) { + switch (cparams.decoding_speed_tier) { + case 0: + break; + case 1: + cparams.options.wp_tree_mode = ModularOptions::TreeMode::kWPOnly; + break; + case 2: { + cparams.options.wp_tree_mode = ModularOptions::TreeMode::kGradientOnly; + cparams.options.predictor = Predictor::Gradient; + break; + } + case 3: { // LZ77, no Gradient. + cparams.options.nb_repeats = 0; + cparams.options.predictor = Predictor::Gradient; + break; + } + default: { // LZ77, no predictor. + cparams.options.nb_repeats = 0; + cparams.options.predictor = Predictor::Zero; + break; + } + } + } + stream_images.resize(num_streams); + if (cquality > 100) cquality = quality; + + // use a sensible default if nothing explicit is specified: + // Squeeze for lossy, no squeeze for lossless + if (cparams.responsive < 0) { + if (quality == 100) { + cparams.responsive = 0; + } else { + cparams.responsive = 1; + } + } + + if (cparams.speed_tier > SpeedTier::kWombat) { + cparams.options.splitting_heuristics_node_threshold = 192; + } else { + cparams.options.splitting_heuristics_node_threshold = 96; + } + { + // Set properties. + std::vector prop_order; + if (cparams.responsive) { + // Properties in order of their likelihood of being useful for Squeeze + // residuals. + prop_order = {0, 1, 4, 5, 6, 7, 8, 15, 9, 10, 11, 12, 13, 14, 2, 3}; + } else { + // Same, but for the non-Squeeze case. + prop_order = {0, 1, 15, 9, 10, 11, 12, 13, 14, 2, 3, 4, 5, 6, 7, 8}; + } + switch (cparams.speed_tier) { + case SpeedTier::kSquirrel: + cparams.options.splitting_heuristics_properties.assign( + prop_order.begin(), prop_order.begin() + 8); + cparams.options.max_property_values = 32; + break; + case SpeedTier::kKitten: + cparams.options.splitting_heuristics_properties.assign( + prop_order.begin(), prop_order.begin() + 10); + cparams.options.max_property_values = 64; + break; + case SpeedTier::kTortoise: + cparams.options.splitting_heuristics_properties = prop_order; + cparams.options.max_property_values = 256; + break; + default: + cparams.options.splitting_heuristics_properties.assign( + prop_order.begin(), prop_order.begin() + 6); + cparams.options.max_property_values = 16; + break; + } + if (cparams.speed_tier > SpeedTier::kTortoise) { + // Gradient in previous channels. + for (int i = 0; i < cparams.options.max_properties; i++) { + cparams.options.splitting_heuristics_properties.push_back( + kNumNonrefProperties + i * 4 + 3); + } + } else { + // All the extra properties in Tortoise mode. + for (int i = 0; i < cparams.options.max_properties * 4; i++) { + cparams.options.splitting_heuristics_properties.push_back( + kNumNonrefProperties + i); + } + } + } + + if (cparams.options.predictor == static_cast(-1)) { + // no explicit predictor(s) given, set a good default + if ((cparams.speed_tier <= SpeedTier::kTortoise || + cparams.modular_mode == false) && + quality == 100 && cparams.responsive == false) { + // TODO(veluca): allow all predictors that don't break residual + // multipliers in lossy mode. + cparams.options.predictor = Predictor::Variable; + } else if (cparams.responsive) { + // zero predictor for Squeeze residues + cparams.options.predictor = Predictor::Zero; + } else if (quality < 100) { + // If not responsive and lossy. TODO(veluca): use near_lossless instead? + cparams.options.predictor = Predictor::Gradient; + } else if (cparams.speed_tier < SpeedTier::kFalcon) { + // try median and weighted predictor for anything else + cparams.options.predictor = Predictor::Best; + } else if (cparams.speed_tier == SpeedTier::kFalcon) { + // just weighted predictor in falcon mode + cparams.options.predictor = Predictor::Weighted; + } else if (cparams.speed_tier > SpeedTier::kFalcon) { + // just gradient predictor in thunder mode + cparams.options.predictor = Predictor::Gradient; + } + } + tree_splits.push_back(0); + if (cparams.modular_mode == false) { + cparams.options.fast_decode_multiplier = 1.0f; + tree_splits.push_back(ModularStreamId::VarDCTDC(0).ID(frame_dim)); + tree_splits.push_back(ModularStreamId::ModularDC(0).ID(frame_dim)); + tree_splits.push_back(ModularStreamId::ACMetadata(0).ID(frame_dim)); + tree_splits.push_back(ModularStreamId::QuantTable(0).ID(frame_dim)); + tree_splits.push_back(ModularStreamId::ModularAC(0, 0).ID(frame_dim)); + ac_metadata_size.resize(frame_dim.num_dc_groups); + extra_dc_precision.resize(frame_dim.num_dc_groups); + } + tree_splits.push_back(num_streams); + cparams.options.max_chan_size = frame_dim.group_dim; + cparams.options.group_dim = frame_dim.group_dim; + + // TODO(veluca): figure out how to use different predictor sets per channel. + stream_options.resize(num_streams, cparams.options); +} + +bool do_transform(Image& image, const Transform& tr, + const weighted::Header& wp_header, + jxl::ThreadPool* pool = nullptr) { + Transform t = tr; + bool did_it = TransformForward(t, image, wp_header, pool); + if (did_it) image.transform.push_back(t); + return did_it; +} + +Status ModularFrameEncoder::ComputeEncodingData( + const FrameHeader& frame_header, const ImageMetadata& metadata, + Image3F* JXL_RESTRICT color, const std::vector& extra_channels, + PassesEncoderState* JXL_RESTRICT enc_state, ThreadPool* pool, + AuxOut* aux_out, bool do_color) { + const FrameDimensions& frame_dim = enc_state->shared.frame_dim; + + if (do_color && frame_header.loop_filter.gab) { + GaborishInverse(color, 0.9908511000000001f, pool); + } + + if (do_color && metadata.bit_depth.bits_per_sample <= 16 && + cparams.speed_tier < SpeedTier::kCheetah) { + FindBestPatchDictionary(*color, enc_state, nullptr, aux_out, + cparams.color_transform == ColorTransform::kXYB); + PatchDictionaryEncoder::SubtractFrom( + enc_state->shared.image_features.patches, color); + } + + // Convert ImageBundle to modular Image object + const size_t xsize = frame_dim.xsize; + const size_t ysize = frame_dim.ysize; + + int nb_chans = 3; + if (metadata.color_encoding.IsGray() && + cparams.color_transform == ColorTransform::kNone) { + nb_chans = 1; + } + if (!do_color) nb_chans = 0; + + nb_chans += extra_channels.size(); + + bool fp = metadata.bit_depth.floating_point_sample; + + // bits_per_sample is just metadata for XYB images. + if (metadata.bit_depth.bits_per_sample >= 32 && do_color && + cparams.color_transform != ColorTransform::kXYB) { + if (metadata.bit_depth.bits_per_sample == 32 && fp == false) { + return JXL_FAILURE("uint32_t not supported in enc_modular"); + } else if (metadata.bit_depth.bits_per_sample > 32) { + return JXL_FAILURE("bits_per_sample > 32 not supported"); + } + } + + Image& gi = stream_images[0]; + gi = Image(xsize, ysize, metadata.bit_depth.bits_per_sample, nb_chans); + int c = 0; + if (cparams.color_transform == ColorTransform::kXYB && + cparams.modular_mode == true) { + static const float enc_factors[3] = {32768.0f, 2048.0f, 2048.0f}; + DequantMatricesSetCustomDC(&enc_state->shared.matrices, enc_factors); + } + pixel_type maxval = gi.bitdepth < 32 ? (1u << gi.bitdepth) - 1 : 0; + if (do_color) { + for (; c < 3; c++) { + if (metadata.color_encoding.IsGray() && + cparams.color_transform == ColorTransform::kNone && + c != (cparams.color_transform == ColorTransform::kXYB ? 1 : 0)) + continue; + int c_out = c; + // XYB is encoded as YX(B-Y) + if (cparams.color_transform == ColorTransform::kXYB && c < 2) + c_out = 1 - c_out; + float factor = maxval; + if (cparams.color_transform == ColorTransform::kXYB) + factor = enc_state->shared.matrices.InvDCQuant(c); + if (c == 2 && cparams.color_transform == ColorTransform::kXYB) { + JXL_ASSERT(!fp); + for (size_t y = 0; y < ysize; ++y) { + const float* const JXL_RESTRICT row_in = color->PlaneRow(c, y); + pixel_type* const JXL_RESTRICT row_out = gi.channel[c_out].Row(y); + pixel_type* const JXL_RESTRICT row_Y = gi.channel[0].Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = row_in[x] * factor + 0.5f; + row_out[x] -= row_Y[x]; + } + } + } else { + int bits = metadata.bit_depth.bits_per_sample; + int exp_bits = metadata.bit_depth.exponent_bits_per_sample; + gi.channel[c_out].hshift = + enc_state->shared.frame_header.chroma_subsampling.HShift(c); + gi.channel[c_out].vshift = + enc_state->shared.frame_header.chroma_subsampling.VShift(c); + size_t xsize_shifted = DivCeil(xsize, 1 << gi.channel[c_out].hshift); + size_t ysize_shifted = DivCeil(ysize, 1 << gi.channel[c_out].vshift); + gi.channel[c_out].shrink(xsize_shifted, ysize_shifted); + for (size_t y = 0; y < ysize_shifted; ++y) { + const float* const JXL_RESTRICT row_in = color->PlaneRow(c, y); + pixel_type* const JXL_RESTRICT row_out = gi.channel[c_out].Row(y); + JXL_RETURN_IF_ERROR(float_to_int(row_in, row_out, xsize_shifted, bits, + exp_bits, fp, factor)); + } + } + } + if (metadata.color_encoding.IsGray() && + cparams.color_transform == ColorTransform::kNone) + c = 1; + } + + for (size_t ec = 0; ec < extra_channels.size(); ec++, c++) { + const ExtraChannelInfo& eci = metadata.extra_channel_info[ec]; + size_t ecups = frame_header.extra_channel_upsampling[ec]; + gi.channel[c].shrink(DivCeil(frame_dim.xsize_upsampled, ecups), + DivCeil(frame_dim.ysize_upsampled, ecups)); + gi.channel[c].hshift = gi.channel[c].vshift = + CeilLog2Nonzero(ecups) - CeilLog2Nonzero(frame_header.upsampling); + + int bits = eci.bit_depth.bits_per_sample; + int exp_bits = eci.bit_depth.exponent_bits_per_sample; + bool fp = eci.bit_depth.floating_point_sample; + float factor = (fp ? 1 : ((1u << eci.bit_depth.bits_per_sample) - 1)); + for (size_t y = 0; y < gi.channel[c].plane.ysize(); ++y) { + const float* const JXL_RESTRICT row_in = extra_channels[ec].Row(y); + pixel_type* const JXL_RESTRICT row_out = gi.channel[c].Row(y); + JXL_RETURN_IF_ERROR(float_to_int(row_in, row_out, + gi.channel[c].plane.xsize(), bits, + exp_bits, fp, factor)); + } + } + JXL_ASSERT(c == nb_chans); + + // Set options and apply transformations + + if (quality < 100) { + if (cparams.palette_colors != 0) { + JXL_DEBUG_V(3, "Lossy encode, not doing palette transforms"); + } + if (cparams.color_transform == ColorTransform::kXYB) { + cparams.channel_colors_pre_transform_percent = 0; + } + cparams.channel_colors_percent = 0; + cparams.palette_colors = 0; + cparams.lossy_palette = false; + } + + // if few colors, do all-channel palette before trying channel palette + // Logic is as follows: + // - if you can make a palette with few colors (arbitrary threshold: 200), + // then you can also make channel palettes, but they will just be extra + // signaling cost for almost no benefit + // - if the palette needs more colors, then channel palette might help to + // reduce palette signaling cost + if (cparams.palette_colors != 0 && cparams.speed_tier < SpeedTier::kFalcon) { + // all-channel palette (e.g. RGBA) + if (gi.channel.size() > 1) { + Transform maybe_palette(TransformId::kPalette); + maybe_palette.begin_c = gi.nb_meta_channels; + maybe_palette.num_c = gi.channel.size() - gi.nb_meta_channels; + maybe_palette.nb_colors = + std::min(std::min(200, (int)(xsize * ysize / 8)), + std::abs(cparams.palette_colors) / 16); + maybe_palette.ordered_palette = cparams.palette_colors >= 0; + maybe_palette.lossy_palette = false; + do_transform(gi, maybe_palette, weighted::Header(), pool); + } + } + + // Global channel palette + if (cparams.channel_colors_pre_transform_percent > 0 && + !cparams.lossy_palette && + (cparams.speed_tier <= SpeedTier::kThunder || + (do_color && metadata.bit_depth.bits_per_sample > 8))) { + // single channel palette (like FLIF's ChannelCompact) + size_t nb_channels = gi.channel.size() - gi.nb_meta_channels; + for (size_t i = 0; i < nb_channels; i++) { + int min, max; + compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max); + int64_t colors = max - min + 1; + JXL_DEBUG_V(10, "Channel %zu: range=%i..%i", i, min, max); + Transform maybe_palette_1(TransformId::kPalette); + maybe_palette_1.begin_c = i + gi.nb_meta_channels; + maybe_palette_1.num_c = 1; + // simple heuristic: if less than X percent of the values in the range + // actually occur, it is probably worth it to do a compaction + // (but only if the channel palette is less than 6% the size of the + // image itself) + maybe_palette_1.nb_colors = std::min( + (int)(xsize * ysize / 16), + (int)(cparams.channel_colors_pre_transform_percent / 100. * colors)); + if (do_transform(gi, maybe_palette_1, weighted::Header(), pool)) { + // effective bit depth is lower, adjust quantization accordingly + compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max); + if (max < maxval) maxval = max; + } + } + } + + // Global palette + if ((cparams.palette_colors != 0 || cparams.lossy_palette) && + cparams.speed_tier < SpeedTier::kFalcon) { + // all-channel palette (e.g. RGBA) + if (gi.channel.size() - gi.nb_meta_channels > 1) { + Transform maybe_palette(TransformId::kPalette); + maybe_palette.begin_c = gi.nb_meta_channels; + maybe_palette.num_c = gi.channel.size() - gi.nb_meta_channels; + maybe_palette.nb_colors = + std::min((int)(xsize * ysize / 8), std::abs(cparams.palette_colors)); + maybe_palette.ordered_palette = cparams.palette_colors >= 0; + maybe_palette.lossy_palette = + (cparams.lossy_palette && maybe_palette.num_c == 3); + if (maybe_palette.lossy_palette) { + maybe_palette.predictor = Predictor::Average4; + } + // TODO(veluca): use a custom weighted header if using the weighted + // predictor. + do_transform(gi, maybe_palette, weighted::Header(), pool); + } + // all-minus-one-channel palette (RGB with separate alpha, or CMY with + // separate K) + if (gi.channel.size() - gi.nb_meta_channels > 3) { + Transform maybe_palette_3(TransformId::kPalette); + maybe_palette_3.begin_c = gi.nb_meta_channels; + maybe_palette_3.num_c = gi.channel.size() - gi.nb_meta_channels - 1; + maybe_palette_3.nb_colors = + std::min((int)(xsize * ysize / 8), std::abs(cparams.palette_colors)); + maybe_palette_3.ordered_palette = cparams.palette_colors >= 0; + maybe_palette_3.lossy_palette = cparams.lossy_palette; + if (maybe_palette_3.lossy_palette) { + maybe_palette_3.predictor = Predictor::Average4; + } + do_transform(gi, maybe_palette_3, weighted::Header(), pool); + } + } + + if (cparams.color_transform == ColorTransform::kNone && do_color && !fp && + gi.channel.size() - gi.nb_meta_channels >= 3) { + if (cparams.colorspace == 1 || + (cparams.colorspace < 0 && + (quality < 100 || cparams.speed_tier > SpeedTier::kHare))) { + Transform ycocg{TransformId::kRCT}; + ycocg.rct_type = 6; + ycocg.begin_c = gi.nb_meta_channels; + do_transform(gi, ycocg, weighted::Header(), pool); + } else if (cparams.colorspace >= 2) { + Transform sg(TransformId::kRCT); + sg.begin_c = gi.nb_meta_channels; + sg.rct_type = cparams.colorspace - 2; + do_transform(gi, sg, weighted::Header(), pool); + } + } + + if (cparams.responsive && !gi.channel.empty()) { + do_transform(gi, Transform(TransformId::kSqueeze), weighted::Header(), + pool); // use default squeezing + } + + std::vector quants; + + if (quality < 100 || cquality < 100) { + quants.resize(gi.channel.size(), 1); + JXL_DEBUG_V( + 2, + "Adding quantization constants corresponding to luma quality %.2f " + "and chroma quality %.2f", + quality, cquality); + if (!cparams.responsive) { + JXL_DEBUG_V(1, + "Warning: lossy compression without Squeeze " + "transform is just color quantization."); + quality = (400 + quality) / 5; + cquality = (400 + cquality) / 5; + } + + // convert 'quality' to quantization scaling factor + if (quality > 50) { + quality = 200.0 - quality * 2.0; + } else { + quality = 900.0 - quality * 16.0; + } + if (cquality > 50) { + cquality = 200.0 - cquality * 2.0; + } else { + cquality = 900.0 - cquality * 16.0; + } + if (cparams.color_transform != ColorTransform::kXYB) { + quality *= 0.01f * maxval / 255.f; + cquality *= 0.01f * maxval / 255.f; + } else { + quality *= 0.01f; + cquality *= 0.01f; + } + + if (cparams.options.nb_repeats == 0) { + return JXL_FAILURE("nb_repeats = 0 not supported with modular lossy!"); + } + for (uint32_t i = gi.nb_meta_channels; i < gi.channel.size(); i++) { + Channel& ch = gi.channel[i]; + int shift = ch.hshift + ch.vshift; // number of pixel halvings + if (shift > 16) shift = 16; + if (shift > 0) shift--; + int q; + // assuming default Squeeze here + int component = ((i - gi.nb_meta_channels) % nb_chans); + // last 4 channels are final chroma residuals + if (nb_chans > 2 && i >= gi.channel.size() - 4) { + component = 1; + } + + if (cparams.color_transform == ColorTransform::kXYB && component < 3) { + q = (component == 0 ? quality : cquality) * squeeze_quality_factor_xyb * + squeeze_xyb_qtable[component][shift]; + } else { + if (cparams.colorspace != 0 && component > 0 && component < 3) { + q = cquality * squeeze_quality_factor * squeeze_chroma_qtable[shift]; + } else { + q = quality * squeeze_quality_factor * squeeze_luma_factor * + squeeze_luma_qtable[shift]; + } + } + if (q < 1) q = 1; + QuantizeChannel(gi.channel[i], q); + quants[i] = q; + } + } + + // Fill other groups. + struct GroupParams { + Rect rect; + int minShift; + int maxShift; + ModularStreamId id; + }; + std::vector stream_params; + + stream_options[0] = cparams.options; + + // DC + for (size_t group_id = 0; group_id < frame_dim.num_dc_groups; group_id++) { + const size_t gx = group_id % frame_dim.xsize_dc_groups; + const size_t gy = group_id / frame_dim.xsize_dc_groups; + const Rect rect(gx * frame_dim.dc_group_dim, gy * frame_dim.dc_group_dim, + frame_dim.dc_group_dim, frame_dim.dc_group_dim); + // minShift==3 because (frame_dim.dc_group_dim >> 3) == frame_dim.group_dim + // maxShift==1000 is infinity + stream_params.push_back( + GroupParams{rect, 3, 1000, ModularStreamId::ModularDC(group_id)}); + } + // AC global -> nothing. + // AC + for (size_t group_id = 0; group_id < frame_dim.num_groups; group_id++) { + const size_t gx = group_id % frame_dim.xsize_groups; + const size_t gy = group_id / frame_dim.xsize_groups; + const Rect mrect(gx * frame_dim.group_dim, gy * frame_dim.group_dim, + frame_dim.group_dim, frame_dim.group_dim); + for (size_t i = 0; i < enc_state->progressive_splitter.GetNumPasses(); + i++) { + int maxShift, minShift; + frame_header.passes.GetDownsamplingBracket(i, minShift, maxShift); + stream_params.push_back(GroupParams{ + mrect, minShift, maxShift, ModularStreamId::ModularAC(group_id, i)}); + } + } + gi_channel.resize(stream_images.size()); + + RunOnPool( + pool, 0, stream_params.size(), ThreadPool::SkipInit(), + [&](size_t i, size_t _) { + stream_options[stream_params[i].id.ID(frame_dim)] = cparams.options; + JXL_CHECK(PrepareStreamParams( + stream_params[i].rect, cparams, stream_params[i].minShift, + stream_params[i].maxShift, stream_params[i].id, do_color)); + }, + "ChooseParams"); + { + // Clear out channels that have been copied to groups. + Image& full_image = stream_images[0]; + size_t c = full_image.nb_meta_channels; + for (; c < full_image.channel.size(); c++) { + Channel& fc = full_image.channel[c]; + if (fc.w > frame_dim.group_dim || fc.h > frame_dim.group_dim) break; + } + for (; c < full_image.channel.size(); c++) { + full_image.channel[c].plane = ImageI(); + } + } + + if (!quants.empty()) { + for (uint32_t stream_id = 0; stream_id < stream_images.size(); + stream_id++) { + // skip non-modular stream_ids + if (stream_id > 0 && gi_channel[stream_id].empty()) continue; + Image& image = stream_images[stream_id]; + const ModularOptions& options = stream_options[stream_id]; + for (uint32_t i = image.nb_meta_channels; i < image.channel.size(); i++) { + if (i >= image.nb_meta_channels && + (image.channel[i].w > options.max_chan_size || + image.channel[i].h > options.max_chan_size)) { + continue; + } + if (stream_id > 0 && gi_channel[stream_id].empty()) continue; + size_t ch_id = stream_id == 0 + ? i + : gi_channel[stream_id][i - image.nb_meta_channels]; + uint32_t q = quants[ch_id]; + // Inform the tree splitting heuristics that each channel in each group + // used this quantization factor. This will produce a tree with the + // given multipliers. + if (multiplier_info.empty() || + multiplier_info.back().range[1][0] != stream_id || + multiplier_info.back().multiplier != q) { + StaticPropRange range; + range[0] = {i, i + 1}; + range[1] = {stream_id, stream_id + 1}; + multiplier_info.push_back({range, (uint32_t)q}); + } else { + // Previous channel in the same group had the same quantization + // factor. Don't provide two different ranges, as that creates + // unnecessary nodes. + multiplier_info.back().range[0][1] = i + 1; + } + } + } + // Merge group+channel settings that have the same channels and quantization + // factors, to avoid unnecessary nodes. + std::sort(multiplier_info.begin(), multiplier_info.end(), + [](ModularMultiplierInfo a, ModularMultiplierInfo b) { + return std::make_tuple(a.range, a.multiplier) < + std::make_tuple(b.range, b.multiplier); + }); + size_t new_num = 1; + for (size_t i = 1; i < multiplier_info.size(); i++) { + ModularMultiplierInfo& prev = multiplier_info[new_num - 1]; + ModularMultiplierInfo& cur = multiplier_info[i]; + if (prev.range[0] == cur.range[0] && prev.multiplier == cur.multiplier && + prev.range[1][1] == cur.range[1][0]) { + prev.range[1][1] = cur.range[1][1]; + } else { + multiplier_info[new_num++] = multiplier_info[i]; + } + } + multiplier_info.resize(new_num); + } + + JXL_RETURN_IF_ERROR(ValidateChannelDimensions(gi, stream_options[0])); + + return PrepareEncoding(pool, enc_state->shared.frame_dim, + enc_state->heuristics.get(), aux_out); +} + +Status ModularFrameEncoder::PrepareEncoding(ThreadPool* pool, + const FrameDimensions& frame_dim, + EncoderHeuristics* heuristics, + AuxOut* aux_out) { + if (!tree.empty()) return true; + + // Compute tree. + size_t num_streams = stream_images.size(); + stream_headers.resize(num_streams); + tokens.resize(num_streams); + + if (heuristics->CustomFixedTreeLossless(frame_dim, &tree)) { + // Using a fixed tree. + } else if (cparams.speed_tier < SpeedTier::kFalcon || quality != 100 || + !cparams.modular_mode) { + // Avoid creating a tree with leaves that don't correspond to any pixels. + std::vector useful_splits; + useful_splits.reserve(tree_splits.size()); + for (size_t chunk = 0; chunk < tree_splits.size() - 1; chunk++) { + bool has_pixels = false; + size_t start = tree_splits[chunk]; + size_t stop = tree_splits[chunk + 1]; + for (size_t i = start; i < stop; i++) { + for (const Channel& c : stream_images[i].channel) { + if (c.w && c.h) has_pixels = true; + } + } + if (has_pixels) { + useful_splits.push_back(tree_splits[chunk]); + } + } + // Don't do anything if modular mode does not have any pixels in this image + if (useful_splits.empty()) return true; + useful_splits.push_back(tree_splits.back()); + + std::atomic_flag invalid_force_wp = ATOMIC_FLAG_INIT; + + std::vector trees(useful_splits.size() - 1); + RunOnPool( + pool, 0, useful_splits.size() - 1, ThreadPool::SkipInit(), + [&](size_t chunk, size_t _) { + // TODO(veluca): parallelize more. + size_t total_pixels = 0; + uint32_t start = useful_splits[chunk]; + uint32_t stop = useful_splits[chunk + 1]; + uint32_t max_c = 0; + if (stream_options[start].tree_kind != + ModularOptions::TreeKind::kLearn) { + for (size_t i = start; i < stop; i++) { + for (const Channel& ch : stream_images[i].channel) { + total_pixels += ch.w * ch.h; + } + } + trees[chunk] = + PredefinedTree(stream_options[start].tree_kind, total_pixels); + return; + } + TreeSamples tree_samples; + if (!tree_samples.SetPredictor(stream_options[start].predictor, + stream_options[start].wp_tree_mode)) { + invalid_force_wp.test_and_set(std::memory_order_acq_rel); + return; + } + if (!tree_samples.SetProperties( + stream_options[start].splitting_heuristics_properties, + stream_options[start].wp_tree_mode)) { + invalid_force_wp.test_and_set(std::memory_order_acq_rel); + return; + } + std::vector pixel_samples; + std::vector diff_samples; + std::vector group_pixel_count; + std::vector channel_pixel_count; + for (size_t i = start; i < stop; i++) { + max_c = std::max(stream_images[i].channel.size(), max_c); + CollectPixelSamples(stream_images[i], stream_options[i], i, + group_pixel_count, channel_pixel_count, + pixel_samples, diff_samples); + } + StaticPropRange range; + range[0] = {0, max_c}; + range[1] = {start, stop}; + auto local_multiplier_info = multiplier_info; + + tree_samples.PreQuantizeProperties( + range, local_multiplier_info, group_pixel_count, + channel_pixel_count, pixel_samples, diff_samples, + stream_options[start].max_property_values); + for (size_t i = start; i < stop; i++) { + JXL_CHECK(ModularGenericCompress( + stream_images[i], stream_options[i], /*writer=*/nullptr, + /*aux_out=*/nullptr, 0, i, &tree_samples, &total_pixels)); + } + + // TODO(veluca): parallelize more. + trees[chunk] = + LearnTree(std::move(tree_samples), total_pixels, + stream_options[start], local_multiplier_info, range); + }, + "LearnTrees"); + if (invalid_force_wp.test_and_set(std::memory_order_acq_rel)) { + return JXL_FAILURE("PrepareEncoding: force_no_wp with {Weighted}"); + } + tree.clear(); + MergeTrees(trees, useful_splits, 0, useful_splits.size() - 1, &tree); + } else { + // Fixed tree. + size_t total_pixels = 0; + for (const Image& img : stream_images) { + for (const Channel& ch : img.channel) { + total_pixels += ch.w * ch.h; + } + } + if (cparams.speed_tier <= SpeedTier::kFalcon) { + tree = PredefinedTree(ModularOptions::TreeKind::kWPFixedDC, total_pixels); + } else if (cparams.speed_tier <= SpeedTier::kThunder) { + tree = PredefinedTree(ModularOptions::TreeKind::kGradientFixedDC, + total_pixels); + } else { + tree = {PropertyDecisionNode::Leaf(Predictor::Gradient)}; + } + } + tree_tokens.resize(1); + tree_tokens[0].clear(); + Tree decoded_tree; + TokenizeTree(tree, &tree_tokens[0], &decoded_tree); + JXL_ASSERT(tree.size() == decoded_tree.size()); + tree = std::move(decoded_tree); + + if (WantDebugOutput(aux_out)) { + PrintTree(tree, aux_out->debug_prefix + "/global_tree"); + } + + image_widths.resize(num_streams); + RunOnPool( + pool, 0, num_streams, ThreadPool::SkipInit(), + [&](size_t stream_id, size_t _) { + AuxOut my_aux_out; + if (aux_out) { + my_aux_out.dump_image = aux_out->dump_image; + my_aux_out.debug_prefix = aux_out->debug_prefix; + } + tokens[stream_id].clear(); + JXL_CHECK(ModularGenericCompress( + stream_images[stream_id], stream_options[stream_id], + /*writer=*/nullptr, &my_aux_out, 0, stream_id, + /*tree_samples=*/nullptr, + /*total_pixels=*/nullptr, + /*tree=*/&tree, /*header=*/&stream_headers[stream_id], + /*tokens=*/&tokens[stream_id], + /*widths=*/&image_widths[stream_id])); + }, + "ComputeTokens"); + return true; +} + +Status ModularFrameEncoder::EncodeGlobalInfo(BitWriter* writer, + AuxOut* aux_out) { + BitWriter::Allotment allotment(writer, 1); + // If we are using brotli, or not using modular mode. + if (tree_tokens.empty() || tree_tokens[0].empty()) { + writer->Write(1, 0); + ReclaimAndCharge(writer, &allotment, kLayerModularTree, aux_out); + return true; + } + writer->Write(1, 1); + ReclaimAndCharge(writer, &allotment, kLayerModularTree, aux_out); + + // Write tree + HistogramParams params; + if (cparams.speed_tier > SpeedTier::kKitten) { + params.clustering = HistogramParams::ClusteringType::kFast; + params.ans_histogram_strategy = + cparams.speed_tier > SpeedTier::kThunder + ? HistogramParams::ANSHistogramStrategy::kFast + : HistogramParams::ANSHistogramStrategy::kApproximate; + params.lz77_method = + cparams.decoding_speed_tier >= 3 && cparams.modular_mode + ? (cparams.speed_tier >= SpeedTier::kFalcon + ? HistogramParams::LZ77Method::kRLE + : HistogramParams::LZ77Method::kLZ77) + : HistogramParams::LZ77Method::kNone; + // Near-lossless DC, as well as modular mode, require choosing hybrid uint + // more carefully. + if ((!extra_dc_precision.empty() && extra_dc_precision[0] != 0) || + (cparams.modular_mode && cparams.speed_tier < SpeedTier::kCheetah)) { + params.uint_method = HistogramParams::HybridUintMethod::kFast; + } else { + params.uint_method = HistogramParams::HybridUintMethod::kNone; + } + } else if (cparams.speed_tier <= SpeedTier::kTortoise) { + params.lz77_method = HistogramParams::LZ77Method::kOptimal; + } else { + params.lz77_method = HistogramParams::LZ77Method::kLZ77; + } + if (cparams.decoding_speed_tier >= 1) { + params.max_histograms = 12; + } + BuildAndEncodeHistograms(params, kNumTreeContexts, tree_tokens, &code, + &context_map, writer, kLayerModularTree, aux_out); + WriteTokens(tree_tokens[0], code, context_map, writer, kLayerModularTree, + aux_out); + params.image_widths = image_widths; + // Write histograms. + BuildAndEncodeHistograms(params, (tree.size() + 1) / 2, tokens, &code, + &context_map, writer, kLayerModularGlobal, aux_out); + return true; +} + +Status ModularFrameEncoder::EncodeStream(BitWriter* writer, AuxOut* aux_out, + size_t layer, + const ModularStreamId& stream) { + size_t stream_id = stream.ID(frame_dim); + if (stream_images[stream_id].channel.empty()) { + return true; // Image with no channels, header never gets decoded. + } + JXL_RETURN_IF_ERROR( + Bundle::Write(stream_headers[stream_id], writer, layer, aux_out)); + WriteTokens(tokens[stream_id], code, context_map, writer, layer, aux_out); + return true; +} + +namespace { +float EstimateWPCost(const Image& img, size_t i) { + size_t extra_bits = 0; + float histo_cost = 0; + HybridUintConfig config; + int32_t cutoffs[] = {-500, -392, -255, -191, -127, -95, -63, -47, -31, + -23, -15, -11, -7, -4, -3, -1, 0, 1, + 3, 5, 7, 11, 15, 23, 31, 47, 63, + 95, 127, 191, 255, 392, 500}; + constexpr size_t nc = sizeof(cutoffs) / sizeof(*cutoffs) + 1; + Histogram histo[nc] = {}; + weighted::Header wp_header; + PredictorMode(i, &wp_header); + for (const Channel& ch : img.channel) { + const intptr_t onerow = ch.plane.PixelsPerRow(); + weighted::State wp_state(wp_header, ch.w, ch.h); + Properties properties(1); + for (size_t y = 0; y < ch.h; y++) { + const pixel_type* JXL_RESTRICT r = ch.Row(y); + for (size_t x = 0; x < ch.w; x++) { + size_t offset = 0; + pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); + pixel_type_w top = (y ? *(r + x - onerow) : left); + pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); + pixel_type_w topright = + (x + 1 < ch.w && y ? *(r + x + 1 - onerow) : top); + pixel_type_w toptop = (y > 1 ? *(r + x - onerow - onerow) : top); + pixel_type guess = wp_state.Predict( + x, y, ch.w, top, left, topright, topleft, toptop, &properties, + offset); + size_t ctx = 0; + for (int c : cutoffs) { + ctx += c >= properties[0]; + } + pixel_type res = r[x] - guess; + uint32_t token, nbits, bits; + config.Encode(PackSigned(res), &token, &nbits, &bits); + histo[ctx].Add(token); + extra_bits += nbits; + wp_state.UpdateErrors(r[x], x, y, ch.w); + } + } + for (size_t h = 0; h < nc; h++) { + histo_cost += histo[h].ShannonEntropy(); + histo[h].Clear(); + } + } + return histo_cost + extra_bits; +} + +float EstimateCost(const Image& img) { + // TODO(veluca): consider SIMDfication of this code. + size_t extra_bits = 0; + float histo_cost = 0; + HybridUintConfig config; + uint32_t cutoffs[] = {0, 1, 3, 5, 7, 11, 15, 23, 31, + 47, 63, 95, 127, 191, 255, 392, 500}; + constexpr size_t nc = sizeof(cutoffs) / sizeof(*cutoffs) + 1; + Histogram histo[nc] = {}; + for (const Channel& ch : img.channel) { + const intptr_t onerow = ch.plane.PixelsPerRow(); + for (size_t y = 0; y < ch.h; y++) { + const pixel_type* JXL_RESTRICT r = ch.Row(y); + for (size_t x = 0; x < ch.w; x++) { + pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); + pixel_type_w top = (y ? *(r + x - onerow) : left); + pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); + size_t maxdiff = std::max(std::max(left, top), topleft) - + std::min(std::min(left, top), topleft); + size_t ctx = 0; + for (uint32_t c : cutoffs) { + ctx += c > maxdiff; + } + pixel_type res = r[x] - ClampedGradient(top, left, topleft); + uint32_t token, nbits, bits; + config.Encode(PackSigned(res), &token, &nbits, &bits); + histo[ctx].Add(token); + extra_bits += nbits; + } + } + for (size_t h = 0; h < nc; h++) { + histo_cost += histo[h].ShannonEntropy(); + histo[h].Clear(); + } + } + return histo_cost + extra_bits; +} + +} // namespace + +Status ModularFrameEncoder::PrepareStreamParams(const Rect& rect, + const CompressParams& cparams, + int minShift, int maxShift, + const ModularStreamId& stream, + bool do_color) { + size_t stream_id = stream.ID(frame_dim); + JXL_ASSERT(stream_id != 0); + Image& full_image = stream_images[0]; + const size_t xsize = rect.xsize(); + const size_t ysize = rect.ysize(); + Image& gi = stream_images[stream_id]; + gi = Image(xsize, ysize, full_image.bitdepth, 0); + // start at the first bigger-than-frame_dim.group_dim non-metachannel + size_t c = full_image.nb_meta_channels; + for (; c < full_image.channel.size(); c++) { + Channel& fc = full_image.channel[c]; + if (fc.w > frame_dim.group_dim || fc.h > frame_dim.group_dim) break; + } + for (; c < full_image.channel.size(); c++) { + Channel& fc = full_image.channel[c]; + int shift = std::min(fc.hshift, fc.vshift); + if (shift > maxShift) continue; + if (shift < minShift) continue; + Rect r(rect.x0() >> fc.hshift, rect.y0() >> fc.vshift, + rect.xsize() >> fc.hshift, rect.ysize() >> fc.vshift, fc.w, fc.h); + if (r.xsize() == 0 || r.ysize() == 0) continue; + gi_channel[stream_id].push_back(c); + Channel gc(r.xsize(), r.ysize()); + gc.hshift = fc.hshift; + gc.vshift = fc.vshift; + for (size_t y = 0; y < r.ysize(); ++y) { + const pixel_type* const JXL_RESTRICT row_in = r.ConstRow(fc.plane, y); + pixel_type* const JXL_RESTRICT row_out = gc.Row(y); + for (size_t x = 0; x < r.xsize(); ++x) { + row_out[x] = row_in[x]; + } + } + gi.channel.emplace_back(std::move(gc)); + } + + // Do some per-group transforms + + float quality = cparams.quality_pair.first; + + // Local palette + // TODO(veluca): make this work with quantize-after-prediction in lossy mode. + if (quality == 100 && cparams.palette_colors != 0 && + cparams.speed_tier < SpeedTier::kCheetah) { + // all-channel palette (e.g. RGBA) + if (gi.channel.size() - gi.nb_meta_channels > 1) { + Transform maybe_palette(TransformId::kPalette); + maybe_palette.begin_c = gi.nb_meta_channels; + maybe_palette.num_c = gi.channel.size() - gi.nb_meta_channels; + maybe_palette.nb_colors = std::abs(cparams.palette_colors); + maybe_palette.ordered_palette = cparams.palette_colors >= 0; + do_transform(gi, maybe_palette, weighted::Header()); + } + // all-minus-one-channel palette (RGB with separate alpha, or CMY with + // separate K) + if (gi.channel.size() - gi.nb_meta_channels > 3) { + Transform maybe_palette_3(TransformId::kPalette); + maybe_palette_3.begin_c = gi.nb_meta_channels; + maybe_palette_3.num_c = gi.channel.size() - gi.nb_meta_channels - 1; + maybe_palette_3.nb_colors = std::abs(cparams.palette_colors); + maybe_palette_3.ordered_palette = cparams.palette_colors >= 0; + maybe_palette_3.lossy_palette = cparams.lossy_palette; + if (maybe_palette_3.lossy_palette) { + maybe_palette_3.predictor = Predictor::Weighted; + } + do_transform(gi, maybe_palette_3, weighted::Header()); + } + } + + // Local channel palette + if (cparams.channel_colors_percent > 0 && quality == 100 && + !cparams.lossy_palette && cparams.speed_tier < SpeedTier::kCheetah) { + // single channel palette (like FLIF's ChannelCompact) + size_t nb_channels = gi.channel.size() - gi.nb_meta_channels; + for (size_t i = 0; i < nb_channels; i++) { + int min, max; + compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max); + int colors = max - min + 1; + JXL_DEBUG_V(10, "Channel %zu: range=%i..%i", i, min, max); + Transform maybe_palette_1(TransformId::kPalette); + maybe_palette_1.begin_c = i + gi.nb_meta_channels; + maybe_palette_1.num_c = 1; + // simple heuristic: if less than X percent of the values in the range + // actually occur, it is probably worth it to do a compaction + // (but only if the channel palette is less than 80% the size of the + // image itself) + maybe_palette_1.nb_colors = + std::min((int)(xsize * ysize * 0.8), + (int)(cparams.channel_colors_percent / 100. * colors)); + do_transform(gi, maybe_palette_1, weighted::Header()); + } + } + + // lossless and no specific color transform specified: try Nothing, YCoCg, + // and 17 RCTs + if (cparams.color_transform == ColorTransform::kNone && quality == 100 && + cparams.colorspace < 0 && gi.channel.size() - gi.nb_meta_channels >= 3 && + cparams.responsive == false && do_color && + cparams.speed_tier <= SpeedTier::kHare) { + Transform sg(TransformId::kRCT); + sg.begin_c = gi.nb_meta_channels; + size_t nb_rcts_to_try = 0; + switch (cparams.speed_tier) { + case SpeedTier::kLightning: + case SpeedTier::kThunder: + case SpeedTier::kFalcon: + case SpeedTier::kCheetah: + nb_rcts_to_try = 0; // Just do global YCoCg + break; + case SpeedTier::kHare: + nb_rcts_to_try = 4; + break; + case SpeedTier::kWombat: + nb_rcts_to_try = 5; + break; + case SpeedTier::kSquirrel: + nb_rcts_to_try = 7; + break; + case SpeedTier::kKitten: + nb_rcts_to_try = 9; + break; + case SpeedTier::kTortoise: + nb_rcts_to_try = 19; + break; + } + float best_cost = std::numeric_limits::max(); + size_t best_rct = 0; + // These should be 19 actually different transforms; the remaining ones + // are equivalent to one of these (note that the first two are do-nothing + // and YCoCg) modulo channel reordering (which only matters in the case of + // MA-with-prev-channels-properties) and/or sign (e.g. RmG vs GmR) + for (int i : {0 * 7 + 0, 0 * 7 + 6, 0 * 7 + 5, 1 * 7 + 3, 3 * 7 + 5, + 5 * 7 + 5, 1 * 7 + 5, 2 * 7 + 5, 1 * 7 + 1, 0 * 7 + 4, + 1 * 7 + 2, 2 * 7 + 1, 2 * 7 + 2, 2 * 7 + 3, 4 * 7 + 4, + 4 * 7 + 5, 0 * 7 + 2, 0 * 7 + 1, 0 * 7 + 3}) { + if (nb_rcts_to_try == 0) break; + int num_transforms_to_keep = gi.transform.size(); + sg.rct_type = i; + do_transform(gi, sg, weighted::Header()); + float cost = EstimateCost(gi); + if (cost < best_cost) { + best_rct = i; + best_cost = cost; + } + nb_rcts_to_try--; + // Ensure we do not clamp channels to their supposed range, as this + // otherwise breaks in the presence of patches. + gi.undo_transforms(weighted::Header(), num_transforms_to_keep == 0 + ? -1 + : num_transforms_to_keep); + } + // Apply the best RCT to the image for future encoding. + sg.rct_type = best_rct; + do_transform(gi, sg, weighted::Header()); + } else { + // No need to try anything, just use the default options. + } + size_t nb_wp_modes = 1; + if (cparams.speed_tier <= SpeedTier::kTortoise) { + nb_wp_modes = 5; + } else if (cparams.speed_tier <= SpeedTier::kKitten) { + nb_wp_modes = 2; + } + if (nb_wp_modes > 1 && + (stream_options[stream_id].predictor == Predictor::Weighted || + stream_options[stream_id].predictor == Predictor::Best || + stream_options[stream_id].predictor == Predictor::Variable)) { + float best_cost = std::numeric_limits::max(); + stream_options[stream_id].wp_mode = 0; + for (size_t i = 0; i < nb_wp_modes; i++) { + float cost = EstimateWPCost(gi, i); + if (cost < best_cost) { + best_cost = cost; + stream_options[stream_id].wp_mode = i; + } + } + } + return true; +} + +int QuantizeWP(const int32_t* qrow, size_t onerow, size_t c, size_t x, size_t y, + size_t w, weighted::State* wp_state, float value, + float inv_factor) { + float svalue = value * inv_factor; + PredictionResult pred = + PredictNoTreeWP(w, qrow + x, onerow, x, y, Predictor::Weighted, wp_state); + svalue -= pred.guess; + int residual = roundf(svalue); + if (residual > 2 || residual < -2) residual = roundf(svalue * 0.5) * 2; + return residual + pred.guess; +} + +int QuantizeGradient(const int32_t* qrow, size_t onerow, size_t c, size_t x, + size_t y, size_t w, float value, float inv_factor) { + float svalue = value * inv_factor; + PredictionResult pred = + PredictNoTreeNoWP(w, qrow + x, onerow, x, y, Predictor::Gradient); + svalue -= pred.guess; + int residual = roundf(svalue); + if (residual > 2 || residual < -2) residual = roundf(svalue * 0.5) * 2; + return residual + pred.guess; +} + +void ModularFrameEncoder::AddVarDCTDC(const Image3F& dc, size_t group_index, + bool nl_dc, + PassesEncoderState* enc_state) { + const Rect r = enc_state->shared.DCGroupRect(group_index); + extra_dc_precision[group_index] = nl_dc ? 1 : 0; + float mul = 1 << extra_dc_precision[group_index]; + + size_t stream_id = ModularStreamId::VarDCTDC(group_index).ID(frame_dim); + stream_options[stream_id].max_chan_size = 0xFFFFFF; + stream_options[stream_id].predictor = Predictor::Weighted; + stream_options[stream_id].wp_tree_mode = ModularOptions::TreeMode::kWPOnly; + if (cparams.speed_tier >= SpeedTier::kSquirrel) { + stream_options[stream_id].tree_kind = ModularOptions::TreeKind::kWPFixedDC; + } + if (cparams.decoding_speed_tier >= 1) { + stream_options[stream_id].tree_kind = + ModularOptions::TreeKind::kGradientFixedDC; + } + + stream_images[stream_id] = Image(r.xsize(), r.ysize(), 8, 3); + if (nl_dc && stream_options[stream_id].tree_kind == + ModularOptions::TreeKind::kGradientFixedDC) { + JXL_ASSERT(enc_state->shared.frame_header.chroma_subsampling.Is444()); + for (size_t c : {1, 0, 2}) { + float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul; + float y_factor = enc_state->shared.quantizer.GetDcStep(1) / mul; + float cfl_factor = enc_state->shared.cmap.DCFactors()[c]; + for (size_t y = 0; y < r.ysize(); y++) { + int32_t* quant_row = + stream_images[stream_id].channel[c < 2 ? c ^ 1 : c].plane.Row(y); + size_t stride = stream_images[stream_id] + .channel[c < 2 ? c ^ 1 : c] + .plane.PixelsPerRow(); + const float* row = r.ConstPlaneRow(dc, c, y); + if (c == 1) { + for (size_t x = 0; x < r.xsize(); x++) { + quant_row[x] = QuantizeGradient(quant_row, stride, c, x, y, + r.xsize(), row[x], inv_factor); + } + } else { + int32_t* quant_row_y = + stream_images[stream_id].channel[0].plane.Row(y); + for (size_t x = 0; x < r.xsize(); x++) { + quant_row[x] = QuantizeGradient( + quant_row, stride, c, x, y, r.xsize(), + row[x] - quant_row_y[x] * (y_factor * cfl_factor), inv_factor); + } + } + } + } + } else if (nl_dc) { + JXL_ASSERT(enc_state->shared.frame_header.chroma_subsampling.Is444()); + for (size_t c : {1, 0, 2}) { + float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul; + float y_factor = enc_state->shared.quantizer.GetDcStep(1) / mul; + float cfl_factor = enc_state->shared.cmap.DCFactors()[c]; + weighted::Header header; + weighted::State wp_state(header, r.xsize(), r.ysize()); + for (size_t y = 0; y < r.ysize(); y++) { + int32_t* quant_row = + stream_images[stream_id].channel[c < 2 ? c ^ 1 : c].plane.Row(y); + size_t stride = stream_images[stream_id] + .channel[c < 2 ? c ^ 1 : c] + .plane.PixelsPerRow(); + const float* row = r.ConstPlaneRow(dc, c, y); + if (c == 1) { + for (size_t x = 0; x < r.xsize(); x++) { + quant_row[x] = QuantizeWP(quant_row, stride, c, x, y, r.xsize(), + &wp_state, row[x], inv_factor); + wp_state.UpdateErrors(quant_row[x], x, y, r.xsize()); + } + } else { + int32_t* quant_row_y = + stream_images[stream_id].channel[0].plane.Row(y); + for (size_t x = 0; x < r.xsize(); x++) { + quant_row[x] = QuantizeWP( + quant_row, stride, c, x, y, r.xsize(), &wp_state, + row[x] - quant_row_y[x] * (y_factor * cfl_factor), inv_factor); + wp_state.UpdateErrors(quant_row[x], x, y, r.xsize()); + } + } + } + } + } else if (enc_state->shared.frame_header.chroma_subsampling.Is444()) { + for (size_t c : {1, 0, 2}) { + float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul; + float y_factor = enc_state->shared.quantizer.GetDcStep(1) / mul; + float cfl_factor = enc_state->shared.cmap.DCFactors()[c]; + for (size_t y = 0; y < r.ysize(); y++) { + int32_t* quant_row = + stream_images[stream_id].channel[c < 2 ? c ^ 1 : c].plane.Row(y); + const float* row = r.ConstPlaneRow(dc, c, y); + if (c == 1) { + for (size_t x = 0; x < r.xsize(); x++) { + quant_row[x] = roundf(row[x] * inv_factor); + } + } else { + int32_t* quant_row_y = + stream_images[stream_id].channel[0].plane.Row(y); + for (size_t x = 0; x < r.xsize(); x++) { + quant_row[x] = + roundf((row[x] - quant_row_y[x] * (y_factor * cfl_factor)) * + inv_factor); + } + } + } + } + } else { + for (size_t c : {1, 0, 2}) { + Rect rect( + r.x0() >> enc_state->shared.frame_header.chroma_subsampling.HShift(c), + r.y0() >> enc_state->shared.frame_header.chroma_subsampling.VShift(c), + r.xsize() >> + enc_state->shared.frame_header.chroma_subsampling.HShift(c), + r.ysize() >> + enc_state->shared.frame_header.chroma_subsampling.VShift(c)); + float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul; + size_t ys = rect.ysize(); + size_t xs = rect.xsize(); + Channel& ch = stream_images[stream_id].channel[c < 2 ? c ^ 1 : c]; + ch.w = xs; + ch.h = ys; + ch.shrink(); + for (size_t y = 0; y < ys; y++) { + int32_t* quant_row = ch.plane.Row(y); + const float* row = rect.ConstPlaneRow(dc, c, y); + for (size_t x = 0; x < xs; x++) { + quant_row[x] = roundf(row[x] * inv_factor); + } + } + } + } + + DequantDC(r, &enc_state->shared.dc_storage, &enc_state->shared.quant_dc, + stream_images[stream_id], enc_state->shared.quantizer.MulDC(), + 1.0 / mul, enc_state->shared.cmap.DCFactors(), + enc_state->shared.frame_header.chroma_subsampling, + enc_state->shared.block_ctx_map); +} + +void ModularFrameEncoder::AddACMetadata(size_t group_index, bool jpeg_transcode, + PassesEncoderState* enc_state) { + const Rect r = enc_state->shared.DCGroupRect(group_index); + size_t stream_id = ModularStreamId::ACMetadata(group_index).ID(frame_dim); + stream_options[stream_id].max_chan_size = 0xFFFFFF; + stream_options[stream_id].wp_tree_mode = ModularOptions::TreeMode::kNoWP; + if (jpeg_transcode) { + stream_options[stream_id].tree_kind = + ModularOptions::TreeKind::kJpegTranscodeACMeta; + } else if (cparams.speed_tier >= SpeedTier::kFalcon) { + stream_options[stream_id].tree_kind = + ModularOptions::TreeKind::kFalconACMeta; + } else if (cparams.speed_tier > SpeedTier::kKitten) { + stream_options[stream_id].tree_kind = ModularOptions::TreeKind::kACMeta; + } + // If we are using a non-constant CfL field, and are in a slow enough mode, + // re-enable tree computation for it. + if (cparams.speed_tier < SpeedTier::kSquirrel && + cparams.force_cfl_jpeg_recompression) { + stream_options[stream_id].tree_kind = ModularOptions::TreeKind::kLearn; + } + // YToX, YToB, ACS + QF, EPF + Image& image = stream_images[stream_id]; + image = Image(r.xsize(), r.ysize(), 8, 4); + static_assert(kColorTileDimInBlocks == 8, "Color tile size changed"); + Rect cr(r.x0() >> 3, r.y0() >> 3, (r.xsize() + 7) >> 3, (r.ysize() + 7) >> 3); + image.channel[0] = Channel(cr.xsize(), cr.ysize(), 3, 3); + image.channel[1] = Channel(cr.xsize(), cr.ysize(), 3, 3); + image.channel[2] = Channel(r.xsize() * r.ysize(), 2, 0, 0); + ConvertPlaneAndClamp(cr, enc_state->shared.cmap.ytox_map, + Rect(image.channel[0].plane), &image.channel[0].plane); + ConvertPlaneAndClamp(cr, enc_state->shared.cmap.ytob_map, + Rect(image.channel[1].plane), &image.channel[1].plane); + size_t num = 0; + for (size_t y = 0; y < r.ysize(); y++) { + AcStrategyRow row_acs = enc_state->shared.ac_strategy.ConstRow(r, y); + const int* row_qf = r.ConstRow(enc_state->shared.raw_quant_field, y); + const uint8_t* row_epf = r.ConstRow(enc_state->shared.epf_sharpness, y); + int* out_acs = image.channel[2].plane.Row(0); + int* out_qf = image.channel[2].plane.Row(1); + int* row_out_epf = image.channel[3].plane.Row(y); + for (size_t x = 0; x < r.xsize(); x++) { + row_out_epf[x] = row_epf[x]; + if (!row_acs[x].IsFirstBlock()) continue; + out_acs[num] = row_acs[x].RawStrategy(); + out_qf[num] = row_qf[x] - 1; + num++; + } + } + image.channel[2].w = num; + ac_metadata_size[group_index] = num; +} + +void ModularFrameEncoder::EncodeQuantTable( + size_t size_x, size_t size_y, BitWriter* writer, + const QuantEncoding& encoding, size_t idx, + ModularFrameEncoder* modular_frame_encoder) { + JXL_ASSERT(encoding.qraw.qtable != nullptr); + JXL_ASSERT(size_x * size_y * 3 == encoding.qraw.qtable->size()); + JXL_CHECK(F16Coder::Write(encoding.qraw.qtable_den, writer)); + if (modular_frame_encoder) { + JXL_CHECK(modular_frame_encoder->EncodeStream( + writer, nullptr, 0, ModularStreamId::QuantTable(idx))); + return; + } + Image image(size_x, size_y, 8, 3); + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < size_y; y++) { + int* JXL_RESTRICT row = image.channel[c].Row(y); + for (size_t x = 0; x < size_x; x++) { + row[x] = (*encoding.qraw.qtable)[c * size_x * size_y + y * size_x + x]; + } + } + } + ModularOptions cfopts; + JXL_CHECK(ModularGenericCompress(image, cfopts, writer)); +} + +void ModularFrameEncoder::AddQuantTable(size_t size_x, size_t size_y, + const QuantEncoding& encoding, + size_t idx) { + size_t stream_id = ModularStreamId::QuantTable(idx).ID(frame_dim); + JXL_ASSERT(encoding.qraw.qtable != nullptr); + JXL_ASSERT(size_x * size_y * 3 == encoding.qraw.qtable->size()); + Image& image = stream_images[stream_id]; + image = Image(size_x, size_y, 8, 3); + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < size_y; y++) { + int* JXL_RESTRICT row = image.channel[c].Row(y); + for (size_t x = 0; x < size_x; x++) { + row[x] = (*encoding.qraw.qtable)[c * size_x * size_y + y * size_x + x]; + } + } + } +} +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.h new file mode 100644 index 0000000000..30a6610d6b --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_modular.h @@ -0,0 +1,94 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_MODULAR_H_ +#define LIB_JXL_ENC_MODULAR_H_ + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_modular.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/modular_image.h" + +namespace jxl { + +class ModularFrameEncoder { + public: + ModularFrameEncoder(const FrameHeader& frame_header, + const CompressParams& cparams_orig); + Status ComputeEncodingData(const FrameHeader& frame_header, + const ImageMetadata& metadata, + Image3F* JXL_RESTRICT color, + const std::vector& extra_channels, + PassesEncoderState* JXL_RESTRICT enc_state, + ThreadPool* pool, AuxOut* aux_out, bool do_color); + // Encodes global info (tree + histograms) in the `writer`. + Status EncodeGlobalInfo(BitWriter* writer, AuxOut* aux_out); + // Encodes a specific modular image (identified by `stream`) in the `writer`, + // assigning bits to the provided `layer`. + Status EncodeStream(BitWriter* writer, AuxOut* aux_out, size_t layer, + const ModularStreamId& stream); + // Creates a modular image for a given DC group of VarDCT mode. `dc` is the + // input DC image, not quantized; the group is specified by `group_index`, and + // `nl_dc` decides whether to apply a near-lossless processing to the DC or + // not. + void AddVarDCTDC(const Image3F& dc, size_t group_index, bool nl_dc, + PassesEncoderState* enc_state); + // Creates a modular image for the AC metadata of the given group + // (`group_index`). + void AddACMetadata(size_t group_index, bool jpeg_transcode, + PassesEncoderState* enc_state); + // Encodes a RAW quantization table in `writer`. If `modular_frame_encoder` is + // null, the quantization table in `encoding` is used, with dimensions `size_x + // x size_y`. Otherwise, the table with ID `idx` is encoded from the given + // `modular_frame_encoder`. + static void EncodeQuantTable(size_t size_x, size_t size_y, BitWriter* writer, + const QuantEncoding& encoding, size_t idx, + ModularFrameEncoder* modular_frame_encoder); + // Stores a quantization table for future usage with `EncodeQuantTable`. + void AddQuantTable(size_t size_x, size_t size_y, + const QuantEncoding& encoding, size_t idx); + + std::vector ac_metadata_size; + std::vector extra_dc_precision; + + std::vector stream_images; + std::vector stream_options; + + Tree tree; + std::vector> tree_tokens; + std::vector stream_headers; + std::vector> tokens; + EntropyEncodingData code; + std::vector context_map; + FrameDimensions frame_dim; + CompressParams cparams; + float quality = cparams.quality_pair.first; + float cquality = cparams.quality_pair.second; + std::vector tree_splits; + std::vector multiplier_info; + std::vector> gi_channel; + std::vector image_widths; + + private: + Status PrepareEncoding(ThreadPool* pool, const FrameDimensions& frame_dim, + EncoderHeuristics* heuristics, + AuxOut* aux_out = nullptr); + Status PrepareStreamParams(const Rect& rect, const CompressParams& cparams, + int minShift, int maxShift, + const ModularStreamId& stream, bool do_color); + +}; + +} // namespace jxl + +#endif // LIB_JXL_ENC_MODULAR_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_noise.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_noise.cc new file mode 100644 index 0000000000..383b7922f9 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_noise.cc @@ -0,0 +1,378 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_noise.h" + +#include +#include +#include + +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/robust_statistics.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/optimize.h" + +namespace jxl { +namespace { + +using OptimizeArray = optimize::Array; + +float GetScoreSumsOfAbsoluteDifferences(const Image3F& opsin, const int x, + const int y, const int block_size) { + const int small_bl_size_x = 3; + const int small_bl_size_y = 4; + const int kNumSAD = + (block_size - small_bl_size_x) * (block_size - small_bl_size_y); + // block_size x block_size reference pixels + int counter = 0; + const int offset = 2; + + std::vector sad(kNumSAD, 0); + for (int y_bl = 0; y_bl + small_bl_size_y < block_size; ++y_bl) { + for (int x_bl = 0; x_bl + small_bl_size_x < block_size; ++x_bl) { + float sad_sum = 0; + // size of the center patch, we compare all the patches inside window with + // the center one + for (int cy = 0; cy < small_bl_size_y; ++cy) { + for (int cx = 0; cx < small_bl_size_x; ++cx) { + float wnd = 0.5f * (opsin.PlaneRow(1, y + y_bl + cy)[x + x_bl + cx] + + opsin.PlaneRow(0, y + y_bl + cy)[x + x_bl + cx]); + float center = + 0.5f * (opsin.PlaneRow(1, y + offset + cy)[x + offset + cx] + + opsin.PlaneRow(0, y + offset + cy)[x + offset + cx]); + sad_sum += std::abs(center - wnd); + } + } + sad[counter++] = sad_sum; + } + } + const int kSamples = (kNumSAD) / 2; + // As with ROAD (rank order absolute distance), we keep the smallest half of + // the values in SAD (we use here the more robust patch SAD instead of + // absolute single-pixel differences). + std::sort(sad.begin(), sad.end()); + const float total_sad_sum = + std::accumulate(sad.begin(), sad.begin() + kSamples, 0.0f); + return total_sad_sum / kSamples; +} + +class NoiseHistogram { + public: + static constexpr int kBins = 256; + + NoiseHistogram() { std::fill(bins, bins + kBins, 0); } + + void Increment(const float x) { bins[Index(x)] += 1; } + int Get(const float x) const { return bins[Index(x)]; } + int Bin(const size_t bin) const { return bins[bin]; } + + void Print() const { + for (unsigned int bin : bins) { + printf("%d\n", bin); + } + } + + int Mode() const { + uint32_t cdf[kBins]; + std::partial_sum(bins, bins + kBins, cdf); + return HalfRangeMode()(cdf, kBins); + } + + double Quantile(double q01) const { + const int64_t total = std::accumulate(bins, bins + kBins, int64_t{1}); + const int64_t target = static_cast(q01 * total); + // Until sum >= target: + int64_t sum = 0; + size_t i = 0; + for (; i < kBins; ++i) { + sum += bins[i]; + // Exact match: assume middle of bin i + if (sum == target) { + return i + 0.5; + } + if (sum > target) break; + } + + // Next non-empty bin (in case histogram is sparsely filled) + size_t next = i + 1; + while (next < kBins && bins[next] == 0) { + ++next; + } + + // Linear interpolation according to how far into next we went + const double excess = target - sum; + const double weight_next = bins[Index(next)] / excess; + return ClampX(next * weight_next + i * (1.0 - weight_next)); + } + + // Inter-quartile range + double IQR() const { return Quantile(0.75) - Quantile(0.25); } + + private: + template + T ClampX(const T x) const { + return std::min(std::max(T(0), x), T(kBins - 1)); + } + size_t Index(const float x) const { return ClampX(static_cast(x)); } + + uint32_t bins[kBins]; +}; + +std::vector GetSADScoresForPatches(const Image3F& opsin, + const size_t block_s, + const size_t num_bin, + NoiseHistogram* sad_histogram) { + std::vector sad_scores( + (opsin.ysize() / block_s) * (opsin.xsize() / block_s), 0.0f); + + int block_index = 0; + + for (size_t y = 0; y + block_s <= opsin.ysize(); y += block_s) { + for (size_t x = 0; x + block_s <= opsin.xsize(); x += block_s) { + float sad_sc = GetScoreSumsOfAbsoluteDifferences(opsin, x, y, block_s); + sad_scores[block_index++] = sad_sc; + sad_histogram->Increment(sad_sc * num_bin); + } + } + return sad_scores; +} + +float GetSADThreshold(const NoiseHistogram& histogram, const int num_bin) { + // Here we assume that the most patches with similar SAD value is a "flat" + // patches. However, some images might contain regular texture part and + // generate second strong peak at the histogram + // TODO(user) handle bimodal and heavy-tailed case + const int mode = histogram.Mode(); + return static_cast(mode) / NoiseHistogram::kBins; +} + +// loss = sum asym * (F(x) - nl)^2 + kReg * num_points * sum (w[i] - w[i+1])^2 +// where asym = 1 if F(x) < nl, kAsym if F(x) > nl. +struct LossFunction { + explicit LossFunction(std::vector nl0) : nl(std::move(nl0)) {} + + double Compute(const OptimizeArray& w, OptimizeArray* df, + bool skip_regularization = false) const { + constexpr double kReg = 0.005; + constexpr double kAsym = 1.1; + double loss_function = 0; + for (size_t i = 0; i < w.size(); i++) { + (*df)[i] = 0; + } + for (auto ind : nl) { + std::pair pos = IndexAndFrac(ind.intensity); + JXL_DASSERT(pos.first >= 0 && static_cast(pos.first) < + NoiseParams::kNumNoisePoints - 1); + double low = w[pos.first]; + double hi = w[pos.first + 1]; + double val = low * (1.0f - pos.second) + hi * pos.second; + double dist = val - ind.noise_level; + if (dist > 0) { + loss_function += kAsym * dist * dist; + (*df)[pos.first] -= kAsym * (1.0f - pos.second) * dist; + (*df)[pos.first + 1] -= kAsym * pos.second * dist; + } else { + loss_function += dist * dist; + (*df)[pos.first] -= (1.0f - pos.second) * dist; + (*df)[pos.first + 1] -= pos.second * dist; + } + } + if (skip_regularization) return loss_function; + for (size_t i = 0; i + 1 < w.size(); i++) { + double diff = w[i] - w[i + 1]; + loss_function += kReg * nl.size() * diff * diff; + (*df)[i] -= kReg * diff * nl.size(); + (*df)[i + 1] += kReg * diff * nl.size(); + } + return loss_function; + } + + std::vector nl; +}; + +void OptimizeNoiseParameters(const std::vector& noise_level, + NoiseParams* noise_params) { + constexpr double kMaxError = 1e-3; + static const double kPrecision = 1e-8; + static const int kMaxIter = 40; + + float avg = 0; + for (const NoiseLevel& nl : noise_level) { + avg += nl.noise_level; + } + avg /= noise_level.size(); + + LossFunction loss_function(noise_level); + OptimizeArray parameter_vector; + for (size_t i = 0; i < parameter_vector.size(); i++) { + parameter_vector[i] = avg; + } + + parameter_vector = optimize::OptimizeWithScaledConjugateGradientMethod( + loss_function, parameter_vector, kPrecision, kMaxIter); + + OptimizeArray df = parameter_vector; + float loss = loss_function.Compute(parameter_vector, &df, + /*skip_regularization=*/true) / + noise_level.size(); + + // Approximation went too badly: escape with no noise at all. + if (loss > kMaxError) { + noise_params->Clear(); + return; + } + + for (size_t i = 0; i < parameter_vector.size(); i++) { + noise_params->lut[i] = std::max(parameter_vector[i], 0.0); + } +} + +std::vector GetNoiseLevel( + const Image3F& opsin, const std::vector& texture_strength, + const float threshold, const size_t block_s) { + std::vector noise_level_per_intensity; + + const int filt_size = 1; + static const float kLaplFilter[filt_size * 2 + 1][filt_size * 2 + 1] = { + {-0.25f, -1.0f, -0.25f}, + {-1.0f, 5.0f, -1.0f}, + {-0.25f, -1.0f, -0.25f}, + }; + + // The noise model is built based on channel 0.5 * (X+Y) as we notice that it + // is similar to the model 0.5 * (Y-X) + size_t patch_index = 0; + + for (size_t y = 0; y + block_s <= opsin.ysize(); y += block_s) { + for (size_t x = 0; x + block_s <= opsin.xsize(); x += block_s) { + if (texture_strength[patch_index] <= threshold) { + // Calculate mean value + float mean_int = 0; + for (size_t y_bl = 0; y_bl < block_s; ++y_bl) { + for (size_t x_bl = 0; x_bl < block_s; ++x_bl) { + mean_int += 0.5f * (opsin.PlaneRow(1, y + y_bl)[x + x_bl] + + opsin.PlaneRow(0, y + y_bl)[x + x_bl]); + } + } + mean_int /= block_s * block_s; + + // Calculate Noise level + float noise_level = 0; + size_t count = 0; + for (size_t y_bl = 0; y_bl < block_s; ++y_bl) { + for (size_t x_bl = 0; x_bl < block_s; ++x_bl) { + float filtered_value = 0; + for (int y_f = -1 * filt_size; y_f <= filt_size; ++y_f) { + if ((static_cast(y_bl) + y_f) >= 0 && + (y_bl + y_f) < block_s) { + for (int x_f = -1 * filt_size; x_f <= filt_size; ++x_f) { + if ((static_cast(x_bl) + x_f) >= 0 && + (x_bl + x_f) < block_s) { + filtered_value += + 0.5f * + (opsin.PlaneRow(1, y + y_bl + y_f)[x + x_bl + x_f] + + opsin.PlaneRow(0, y + y_bl + y_f)[x + x_bl + x_f]) * + kLaplFilter[y_f + filt_size][x_f + filt_size]; + } else { + filtered_value += + 0.5f * + (opsin.PlaneRow(1, y + y_bl + y_f)[x + x_bl - x_f] + + opsin.PlaneRow(0, y + y_bl + y_f)[x + x_bl - x_f]) * + kLaplFilter[y_f + filt_size][x_f + filt_size]; + } + } + } else { + for (int x_f = -1 * filt_size; x_f <= filt_size; ++x_f) { + if ((static_cast(x_bl) + x_f) >= 0 && + (x_bl + x_f) < block_s) { + filtered_value += + 0.5f * + (opsin.PlaneRow(1, y + y_bl - y_f)[x + x_bl + x_f] + + opsin.PlaneRow(0, y + y_bl - y_f)[x + x_bl + x_f]) * + kLaplFilter[y_f + filt_size][x_f + filt_size]; + } else { + filtered_value += + 0.5f * + (opsin.PlaneRow(1, y + y_bl - y_f)[x + x_bl - x_f] + + opsin.PlaneRow(0, y + y_bl - y_f)[x + x_bl - x_f]) * + kLaplFilter[y_f + filt_size][x_f + filt_size]; + } + } + } + } + noise_level += std::abs(filtered_value); + ++count; + } + } + noise_level /= count; + NoiseLevel nl; + nl.intensity = mean_int; + nl.noise_level = noise_level; + noise_level_per_intensity.push_back(nl); + } + ++patch_index; + } + } + return noise_level_per_intensity; +} + +void EncodeFloatParam(float val, float precision, BitWriter* writer) { + JXL_ASSERT(val >= 0); + const int absval_quant = static_cast(val * precision + 0.5f); + JXL_ASSERT(absval_quant < (1 << 10)); + writer->Write(10, absval_quant); +} + +} // namespace + +Status GetNoiseParameter(const Image3F& opsin, NoiseParams* noise_params, + float quality_coef) { + // The size of a patch in decoder might be different from encoder's patch + // size. + // For encoder: the patch size should be big enough to estimate + // noise level, but, at the same time, it should be not too big + // to be able to estimate intensity value of the patch + const size_t block_s = 8; + const size_t kNumBin = 256; + NoiseHistogram sad_histogram; + std::vector sad_scores = + GetSADScoresForPatches(opsin, block_s, kNumBin, &sad_histogram); + float sad_threshold = GetSADThreshold(sad_histogram, kNumBin); + // If threshold is too large, the image has a strong pattern. This pattern + // fools our model and it will add too much noise. Therefore, we do not add + // noise for such images + if (sad_threshold > 0.15f || sad_threshold <= 0.0f) { + noise_params->Clear(); + return false; + } + std::vector nl = + GetNoiseLevel(opsin, sad_scores, sad_threshold, block_s); + + OptimizeNoiseParameters(nl, noise_params); + for (float& i : noise_params->lut) { + i *= quality_coef * 1.4; + } + return noise_params->HasAny(); +} + +void EncodeNoise(const NoiseParams& noise_params, BitWriter* writer, + size_t layer, AuxOut* aux_out) { + JXL_ASSERT(noise_params.HasAny()); + + BitWriter::Allotment allotment(writer, NoiseParams::kNumNoisePoints * 16); + for (float i : noise_params.lut) { + EncodeFloatParam(i, kNoisePrecision, writer); + } + ReclaimAndCharge(writer, &allotment, layer, aux_out); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_noise.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_noise.h new file mode 100644 index 0000000000..15fb07a8c8 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_noise.h @@ -0,0 +1,33 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_NOISE_H_ +#define LIB_JXL_ENC_NOISE_H_ + +// Noise parameter estimation. + +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/image.h" +#include "lib/jxl/noise.h" + +namespace jxl { + +// Get parameters of the noise for NoiseParams model +// Returns whether a valid noise model (with HasAny()) is set. +Status GetNoiseParameter(const Image3F& opsin, NoiseParams* noise_params, + float quality_coef); + +// Does not write anything if `noise_params` are empty. Otherwise, caller must +// set FrameHeader.flags.kNoise. +void EncodeNoise(const NoiseParams& noise_params, BitWriter* writer, + size_t layer, AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_ENC_NOISE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_params.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_params.h new file mode 100644 index 0000000000..78a3a7cee0 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_params.h @@ -0,0 +1,270 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_PARAMS_H_ +#define LIB_JXL_ENC_PARAMS_H_ + +// Parameters and flags that govern JXL compression. + +#include +#include + +#include + +#include "lib/jxl/base/override.h" +#include "lib/jxl/butteraugli/butteraugli.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/modular/options.h" + +namespace jxl { + +enum class SpeedTier { + // Turns on FindBestQuantizationHQ loop. Equivalent to "guetzli" mode. + kTortoise = 1, + // Turns on FindBestQuantization butteraugli loop. + kKitten = 2, + // Turns on dots, patches, and spline detection by default, as well as full + // context clustering. Default. + kSquirrel = 3, + // Turns on error diffusion and full AC strategy heuristics. Equivalent to + // "fast" mode. + kWombat = 4, + // Turns on gaborish by default, non-default cmap, initial quant field. + kHare = 5, + // Turns on simple heuristics for AC strategy, quant field, and clustering; + // also enables coefficient reordering. + kCheetah = 6, + // Turns off most encoder features. Does context clustering. + // Modular: uses fixed tree with Weighted predictor. + kFalcon = 7, + // Currently fastest possible setting for VarDCT. + // Modular: uses fixed tree with Gradient predictor. + kThunder = 8, + // VarDCT: same as kThunder. + // Modular: no tree, Gradient predictor, fast histograms + kLightning = 9 +}; + +inline bool ParseSpeedTier(const std::string& s, SpeedTier* out) { + if (s == "lightning") { + *out = SpeedTier::kLightning; + return true; + } else if (s == "thunder") { + *out = SpeedTier::kThunder; + return true; + } else if (s == "falcon") { + *out = SpeedTier::kFalcon; + return true; + } else if (s == "cheetah") { + *out = SpeedTier::kCheetah; + return true; + } else if (s == "hare") { + *out = SpeedTier::kHare; + return true; + } else if (s == "fast" || s == "wombat") { + *out = SpeedTier::kWombat; + return true; + } else if (s == "squirrel") { + *out = SpeedTier::kSquirrel; + return true; + } else if (s == "kitten") { + *out = SpeedTier::kKitten; + return true; + } else if (s == "guetzli" || s == "tortoise") { + *out = SpeedTier::kTortoise; + return true; + } + size_t st = 10 - static_cast(strtoull(s.c_str(), nullptr, 0)); + if (st <= static_cast(SpeedTier::kLightning) && + st >= static_cast(SpeedTier::kTortoise)) { + *out = SpeedTier(st); + return true; + } + return false; +} + +inline const char* SpeedTierName(SpeedTier speed_tier) { + switch (speed_tier) { + case SpeedTier::kLightning: + return "lightning"; + case SpeedTier::kThunder: + return "thunder"; + case SpeedTier::kFalcon: + return "falcon"; + case SpeedTier::kCheetah: + return "cheetah"; + case SpeedTier::kHare: + return "hare"; + case SpeedTier::kWombat: + return "wombat"; + case SpeedTier::kSquirrel: + return "squirrel"; + case SpeedTier::kKitten: + return "kitten"; + case SpeedTier::kTortoise: + return "tortoise"; + } + return "INVALID"; +} + +// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) +struct CompressParams { + float butteraugli_distance = 1.0f; + size_t target_size = 0; + float target_bitrate = 0.0f; + + // 0.0 means search for the adaptive quantization map that matches the + // butteraugli distance, positive values mean quantize everywhere with that + // value. + float uniform_quant = 0.0f; + float quant_border_bias = 0.0f; + + // Try to achieve a maximum pixel-by-pixel error on each channel. + bool max_error_mode = false; + float max_error[3] = {0.0, 0.0, 0.0}; + + SpeedTier speed_tier = SpeedTier::kSquirrel; + + // 0 = default. + // 1 = slightly worse quality. + // 4 = fastest speed, lowest quality + // TODO(veluca): hook this up to the C API. + size_t decoding_speed_tier = 0; + + int max_butteraugli_iters = 4; + + int max_butteraugli_iters_guetzli_mode = 100; + + ColorTransform color_transform = ColorTransform::kXYB; + YCbCrChromaSubsampling chroma_subsampling; + + // If true, the "modular mode options" members below are used. + bool modular_mode = false; + + // Change group size in modular mode (0=128, 1=256, 2=512, 3=1024). + size_t modular_group_size_shift = 1; + + Override preview = Override::kDefault; + Override noise = Override::kDefault; + Override dots = Override::kDefault; + Override patches = Override::kDefault; + Override gaborish = Override::kDefault; + int epf = -1; + + // Progressive mode. + bool progressive_mode = false; + + // Quantized-progressive mode. + bool qprogressive_mode = false; + + // Put center groups first in the bitstream. + bool centerfirst = false; + + // Pixel coordinates of the center. First group will contain that center. + size_t center_x = static_cast(-1); + size_t center_y = static_cast(-1); + + int progressive_dc = -1; + + // If on: preserve color of invisible pixels (if off: don't care) + // Default: on for lossless, off for lossy + Override keep_invisible = Override::kDefault; + + // Progressive-mode saliency. + // + // How many progressive saliency-encoding steps to perform. + // - 1: Encode only DC and lowest-frequency AC. Does not need a saliency-map. + // - 2: Encode only DC+LF, dropping all HF AC data. + // Does not need a saliency-map. + // - 3: Encode DC+LF+{salient HF}, dropping all non-salient HF data. + // - 4: Encode DC+LF+{salient HF}+{other HF}. + // - 5: Encode DC+LF+{quantized HF}+{low HF bits}. + size_t saliency_num_progressive_steps = 3; + // Every saliency-heatmap cell with saliency >= threshold will be considered + // as 'salient'. The default value of 0.0 will consider every AC-block + // as salient, hence not require a saliency-map, and not actually generate + // a 4th progressive step. + float saliency_threshold = 0.0f; + // Saliency-map (owned by caller). + ImageF* saliency_map = nullptr; + + // Input and output file name. Will be used to provide pluggable saliency + // extractor with paths. + const char* file_in = nullptr; + const char* file_out = nullptr; + + // Currently unused as of 2020-01. + bool clear_metadata = false; + + // Prints extra information during/after encoding. + bool verbose = false; + + ButteraugliParams ba_params; + + // Force usage of CfL when doing JPEG recompression. This can have unexpected + // effects on the decoded pixels, while still being JPEG-compliant and + // allowing reconstruction of the original JPEG. + bool force_cfl_jpeg_recompression = true; + + // Set the noise to what it would approximately be if shooting at the nominal + // exposure for a given ISO setting on a 35mm camera. + float photon_noise_iso = 0; + + // modular mode options below + ModularOptions options; + int responsive = -1; + // A pair of . + std::pair quality_pair{100.f, 100.f}; + int colorspace = -1; + // Use Global channel palette if #colors < this percentage of range + float channel_colors_pre_transform_percent = 95.f; + // Use Local channel palette if #colors < this percentage of range + float channel_colors_percent = 80.f; + int palette_colors = 1 << 10; // up to 10-bit palette is probably worthwhile + bool lossy_palette = false; + + // Returns whether these params are lossless as defined by SetLossless(); + bool IsLossless() const { + return modular_mode && quality_pair.first == 100 && + quality_pair.second == 100 && + color_transform == jxl::ColorTransform::kNone; + } + + // Sets the parameters required to make the codec lossless. + void SetLossless() { + modular_mode = true; + quality_pair.first = 100; + quality_pair.second = 100; + color_transform = jxl::ColorTransform::kNone; + } + + bool use_new_heuristics = false; + + // Down/upsample the image before encoding / after decoding by this factor. + size_t resampling = 1; + size_t ec_resampling = 1; + // Skip the downsampling before encoding if this is true. + bool already_downsampled = false; +}; + +static constexpr float kMinButteraugliForDynamicAR = 0.5f; +static constexpr float kMinButteraugliForDots = 3.0f; +static constexpr float kMinButteraugliToSubtractOriginalPatches = 3.0f; +static constexpr float kMinButteraugliDistanceForProgressiveDc = 4.5f; + +// Always off +static constexpr float kMinButteraugliForNoise = 99.0f; + +// Minimum butteraugli distance the encoder accepts. +static constexpr float kMinButteraugliDistance = 0.01f; + +// Tile size for encoder-side processing. Must be equal to color tile dim in the +// current implementation. +static constexpr size_t kEncTileDim = 64; +static constexpr size_t kEncTileDimInBlocks = kEncTileDim / kBlockDim; + +} // namespace jxl + +#endif // LIB_JXL_ENC_PARAMS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_patch_dictionary.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_patch_dictionary.cc new file mode 100644 index 0000000000..5973acd63d --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_patch_dictionary.cc @@ -0,0 +1,836 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_patch_dictionary.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/dec_frame.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_dot_dictionary.h" +#include "lib/jxl/enc_frame.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/patch_dictionary_internal.h" + +namespace jxl { + +// static +void PatchDictionaryEncoder::Encode(const PatchDictionary& pdic, + BitWriter* writer, size_t layer, + AuxOut* aux_out) { + JXL_ASSERT(pdic.HasAny()); + std::vector> tokens(1); + + auto add_num = [&](int context, size_t num) { + tokens[0].emplace_back(context, num); + }; + size_t num_ref_patch = 0; + for (size_t i = 0; i < pdic.positions_.size();) { + size_t i_start = i; + while (i < pdic.positions_.size() && + pdic.positions_[i].ref_pos == pdic.positions_[i_start].ref_pos) { + i++; + } + num_ref_patch++; + } + add_num(kNumRefPatchContext, num_ref_patch); + for (size_t i = 0; i < pdic.positions_.size();) { + size_t i_start = i; + while (i < pdic.positions_.size() && + pdic.positions_[i].ref_pos == pdic.positions_[i_start].ref_pos) { + i++; + } + size_t num = i - i_start; + JXL_ASSERT(num > 0); + add_num(kReferenceFrameContext, pdic.positions_[i_start].ref_pos.ref); + add_num(kPatchReferencePositionContext, + pdic.positions_[i_start].ref_pos.x0); + add_num(kPatchReferencePositionContext, + pdic.positions_[i_start].ref_pos.y0); + add_num(kPatchSizeContext, pdic.positions_[i_start].ref_pos.xsize - 1); + add_num(kPatchSizeContext, pdic.positions_[i_start].ref_pos.ysize - 1); + add_num(kPatchCountContext, num - 1); + for (size_t j = i_start; j < i; j++) { + const PatchPosition& pos = pdic.positions_[j]; + if (j == i_start) { + add_num(kPatchPositionContext, pos.x); + add_num(kPatchPositionContext, pos.y); + } else { + add_num(kPatchOffsetContext, + PackSigned(pos.x - pdic.positions_[j - 1].x)); + add_num(kPatchOffsetContext, + PackSigned(pos.y - pdic.positions_[j - 1].y)); + } + JXL_ASSERT(pdic.shared_->metadata->m.extra_channel_info.size() + 1 == + pos.blending.size()); + for (size_t i = 0; + i < pdic.shared_->metadata->m.extra_channel_info.size() + 1; i++) { + const PatchBlending& info = pos.blending[i]; + add_num(kPatchBlendModeContext, static_cast(info.mode)); + if (UsesAlpha(info.mode) && + pdic.shared_->metadata->m.extra_channel_info.size() > 1) { + add_num(kPatchAlphaChannelContext, info.alpha_channel); + } + if (UsesClamp(info.mode)) { + add_num(kPatchClampContext, info.clamp); + } + } + } + } + + EntropyEncodingData codes; + std::vector context_map; + BuildAndEncodeHistograms(HistogramParams(), kNumPatchDictionaryContexts, + tokens, &codes, &context_map, writer, layer, + aux_out); + WriteTokens(tokens[0], codes, context_map, writer, layer, aux_out); +} + +// static +void PatchDictionaryEncoder::SubtractFrom(const PatchDictionary& pdic, + Image3F* opsin) { + // TODO(veluca): this can likely be optimized knowing it runs on full images. + for (size_t y = 0; y < opsin->ysize(); y++) { + if (y + 1 >= pdic.patch_starts_.size()) continue; + float* JXL_RESTRICT rows[3] = { + opsin->PlaneRow(0, y), + opsin->PlaneRow(1, y), + opsin->PlaneRow(2, y), + }; + for (size_t id = pdic.patch_starts_[y]; id < pdic.patch_starts_[y + 1]; + id++) { + const PatchPosition& pos = pdic.positions_[pdic.sorted_patches_[id]]; + size_t by = pos.y; + size_t bx = pos.x; + size_t xsize = pos.ref_pos.xsize; + JXL_DASSERT(y >= by); + JXL_DASSERT(y < by + pos.ref_pos.ysize); + size_t iy = y - by; + size_t ref = pos.ref_pos.ref; + const float* JXL_RESTRICT ref_rows[3] = { + pdic.shared_->reference_frames[ref].frame->color()->ConstPlaneRow( + 0, pos.ref_pos.y0 + iy) + + pos.ref_pos.x0, + pdic.shared_->reference_frames[ref].frame->color()->ConstPlaneRow( + 1, pos.ref_pos.y0 + iy) + + pos.ref_pos.x0, + pdic.shared_->reference_frames[ref].frame->color()->ConstPlaneRow( + 2, pos.ref_pos.y0 + iy) + + pos.ref_pos.x0, + }; + for (size_t ix = 0; ix < xsize; ix++) { + for (size_t c = 0; c < 3; c++) { + if (pos.blending[0].mode == PatchBlendMode::kAdd) { + rows[c][bx + ix] -= ref_rows[c][ix]; + } else if (pos.blending[0].mode == PatchBlendMode::kReplace) { + rows[c][bx + ix] = 0; + } else if (pos.blending[0].mode == PatchBlendMode::kNone) { + // Nothing to do. + } else { + JXL_ABORT("Blending mode %u not yet implemented", + (uint32_t)pos.blending[0].mode); + } + } + } + } + } +} + +namespace { + +struct PatchColorspaceInfo { + float kChannelDequant[3]; + float kChannelWeights[3]; + + explicit PatchColorspaceInfo(bool is_xyb) { + if (is_xyb) { + kChannelDequant[0] = 0.01615; + kChannelDequant[1] = 0.08875; + kChannelDequant[2] = 0.1922; + kChannelWeights[0] = 30.0; + kChannelWeights[1] = 3.0; + kChannelWeights[2] = 1.0; + } else { + kChannelDequant[0] = 20.0f / 255; + kChannelDequant[1] = 22.0f / 255; + kChannelDequant[2] = 20.0f / 255; + kChannelWeights[0] = 0.017 * 255; + kChannelWeights[1] = 0.02 * 255; + kChannelWeights[2] = 0.017 * 255; + } + } + + float ScaleForQuantization(float val, size_t c) { + return val / kChannelDequant[c]; + } + + int Quantize(float val, size_t c) { + return truncf(ScaleForQuantization(val, c)); + } + + bool is_similar_v(const float v1[3], const float v2[3], float threshold) { + float distance = 0; + for (size_t c = 0; c < 3; c++) { + distance += std::fabs(v1[c] - v2[c]) * kChannelWeights[c]; + } + return distance <= threshold; + } +}; + +std::vector FindTextLikePatches( + const Image3F& opsin, const PassesEncoderState* JXL_RESTRICT state, + ThreadPool* pool, AuxOut* aux_out, bool is_xyb) { + if (state->cparams.patches == Override::kOff) return {}; + + PatchColorspaceInfo pci(is_xyb); + float kSimilarThreshold = 0.8f; + + auto is_similar_impl = [&pci](std::pair p1, + std::pair p2, + const float* JXL_RESTRICT rows[3], + size_t stride, float threshold) { + float v1[3], v2[3]; + for (size_t c = 0; c < 3; c++) { + v1[c] = rows[c][p1.second * stride + p1.first]; + v2[c] = rows[c][p2.second * stride + p2.first]; + } + return pci.is_similar_v(v1, v2, threshold); + }; + + std::atomic has_screenshot_areas{false}; + const size_t opsin_stride = opsin.PixelsPerRow(); + const float* JXL_RESTRICT opsin_rows[3] = {opsin.ConstPlaneRow(0, 0), + opsin.ConstPlaneRow(1, 0), + opsin.ConstPlaneRow(2, 0)}; + + auto is_same = [&opsin_rows, opsin_stride](std::pair p1, + std::pair p2) { + for (size_t c = 0; c < 3; c++) { + float v1 = opsin_rows[c][p1.second * opsin_stride + p1.first]; + float v2 = opsin_rows[c][p2.second * opsin_stride + p2.first]; + if (std::fabs(v1 - v2) > 1e-4) { + return false; + } + } + return true; + }; + + auto is_similar = [&](std::pair p1, + std::pair p2) { + return is_similar_impl(p1, p2, opsin_rows, opsin_stride, kSimilarThreshold); + }; + + constexpr int64_t kPatchSide = 4; + constexpr int64_t kExtraSide = 4; + + // Look for kPatchSide size squares, naturally aligned, that all have the same + // pixel values. + ImageB is_screenshot_like(DivCeil(opsin.xsize(), kPatchSide), + DivCeil(opsin.ysize(), kPatchSide)); + ZeroFillImage(&is_screenshot_like); + uint8_t* JXL_RESTRICT screenshot_row = is_screenshot_like.Row(0); + const size_t screenshot_stride = is_screenshot_like.PixelsPerRow(); + const auto process_row = [&](uint64_t y, int _) { + for (uint64_t x = 0; x < opsin.xsize() / kPatchSide; x++) { + bool all_same = true; + for (size_t iy = 0; iy < static_cast(kPatchSide); iy++) { + for (size_t ix = 0; ix < static_cast(kPatchSide); ix++) { + size_t cx = x * kPatchSide + ix; + size_t cy = y * kPatchSide + iy; + if (!is_same({cx, cy}, {x * kPatchSide, y * kPatchSide})) { + all_same = false; + break; + } + } + } + if (!all_same) continue; + size_t num = 0; + size_t num_same = 0; + for (int64_t iy = -kExtraSide; iy < kExtraSide + kPatchSide; iy++) { + for (int64_t ix = -kExtraSide; ix < kExtraSide + kPatchSide; ix++) { + int64_t cx = x * kPatchSide + ix; + int64_t cy = y * kPatchSide + iy; + if (cx < 0 || static_cast(cx) >= opsin.xsize() || // + cy < 0 || static_cast(cy) >= opsin.ysize()) { + continue; + } + num++; + if (is_same({cx, cy}, {x * kPatchSide, y * kPatchSide})) num_same++; + } + } + // Too few equal pixels nearby. + if (num_same * 8 < num * 7) continue; + screenshot_row[y * screenshot_stride + x] = 1; + has_screenshot_areas = true; + } + }; + RunOnPool(pool, 0, opsin.ysize() / kPatchSide, ThreadPool::SkipInit(), + process_row, "IsScreenshotLike"); + + // TODO(veluca): also parallelize the rest of this function. + if (WantDebugOutput(aux_out)) { + aux_out->DumpPlaneNormalized("screenshot_like", is_screenshot_like); + } + + constexpr int kSearchRadius = 1; + + if (!ApplyOverride(state->cparams.patches, has_screenshot_areas)) { + return {}; + } + + // Search for "similar enough" pixels near the screenshot-like areas. + ImageB is_background(opsin.xsize(), opsin.ysize()); + ZeroFillImage(&is_background); + Image3F background(opsin.xsize(), opsin.ysize()); + ZeroFillImage(&background); + constexpr size_t kDistanceLimit = 50; + float* JXL_RESTRICT background_rows[3] = { + background.PlaneRow(0, 0), + background.PlaneRow(1, 0), + background.PlaneRow(2, 0), + }; + const size_t background_stride = background.PixelsPerRow(); + uint8_t* JXL_RESTRICT is_background_row = is_background.Row(0); + const size_t is_background_stride = is_background.PixelsPerRow(); + std::vector< + std::pair, std::pair>> + queue; + size_t queue_front = 0; + for (size_t y = 0; y < opsin.ysize(); y++) { + for (size_t x = 0; x < opsin.xsize(); x++) { + if (!screenshot_row[screenshot_stride * (y / kPatchSide) + + (x / kPatchSide)]) + continue; + queue.push_back({{x, y}, {x, y}}); + } + } + while (queue.size() != queue_front) { + std::pair cur = queue[queue_front].first; + std::pair src = queue[queue_front].second; + queue_front++; + if (is_background_row[cur.second * is_background_stride + cur.first]) + continue; + is_background_row[cur.second * is_background_stride + cur.first] = 1; + for (size_t c = 0; c < 3; c++) { + background_rows[c][cur.second * background_stride + cur.first] = + opsin_rows[c][src.second * opsin_stride + src.first]; + } + for (int dx = -kSearchRadius; dx <= kSearchRadius; dx++) { + for (int dy = -kSearchRadius; dy <= kSearchRadius; dy++) { + if (dx == 0 && dy == 0) continue; + int next_first = cur.first + dx; + int next_second = cur.second + dy; + if (next_first < 0 || next_second < 0 || + static_cast(next_first) >= opsin.xsize() || + static_cast(next_second) >= opsin.ysize()) { + continue; + } + if (static_cast( + std::abs(next_first - static_cast(src.first)) + + std::abs(next_second - static_cast(src.second))) > + kDistanceLimit) { + continue; + } + std::pair next{next_first, next_second}; + if (is_similar(src, next)) { + if (!screenshot_row[next.second / kPatchSide * screenshot_stride + + next.first / kPatchSide] || + is_same(src, next)) { + if (!is_background_row[next.second * is_background_stride + + next.first]) + queue.emplace_back(next, src); + } + } + } + } + } + queue.clear(); + + ImageF ccs; + std::mt19937 rng; + std::uniform_real_distribution dist(0.5, 1.0); + bool paint_ccs = false; + if (WantDebugOutput(aux_out)) { + aux_out->DumpPlaneNormalized("is_background", is_background); + if (is_xyb) { + aux_out->DumpXybImage("background", background); + } else { + aux_out->DumpImage("background", background); + } + ccs = ImageF(opsin.xsize(), opsin.ysize()); + ZeroFillImage(&ccs); + paint_ccs = true; + } + + constexpr float kVerySimilarThreshold = 0.03f; + constexpr float kHasSimilarThreshold = 0.03f; + + const float* JXL_RESTRICT const_background_rows[3] = { + background_rows[0], background_rows[1], background_rows[2]}; + auto is_similar_b = [&](std::pair p1, std::pair p2) { + return is_similar_impl(p1, p2, const_background_rows, background_stride, + kVerySimilarThreshold); + }; + + constexpr int kMinPeak = 2; + constexpr int kHasSimilarRadius = 2; + + std::vector info; + + // Find small CC outside the "similar enough" areas, compute bounding boxes, + // and run heuristics to exclude some patches. + ImageB visited(opsin.xsize(), opsin.ysize()); + ZeroFillImage(&visited); + uint8_t* JXL_RESTRICT visited_row = visited.Row(0); + const size_t visited_stride = visited.PixelsPerRow(); + std::vector> cc; + std::vector> stack; + for (size_t y = 0; y < opsin.ysize(); y++) { + for (size_t x = 0; x < opsin.xsize(); x++) { + if (is_background_row[y * is_background_stride + x]) continue; + cc.clear(); + stack.clear(); + stack.emplace_back(x, y); + size_t min_x = x; + size_t max_x = x; + size_t min_y = y; + size_t max_y = y; + std::pair reference; + bool found_border = false; + bool all_similar = true; + while (!stack.empty()) { + std::pair cur = stack.back(); + stack.pop_back(); + if (visited_row[cur.second * visited_stride + cur.first]) continue; + visited_row[cur.second * visited_stride + cur.first] = 1; + if (cur.first < min_x) min_x = cur.first; + if (cur.first > max_x) max_x = cur.first; + if (cur.second < min_y) min_y = cur.second; + if (cur.second > max_y) max_y = cur.second; + if (paint_ccs) { + cc.push_back(cur); + } + for (int dx = -kSearchRadius; dx <= kSearchRadius; dx++) { + for (int dy = -kSearchRadius; dy <= kSearchRadius; dy++) { + if (dx == 0 && dy == 0) continue; + int next_first = static_cast(cur.first) + dx; + int next_second = static_cast(cur.second) + dy; + if (next_first < 0 || next_second < 0 || + static_cast(next_first) >= opsin.xsize() || + static_cast(next_second) >= opsin.ysize()) { + continue; + } + std::pair next{next_first, next_second}; + if (!is_background_row[next.second * is_background_stride + + next.first]) { + stack.push_back(next); + } else { + if (!found_border) { + reference = next; + found_border = true; + } else { + if (!is_similar_b(next, reference)) all_similar = false; + } + } + } + } + } + if (!found_border || !all_similar || max_x - min_x >= kMaxPatchSize || + max_y - min_y >= kMaxPatchSize) { + continue; + } + size_t bpos = background_stride * reference.second + reference.first; + float ref[3] = {background_rows[0][bpos], background_rows[1][bpos], + background_rows[2][bpos]}; + bool has_similar = false; + for (size_t iy = std::max( + static_cast(min_y) - kHasSimilarRadius, 0); + iy < std::min(max_y + kHasSimilarRadius + 1, opsin.ysize()); iy++) { + for (size_t ix = std::max( + static_cast(min_x) - kHasSimilarRadius, 0); + ix < std::min(max_x + kHasSimilarRadius + 1, opsin.xsize()); + ix++) { + size_t opos = opsin_stride * iy + ix; + float px[3] = {opsin_rows[0][opos], opsin_rows[1][opos], + opsin_rows[2][opos]}; + if (pci.is_similar_v(ref, px, kHasSimilarThreshold)) { + has_similar = true; + } + } + } + if (!has_similar) continue; + info.emplace_back(); + info.back().second.emplace_back(min_x, min_y); + QuantizedPatch& patch = info.back().first; + patch.xsize = max_x - min_x + 1; + patch.ysize = max_y - min_y + 1; + int max_value = 0; + for (size_t c : {1, 0, 2}) { + for (size_t iy = min_y; iy <= max_y; iy++) { + for (size_t ix = min_x; ix <= max_x; ix++) { + size_t offset = (iy - min_y) * patch.xsize + ix - min_x; + patch.fpixels[c][offset] = + opsin_rows[c][iy * opsin_stride + ix] - ref[c]; + int val = pci.Quantize(patch.fpixels[c][offset], c); + patch.pixels[c][offset] = val; + if (std::abs(val) > max_value) max_value = std::abs(val); + } + } + } + if (max_value < kMinPeak) { + info.pop_back(); + continue; + } + if (paint_ccs) { + float cc_color = dist(rng); + for (std::pair p : cc) { + ccs.Row(p.second)[p.first] = cc_color; + } + } + } + } + + if (paint_ccs) { + JXL_ASSERT(WantDebugOutput(aux_out)); + aux_out->DumpPlaneNormalized("ccs", ccs); + } + if (info.empty()) { + return {}; + } + + // Remove duplicates. + constexpr size_t kMinPatchOccurences = 2; + std::sort(info.begin(), info.end()); + size_t unique = 0; + for (size_t i = 1; i < info.size(); i++) { + if (info[i].first == info[unique].first) { + info[unique].second.insert(info[unique].second.end(), + info[i].second.begin(), info[i].second.end()); + } else { + if (info[unique].second.size() >= kMinPatchOccurences) { + unique++; + } + info[unique] = info[i]; + } + } + if (info[unique].second.size() >= kMinPatchOccurences) { + unique++; + } + info.resize(unique); + + size_t max_patch_size = 0; + + for (size_t i = 0; i < info.size(); i++) { + size_t pixels = info[i].first.xsize * info[i].first.ysize; + if (pixels > max_patch_size) max_patch_size = pixels; + } + + // don't use patches if all patches are smaller than this + constexpr size_t kMinMaxPatchSize = 20; + if (max_patch_size < kMinMaxPatchSize) return {}; + + // Ensure that the specified set of patches doesn't produce out-of-bounds + // pixels. + // TODO(veluca): figure out why this is still necessary even with RCTs that + // don't depend on bit depth. + if (state->cparams.modular_mode && state->cparams.quality_pair.first >= 100) { + constexpr size_t kMaxPatchArea = kMaxPatchSize * kMaxPatchSize; + std::vector min_then_max_px(2 * kMaxPatchArea); + for (size_t i = 0; i < info.size(); i++) { + for (size_t c = 0; c < 3; c++) { + float* JXL_RESTRICT min_px = min_then_max_px.data(); + float* JXL_RESTRICT max_px = min_px + kMaxPatchArea; + std::fill(min_px, min_px + kMaxPatchArea, 1); + std::fill(max_px, max_px + kMaxPatchArea, 0); + size_t xsize = info[i].first.xsize; + for (size_t j = 0; j < info[i].second.size(); j++) { + size_t bx = info[i].second[j].first; + size_t by = info[i].second[j].second; + for (size_t iy = 0; iy < info[i].first.ysize; iy++) { + for (size_t ix = 0; ix < xsize; ix++) { + float v = opsin_rows[c][(by + iy) * opsin_stride + bx + ix]; + if (v < min_px[iy * xsize + ix]) min_px[iy * xsize + ix] = v; + if (v > max_px[iy * xsize + ix]) max_px[iy * xsize + ix] = v; + } + } + } + for (size_t iy = 0; iy < info[i].first.ysize; iy++) { + for (size_t ix = 0; ix < xsize; ix++) { + float smallest = min_px[iy * xsize + ix]; + float biggest = max_px[iy * xsize + ix]; + JXL_ASSERT(smallest <= biggest); + float& out = info[i].first.fpixels[c][iy * xsize + ix]; + // Clamp fpixels so that subtracting the patch never creates a + // negative value, or a value above 1. + JXL_ASSERT(biggest - 1 <= smallest); + out = std::max(smallest, out); + out = std::min(biggest - 1.f, out); + } + } + } + } + } + return info; +} + +} // namespace + +void FindBestPatchDictionary(const Image3F& opsin, + PassesEncoderState* JXL_RESTRICT state, + ThreadPool* pool, AuxOut* aux_out, bool is_xyb) { + state->shared.image_features.patches = PatchDictionary(); + state->shared.image_features.patches.SetPassesSharedState(&state->shared); + + std::vector info = + FindTextLikePatches(opsin, state, pool, aux_out, is_xyb); + + // TODO(veluca): this doesn't work if both dots and patches are enabled. + // For now, since dots and patches are not likely to occur in the same kind of + // images, disable dots if some patches were found. + if (info.empty() && + ApplyOverride( + state->cparams.dots, + state->cparams.speed_tier <= SpeedTier::kSquirrel && + state->cparams.butteraugli_distance >= kMinButteraugliForDots)) { + info = FindDotDictionary(state->cparams, opsin, state->shared.cmap, pool); + } + + if (info.empty()) return; + + std::sort( + info.begin(), info.end(), [&](const PatchInfo& a, const PatchInfo& b) { + return a.first.xsize * a.first.ysize > b.first.xsize * b.first.ysize; + }); + + size_t max_x_size = 0; + size_t max_y_size = 0; + size_t total_pixels = 0; + + for (size_t i = 0; i < info.size(); i++) { + size_t pixels = info[i].first.xsize * info[i].first.ysize; + if (max_x_size < info[i].first.xsize) max_x_size = info[i].first.xsize; + if (max_y_size < info[i].first.ysize) max_y_size = info[i].first.ysize; + total_pixels += pixels; + } + + // Bin-packing & conversion of patches. + constexpr float kBinPackingSlackness = 1.05f; + size_t ref_xsize = std::max(max_x_size, std::sqrt(total_pixels)); + size_t ref_ysize = std::max(max_y_size, std::sqrt(total_pixels)); + std::vector> ref_positions(info.size()); + // TODO(veluca): allow partial overlaps of patches that have the same pixels. + size_t max_y = 0; + do { + max_y = 0; + // Increase packed image size. + ref_xsize = ref_xsize * kBinPackingSlackness + 1; + ref_ysize = ref_ysize * kBinPackingSlackness + 1; + + ImageB occupied(ref_xsize, ref_ysize); + ZeroFillImage(&occupied); + uint8_t* JXL_RESTRICT occupied_rows = occupied.Row(0); + size_t occupied_stride = occupied.PixelsPerRow(); + + bool success = true; + // For every patch... + for (size_t patch = 0; patch < info.size(); patch++) { + size_t x0 = 0; + size_t y0 = 0; + size_t xsize = info[patch].first.xsize; + size_t ysize = info[patch].first.ysize; + bool found = false; + // For every possible start position ... + for (; y0 + ysize <= ref_ysize; y0++) { + x0 = 0; + for (; x0 + xsize <= ref_xsize; x0++) { + bool has_occupied_pixel = false; + size_t x = x0; + // Check if it is possible to place the patch in this position in the + // reference frame. + for (size_t y = y0; y < y0 + ysize; y++) { + x = x0; + for (; x < x0 + xsize; x++) { + if (occupied_rows[y * occupied_stride + x]) { + has_occupied_pixel = true; + break; + } + } + } // end of positioning check + if (!has_occupied_pixel) { + found = true; + break; + } + x0 = x; // Jump to next pixel after the occupied one. + } + if (found) break; + } // end of start position checking + + // We didn't find a possible position: repeat from the beginning with a + // larger reference frame size. + if (!found) { + success = false; + break; + } + + // We found a position: mark the corresponding positions in the reference + // image as used. + ref_positions[patch] = {x0, y0}; + for (size_t y = y0; y < y0 + ysize; y++) { + for (size_t x = x0; x < x0 + xsize; x++) { + occupied_rows[y * occupied_stride + x] = true; + } + } + max_y = std::max(max_y, y0 + ysize); + } + + if (success) break; + } while (true); + + JXL_ASSERT(ref_ysize >= max_y); + + ref_ysize = max_y; + + Image3F reference_frame(ref_xsize, ref_ysize); + // TODO(veluca): figure out a better way to fill the image. + ZeroFillImage(&reference_frame); + std::vector positions; + float* JXL_RESTRICT ref_rows[3] = { + reference_frame.PlaneRow(0, 0), + reference_frame.PlaneRow(1, 0), + reference_frame.PlaneRow(2, 0), + }; + size_t ref_stride = reference_frame.PixelsPerRow(); + + for (size_t i = 0; i < info.size(); i++) { + PatchReferencePosition ref_pos; + ref_pos.xsize = info[i].first.xsize; + ref_pos.ysize = info[i].first.ysize; + ref_pos.x0 = ref_positions[i].first; + ref_pos.y0 = ref_positions[i].second; + ref_pos.ref = 0; + for (size_t y = 0; y < ref_pos.ysize; y++) { + for (size_t x = 0; x < ref_pos.xsize; x++) { + for (size_t c = 0; c < 3; c++) { + ref_rows[c][(y + ref_pos.y0) * ref_stride + x + ref_pos.x0] = + info[i].first.fpixels[c][y * ref_pos.xsize + x]; + } + } + } + // Add color channels, ignore other channels. + std::vector blending_info( + state->shared.metadata->m.extra_channel_info.size() + 1, + PatchBlending{PatchBlendMode::kNone, 0, false}); + blending_info[0].mode = PatchBlendMode::kAdd; + for (const auto& pos : info[i].second) { + positions.emplace_back( + PatchPosition{pos.first, pos.second, blending_info, ref_pos}); + } + } + + CompressParams cparams = state->cparams; + cparams.resampling = 1; + cparams.ec_resampling = 1; + // Recursive application of patches could create very weird issues. + cparams.patches = Override::kOff; + cparams.dots = Override::kOff; + cparams.noise = Override::kOff; + cparams.modular_mode = true; + cparams.responsive = 0; + cparams.progressive_dc = 0; + cparams.progressive_mode = false; + cparams.qprogressive_mode = false; + // Use gradient predictor and not Predictor::Best. + cparams.options.predictor = Predictor::Gradient; + // TODO(veluca): possibly change heuristics here. + if (!cparams.modular_mode) { + cparams.quality_pair.first = cparams.quality_pair.second = + 80 - cparams.butteraugli_distance * 12; + } else { + cparams.quality_pair.first = (100 + 3 * cparams.quality_pair.first) * 0.25f; + cparams.quality_pair.second = + (100 + 3 * cparams.quality_pair.second) * 0.25f; + } + FrameInfo patch_frame_info; + patch_frame_info.save_as_reference = 0; // always saved. + patch_frame_info.frame_type = FrameType::kReferenceOnly; + patch_frame_info.save_before_color_transform = true; + + ImageBundle ib(&state->shared.metadata->m); + // TODO(veluca): metadata.color_encoding is a lie: ib is in XYB, but there is + // no simple way to express that yet. + patch_frame_info.ib_needs_color_transform = false; + patch_frame_info.save_as_reference = 0; + ib.SetFromImage(std::move(reference_frame), + state->shared.metadata->m.color_encoding); + if (!ib.metadata()->extra_channel_info.empty()) { + // Add dummy extra channels to the patch image: patches do not yet support + // extra channels, but the codec expects that the amount of extra channels + // in frames matches that in the metadata of the codestream. + std::vector extra_channels; + extra_channels.reserve(ib.metadata()->extra_channel_info.size()); + for (size_t i = 0; i < ib.metadata()->extra_channel_info.size(); i++) { + extra_channels.emplace_back(ib.xsize(), ib.ysize()); + // Must initialize the image with data to not affect blending with + // uninitialized memory. + // TODO(lode): patches must copy and use the real extra channels instead. + FillImage(1.0f, &extra_channels.back()); + } + ib.SetExtraChannels(std::move(extra_channels)); + } + + PassesEncoderState roundtrip_state; + auto special_frame = std::unique_ptr(new BitWriter()); + JXL_CHECK(EncodeFrame(cparams, patch_frame_info, state->shared.metadata, ib, + &roundtrip_state, pool, special_frame.get(), nullptr)); + const Span encoded = special_frame->GetSpan(); + state->special_frames.emplace_back(std::move(special_frame)); + if (cparams.butteraugli_distance < kMinButteraugliToSubtractOriginalPatches) { + BitReader br(encoded); + ImageBundle decoded(&state->shared.metadata->m); + PassesDecoderState dec_state; + JXL_CHECK(dec_state.output_encoding_info.Set( + *state->shared.metadata, + ColorEncoding::LinearSRGB( + state->shared.metadata->m.color_encoding.IsGray()))); + JXL_CHECK(DecodeFrame({}, &dec_state, pool, &br, &decoded, + *state->shared.metadata, /*constraints=*/nullptr)); + JXL_CHECK(br.Close()); + state->shared.reference_frames[0] = + std::move(dec_state.shared_storage.reference_frames[0]); + } else { + state->shared.reference_frames[0].storage = std::move(ib); + } + state->shared.reference_frames[0].frame = + &state->shared.reference_frames[0].storage; + // TODO(veluca): this assumes that applying patches is commutative, which is + // not true for all blending modes. This code only produces kAdd patches, so + // this works out. + std::sort(positions.begin(), positions.end()); + PatchDictionaryEncoder::SetPositions(&state->shared.image_features.patches, + std::move(positions)); +} +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_patch_dictionary.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_patch_dictionary.h new file mode 100644 index 0000000000..f26016f8de --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_patch_dictionary.h @@ -0,0 +1,56 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_PATCH_DICTIONARY_H_ +#define LIB_JXL_ENC_PATCH_DICTIONARY_H_ + +// Chooses reference patches, and avoids encoding them once per occurrence. + +#include +#include +#include + +#include +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_patch_dictionary.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image.h" +#include "lib/jxl/opsin_params.h" + +namespace jxl { + +// Friend class of PatchDictionary. +class PatchDictionaryEncoder { + public: + // Only call if HasAny(). + static void Encode(const PatchDictionary& pdic, BitWriter* writer, + size_t layer, AuxOut* aux_out); + + static void SetPositions(PatchDictionary* pdic, + std::vector positions) { + pdic->positions_ = std::move(positions); + pdic->ComputePatchCache(); + } + + static void SubtractFrom(const PatchDictionary& pdic, Image3F* opsin); +}; + +void FindBestPatchDictionary(const Image3F& opsin, + PassesEncoderState* JXL_RESTRICT state, + ThreadPool* pool, AuxOut* aux_out, + bool is_xyb = true); + +} // namespace jxl + +#endif // LIB_JXL_ENC_PATCH_DICTIONARY_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise.cc new file mode 100644 index 0000000000..3786ef5cf5 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise.cc @@ -0,0 +1,89 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_photon_noise.h" + +namespace jxl { + +namespace { + +// Assumes a daylight-like spectrum. +// https://www.strollswithmydog.com/effective-quantum-efficiency-of-sensor/#:~:text=11%2C260%20photons/um%5E2/lx-s +constexpr float kPhotonsPerLxSPerUm2 = 11260; + +// Order of magnitude for cameras in the 2010-2020 decade, taking the CFA into +// account. +constexpr float kEffectiveQuantumEfficiency = 0.20; + +// TODO(sboukortt): reevaluate whether these are good defaults, notably whether +// it would be worth making read noise higher at lower ISO settings. +constexpr float kPhotoResponseNonUniformity = 0.005; +constexpr float kInputReferredReadNoise = 3; + +// Assumes a 35mm sensor. +constexpr float kSensorAreaUm2 = 36000.f * 24000; + +template +inline constexpr T Square(const T x) { + return x * x; +} +template +inline constexpr T Cube(const T x) { + return x * x * x; +} + +} // namespace + +NoiseParams SimulatePhotonNoise(const size_t xsize, const size_t ysize, + const float iso) { + const float kOpsinAbsorbanceBiasCbrt = std::cbrt(kOpsinAbsorbanceBias[1]); + + // Focal plane exposure for 18% of kDefaultIntensityTarget, in lx·s. + // (ISO = 10 lx·s Ă· H) + const float h_18 = 10 / iso; + + const float pixel_area_um2 = kSensorAreaUm2 / (xsize * ysize); + + const float electrons_per_pixel_18 = kEffectiveQuantumEfficiency * + kPhotonsPerLxSPerUm2 * h_18 * + pixel_area_um2; + + NoiseParams params; + + for (size_t i = 0; i < NoiseParams::kNumNoisePoints; ++i) { + const float scaled_index = i / (NoiseParams::kNumNoisePoints - 2.f); + // scaled_index is used for XYB = (0, 2·scaled_index, 2·scaled_index) + const float y = 2 * scaled_index; + // 1 = default intensity target + const float linear = std::max( + 0.f, Cube(y - kOpsinAbsorbanceBiasCbrt) + kOpsinAbsorbanceBias[1]); + const float electrons_per_pixel = electrons_per_pixel_18 * (linear / 0.18f); + // Quadrature sum of read noise, photon shot noise (sqrt(S) so simply not + // squared here) and photo response non-uniformity. + // https://doi.org/10.1117/3.725073 + // Units are electrons rms. + const float noise = + std::sqrt(Square(kInputReferredReadNoise) + electrons_per_pixel + + Square(kPhotoResponseNonUniformity * electrons_per_pixel)); + const float linear_noise = noise * (0.18f / electrons_per_pixel_18); + const float opsin_derivative = + (1.f / 3) / Square(std::cbrt(linear - kOpsinAbsorbanceBias[1])); + const float opsin_noise = linear_noise * opsin_derivative; + + // TODO(sboukortt): verify more thoroughly whether the denominator is + // correct. + params.lut[i] = + Clamp1(opsin_noise / + (0.22f // norm_const + * std::sqrt(2.f) // red_noise + green_noise + * 1.13f // standard deviation of a plane of generated noise + ), + 0.f, 1.f); + } + + return params; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise.h new file mode 100644 index 0000000000..f43e14d560 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise.h @@ -0,0 +1,22 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_PHOTON_NOISE_H_ +#define LIB_JXL_ENC_PHOTON_NOISE_H_ + +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/image.h" +#include "lib/jxl/noise.h" + +namespace jxl { + +// Constructs a NoiseParams representing the noise that would be seen at the +// selected nominal exposure on a last-decade (as of 2021) color camera with a +// 36Ă—24mm sensor (“35mm format”). +NoiseParams SimulatePhotonNoise(size_t xsize, size_t ysize, float iso); + +} // namespace jxl + +#endif // LIB_JXL_ENC_PHOTON_NOISE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise_test.cc new file mode 100644 index 0000000000..3790fdee99 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_photon_noise_test.cc @@ -0,0 +1,50 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_photon_noise.h" + +#include "gmock/gmock.h" + +namespace jxl { +namespace { + +using ::testing::FloatNear; +using ::testing::Pointwise; + +MATCHER(AreApproximatelyEqual, "") { + constexpr float kTolerance = 1e-6; + const float actual = std::get<0>(arg); + const float expected = std::get<1>(arg); + return testing::ExplainMatchResult(FloatNear(expected, kTolerance), actual, + result_listener); +} + +TEST(EncPhotonNoiseTest, LUTs) { + EXPECT_THAT( + SimulatePhotonNoise(/*xsize=*/6000, /*ysize=*/4000, /*iso=*/100).lut, + Pointwise(AreApproximatelyEqual(), + {0.00259652, 0.0139648, 0.00681551, 0.00632582, 0.00694917, + 0.00803922, 0.00934574, 0.0107607})); + EXPECT_THAT( + SimulatePhotonNoise(/*xsize=*/6000, /*ysize=*/4000, /*iso=*/800).lut, + Pointwise(AreApproximatelyEqual(), + {0.02077220, 0.0420923, 0.01820690, 0.01439020, 0.01293670, + 0.01254030, 0.01277390, 0.0134161})); + EXPECT_THAT( + SimulatePhotonNoise(/*xsize=*/6000, /*ysize=*/4000, /*iso=*/6400).lut, + Pointwise(AreApproximatelyEqual(), + {0.1661770, 0.1691120, 0.05309080, 0.03963960, 0.03357410, + 0.03001650, 0.02776740, 0.0263478})); + + // Lower when measured on a per-pixel basis as there are fewer of them. + EXPECT_THAT( + SimulatePhotonNoise(/*xsize=*/4000, /*ysize=*/3000, /*iso=*/6400).lut, + Pointwise(AreApproximatelyEqual(), + {0.0830886, 0.1008720, 0.0367748, 0.0280305, 0.0240236, + 0.0218040, 0.0205771, 0.0200058})); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_quant_weights.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_quant_weights.cc new file mode 100644 index 0000000000..33d0e47bae --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_quant_weights.cc @@ -0,0 +1,203 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_quant_weights.h" + +#include +#include + +#include +#include +#include +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/options.h" + +namespace jxl { + +namespace { + +Status EncodeDctParams(const DctQuantWeightParams& params, BitWriter* writer) { + JXL_ASSERT(params.num_distance_bands >= 1); + writer->Write(DctQuantWeightParams::kLog2MaxDistanceBands, + params.num_distance_bands - 1); + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < params.num_distance_bands; i++) { + JXL_RETURN_IF_ERROR(F16Coder::Write( + params.distance_bands[c][i] * (i == 0 ? (1 / 64.0f) : 1.0f), writer)); + } + } + return true; +} + +Status EncodeQuant(const QuantEncoding& encoding, size_t idx, size_t size_x, + size_t size_y, BitWriter* writer, + ModularFrameEncoder* modular_frame_encoder) { + writer->Write(kLog2NumQuantModes, encoding.mode); + size_x *= kBlockDim; + size_y *= kBlockDim; + switch (encoding.mode) { + case QuantEncoding::kQuantModeLibrary: { + writer->Write(kCeilLog2NumPredefinedTables, encoding.predefined); + break; + } + case QuantEncoding::kQuantModeID: { + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 3; i++) { + JXL_RETURN_IF_ERROR( + F16Coder::Write(encoding.idweights[c][i] * (1.0f / 64), writer)); + } + } + break; + } + case QuantEncoding::kQuantModeDCT2: { + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 6; i++) { + JXL_RETURN_IF_ERROR(F16Coder::Write( + encoding.dct2weights[c][i] * (1.0f / 64), writer)); + } + } + break; + } + case QuantEncoding::kQuantModeDCT4X8: { + for (size_t c = 0; c < 3; c++) { + JXL_RETURN_IF_ERROR( + F16Coder::Write(encoding.dct4x8multipliers[c], writer)); + } + JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params, writer)); + break; + } + case QuantEncoding::kQuantModeDCT4: { + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 2; i++) { + JXL_RETURN_IF_ERROR( + F16Coder::Write(encoding.dct4multipliers[c][i], writer)); + } + } + JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params, writer)); + break; + } + case QuantEncoding::kQuantModeDCT: { + JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params, writer)); + break; + } + case QuantEncoding::kQuantModeRAW: { + ModularFrameEncoder::EncodeQuantTable(size_x, size_y, writer, encoding, + idx, modular_frame_encoder); + break; + } + case QuantEncoding::kQuantModeAFV: { + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 9; i++) { + JXL_RETURN_IF_ERROR(F16Coder::Write( + encoding.afv_weights[c][i] * (i < 6 ? 1.0f / 64 : 1.0f), writer)); + } + JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params, writer)); + JXL_RETURN_IF_ERROR( + EncodeDctParams(encoding.dct_params_afv_4x4, writer)); + } + break; + } + } + return true; +} + +} // namespace + +Status DequantMatricesEncode(const DequantMatrices* matrices, BitWriter* writer, + size_t layer, AuxOut* aux_out, + ModularFrameEncoder* modular_frame_encoder) { + bool all_default = true; + const std::vector& encodings = matrices->encodings(); + + for (size_t i = 0; i < encodings.size(); i++) { + if (encodings[i].mode != QuantEncoding::kQuantModeLibrary || + encodings[i].predefined != 0) { + all_default = false; + } + } + // TODO(janwas): better bound + BitWriter::Allotment allotment(writer, 512 * 1024); + writer->Write(1, all_default); + if (!all_default) { + for (size_t i = 0; i < encodings.size(); i++) { + JXL_RETURN_IF_ERROR(EncodeQuant( + encodings[i], i, DequantMatrices::required_size_x[i], + DequantMatrices::required_size_y[i], writer, modular_frame_encoder)); + } + } + ReclaimAndCharge(writer, &allotment, layer, aux_out); + return true; +} + +Status DequantMatricesEncodeDC(const DequantMatrices* matrices, + BitWriter* writer, size_t layer, + AuxOut* aux_out) { + bool all_default = true; + const float* dc_quant = matrices->DCQuants(); + for (size_t c = 0; c < 3; c++) { + if (dc_quant[c] != kDCQuant[c]) { + all_default = false; + } + } + BitWriter::Allotment allotment(writer, 1 + sizeof(float) * kBitsPerByte * 3); + writer->Write(1, all_default); + if (!all_default) { + for (size_t c = 0; c < 3; c++) { + JXL_RETURN_IF_ERROR(F16Coder::Write(dc_quant[c] * 128.0f, writer)); + } + } + ReclaimAndCharge(writer, &allotment, layer, aux_out); + return true; +} + +void DequantMatricesSetCustomDC(DequantMatrices* matrices, const float* dc) { + matrices->SetDCQuant(dc); + // Roundtrip encode/decode DC to ensure same values as decoder. + BitWriter writer; + JXL_CHECK(DequantMatricesEncodeDC(matrices, &writer, 0, nullptr)); + writer.ZeroPadToByte(); + BitReader br(writer.GetSpan()); + // Called only in the encoder: should fail only for programmer errors. + JXL_CHECK(matrices->DecodeDC(&br)); + JXL_CHECK(br.Close()); +} + +void DequantMatricesSetCustom(DequantMatrices* matrices, + const std::vector& encodings, + ModularFrameEncoder* encoder) { + JXL_ASSERT(encodings.size() == DequantMatrices::kNum); + matrices->SetEncodings(encodings); + for (size_t i = 0; i < encodings.size(); i++) { + if (encodings[i].mode == QuantEncodingInternal::kQuantModeRAW) { + encoder->AddQuantTable(DequantMatrices::required_size_x[i] * kBlockDim, + DequantMatrices::required_size_y[i] * kBlockDim, + encodings[i], i); + } + } + // Roundtrip encode/decode the matrices to ensure same values as decoder. + // Do not pass modular en/decoder, as they only change entropy and not + // values. + BitWriter writer; + JXL_CHECK(DequantMatricesEncode(matrices, &writer, 0, nullptr)); + writer.ZeroPadToByte(); + BitReader br(writer.GetSpan()); + // Called only in the encoder: should fail only for programmer errors. + JXL_CHECK(matrices->Decode(&br)); + JXL_CHECK(br.Close()); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_quant_weights.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_quant_weights.h new file mode 100644 index 0000000000..89033d8cbb --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_quant_weights.h @@ -0,0 +1,29 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_QUANT_WEIGHTS_H_ +#define LIB_JXL_ENC_QUANT_WEIGHTS_H_ + +#include "lib/jxl/quant_weights.h" + +namespace jxl { + +Status DequantMatricesEncode( + const DequantMatrices* matrices, BitWriter* writer, size_t layer, + AuxOut* aux_out, ModularFrameEncoder* modular_frame_encoder = nullptr); +Status DequantMatricesEncodeDC(const DequantMatrices* matrices, + BitWriter* writer, size_t layer, + AuxOut* aux_out); +// For consistency with QuantEncoding, higher values correspond to more +// precision. +void DequantMatricesSetCustomDC(DequantMatrices* matrices, const float* dc); + +void DequantMatricesSetCustom(DequantMatrices* matrices, + const std::vector& encodings, + ModularFrameEncoder* encoder); + +} // namespace jxl + +#endif // LIB_JXL_ENC_QUANT_WEIGHTS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_splines.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_splines.cc new file mode 100644 index 0000000000..cdb797dc6a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_splines.cc @@ -0,0 +1,96 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/splines.h" + +namespace jxl { + +class QuantizedSplineEncoder { + public: + // Only call if HasAny(). + static void Tokenize(const QuantizedSpline& spline, + std::vector* const tokens) { + tokens->emplace_back(kNumControlPointsContext, + spline.control_points_.size()); + for (const auto& point : spline.control_points_) { + tokens->emplace_back(kControlPointsContext, PackSigned(point.first)); + tokens->emplace_back(kControlPointsContext, PackSigned(point.second)); + } + const auto encode_dct = [tokens](const int dct[32]) { + for (int i = 0; i < 32; ++i) { + tokens->emplace_back(kDCTContext, PackSigned(dct[i])); + } + }; + for (int c = 0; c < 3; ++c) { + encode_dct(spline.color_dct_[c]); + } + encode_dct(spline.sigma_dct_); + } +}; + +namespace { + +void EncodeAllStartingPoints(const std::vector& points, + std::vector* tokens) { + int64_t last_x = 0; + int64_t last_y = 0; + for (size_t i = 0; i < points.size(); i++) { + const int64_t x = lroundf(points[i].x); + const int64_t y = lroundf(points[i].y); + if (i == 0) { + tokens->emplace_back(kStartingPositionContext, x); + tokens->emplace_back(kStartingPositionContext, y); + } else { + tokens->emplace_back(kStartingPositionContext, PackSigned(x - last_x)); + tokens->emplace_back(kStartingPositionContext, PackSigned(y - last_y)); + } + last_x = x; + last_y = y; + } +} + +} // namespace + +void EncodeSplines(const Splines& splines, BitWriter* writer, + const size_t layer, const HistogramParams& histogram_params, + AuxOut* aux_out) { + JXL_ASSERT(splines.HasAny()); + + const std::vector& quantized_splines = + splines.QuantizedSplines(); + std::vector> tokens(1); + tokens[0].emplace_back(kNumSplinesContext, quantized_splines.size() - 1); + EncodeAllStartingPoints(splines.StartingPoints(), &tokens[0]); + + tokens[0].emplace_back(kQuantizationAdjustmentContext, + PackSigned(splines.GetQuantizationAdjustment())); + + for (const QuantizedSpline& spline : quantized_splines) { + QuantizedSplineEncoder::Tokenize(spline, &tokens[0]); + } + + EntropyEncodingData codes; + std::vector context_map; + BuildAndEncodeHistograms(histogram_params, kNumSplineContexts, tokens, &codes, + &context_map, writer, layer, aux_out); + WriteTokens(tokens[0], codes, context_map, writer, layer, aux_out); +} + +Splines FindSplines(const Image3F& opsin) { + // TODO: implement spline detection. + return {}; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_splines.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_splines.h new file mode 100644 index 0000000000..732d77ac2c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_splines.h @@ -0,0 +1,39 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_SPLINES_H_ +#define LIB_JXL_ENC_SPLINES_H_ + +#include +#include + +#include +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/image.h" +#include "lib/jxl/splines.h" + +namespace jxl { + +// Only call if splines.HasAny(). +void EncodeSplines(const Splines& splines, BitWriter* writer, + const size_t layer, const HistogramParams& histogram_params, + AuxOut* aux_out); + +Splines FindSplines(const Image3F& opsin); + +} // namespace jxl + +#endif // LIB_JXL_ENC_SPLINES_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_toc.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_toc.cc new file mode 100644 index 0000000000..c877b0c837 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_toc.cc @@ -0,0 +1,46 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_toc.h" + +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_coeff_order.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/toc.h" + +namespace jxl { +Status WriteGroupOffsets(const std::vector& group_codes, + const std::vector* permutation, + BitWriter* JXL_RESTRICT writer, AuxOut* aux_out) { + BitWriter::Allotment allotment(writer, MaxBits(group_codes.size())); + if (permutation && !group_codes.empty()) { + // Don't write a permutation at all for an empty group_codes. + writer->Write(1, 1); // permutation + JXL_DASSERT(permutation->size() == group_codes.size()); + EncodePermutation(permutation->data(), /*skip=*/0, permutation->size(), + writer, /* layer= */ 0, aux_out); + + } else { + writer->Write(1, 0); // no permutation + } + writer->ZeroPadToByte(); // before TOC entries + + for (size_t i = 0; i < group_codes.size(); i++) { + JXL_ASSERT(group_codes[i].BitsWritten() % kBitsPerByte == 0); + const size_t group_size = group_codes[i].BitsWritten() / kBitsPerByte; + JXL_RETURN_IF_ERROR(U32Coder::Write(kTocDist, group_size, writer)); + } + writer->ZeroPadToByte(); // before first group + ReclaimAndCharge(writer, &allotment, kLayerTOC, aux_out); + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_toc.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_toc.h new file mode 100644 index 0000000000..dc81a5d12e --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_toc.h @@ -0,0 +1,29 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_TOC_H_ +#define LIB_JXL_ENC_TOC_H_ + +#include +#include + +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/enc_bit_writer.h" + +namespace jxl { + +// Writes the group offsets. If the permutation vector is nullptr, the identity +// permutation will be used. +Status WriteGroupOffsets(const std::vector& group_codes, + const std::vector* permutation, + BitWriter* JXL_RESTRICT writer, AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_ENC_TOC_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_transforms-inl.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_transforms-inl.h new file mode 100644 index 0000000000..c2f8e61105 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_transforms-inl.h @@ -0,0 +1,844 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#if defined(LIB_JXL_ENC_TRANSFORMS_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_ENC_TRANSFORMS_INL_H_ +#undef LIB_JXL_ENC_TRANSFORMS_INL_H_ +#else +#define LIB_JXL_ENC_TRANSFORMS_INL_H_ +#endif + +#include + +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/dct-inl.h" +#include "lib/jxl/dct_scales.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +template +struct DoIDCT { + template + void operator()(float* JXL_RESTRICT from, const To& to, + float* JXL_RESTRICT scratch_space) { + ComputeScaledIDCT()(from, to, scratch_space); + } +}; + +template +struct DoIDCT { + template + void operator()(float* JXL_RESTRICT from, const To& to, + float* JXL_RESTRICT scratch_space) const { + ComputeTransposedScaledIDCT()(from, to, scratch_space); + } +}; + +// Inverse of ReinterpretingDCT. +template +HWY_INLINE void ReinterpretingIDCT(const float* input, + const size_t input_stride, float* output, + const size_t output_stride) { + HWY_ALIGN float block[ROWS * COLS] = {}; + if (ROWS < COLS) { + for (size_t y = 0; y < LF_ROWS; y++) { + for (size_t x = 0; x < LF_COLS; x++) { + block[y * COLS + x] = input[y * input_stride + x] * + DCTTotalResampleScale(y) * + DCTTotalResampleScale(x); + } + } + } else { + for (size_t y = 0; y < LF_COLS; y++) { + for (size_t x = 0; x < LF_ROWS; x++) { + block[y * ROWS + x] = input[y * input_stride + x] * + DCTTotalResampleScale(y) * + DCTTotalResampleScale(x); + } + } + } + + // ROWS, COLS <= 8, so we can put scratch space on the stack. + HWY_ALIGN float scratch_space[ROWS * COLS]; + DoIDCT()(block, DCTTo(output, output_stride), scratch_space); +} + +template +void DCT2TopBlock(const float* block, size_t stride, float* out) { + static_assert(kBlockDim % S == 0, "S should be a divisor of kBlockDim"); + static_assert(S % 2 == 0, "S should be even"); + float temp[kDCTBlockSize]; + constexpr size_t num_2x2 = S / 2; + for (size_t y = 0; y < num_2x2; y++) { + for (size_t x = 0; x < num_2x2; x++) { + float c00 = block[y * 2 * stride + x * 2]; + float c01 = block[y * 2 * stride + x * 2 + 1]; + float c10 = block[(y * 2 + 1) * stride + x * 2]; + float c11 = block[(y * 2 + 1) * stride + x * 2 + 1]; + float r00 = c00 + c01 + c10 + c11; + float r01 = c00 + c01 - c10 - c11; + float r10 = c00 - c01 + c10 - c11; + float r11 = c00 - c01 - c10 + c11; + r00 *= 0.25f; + r01 *= 0.25f; + r10 *= 0.25f; + r11 *= 0.25f; + temp[y * kBlockDim + x] = r00; + temp[y * kBlockDim + num_2x2 + x] = r01; + temp[(y + num_2x2) * kBlockDim + x] = r10; + temp[(y + num_2x2) * kBlockDim + num_2x2 + x] = r11; + } + } + for (size_t y = 0; y < S; y++) { + for (size_t x = 0; x < S; x++) { + out[y * kBlockDim + x] = temp[y * kBlockDim + x]; + } + } +} + +void AFVDCT4x4(const float* JXL_RESTRICT pixels, float* JXL_RESTRICT coeffs) { + HWY_ALIGN static constexpr float k4x4AFVBasisTranspose[16][16] = { + { + 0.2500000000000000, + 0.8769029297991420f, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + -0.4105377591765233f, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + }, + { + 0.2500000000000000, + 0.2206518106944235f, + 0.0000000000000000, + 0.0000000000000000, + -0.7071067811865474f, + 0.6235485373547691f, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + }, + { + 0.2500000000000000, + -0.1014005039375376f, + 0.4067007583026075f, + -0.2125574805828875f, + 0.0000000000000000, + -0.0643507165794627f, + -0.4517556589999482f, + -0.3046847507248690f, + 0.3017929516615495f, + 0.4082482904638627f, + 0.1747866975480809f, + -0.2110560104933578f, + -0.1426608480880726f, + -0.1381354035075859f, + -0.1743760259965107f, + 0.1135498731499434f, + }, + { + 0.2500000000000000, + -0.1014005039375375f, + 0.4444481661973445f, + 0.3085497062849767f, + 0.0000000000000000f, + -0.0643507165794627f, + 0.1585450355184006f, + 0.5112616136591823f, + 0.2579236279634118f, + 0.0000000000000000, + 0.0812611176717539f, + 0.1856718091610980f, + -0.3416446842253372f, + 0.3302282550303788f, + 0.0702790691196284f, + -0.0741750459581035f, + }, + { + 0.2500000000000000, + 0.2206518106944236f, + 0.0000000000000000, + 0.0000000000000000, + 0.7071067811865476f, + 0.6235485373547694f, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + }, + { + 0.2500000000000000, + -0.1014005039375378f, + 0.0000000000000000, + 0.4706702258572536f, + 0.0000000000000000, + -0.0643507165794628f, + -0.0403851516082220f, + 0.0000000000000000, + 0.1627234014286620f, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.7367497537172237f, + 0.0875511500058708f, + -0.2921026642334881f, + 0.1940289303259434f, + }, + { + 0.2500000000000000, + -0.1014005039375377f, + 0.1957439937204294f, + -0.1621205195722993f, + 0.0000000000000000, + -0.0643507165794628f, + 0.0074182263792424f, + -0.2904801297289980f, + 0.0952002265347504f, + 0.0000000000000000, + -0.3675398009862027f, + 0.4921585901373873f, + 0.2462710772207515f, + -0.0794670660590957f, + 0.3623817333531167f, + -0.4351904965232280f, + }, + { + 0.2500000000000000, + -0.1014005039375376f, + 0.2929100136981264f, + 0.0000000000000000, + 0.0000000000000000, + -0.0643507165794627f, + 0.3935103426921017f, + -0.0657870154914280f, + 0.0000000000000000, + -0.4082482904638628f, + -0.3078822139579090f, + -0.3852501370925192f, + -0.0857401903551931f, + -0.4613374887461511f, + 0.0000000000000000, + 0.2191868483885747f, + }, + { + 0.2500000000000000, + -0.1014005039375376f, + -0.4067007583026072f, + -0.2125574805828705f, + 0.0000000000000000, + -0.0643507165794627f, + -0.4517556589999464f, + 0.3046847507248840f, + 0.3017929516615503f, + -0.4082482904638635f, + -0.1747866975480813f, + 0.2110560104933581f, + -0.1426608480880734f, + -0.1381354035075829f, + -0.1743760259965108f, + 0.1135498731499426f, + }, + { + 0.2500000000000000, + -0.1014005039375377f, + -0.1957439937204287f, + -0.1621205195722833f, + 0.0000000000000000, + -0.0643507165794628f, + 0.0074182263792444f, + 0.2904801297290076f, + 0.0952002265347505f, + 0.0000000000000000, + 0.3675398009862011f, + -0.4921585901373891f, + 0.2462710772207514f, + -0.0794670660591026f, + 0.3623817333531165f, + -0.4351904965232251f, + }, + { + 0.2500000000000000, + -0.1014005039375375f, + 0.0000000000000000, + -0.4706702258572528f, + 0.0000000000000000, + -0.0643507165794627f, + 0.1107416575309343f, + 0.0000000000000000, + -0.1627234014286617f, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + 0.1488339922711357f, + 0.4972464710953509f, + 0.2921026642334879f, + 0.5550443808910661f, + }, + { + 0.2500000000000000, + -0.1014005039375377f, + 0.1137907446044809f, + -0.1464291867126764f, + 0.0000000000000000, + -0.0643507165794628f, + 0.0829816309488205f, + -0.2388977352334460f, + -0.3531238544981630f, + -0.4082482904638630f, + 0.4826689115059883f, + 0.1741941265991622f, + -0.0476868035022925f, + 0.1253805944856366f, + -0.4326608024727445f, + -0.2546827712406646f, + }, + { + 0.2500000000000000, + -0.1014005039375377f, + -0.4444481661973438f, + 0.3085497062849487f, + 0.0000000000000000, + -0.0643507165794628f, + 0.1585450355183970f, + -0.5112616136592012f, + 0.2579236279634129f, + 0.0000000000000000, + -0.0812611176717504f, + -0.1856718091610990f, + -0.3416446842253373f, + 0.3302282550303805f, + 0.0702790691196282f, + -0.0741750459581023f, + }, + { + 0.2500000000000000, + -0.1014005039375376f, + -0.2929100136981264f, + 0.0000000000000000, + 0.0000000000000000, + -0.0643507165794627f, + 0.3935103426921022f, + 0.0657870154914254f, + 0.0000000000000000, + 0.4082482904638634f, + 0.3078822139579031f, + 0.3852501370925211f, + -0.0857401903551927f, + -0.4613374887461554f, + 0.0000000000000000, + 0.2191868483885728f, + }, + { + 0.2500000000000000, + -0.1014005039375376f, + -0.1137907446044814f, + -0.1464291867126654f, + 0.0000000000000000, + -0.0643507165794627f, + 0.0829816309488214f, + 0.2388977352334547f, + -0.3531238544981624f, + 0.4082482904638630f, + -0.4826689115059858f, + -0.1741941265991621f, + -0.0476868035022928f, + 0.1253805944856431f, + -0.4326608024727457f, + -0.2546827712406641f, + }, + { + 0.2500000000000000, + -0.1014005039375374f, + 0.0000000000000000, + 0.4251149611657548f, + 0.0000000000000000, + -0.0643507165794626f, + -0.4517556589999480f, + 0.0000000000000000, + -0.6035859033230976f, + 0.0000000000000000, + 0.0000000000000000, + 0.0000000000000000, + -0.1426608480880724f, + -0.1381354035075845f, + 0.3487520519930227f, + 0.1135498731499429f, + }, + }; + + const HWY_CAPPED(float, 16) d; + for (size_t i = 0; i < 16; i += Lanes(d)) { + auto scalar = Zero(d); + for (size_t j = 0; j < 16; j++) { + auto px = Set(d, pixels[j]); + auto basis = Load(d, k4x4AFVBasisTranspose[j] + i); + scalar = MulAdd(px, basis, scalar); + } + Store(scalar, d, coeffs + i); + } +} + +// Coefficient layout: +// - (even, even) positions hold AFV coefficients +// - (odd, even) positions hold DCT4x4 coefficients +// - (any, odd) positions hold DCT4x8 coefficients +template +void AFVTransformFromPixels(const float* JXL_RESTRICT pixels, + size_t pixels_stride, + float* JXL_RESTRICT coefficients) { + HWY_ALIGN float scratch_space[4 * 8 * 2]; + size_t afv_x = afv_kind & 1; + size_t afv_y = afv_kind / 2; + HWY_ALIGN float block[4 * 8]; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + block[(afv_y == 1 ? 3 - iy : iy) * 4 + (afv_x == 1 ? 3 - ix : ix)] = + pixels[(iy + 4 * afv_y) * pixels_stride + ix + 4 * afv_x]; + } + } + // AFV coefficients in (even, even) positions. + HWY_ALIGN float coeff[4 * 4]; + AFVDCT4x4(block, coeff); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + coefficients[iy * 2 * 8 + ix * 2] = coeff[iy * 4 + ix]; + } + } + // 4x4 DCT of the block with same y and different x. + ComputeTransposedScaledDCT<4>()( + DCTFrom(pixels + afv_y * 4 * pixels_stride + (afv_x == 1 ? 0 : 4), + pixels_stride), + block, scratch_space); + // ... in (odd, even) positions. + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + coefficients[iy * 2 * 8 + ix * 2 + 1] = block[iy * 4 + ix]; + } + } + // 4x8 DCT of the other half of the block. + ComputeScaledDCT<4, 8>()( + DCTFrom(pixels + (afv_y == 1 ? 0 : 4) * pixels_stride, pixels_stride), + block, scratch_space); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + coefficients[(1 + iy * 2) * 8 + ix] = block[iy * 8 + ix]; + } + } + float block00 = coefficients[0] * 0.25f; + float block01 = coefficients[1]; + float block10 = coefficients[8]; + coefficients[0] = (block00 + block01 + 2 * block10) * 0.25f; + coefficients[1] = (block00 - block01) * 0.5f; + coefficients[8] = (block00 + block01 - 2 * block10) * 0.25f; +} + +HWY_MAYBE_UNUSED void TransformFromPixels(const AcStrategy::Type strategy, + const float* JXL_RESTRICT pixels, + size_t pixels_stride, + float* JXL_RESTRICT coefficients, + float* JXL_RESTRICT scratch_space) { + using Type = AcStrategy::Type; + switch (strategy) { + case Type::IDENTITY: { + PROFILER_ZONE("DCT Identity"); + for (size_t y = 0; y < 2; y++) { + for (size_t x = 0; x < 2; x++) { + float block_dc = 0; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + block_dc += pixels[(y * 4 + iy) * pixels_stride + x * 4 + ix]; + } + } + block_dc *= 1.0f / 16; + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + if (ix == 1 && iy == 1) continue; + coefficients[(y + iy * 2) * 8 + x + ix * 2] = + pixels[(y * 4 + iy) * pixels_stride + x * 4 + ix] - + pixels[(y * 4 + 1) * pixels_stride + x * 4 + 1]; + } + } + coefficients[(y + 2) * 8 + x + 2] = coefficients[y * 8 + x]; + coefficients[y * 8 + x] = block_dc; + } + } + float block00 = coefficients[0]; + float block01 = coefficients[1]; + float block10 = coefficients[8]; + float block11 = coefficients[9]; + coefficients[0] = (block00 + block01 + block10 + block11) * 0.25f; + coefficients[1] = (block00 + block01 - block10 - block11) * 0.25f; + coefficients[8] = (block00 - block01 + block10 - block11) * 0.25f; + coefficients[9] = (block00 - block01 - block10 + block11) * 0.25f; + break; + } + case Type::DCT8X4: { + PROFILER_ZONE("DCT 8x4"); + for (size_t x = 0; x < 2; x++) { + HWY_ALIGN float block[4 * 8]; + ComputeScaledDCT<8, 4>()(DCTFrom(pixels + x * 4, pixels_stride), block, + scratch_space); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + // Store transposed. + coefficients[(x + iy * 2) * 8 + ix] = block[iy * 8 + ix]; + } + } + } + float block0 = coefficients[0]; + float block1 = coefficients[8]; + coefficients[0] = (block0 + block1) * 0.5f; + coefficients[8] = (block0 - block1) * 0.5f; + break; + } + case Type::DCT4X8: { + PROFILER_ZONE("DCT 4x8"); + for (size_t y = 0; y < 2; y++) { + HWY_ALIGN float block[4 * 8]; + ComputeScaledDCT<4, 8>()( + DCTFrom(pixels + y * 4 * pixels_stride, pixels_stride), block, + scratch_space); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 8; ix++) { + coefficients[(y + iy * 2) * 8 + ix] = block[iy * 8 + ix]; + } + } + } + float block0 = coefficients[0]; + float block1 = coefficients[8]; + coefficients[0] = (block0 + block1) * 0.5f; + coefficients[8] = (block0 - block1) * 0.5f; + break; + } + case Type::DCT4X4: { + PROFILER_ZONE("DCT 4"); + for (size_t y = 0; y < 2; y++) { + for (size_t x = 0; x < 2; x++) { + HWY_ALIGN float block[4 * 4]; + ComputeTransposedScaledDCT<4>()( + DCTFrom(pixels + y * 4 * pixels_stride + x * 4, pixels_stride), + block, scratch_space); + for (size_t iy = 0; iy < 4; iy++) { + for (size_t ix = 0; ix < 4; ix++) { + coefficients[(y + iy * 2) * 8 + x + ix * 2] = block[iy * 4 + ix]; + } + } + } + } + float block00 = coefficients[0]; + float block01 = coefficients[1]; + float block10 = coefficients[8]; + float block11 = coefficients[9]; + coefficients[0] = (block00 + block01 + block10 + block11) * 0.25f; + coefficients[1] = (block00 + block01 - block10 - block11) * 0.25f; + coefficients[8] = (block00 - block01 + block10 - block11) * 0.25f; + coefficients[9] = (block00 - block01 - block10 + block11) * 0.25f; + break; + } + case Type::DCT2X2: { + PROFILER_ZONE("DCT 2"); + DCT2TopBlock<8>(pixels, pixels_stride, coefficients); + DCT2TopBlock<4>(coefficients, kBlockDim, coefficients); + DCT2TopBlock<2>(coefficients, kBlockDim, coefficients); + break; + } + case Type::DCT16X16: { + PROFILER_ZONE("DCT 16"); + ComputeTransposedScaledDCT<16>()(DCTFrom(pixels, pixels_stride), + coefficients, scratch_space); + break; + } + case Type::DCT16X8: { + PROFILER_ZONE("DCT 16x8"); + ComputeScaledDCT<16, 8>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT8X16: { + PROFILER_ZONE("DCT 8x16"); + ComputeScaledDCT<8, 16>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT32X8: { + PROFILER_ZONE("DCT 32x8"); + ComputeScaledDCT<32, 8>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT8X32: { + PROFILER_ZONE("DCT 8x32"); + ComputeScaledDCT<8, 32>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT32X16: { + PROFILER_ZONE("DCT 32x16"); + ComputeScaledDCT<32, 16>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT16X32: { + PROFILER_ZONE("DCT 16x32"); + ComputeScaledDCT<16, 32>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT32X32: { + PROFILER_ZONE("DCT 32"); + ComputeTransposedScaledDCT<32>()(DCTFrom(pixels, pixels_stride), + coefficients, scratch_space); + break; + } + case Type::DCT: { + PROFILER_ZONE("DCT 8"); + ComputeTransposedScaledDCT<8>()(DCTFrom(pixels, pixels_stride), + coefficients, scratch_space); + break; + } + case Type::AFV0: { + PROFILER_ZONE("AFV0"); + AFVTransformFromPixels<0>(pixels, pixels_stride, coefficients); + break; + } + case Type::AFV1: { + PROFILER_ZONE("AFV1"); + AFVTransformFromPixels<1>(pixels, pixels_stride, coefficients); + break; + } + case Type::AFV2: { + PROFILER_ZONE("AFV2"); + AFVTransformFromPixels<2>(pixels, pixels_stride, coefficients); + break; + } + case Type::AFV3: { + PROFILER_ZONE("AFV3"); + AFVTransformFromPixels<3>(pixels, pixels_stride, coefficients); + break; + } + case Type::DCT64X64: { + PROFILER_ZONE("DCT 64x64"); + ComputeTransposedScaledDCT<64>()(DCTFrom(pixels, pixels_stride), + coefficients, scratch_space); + break; + } + case Type::DCT64X32: { + PROFILER_ZONE("DCT 64x32"); + ComputeScaledDCT<64, 32>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT32X64: { + PROFILER_ZONE("DCT 32x64"); + ComputeScaledDCT<32, 64>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT128X128: { + PROFILER_ZONE("DCT 128x128"); + ComputeTransposedScaledDCT<128>()(DCTFrom(pixels, pixels_stride), + coefficients, scratch_space); + break; + } + case Type::DCT128X64: { + PROFILER_ZONE("DCT 128x64"); + ComputeScaledDCT<128, 64>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT64X128: { + PROFILER_ZONE("DCT 64x128"); + ComputeScaledDCT<64, 128>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT256X256: { + PROFILER_ZONE("DCT 256x256"); + ComputeTransposedScaledDCT<256>()(DCTFrom(pixels, pixels_stride), + coefficients, scratch_space); + break; + } + case Type::DCT256X128: { + PROFILER_ZONE("DCT 256x128"); + ComputeScaledDCT<256, 128>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::DCT128X256: { + PROFILER_ZONE("DCT 128x256"); + ComputeScaledDCT<128, 256>()(DCTFrom(pixels, pixels_stride), coefficients, + scratch_space); + break; + } + case Type::kNumValidStrategies: + JXL_ABORT("Invalid strategy"); + } +} + +HWY_MAYBE_UNUSED void DCFromLowestFrequencies(const AcStrategy::Type strategy, + const float* block, float* dc, + size_t dc_stride) { + using Type = AcStrategy::Type; + switch (strategy) { + case Type::DCT16X8: { + ReinterpretingIDCT( + block, 2 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT8X16: { + ReinterpretingIDCT( + block, 2 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT16X16: { + ReinterpretingIDCT( + block, 2 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT32X8: { + ReinterpretingIDCT( + block, 4 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT8X32: { + ReinterpretingIDCT( + block, 4 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT32X16: { + ReinterpretingIDCT( + block, 4 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT16X32: { + ReinterpretingIDCT( + block, 4 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT32X32: { + ReinterpretingIDCT( + block, 4 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT64X32: { + ReinterpretingIDCT( + block, 8 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT32X64: { + ReinterpretingIDCT( + block, 8 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT64X64: { + ReinterpretingIDCT( + block, 8 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT128X64: { + ReinterpretingIDCT< + /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim, + /*LF_ROWS=*/16, /*LF_COLS=*/8, /*ROWS=*/16, /*COLS=*/8>( + block, 16 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT64X128: { + ReinterpretingIDCT< + /*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim, + /*LF_ROWS=*/8, /*LF_COLS=*/16, /*ROWS=*/8, /*COLS=*/16>( + block, 16 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT128X128: { + ReinterpretingIDCT< + /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim, + /*LF_ROWS=*/16, /*LF_COLS=*/16, /*ROWS=*/16, /*COLS=*/16>( + block, 16 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT256X128: { + ReinterpretingIDCT< + /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim, + /*LF_ROWS=*/32, /*LF_COLS=*/16, /*ROWS=*/32, /*COLS=*/16>( + block, 32 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT128X256: { + ReinterpretingIDCT< + /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim, + /*LF_ROWS=*/16, /*LF_COLS=*/32, /*ROWS=*/16, /*COLS=*/32>( + block, 32 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT256X256: { + ReinterpretingIDCT< + /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim, + /*LF_ROWS=*/32, /*LF_COLS=*/32, /*ROWS=*/32, /*COLS=*/32>( + block, 32 * kBlockDim, dc, dc_stride); + break; + } + case Type::DCT: + case Type::DCT2X2: + case Type::DCT4X4: + case Type::DCT4X8: + case Type::DCT8X4: + case Type::AFV0: + case Type::AFV1: + case Type::AFV2: + case Type::AFV3: + case Type::IDENTITY: + dc[0] = block[0]; + break; + case Type::kNumValidStrategies: + JXL_ABORT("Invalid strategy"); + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_ENC_TRANSFORMS_INL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_transforms.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_transforms.cc new file mode 100644 index 0000000000..8978ba1dcb --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_transforms.cc @@ -0,0 +1,41 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_transforms.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_transforms.cc" +#include +#include + +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/enc_transforms-inl.h" + +namespace jxl { + +#if HWY_ONCE +HWY_EXPORT(TransformFromPixels); +void TransformFromPixels(const AcStrategy::Type strategy, + const float* JXL_RESTRICT pixels, size_t pixels_stride, + float* JXL_RESTRICT coefficients, + float* scratch_space) { + return HWY_DYNAMIC_DISPATCH(TransformFromPixels)( + strategy, pixels, pixels_stride, coefficients, scratch_space); +} + +HWY_EXPORT(DCFromLowestFrequencies); +void DCFromLowestFrequencies(AcStrategy::Type strategy, const float* block, + float* dc, size_t dc_stride) { + return HWY_DYNAMIC_DISPATCH(DCFromLowestFrequencies)(strategy, block, dc, + dc_stride); +} + +HWY_EXPORT(AFVDCT4x4); +void AFVDCT4x4(const float* JXL_RESTRICT pixels, float* JXL_RESTRICT coeffs) { + return HWY_DYNAMIC_DISPATCH(AFVDCT4x4)(pixels, coeffs); +} +#endif // HWY_ONCE + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_transforms.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_transforms.h new file mode 100644 index 0000000000..039ccc3893 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_transforms.h @@ -0,0 +1,32 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_TRANSFORMS_H_ +#define LIB_JXL_ENC_TRANSFORMS_H_ + +// Facade for (non-inlined) integral transforms. + +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/compiler_specific.h" + +namespace jxl { + +void TransformFromPixels(const AcStrategy::Type strategy, + const float* JXL_RESTRICT pixels, size_t pixels_stride, + float* JXL_RESTRICT coefficients, + float* JXL_RESTRICT scratch_space); + +// Equivalent of the above for DC image. +void DCFromLowestFrequencies(AcStrategy::Type strategy, const float* block, + float* dc, size_t dc_stride); + +void AFVDCT4x4(const float* JXL_RESTRICT pixels, float* JXL_RESTRICT coeffs); + +} // namespace jxl + +#endif // LIB_JXL_ENC_TRANSFORMS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_xyb.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_xyb.cc new file mode 100644 index 0000000000..57383b1b8e --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_xyb.cc @@ -0,0 +1,437 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/enc_xyb.h" + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/enc_xyb.cc" +#include +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/transfer_functions-inl.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::ShiftRight; + +// Returns cbrt(x) + add with 6 ulp max error. +// Modified from vectormath_exp.h, Apache 2 license. +// https://www.agner.org/optimize/vectorclass.zip +template +V CubeRootAndAdd(const V x, const V add) { + const HWY_FULL(float) df; + const HWY_FULL(int32_t) di; + + const auto kExpBias = Set(di, 0x54800000); // cast(1.) + cast(1.) / 3 + const auto kExpMul = Set(di, 0x002AAAAA); // shifted 1/3 + const auto k1_3 = Set(df, 1.0f / 3); + const auto k4_3 = Set(df, 4.0f / 3); + + const auto xa = x; // assume inputs never negative + const auto xa_3 = k1_3 * xa; + + // Multiply exponent by -1/3 + const auto m1 = BitCast(di, xa); + // Special case for 0. 0 is represented with an exponent of 0, so the + // "kExpBias - 1/3 * exp" below gives the wrong result. The IfThenZeroElse() + // sets those values as 0, which prevents having NaNs in the computations + // below. + const auto m2 = + IfThenZeroElse(m1 == Zero(di), kExpBias - (ShiftRight<23>(m1)) * kExpMul); + auto r = BitCast(df, m2); + + // Newton-Raphson iterations + for (int i = 0; i < 3; i++) { + const auto r2 = r * r; + r = NegMulAdd(xa_3, r2 * r2, k4_3 * r); + } + // Final iteration + auto r2 = r * r; + r = MulAdd(k1_3, NegMulAdd(xa, r2 * r2, r), r); + r2 = r * r; + r = MulAdd(r2, x, add); + + return r; +} + +// Ensures infinity norm is bounded. +void TestCubeRoot() { + const HWY_FULL(float) d; + float max_err = 0.0f; + for (uint64_t x5 = 0; x5 < 2000000; x5++) { + const float x = x5 * 1E-5f; + const float expected = cbrtf(x); + HWY_ALIGN float approx[MaxLanes(d)]; + Store(CubeRootAndAdd(Set(d, x), Zero(d)), d, approx); + + // All lanes are same + for (size_t i = 1; i < Lanes(d); ++i) { + JXL_ASSERT(std::abs(approx[0] - approx[i]) <= 1.2E-7f); + } + + const float err = std::abs(approx[0] - expected); + max_err = std::max(max_err, err); + } + // printf("max err %e\n", max_err); + JXL_ASSERT(max_err < 8E-7f); +} + +// 4x3 matrix * 3x1 SIMD vectors +template +JXL_INLINE void OpsinAbsorbance(const V r, const V g, const V b, + const float* JXL_RESTRICT premul_absorb, + V* JXL_RESTRICT mixed0, V* JXL_RESTRICT mixed1, + V* JXL_RESTRICT mixed2) { + const float* bias = &kOpsinAbsorbanceBias[0]; + const HWY_FULL(float) d; + const size_t N = Lanes(d); + const auto m0 = Load(d, premul_absorb + 0 * N); + const auto m1 = Load(d, premul_absorb + 1 * N); + const auto m2 = Load(d, premul_absorb + 2 * N); + const auto m3 = Load(d, premul_absorb + 3 * N); + const auto m4 = Load(d, premul_absorb + 4 * N); + const auto m5 = Load(d, premul_absorb + 5 * N); + const auto m6 = Load(d, premul_absorb + 6 * N); + const auto m7 = Load(d, premul_absorb + 7 * N); + const auto m8 = Load(d, premul_absorb + 8 * N); + *mixed0 = MulAdd(m0, r, MulAdd(m1, g, MulAdd(m2, b, Set(d, bias[0])))); + *mixed1 = MulAdd(m3, r, MulAdd(m4, g, MulAdd(m5, b, Set(d, bias[1])))); + *mixed2 = MulAdd(m6, r, MulAdd(m7, g, MulAdd(m8, b, Set(d, bias[2])))); +} + +template +void StoreXYB(const V r, V g, const V b, float* JXL_RESTRICT valx, + float* JXL_RESTRICT valy, float* JXL_RESTRICT valz) { + const HWY_FULL(float) d; + const V half = Set(d, 0.5f); + Store(half * (r - g), d, valx); + Store(half * (r + g), d, valy); + Store(b, d, valz); +} + +// Converts one RGB vector to XYB. +template +void LinearRGBToXYB(const V r, const V g, const V b, + const float* JXL_RESTRICT premul_absorb, + float* JXL_RESTRICT valx, float* JXL_RESTRICT valy, + float* JXL_RESTRICT valz) { + V mixed0, mixed1, mixed2; + OpsinAbsorbance(r, g, b, premul_absorb, &mixed0, &mixed1, &mixed2); + + // mixed* should be non-negative even for wide-gamut, so clamp to zero. + mixed0 = ZeroIfNegative(mixed0); + mixed1 = ZeroIfNegative(mixed1); + mixed2 = ZeroIfNegative(mixed2); + + const HWY_FULL(float) d; + const size_t N = Lanes(d); + mixed0 = CubeRootAndAdd(mixed0, Load(d, premul_absorb + 9 * N)); + mixed1 = CubeRootAndAdd(mixed1, Load(d, premul_absorb + 10 * N)); + mixed2 = CubeRootAndAdd(mixed2, Load(d, premul_absorb + 11 * N)); + StoreXYB(mixed0, mixed1, mixed2, valx, valy, valz); + + // For wide-gamut inputs, r/g/b and valx (but not y/z) are often negative. +} + +// Input/output uses the codec.h scaling: nominally 0-1 if in-gamut. +template +V LinearFromSRGB(V encoded) { + return TF_SRGB().DisplayFromEncoded(encoded); +} + +void LinearSRGBToXYB(const Image3F& linear, + const float* JXL_RESTRICT premul_absorb, ThreadPool* pool, + Image3F* JXL_RESTRICT xyb) { + const size_t xsize = linear.xsize(); + + const HWY_FULL(float) d; + RunOnPool( + pool, 0, static_cast(linear.ysize()), ThreadPool::SkipInit(), + [&](const int task, const int /*thread*/) { + const size_t y = static_cast(task); + const float* JXL_RESTRICT row_in0 = linear.ConstPlaneRow(0, y); + const float* JXL_RESTRICT row_in1 = linear.ConstPlaneRow(1, y); + const float* JXL_RESTRICT row_in2 = linear.ConstPlaneRow(2, y); + float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y); + float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y); + float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y); + + for (size_t x = 0; x < xsize; x += Lanes(d)) { + const auto in_r = Load(d, row_in0 + x); + const auto in_g = Load(d, row_in1 + x); + const auto in_b = Load(d, row_in2 + x); + LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x, + row_xyb1 + x, row_xyb2 + x); + } + }, + "LinearToXYB"); +} + +void SRGBToXYB(const Image3F& srgb, const float* JXL_RESTRICT premul_absorb, + ThreadPool* pool, Image3F* JXL_RESTRICT xyb) { + const size_t xsize = srgb.xsize(); + + const HWY_FULL(float) d; + RunOnPool( + pool, 0, static_cast(srgb.ysize()), ThreadPool::SkipInit(), + [&](const int task, const int /*thread*/) { + const size_t y = static_cast(task); + const float* JXL_RESTRICT row_srgb0 = srgb.ConstPlaneRow(0, y); + const float* JXL_RESTRICT row_srgb1 = srgb.ConstPlaneRow(1, y); + const float* JXL_RESTRICT row_srgb2 = srgb.ConstPlaneRow(2, y); + float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y); + float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y); + float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y); + + for (size_t x = 0; x < xsize; x += Lanes(d)) { + const auto in_r = LinearFromSRGB(Load(d, row_srgb0 + x)); + const auto in_g = LinearFromSRGB(Load(d, row_srgb1 + x)); + const auto in_b = LinearFromSRGB(Load(d, row_srgb2 + x)); + LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x, + row_xyb1 + x, row_xyb2 + x); + } + }, + "SRGBToXYB"); +} + +void SRGBToXYBAndLinear(const Image3F& srgb, + const float* JXL_RESTRICT premul_absorb, + ThreadPool* pool, Image3F* JXL_RESTRICT xyb, + Image3F* JXL_RESTRICT linear) { + const size_t xsize = srgb.xsize(); + + const HWY_FULL(float) d; + RunOnPool( + pool, 0, static_cast(srgb.ysize()), ThreadPool::SkipInit(), + [&](const int task, const int /*thread*/) { + const size_t y = static_cast(task); + const float* JXL_RESTRICT row_srgb0 = srgb.ConstPlaneRow(0, y); + const float* JXL_RESTRICT row_srgb1 = srgb.ConstPlaneRow(1, y); + const float* JXL_RESTRICT row_srgb2 = srgb.ConstPlaneRow(2, y); + + float* JXL_RESTRICT row_linear0 = linear->PlaneRow(0, y); + float* JXL_RESTRICT row_linear1 = linear->PlaneRow(1, y); + float* JXL_RESTRICT row_linear2 = linear->PlaneRow(2, y); + + float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y); + float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y); + float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y); + + for (size_t x = 0; x < xsize; x += Lanes(d)) { + const auto in_r = LinearFromSRGB(Load(d, row_srgb0 + x)); + const auto in_g = LinearFromSRGB(Load(d, row_srgb1 + x)); + const auto in_b = LinearFromSRGB(Load(d, row_srgb2 + x)); + + Store(in_r, d, row_linear0 + x); + Store(in_g, d, row_linear1 + x); + Store(in_b, d, row_linear2 + x); + + LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x, + row_xyb1 + x, row_xyb2 + x); + } + }, + "SRGBToXYBAndLinear"); +} + +// This is different from Butteraugli's OpsinDynamicsImage() in the sense that +// it does not contain a sensitivity multiplier based on the blurred image. +const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool, + Image3F* JXL_RESTRICT xyb, + ImageBundle* const JXL_RESTRICT linear) { + PROFILER_FUNC; + + const size_t xsize = in.xsize(); + const size_t ysize = in.ysize(); + JXL_ASSERT(SameSize(in, *xyb)); + + const HWY_FULL(float) d; + // Pre-broadcasted constants + HWY_ALIGN float premul_absorb[MaxLanes(d) * 12]; + const size_t N = Lanes(d); + for (size_t i = 0; i < 9; ++i) { + const auto absorb = Set(d, kOpsinAbsorbanceMatrix[i] * + (in.metadata()->IntensityTarget() / 255.0f)); + Store(absorb, d, premul_absorb + i * N); + } + for (size_t i = 0; i < 3; ++i) { + const auto neg_bias_cbrt = Set(d, -cbrtf(kOpsinAbsorbanceBias[i])); + Store(neg_bias_cbrt, d, premul_absorb + (9 + i) * N); + } + + const bool want_linear = linear != nullptr; + + const ColorEncoding& c_linear_srgb = ColorEncoding::LinearSRGB(in.IsGray()); + // Linear sRGB inputs are rare but can be useful for the fastest encoders, for + // which undoing the sRGB transfer function would be a large part of the cost. + if (c_linear_srgb.SameColorEncoding(in.c_current())) { + LinearSRGBToXYB(in.color(), premul_absorb, pool, xyb); + // This only happens if kitten or slower, moving ImageBundle might be + // possible but the encoder is much slower than this copy. + if (want_linear) { + *linear = in.Copy(); + return linear; + } + return ∈ + } + + // Common case: already sRGB, can avoid the color transform + if (in.IsSRGB()) { + // Common case: can avoid allocating/copying + if (!want_linear) { + SRGBToXYB(in.color(), premul_absorb, pool, xyb); + return ∈ + } + + // Slow encoder also wants linear sRGB. + linear->SetFromImage(Image3F(xsize, ysize), c_linear_srgb); + SRGBToXYBAndLinear(in.color(), premul_absorb, pool, xyb, linear->color()); + return linear; + } + + // General case: not sRGB, need color transform. + ImageBundle linear_storage; // Local storage only used if !want_linear. + + ImageBundle* linear_storage_ptr; + if (want_linear) { + // Caller asked for linear, use that storage directly. + linear_storage_ptr = linear; + } else { + // Caller didn't ask for linear, create our own local storage + // OK to reuse metadata, it will not be changed. + linear_storage = ImageBundle(const_cast(in.metadata())); + linear_storage_ptr = &linear_storage; + } + + const ImageBundle* ptr; + JXL_CHECK( + TransformIfNeeded(in, c_linear_srgb, pool, linear_storage_ptr, &ptr)); + // If no transform was necessary, should have taken the above codepath. + JXL_ASSERT(ptr == linear_storage_ptr); + + LinearSRGBToXYB(*linear_storage_ptr->color(), premul_absorb, pool, xyb); + return want_linear ? linear : ∈ +} + +// Transform RGB to YCbCr. +// Could be performed in-place (i.e. Y, Cb and Cr could alias R, B and B). +void RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane, + const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane, + ImageF* cr_plane, ThreadPool* pool) { + const HWY_FULL(float) df; + const size_t S = Lanes(df); // Step. + + const size_t xsize = r_plane.xsize(); + const size_t ysize = r_plane.ysize(); + if ((xsize == 0) || (ysize == 0)) return; + + // Full-range BT.601 as defined by JFIF Clause 7: + // https://www.itu.int/rec/T-REC-T.871-201105-I/en + const auto k128 = Set(df, 128.0f / 255); + const auto kR = Set(df, 0.299f); // NTSC luma + const auto kG = Set(df, 0.587f); + const auto kB = Set(df, 0.114f); + const auto kAmpR = Set(df, 0.701f); + const auto kAmpB = Set(df, 0.886f); + const auto kDiffR = kAmpR + kR; + const auto kDiffB = kAmpB + kB; + const auto kNormR = Set(df, 1.0f) / (kAmpR + kG + kB); + const auto kNormB = Set(df, 1.0f) / (kR + kG + kAmpB); + + constexpr size_t kGroupArea = kGroupDim * kGroupDim; + const size_t lines_per_group = DivCeil(kGroupArea, xsize); + const size_t num_stripes = DivCeil(ysize, lines_per_group); + const auto transform = [&](int idx, int /* thread*/) { + const size_t y0 = idx * lines_per_group; + const size_t y1 = std::min(y0 + lines_per_group, ysize); + for (size_t y = y0; y < y1; ++y) { + const float* r_row = r_plane.ConstRow(y); + const float* g_row = g_plane.ConstRow(y); + const float* b_row = b_plane.ConstRow(y); + float* y_row = y_plane->Row(y); + float* cb_row = cb_plane->Row(y); + float* cr_row = cr_plane->Row(y); + for (size_t x = 0; x < xsize; x += S) { + const auto r = Load(df, r_row + x); + const auto g = Load(df, g_row + x); + const auto b = Load(df, b_row + x); + const auto r_base = r * kR; + const auto r_diff = r * kDiffR; + const auto g_base = g * kG; + const auto b_base = b * kB; + const auto b_diff = b * kDiffB; + const auto y_base = r_base + g_base + b_base; + const auto y_vec = y_base - k128; + const auto cb_vec = (b_diff - y_base) * kNormB; + const auto cr_vec = (r_diff - y_base) * kNormR; + Store(y_vec, df, y_row + x); + Store(cb_vec, df, cb_row + x); + Store(cr_vec, df, cr_row + x); + } + } + }; + RunOnPool(pool, 0, static_cast(num_stripes), ThreadPool::SkipInit(), + transform, "RgbToYcbCr"); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(ToXYB); +const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool, + Image3F* JXL_RESTRICT xyb, + ImageBundle* JXL_RESTRICT linear_storage) { + return HWY_DYNAMIC_DISPATCH(ToXYB)(in, pool, xyb, linear_storage); +} + +HWY_EXPORT(RgbToYcbcr); +void RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane, + const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane, + ImageF* cr_plane, ThreadPool* pool) { + return HWY_DYNAMIC_DISPATCH(RgbToYcbcr)(r_plane, g_plane, b_plane, y_plane, + cb_plane, cr_plane, pool); +} + +HWY_EXPORT(TestCubeRoot); +void TestCubeRoot() { return HWY_DYNAMIC_DISPATCH(TestCubeRoot)(); } + +// DEPRECATED +Image3F OpsinDynamicsImage(const Image3B& srgb8) { + ImageMetadata metadata; + metadata.SetUintSamples(8); + metadata.color_encoding = ColorEncoding::SRGB(); + ImageBundle ib(&metadata); + ib.SetFromImage(ConvertToFloat(srgb8), metadata.color_encoding); + JXL_CHECK(ib.TransformTo(ColorEncoding::LinearSRGB(ib.IsGray()))); + ThreadPool* null_pool = nullptr; + Image3F xyb(srgb8.xsize(), srgb8.ysize()); + + ImageBundle linear_storage(&metadata); + (void)ToXYB(ib, null_pool, &xyb, &linear_storage); + return xyb; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_xyb.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_xyb.h new file mode 100644 index 0000000000..f30ae2f68b --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/enc_xyb.h @@ -0,0 +1,45 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENC_XYB_H_ +#define LIB_JXL_ENC_XYB_H_ + +// Converts to XYB color space. + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" + +namespace jxl { + +// Converts any color space to XYB. If `linear` is not null, returns `linear` +// after filling it with a linear sRGB copy of `in`. Otherwise, returns `&in`. +// +// NOTE this return value can avoid an extra color conversion if `in` would +// later be passed to JxlButteraugliComparator. +const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool, + Image3F* JXL_RESTRICT xyb, + ImageBundle* JXL_RESTRICT linear = nullptr); + +// Bt.601 to match JPEG/JFIF. Outputs _signed_ YCbCr values suitable for DCT, +// see F.1.1.3 of T.81 (because our data type is float, there is no need to add +// a bias to make the values unsigned). +void RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane, + const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane, + ImageF* cr_plane, ThreadPool* pool); + +// DEPRECATED, used by opsin_image_wrapper. +Image3F OpsinDynamicsImage(const Image3B& srgb8); + +// For opsin_image_test. +void TestCubeRoot(); + +} // namespace jxl + +#endif // LIB_JXL_ENC_XYB_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/encode.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/encode.cc new file mode 100644 index 0000000000..f4e94d1412 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/encode.cc @@ -0,0 +1,471 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "jxl/encode.h" + +#include +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/enc_external_image.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_icc_codec.h" +#include "lib/jxl/encode_internal.h" +#include "lib/jxl/jpeg/enc_jpeg_data.h" + +#define JPEGXL_MAJOR_VERSION 0 +#define JPEGXL_MINOR_VERSION 5 +#define JPEGXL_PATCH_VERSION 0 + +// Debug-printing failure macro similar to JXL_FAILURE, but for the status code +// JXL_ENC_ERROR +#ifdef JXL_CRASH_ON_ERROR +#define JXL_API_ERROR(format, ...) \ + (::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \ + ::jxl::Abort(), JXL_ENC_ERROR) +#else // JXL_CRASH_ON_ERROR +#define JXL_API_ERROR(format, ...) \ + (((JXL_DEBUG_ON_ERROR) && \ + ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__)), \ + JXL_ENC_ERROR) +#endif // JXL_CRASH_ON_ERROR + +namespace jxl {} // namespace jxl + +uint32_t JxlEncoderVersion(void) { + return JPEGXL_MAJOR_VERSION * 1000000 + JPEGXL_MINOR_VERSION * 1000 + + JPEGXL_PATCH_VERSION; +} + +JxlEncoderStatus JxlEncoderStruct::RefillOutputByteQueue() { + jxl::MemoryManagerUniquePtr input_frame = + std::move(input_frame_queue[0]); + input_frame_queue.erase(input_frame_queue.begin()); + + // TODO(zond): If the frame queue is empty and the input_closed is true, + // then mark this frame as the last. + + jxl::BitWriter writer; + + if (!wrote_bytes) { + if (use_container) { + output_byte_queue.insert( + output_byte_queue.end(), jxl::kContainerHeader, + jxl::kContainerHeader + sizeof(jxl::kContainerHeader)); + if (store_jpeg_metadata && jpeg_metadata.size() > 0) { + jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_metadata.size(), + false, &output_byte_queue); + output_byte_queue.insert(output_byte_queue.end(), jpeg_metadata.begin(), + jpeg_metadata.end()); + } + } + if (!WriteHeaders(&metadata, &writer, nullptr)) { + return JXL_ENC_ERROR; + } + // Only send ICC (at least several hundred bytes) if fields aren't enough. + if (metadata.m.color_encoding.WantICC()) { + if (!jxl::WriteICC(metadata.m.color_encoding.ICC(), &writer, + jxl::kLayerHeader, nullptr)) { + return JXL_ENC_ERROR; + } + } + + // TODO(lode): preview should be added here if a preview image is added + + // Each frame should start on byte boundaries. + writer.ZeroPadToByte(); + } + + // TODO(zond): Handle progressive mode like EncodeFile does it. + // TODO(zond): Handle animation like EncodeFile does it, by checking if + // JxlEncoderCloseInput has been called and if the frame queue is + // empty (to see if it's the last animation frame). + + if (metadata.m.xyb_encoded) { + input_frame->option_values.cparams.color_transform = + jxl::ColorTransform::kXYB; + } else { + // TODO(zond): Figure out when to use kYCbCr instead. + input_frame->option_values.cparams.color_transform = + jxl::ColorTransform::kNone; + } + + jxl::PassesEncoderState enc_state; + if (!jxl::EncodeFrame(input_frame->option_values.cparams, jxl::FrameInfo{}, + &metadata, input_frame->frame, &enc_state, + thread_pool.get(), &writer, + /*aux_out=*/nullptr)) { + return JXL_ENC_ERROR; + } + + jxl::PaddedBytes bytes = std::move(writer).TakeBytes(); + + if (use_container && !wrote_bytes) { + if (input_closed && input_frame_queue.empty()) { + jxl::AppendBoxHeader(jxl::MakeBoxType("jxlc"), bytes.size(), + /*unbounded=*/false, &output_byte_queue); + } else { + jxl::AppendBoxHeader(jxl::MakeBoxType("jxlc"), 0, /*unbounded=*/true, + &output_byte_queue); + } + } + + output_byte_queue.insert(output_byte_queue.end(), bytes.data(), + bytes.data() + bytes.size()); + wrote_bytes = true; + + last_used_cparams = input_frame->option_values.cparams; + + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderSetColorEncoding(JxlEncoder* enc, + const JxlColorEncoding* color) { + if (enc->color_encoding_set) { + // Already set + return JXL_ENC_ERROR; + } + if (!jxl::ConvertExternalToInternalColorEncoding( + *color, &enc->metadata.m.color_encoding)) { + return JXL_ENC_ERROR; + } + enc->color_encoding_set = true; + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderSetICCProfile(JxlEncoder* enc, + const uint8_t* icc_profile, + size_t size) { + if (enc->color_encoding_set) { + // Already set + return JXL_ENC_ERROR; + } + jxl::PaddedBytes icc; + icc.assign(icc_profile, icc_profile + size); + if (!enc->metadata.m.color_encoding.SetICCRaw(std::move(icc))) { + return JXL_ENC_ERROR; + } + enc->color_encoding_set = true; + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderSetBasicInfo(JxlEncoder* enc, + const JxlBasicInfo* info) { + if (!enc->metadata.size.Set(info->xsize, info->ysize)) { + return JXL_ENC_ERROR; + } + if (info->exponent_bits_per_sample) { + if (info->exponent_bits_per_sample != 8) return JXL_ENC_NOT_SUPPORTED; + if (info->bits_per_sample == 32) { + enc->metadata.m.SetFloat32Samples(); + } else { + return JXL_ENC_NOT_SUPPORTED; + } + } else { + switch (info->bits_per_sample) { + case 32: + case 16: + case 8: + enc->metadata.m.SetUintSamples(info->bits_per_sample); + break; + default: + return JXL_ENC_ERROR; + break; + } + } + if (info->alpha_bits > 0 && info->alpha_exponent_bits > 0) { + return JXL_ENC_NOT_SUPPORTED; + } + switch (info->alpha_bits) { + case 0: + break; + case 32: + case 16: + enc->metadata.m.SetAlphaBits(16); + break; + case 8: + enc->metadata.m.SetAlphaBits(info->alpha_bits); + break; + default: + return JXL_ENC_ERROR; + break; + } + enc->metadata.m.xyb_encoded = !info->uses_original_profile; + enc->basic_info_set = true; + return JXL_ENC_SUCCESS; +} + +JxlEncoderOptions* JxlEncoderOptionsCreate(JxlEncoder* enc, + const JxlEncoderOptions* source) { + auto opts = + jxl::MemoryManagerMakeUnique(&enc->memory_manager); + if (!opts) return nullptr; + opts->enc = enc; + if (source != nullptr) { + opts->values = source->values; + } else { + opts->values.lossless = false; + } + JxlEncoderOptions* ret = opts.get(); + enc->encoder_options.emplace_back(std::move(opts)); + return ret; +} + +JxlEncoderStatus JxlEncoderOptionsSetLossless(JxlEncoderOptions* options, + const JXL_BOOL lossless) { + options->values.lossless = lossless; + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderOptionsSetEffort(JxlEncoderOptions* options, + const int effort) { + if (effort < 3 || effort > 9) { + return JXL_ENC_ERROR; + } + options->values.cparams.speed_tier = static_cast(10 - effort); + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderOptionsSetDistance(JxlEncoderOptions* options, + float distance) { + if (distance < 0 || distance > 15) { + return JXL_ENC_ERROR; + } + options->values.cparams.butteraugli_distance = distance; + return JXL_ENC_SUCCESS; +} + +JxlEncoder* JxlEncoderCreate(const JxlMemoryManager* memory_manager) { + JxlMemoryManager local_memory_manager; + if (!jxl::MemoryManagerInit(&local_memory_manager, memory_manager)) { + return nullptr; + } + + void* alloc = + jxl::MemoryManagerAlloc(&local_memory_manager, sizeof(JxlEncoder)); + if (!alloc) return nullptr; + JxlEncoder* enc = new (alloc) JxlEncoder(); + enc->memory_manager = local_memory_manager; + + return enc; +} + +void JxlEncoderReset(JxlEncoder* enc) { + enc->thread_pool.reset(); + enc->input_frame_queue.clear(); + enc->encoder_options.clear(); + enc->output_byte_queue.clear(); + enc->wrote_bytes = false; + enc->metadata = jxl::CodecMetadata(); + enc->last_used_cparams = jxl::CompressParams(); + enc->input_closed = false; + enc->basic_info_set = false; + enc->color_encoding_set = false; +} + +void JxlEncoderDestroy(JxlEncoder* enc) { + if (enc) { + // Call destructor directly since custom free function is used. + enc->~JxlEncoder(); + jxl::MemoryManagerFree(&enc->memory_manager, enc); + } +} + +JxlEncoderStatus JxlEncoderUseContainer(JxlEncoder* enc, + JXL_BOOL use_container) { + enc->use_container = static_cast(use_container); + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderStoreJPEGMetadata(JxlEncoder* enc, + JXL_BOOL store_jpeg_metadata) { + enc->store_jpeg_metadata = static_cast(store_jpeg_metadata); + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderSetParallelRunner(JxlEncoder* enc, + JxlParallelRunner parallel_runner, + void* parallel_runner_opaque) { + if (enc->thread_pool) return JXL_API_ERROR("parallel runner already set"); + enc->thread_pool = jxl::MemoryManagerMakeUnique( + &enc->memory_manager, parallel_runner, parallel_runner_opaque); + if (!enc->thread_pool) { + return JXL_ENC_ERROR; + } + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderAddJPEGFrame(const JxlEncoderOptions* options, + const uint8_t* buffer, size_t size) { + if (options->enc->input_closed) { + return JXL_ENC_ERROR; + } + + jxl::CodecInOut io; + if (!jxl::jpeg::DecodeImageJPG(jxl::Span(buffer, size), &io)) { + return JXL_ENC_ERROR; + } + + if (!options->enc->color_encoding_set) { + if (!SetColorEncodingFromJpegData( + *io.Main().jpeg_data, &options->enc->metadata.m.color_encoding)) { + return JXL_ENC_ERROR; + } + } + + if (!options->enc->basic_info_set) { + JxlBasicInfo basic_info; + basic_info.exponent_bits_per_sample = 0; + basic_info.bits_per_sample = 8; + basic_info.alpha_bits = 0; + basic_info.alpha_exponent_bits = 0; + basic_info.xsize = io.Main().jpeg_data->width; + basic_info.ysize = io.Main().jpeg_data->height; + basic_info.uses_original_profile = true; + if (JxlEncoderSetBasicInfo(options->enc, &basic_info) != JXL_ENC_SUCCESS) { + return JXL_ENC_ERROR; + } + } + + if (options->enc->metadata.m.xyb_encoded) { + // Can't XYB encode a lossless JPEG. + return JXL_ENC_ERROR; + } + + if (options->enc->store_jpeg_metadata) { + jxl::jpeg::JPEGData data_in = *io.Main().jpeg_data; + jxl::PaddedBytes jpeg_data; + if (!EncodeJPEGData(data_in, &jpeg_data)) { + return JXL_ENC_ERROR; + } + options->enc->jpeg_metadata = std::vector( + jpeg_data.data(), jpeg_data.data() + jpeg_data.size()); + } + + auto queued_frame = jxl::MemoryManagerMakeUnique( + &options->enc->memory_manager, + // JxlEncoderQueuedFrame is a struct with no constructors, so we use the + // default move constructor there. + jxl::JxlEncoderQueuedFrame{options->values, + jxl::ImageBundle(&options->enc->metadata.m)}); + if (!queued_frame) { + return JXL_ENC_ERROR; + } + queued_frame->frame.SetFromImage(std::move(*io.Main().color()), + io.Main().c_current()); + queued_frame->frame.jpeg_data = std::move(io.Main().jpeg_data); + queued_frame->frame.color_transform = io.Main().color_transform; + queued_frame->frame.chroma_subsampling = io.Main().chroma_subsampling; + + if (options->values.lossless) { + queued_frame->option_values.cparams.SetLossless(); + } + + options->enc->input_frame_queue.emplace_back(std::move(queued_frame)); + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderAddImageFrame(const JxlEncoderOptions* options, + const JxlPixelFormat* pixel_format, + const void* buffer, size_t size) { + if (!options->enc->basic_info_set || !options->enc->color_encoding_set) { + return JXL_ENC_ERROR; + } + + if (options->enc->input_closed) { + return JXL_ENC_ERROR; + } + + auto queued_frame = jxl::MemoryManagerMakeUnique( + &options->enc->memory_manager, + // JxlEncoderQueuedFrame is a struct with no constructors, so we use the + // default move constructor there. + jxl::JxlEncoderQueuedFrame{options->values, + jxl::ImageBundle(&options->enc->metadata.m)}); + if (!queued_frame) { + return JXL_ENC_ERROR; + } + + if (pixel_format->data_type == JXL_TYPE_FLOAT16) { + // float16 is currently only supported in the decoder + return JXL_ENC_ERROR; + } + + jxl::ColorEncoding c_current; + if (options->enc->metadata.m.xyb_encoded) { + if (pixel_format->data_type == JXL_TYPE_FLOAT) { + c_current = + jxl::ColorEncoding::LinearSRGB(pixel_format->num_channels < 3); + } else { + c_current = jxl::ColorEncoding::SRGB(pixel_format->num_channels < 3); + } + } else { + c_current = options->enc->metadata.m.color_encoding; + } + + if (!jxl::BufferToImageBundle(*pixel_format, options->enc->metadata.xsize(), + options->enc->metadata.ysize(), buffer, size, + options->enc->thread_pool.get(), c_current, + &(queued_frame->frame))) { + return JXL_ENC_ERROR; + } + + if (options->values.lossless) { + queued_frame->option_values.cparams.SetLossless(); + } + + options->enc->input_frame_queue.emplace_back(std::move(queued_frame)); + return JXL_ENC_SUCCESS; +} + +void JxlEncoderCloseInput(JxlEncoder* enc) { enc->input_closed = true; } + +JxlEncoderStatus JxlEncoderProcessOutput(JxlEncoder* enc, uint8_t** next_out, + size_t* avail_out) { + while (*avail_out > 0 && + (!enc->output_byte_queue.empty() || !enc->input_frame_queue.empty())) { + if (!enc->output_byte_queue.empty()) { + size_t to_copy = std::min(*avail_out, enc->output_byte_queue.size()); + memcpy(static_cast(*next_out), enc->output_byte_queue.data(), + to_copy); + *next_out += to_copy; + *avail_out -= to_copy; + enc->output_byte_queue.erase(enc->output_byte_queue.begin(), + enc->output_byte_queue.begin() + to_copy); + } else if (!enc->input_frame_queue.empty()) { + if (enc->RefillOutputByteQueue() != JXL_ENC_SUCCESS) { + return JXL_ENC_ERROR; + } + } + } + + if (!enc->output_byte_queue.empty() || !enc->input_frame_queue.empty()) { + return JXL_ENC_NEED_MORE_OUTPUT; + } + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderOptionsSetDecodingSpeed(JxlEncoderOptions* options, + int tier) { + if (tier < 0 || tier > 4) { + return JXL_ENC_ERROR; + } + options->values.cparams.decoding_speed_tier = tier; + return JXL_ENC_SUCCESS; +} + +void JxlColorEncodingSetToSRGB(JxlColorEncoding* color_encoding, + JXL_BOOL is_gray) { + ConvertInternalToExternalColorEncoding(jxl::ColorEncoding::SRGB(is_gray), + color_encoding); +} + +void JxlColorEncodingSetToLinearSRGB(JxlColorEncoding* color_encoding, + JXL_BOOL is_gray) { + ConvertInternalToExternalColorEncoding( + jxl::ColorEncoding::LinearSRGB(is_gray), color_encoding); +} diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/encode_internal.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/encode_internal.h new file mode 100644 index 0000000000..f4ade2872a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/encode_internal.h @@ -0,0 +1,120 @@ +/* Copyright (c) the JPEG XL Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style + * license that can be found in the LICENSE file. + */ + +#ifndef LIB_JXL_ENCODE_INTERNAL_H_ +#define LIB_JXL_ENCODE_INTERNAL_H_ + +#include + +#include "jxl/encode.h" +#include "jxl/memory_manager.h" +#include "jxl/parallel_runner.h" +#include "jxl/types.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/enc_frame.h" +#include "lib/jxl/memory_manager_internal.h" + +namespace jxl { + +typedef struct JxlEncoderOptionsValuesStruct { + // lossless is a separate setting from cparams because it is a combination + // setting that overrides multiple settings inside of cparams. + bool lossless; + jxl::CompressParams cparams; +} JxlEncoderOptionsValues; + +typedef struct JxlEncoderQueuedFrame { + JxlEncoderOptionsValues option_values; + jxl::ImageBundle frame; +} JxlEncoderQueuedFrame; + +typedef std::array BoxType; + +// Utility function that makes a BoxType from a null terminated string literal. +constexpr BoxType MakeBoxType(const char (&type)[5]) { + return BoxType({static_cast(type[0]), static_cast(type[1]), + static_cast(type[2]), + static_cast(type[3])}); +} + +constexpr unsigned char kContainerHeader[] = { + 0, 0, 0, 0xc, 'J', 'X', 'L', ' ', 0xd, 0xa, 0x87, + 0xa, 0, 0, 0, 0x14, 'f', 't', 'y', 'p', 'j', 'x', + 'l', ' ', 0, 0, 0, 0, 'j', 'x', 'l', ' '}; + +namespace { +template +uint8_t* Extend(T* vec, size_t size) { + vec->resize(vec->size() + size, 0); + return vec->data() + vec->size() - size; +} +} // namespace + +// Appends a JXL container box header with given type, size, and unbounded +// properties to output. +template +void AppendBoxHeader(const jxl::BoxType& type, size_t size, bool unbounded, + T* output) { + uint64_t box_size = 0; + bool large_size = false; + if (!unbounded) { + box_size = size + 8; + if (box_size >= 0x100000000ull) { + large_size = true; + } + } + + StoreBE32(large_size ? 1 : box_size, Extend(output, 4)); + + for (size_t i = 0; i < 4; i++) { + output->push_back(*(type.data() + i)); + } + + if (large_size) { + StoreBE64(box_size, Extend(output, 8)); + } +} + +} // namespace jxl + +struct JxlEncoderStruct { + JxlMemoryManager memory_manager; + jxl::MemoryManagerUniquePtr thread_pool{ + nullptr, jxl::MemoryManagerDeleteHelper(&memory_manager)}; + std::vector> encoder_options; + + std::vector> + input_frame_queue; + std::vector output_byte_queue; + + bool use_container = false; + bool store_jpeg_metadata = false; + jxl::CodecMetadata metadata; + std::vector jpeg_metadata; + + bool wrote_bytes = false; + jxl::CompressParams last_used_cparams; + + bool input_closed = false; + bool basic_info_set = false; + bool color_encoding_set = false; + + // Takes the first frame in the input_frame_queue, encodes it, and appends the + // bytes to the output_byte_queue. + JxlEncoderStatus RefillOutputByteQueue(); + + // Appends the bytes of a JXL box header with the provided type and size to + // the end of the output_byte_queue. If unbounded is true, the size won't be + // added to the header and the box will be assumed to continue until EOF. + void AppendBoxHeader(const jxl::BoxType& type, size_t size, bool unbounded); +}; + +struct JxlEncoderOptionsStruct { + JxlEncoder* enc; + jxl::JxlEncoderOptionsValues values; +}; + +#endif // LIB_JXL_ENCODE_INTERNAL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/encode_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/encode_test.cc new file mode 100644 index 0000000000..22425a8292 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/encode_test.cc @@ -0,0 +1,597 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "jxl/encode.h" + +#include "gtest/gtest.h" +#include "jxl/encode_cxx.h" +#include "lib/extras/codec.h" +#include "lib/jxl/dec_file.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/encode_internal.h" +#include "lib/jxl/jpeg/dec_jpeg_data.h" +#include "lib/jxl/jpeg/dec_jpeg_data_writer.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testdata.h" + +TEST(EncodeTest, AddFrameAfterCloseInputTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + + JxlEncoderCloseInput(enc.get()); + + size_t xsize = 64; + size_t ysize = 64; + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + + jxl::CodecInOut input_io = + jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize); + + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + basic_info.uses_original_profile = false; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info)); + JxlColorEncoding color_encoding; + JxlColorEncodingSetToSRGB(&color_encoding, + /*is_gray=*/pixel_format.num_channels < 3); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetColorEncoding(enc.get(), &color_encoding)); + JxlEncoderOptions* options = JxlEncoderOptionsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_ERROR, + JxlEncoderAddImageFrame(options, &pixel_format, pixels.data(), + pixels.size())); +} + +TEST(EncodeTest, AddJPEGAfterCloseTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + + JxlEncoderCloseInput(enc.get()); + + const std::string jpeg_path = + "imagecompression.info/flower_foveon.png.im_q85_420.jpg"; + const jxl::PaddedBytes orig = jxl::ReadTestData(jpeg_path); + jxl::CodecInOut orig_io; + ASSERT_TRUE( + SetFromBytes(jxl::Span(orig), &orig_io, /*pool=*/nullptr)); + + JxlEncoderOptions* options = JxlEncoderOptionsCreate(enc.get(), NULL); + + EXPECT_EQ(JXL_ENC_ERROR, + JxlEncoderAddJPEGFrame(options, orig.data(), orig.size())); +} + +TEST(EncodeTest, AddFrameBeforeColorEncodingTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + + size_t xsize = 64; + size_t ysize = 64; + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + + jxl::CodecInOut input_io = + jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize); + + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + basic_info.uses_original_profile = false; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info)); + JxlEncoderOptions* options = JxlEncoderOptionsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_ERROR, + JxlEncoderAddImageFrame(options, &pixel_format, pixels.data(), + pixels.size())); +} + +TEST(EncodeTest, AddFrameBeforeBasicInfoTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + + size_t xsize = 64; + size_t ysize = 64; + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + + jxl::CodecInOut input_io = + jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize); + + JxlColorEncoding color_encoding; + JxlColorEncodingSetToSRGB(&color_encoding, + /*is_gray=*/pixel_format.num_channels < 3); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetColorEncoding(enc.get(), &color_encoding)); + JxlEncoderOptions* options = JxlEncoderOptionsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_ERROR, + JxlEncoderAddImageFrame(options, &pixel_format, pixels.data(), + pixels.size())); +} + +TEST(EncodeTest, DefaultAllocTest) { + JxlEncoder* enc = JxlEncoderCreate(nullptr); + EXPECT_NE(nullptr, enc); + JxlEncoderDestroy(enc); +} + +TEST(EncodeTest, CustomAllocTest) { + struct CalledCounters { + int allocs = 0; + int frees = 0; + } counters; + + JxlMemoryManager mm; + mm.opaque = &counters; + mm.alloc = [](void* opaque, size_t size) { + reinterpret_cast(opaque)->allocs++; + return malloc(size); + }; + mm.free = [](void* opaque, void* address) { + reinterpret_cast(opaque)->frees++; + free(address); + }; + + { + JxlEncoderPtr enc = JxlEncoderMake(&mm); + EXPECT_NE(nullptr, enc.get()); + EXPECT_LE(1, counters.allocs); + EXPECT_EQ(0, counters.frees); + } + EXPECT_LE(1, counters.frees); +} + +TEST(EncodeTest, DefaultParallelRunnerTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetParallelRunner(enc.get(), nullptr, nullptr)); +} + +void VerifyFrameEncoding(size_t xsize, size_t ysize, JxlEncoder* enc, + const JxlEncoderOptions* options) { + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + + jxl::CodecInOut input_io = + jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize); + + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + if (options->values.lossless) { + basic_info.uses_original_profile = true; + } else { + basic_info.uses_original_profile = false; + } + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info)); + JxlColorEncoding color_encoding; + JxlColorEncodingSetToSRGB(&color_encoding, + /*is_gray=*/pixel_format.num_channels < 3); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetColorEncoding(enc, &color_encoding)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddImageFrame(options, &pixel_format, pixels.data(), + pixels.size())); + JxlEncoderCloseInput(enc); + + std::vector compressed = std::vector(64); + uint8_t* next_out = compressed.data(); + size_t avail_out = compressed.size() - (next_out - compressed.data()); + JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT; + while (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + process_result = JxlEncoderProcessOutput(enc, &next_out, &avail_out); + if (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + size_t offset = next_out - compressed.data(); + compressed.resize(compressed.size() * 2); + next_out = compressed.data() + offset; + avail_out = compressed.size() - offset; + } + } + compressed.resize(next_out - compressed.data()); + EXPECT_EQ(JXL_ENC_SUCCESS, process_result); + + jxl::DecompressParams dparams; + jxl::CodecInOut decoded_io; + EXPECT_TRUE(jxl::DecodeFile( + dparams, jxl::Span(compressed.data(), compressed.size()), + &decoded_io, /*pool=*/nullptr)); + + jxl::ButteraugliParams ba; + EXPECT_LE(ButteraugliDistance(input_io, decoded_io, ba, + /*distmap=*/nullptr, nullptr), + 3.0f); +} + +void VerifyFrameEncoding(JxlEncoder* enc, const JxlEncoderOptions* options) { + VerifyFrameEncoding(63, 129, enc, options); +} + +TEST(EncodeTest, FrameEncodingTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + VerifyFrameEncoding(enc.get(), JxlEncoderOptionsCreate(enc.get(), nullptr)); +} + +TEST(EncodeTest, EncoderResetTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + VerifyFrameEncoding(50, 200, enc.get(), + JxlEncoderOptionsCreate(enc.get(), nullptr)); + // Encoder should become reusable for a new image from scratch after using + // reset. + JxlEncoderReset(enc.get()); + VerifyFrameEncoding(157, 77, enc.get(), + JxlEncoderOptionsCreate(enc.get(), nullptr)); +} + +TEST(EncodeTest, OptionsTest) { + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderOptions* options = JxlEncoderOptionsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderOptionsSetEffort(options, 5)); + VerifyFrameEncoding(enc.get(), options); + EXPECT_EQ(jxl::SpeedTier::kHare, enc->last_used_cparams.speed_tier); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderOptions* options = JxlEncoderOptionsCreate(enc.get(), NULL); + // Lower than currently supported values + EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderOptionsSetEffort(options, 2)); + // Higher than currently supported values + EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderOptionsSetEffort(options, 10)); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderOptions* options = JxlEncoderOptionsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderOptionsSetLossless(options, JXL_TRUE)); + VerifyFrameEncoding(enc.get(), options); + EXPECT_EQ(true, enc->last_used_cparams.IsLossless()); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderOptions* options = JxlEncoderOptionsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderOptionsSetDistance(options, 0.5)); + VerifyFrameEncoding(enc.get(), options); + EXPECT_EQ(0.5, enc->last_used_cparams.butteraugli_distance); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + JxlEncoderOptions* options = JxlEncoderOptionsCreate(enc.get(), NULL); + // Disallowed negative distance + EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderOptionsSetDistance(options, -1)); + } + + { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + JxlEncoderOptions* options = JxlEncoderOptionsCreate(enc.get(), NULL); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderOptionsSetDecodingSpeed(options, 2)); + VerifyFrameEncoding(enc.get(), options); + EXPECT_EQ(2, enc->last_used_cparams.decoding_speed_tier); + } +} + +namespace { +// Returns a copy of buf from offset to offset+size, or a new zeroed vector if +// the result would have been out of bounds taking integer overflow into +// account. +const std::vector SliceSpan(const jxl::Span& buf, + size_t offset, size_t size) { + if (offset + size >= buf.size()) { + return std::vector(size, 0); + } + if (offset + size < offset) { + return std::vector(size, 0); + } + return std::vector(buf.data() + offset, buf.data() + offset + size); +} + +struct Box { + // The type of the box. + // If "uuid", use extended_type instead + char type[4] = {0, 0, 0, 0}; + + // The extended_type is only used when type == "uuid". + // Extended types are not used in JXL. However, the box format itself + // supports this so they are handled correctly. + char extended_type[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + // Box data. + jxl::Span data = jxl::Span(nullptr, 0); + + // If the size is not given, the datasize extends to the end of the file. + // If this field is false, the size field is not encoded when the box is + // serialized. + bool data_size_given = true; + + // If successful, returns true and sets `in` to be the rest data (if any). + // If `in` contains a box with a size larger than `in.size()`, will not + // modify `in`, and will return true but the data `Span` will + // remain set to nullptr. + // If unsuccessful, returns error and doesn't modify `in`. + jxl::Status Decode(jxl::Span* in) { + // Total box_size including this header itself. + uint64_t box_size = LoadBE32(SliceSpan(*in, 0, 4).data()); + size_t pos = 4; + + memcpy(type, SliceSpan(*in, pos, 4).data(), 4); + pos += 4; + + if (box_size == 1) { + // If the size is 1, it indicates extended size read from 64-bit integer. + box_size = LoadBE64(SliceSpan(*in, pos, 8).data()); + pos += 8; + } + + if (!memcmp("uuid", type, 4)) { + memcpy(extended_type, SliceSpan(*in, pos, 16).data(), 16); + pos += 16; + } + + // This is the end of the box header, the box data begins here. Handle + // the data size now. + const size_t header_size = pos; + + if (box_size != 0) { + if (box_size < header_size) { + return JXL_FAILURE("Invalid box size"); + } + if (box_size > in->size()) { + // The box is fine, but the input is too short. + return true; + } + data_size_given = true; + data = jxl::Span(in->data() + header_size, + box_size - header_size); + } else { + data_size_given = false; + data = jxl::Span(in->data() + header_size, + in->size() - header_size); + } + + *in = jxl::Span(in->data() + header_size + data.size(), + in->size() - header_size - data.size()); + return true; + } +}; + +struct Container { + std::vector boxes; + + // If successful, returns true and sets `in` to be the rest data (if any). + // If unsuccessful, returns error and doesn't modify `in`. + jxl::Status Decode(jxl::Span* in) { + boxes.clear(); + + Box signature_box; + JXL_RETURN_IF_ERROR(signature_box.Decode(in)); + if (memcmp("JXL ", signature_box.type, 4) != 0) { + return JXL_FAILURE("Invalid magic signature"); + } + if (signature_box.data.size() != 4) + return JXL_FAILURE("Invalid magic signature"); + if (signature_box.data[0] != 0xd || signature_box.data[1] != 0xa || + signature_box.data[2] != 0x87 || signature_box.data[3] != 0xa) { + return JXL_FAILURE("Invalid magic signature"); + } + + Box ftyp_box; + JXL_RETURN_IF_ERROR(ftyp_box.Decode(in)); + if (memcmp("ftyp", ftyp_box.type, 4) != 0) { + return JXL_FAILURE("Invalid ftyp"); + } + if (ftyp_box.data.size() != 12) return JXL_FAILURE("Invalid ftyp"); + const char* expected = "jxl \0\0\0\0jxl "; + if (memcmp(expected, ftyp_box.data.data(), 12) != 0) + return JXL_FAILURE("Invalid ftyp"); + + while (in->size() > 0) { + Box box = {}; + JXL_RETURN_IF_ERROR(box.Decode(in)); + if (box.data.data() == nullptr) { + // The decoding encountered a box, but not enough data yet. + return true; + } + boxes.emplace_back(box); + } + + return true; + } +}; + +} // namespace + +TEST(EncodeTest, SingleFrameBoundedJXLCTest) { + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + EXPECT_NE(nullptr, enc.get()); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc.get(), + true)); + JxlEncoderOptions* options = JxlEncoderOptionsCreate(enc.get(), NULL); + + size_t xsize = 71; + size_t ysize = 23; + JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}; + std::vector pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0); + + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + basic_info.uses_original_profile = false; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info)); + JxlColorEncoding color_encoding; + JxlColorEncodingSetToSRGB(&color_encoding, + /*is_gray=*/false); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetColorEncoding(enc.get(), &color_encoding)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddImageFrame(options, &pixel_format, pixels.data(), + pixels.size())); + JxlEncoderCloseInput(enc.get()); + + std::vector compressed = std::vector(64); + uint8_t* next_out = compressed.data(); + size_t avail_out = compressed.size() - (next_out - compressed.data()); + JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT; + while (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out); + if (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + size_t offset = next_out - compressed.data(); + compressed.resize(compressed.size() * 2); + next_out = compressed.data() + offset; + avail_out = compressed.size() - offset; + } + } + compressed.resize(next_out - compressed.data()); + EXPECT_EQ(JXL_ENC_SUCCESS, process_result); + + Container container = {}; + jxl::Span encoded_span = + jxl::Span(compressed.data(), compressed.size()); + EXPECT_TRUE(container.Decode(&encoded_span)); + EXPECT_EQ(0, encoded_span.size()); + EXPECT_EQ(0, memcmp("jxlc", container.boxes[0].type, 4)); + EXPECT_EQ(true, container.boxes[0].data_size_given); +} + +TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructionTest)) { + const std::string jpeg_path = + "imagecompression.info/flower_foveon.png.im_q85_420.jpg"; + const jxl::PaddedBytes orig = jxl::ReadTestData(jpeg_path); + jxl::CodecInOut orig_io; + ASSERT_TRUE( + SetFromBytes(jxl::Span(orig), &orig_io, /*pool=*/nullptr)); + + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + JxlEncoderOptions* options = JxlEncoderOptionsCreate(enc.get(), NULL); + + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc.get(), JXL_TRUE)); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderStoreJPEGMetadata(enc.get(), JXL_TRUE)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddJPEGFrame(options, orig.data(), orig.size())); + JxlEncoderCloseInput(enc.get()); + + std::vector compressed = std::vector(64); + uint8_t* next_out = compressed.data(); + size_t avail_out = compressed.size() - (next_out - compressed.data()); + JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT; + while (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out); + if (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + size_t offset = next_out - compressed.data(); + compressed.resize(compressed.size() * 2); + next_out = compressed.data() + offset; + avail_out = compressed.size() - offset; + } + } + compressed.resize(next_out - compressed.data()); + EXPECT_EQ(JXL_ENC_SUCCESS, process_result); + + Container container = {}; + jxl::Span encoded_span = + jxl::Span(compressed.data(), compressed.size()); + EXPECT_TRUE(container.Decode(&encoded_span)); + EXPECT_EQ(0, encoded_span.size()); + EXPECT_EQ(0, memcmp("jbrd", container.boxes[0].type, 4)); + EXPECT_EQ(0, memcmp("jxlc", container.boxes[1].type, 4)); + + jxl::CodecInOut decoded_io; + decoded_io.Main().jpeg_data = jxl::make_unique(); + EXPECT_TRUE(jxl::jpeg::DecodeJPEGData(container.boxes[0].data, + decoded_io.Main().jpeg_data.get())); + + jxl::DecompressParams dparams; + dparams.keep_dct = true; + EXPECT_TRUE( + jxl::DecodeFile(dparams, container.boxes[1].data, &decoded_io, nullptr)); + + std::vector decoded_jpeg_bytes; + auto write = [&decoded_jpeg_bytes](const uint8_t* buf, size_t len) { + decoded_jpeg_bytes.insert(decoded_jpeg_bytes.end(), buf, buf + len); + return len; + }; + EXPECT_TRUE(jxl::jpeg::WriteJpeg(*decoded_io.Main().jpeg_data, write)); + + EXPECT_EQ(decoded_jpeg_bytes.size(), orig.size()); + EXPECT_EQ(0, memcmp(decoded_jpeg_bytes.data(), orig.data(), orig.size())); +} + +TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGFrameTest)) { + for (int skip_basic_info = 0; skip_basic_info < 2; skip_basic_info++) { + for (int skip_color_encoding = 0; skip_color_encoding < 2; + skip_color_encoding++) { + const std::string jpeg_path = + "imagecompression.info/flower_foveon.png.im_q85_420.jpg"; + const jxl::PaddedBytes orig = jxl::ReadTestData(jpeg_path); + jxl::CodecInOut orig_io; + ASSERT_TRUE(SetFromBytes(jxl::Span(orig), &orig_io, + /*pool=*/nullptr)); + + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + JxlEncoderOptions* options = JxlEncoderOptionsCreate(enc.get(), NULL); + + if (!skip_basic_info) { + JxlBasicInfo basic_info; + basic_info.exponent_bits_per_sample = 0; + basic_info.bits_per_sample = 8; + basic_info.alpha_bits = 0; + basic_info.alpha_exponent_bits = 0; + basic_info.xsize = orig_io.xsize(); + basic_info.ysize = orig_io.ysize(); + basic_info.uses_original_profile = true; + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetBasicInfo(enc.get(), &basic_info)); + } + if (!skip_color_encoding) { + JxlColorEncoding color_encoding; + JxlColorEncodingSetToSRGB(&color_encoding, /*is_gray=*/false); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetColorEncoding(enc.get(), &color_encoding)); + } + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddJPEGFrame(options, orig.data(), orig.size())); + JxlEncoderCloseInput(enc.get()); + + std::vector compressed = std::vector(64); + uint8_t* next_out = compressed.data(); + size_t avail_out = compressed.size() - (next_out - compressed.data()); + JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT; + while (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + process_result = + JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out); + if (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + size_t offset = next_out - compressed.data(); + compressed.resize(compressed.size() * 2); + next_out = compressed.data() + offset; + avail_out = compressed.size() - offset; + } + } + compressed.resize(next_out - compressed.data()); + EXPECT_EQ(JXL_ENC_SUCCESS, process_result); + + jxl::DecompressParams dparams; + jxl::CodecInOut decoded_io; + EXPECT_TRUE(jxl::DecodeFile( + dparams, + jxl::Span(compressed.data(), compressed.size()), + &decoded_io, /*pool=*/nullptr)); + + jxl::ButteraugliParams ba; + EXPECT_LE(ButteraugliDistance(orig_io, decoded_io, ba, + /*distmap=*/nullptr, nullptr), + 2.5f); + } + } +} diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder.cc new file mode 100644 index 0000000000..40edd10445 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder.cc @@ -0,0 +1,70 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/entropy_coder.h" + +#include +#include + +#include +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_context_map.h" +#include "lib/jxl/epf.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { + +Status DecodeBlockCtxMap(BitReader* br, BlockCtxMap* block_ctx_map) { + auto& dct = block_ctx_map->dc_thresholds; + auto& qft = block_ctx_map->qf_thresholds; + auto& ctx_map = block_ctx_map->ctx_map; + bool is_default = br->ReadFixedBits<1>(); + if (is_default) { + *block_ctx_map = BlockCtxMap(); + return true; + } + block_ctx_map->num_dc_ctxs = 1; + for (int j : {0, 1, 2}) { + dct[j].resize(br->ReadFixedBits<4>()); + block_ctx_map->num_dc_ctxs *= dct[j].size() + 1; + for (int& i : dct[j]) { + i = UnpackSigned(U32Coder::Read(kDCThresholdDist, br)); + } + } + qft.resize(br->ReadFixedBits<4>()); + for (uint32_t& i : qft) { + i = U32Coder::Read(kQFThresholdDist, br) + 1; + } + + if (block_ctx_map->num_dc_ctxs * (qft.size() + 1) > 64) { + return JXL_FAILURE("Invalid block context map: too big"); + } + + ctx_map.resize(3 * kNumOrders * block_ctx_map->num_dc_ctxs * + (qft.size() + 1)); + JXL_RETURN_IF_ERROR(DecodeContextMap(&ctx_map, &block_ctx_map->num_ctxs, br)); + if (block_ctx_map->num_ctxs > 16) { + return JXL_FAILURE("Invalid block context map: too many distinct contexts"); + } + return true; +} + +// constexpr uint8_t jxl::kDefaultCtxMap[]; // from ac_context.h + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder.h new file mode 100644 index 0000000000..e4afa7a631 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder.h @@ -0,0 +1,45 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ENTROPY_CODER_H_ +#define LIB_JXL_ENTROPY_CODER_H_ + +#include +#include + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/field_encodings.h" + +// Entropy coding and context modeling of DC and AC coefficients, as well as AC +// strategy and quantization field. + +namespace jxl { + +static JXL_INLINE int32_t PredictFromTopAndLeft( + const int32_t* const JXL_RESTRICT row_top, + const int32_t* const JXL_RESTRICT row, size_t x, int32_t default_val) { + if (x == 0) { + return row_top == nullptr ? default_val : row_top[x]; + } + if (row_top == nullptr) { + return row[x - 1]; + } + return (row_top[x] + row[x - 1] + 1) / 2; +} + +static constexpr U32Enc kDCThresholdDist(Bits(4), BitsOffset(8, 16), + BitsOffset(16, 272), + BitsOffset(32, 65808)); + +static constexpr U32Enc kQFThresholdDist(Bits(2), BitsOffset(3, 4), + BitsOffset(5, 12), BitsOffset(8, 44)); + +Status DecodeBlockCtxMap(BitReader* br, BlockCtxMap* block_ctx_map); + +} // namespace jxl + +#endif // LIB_JXL_ENTROPY_CODER_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder_test.cc new file mode 100644 index 0000000000..cce1713d2b --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/entropy_coder_test.cc @@ -0,0 +1,70 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// TODO(deymo): Move these tests to dec_ans.h and common.h + +#include + +#include + +#include "gtest/gtest.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" + +namespace jxl { +namespace { + +TEST(EntropyCoderTest, PackUnpack) { + for (int32_t i = -31; i < 32; ++i) { + uint32_t packed = PackSigned(i); + EXPECT_LT(packed, 63); + int32_t unpacked = UnpackSigned(packed); + EXPECT_EQ(i, unpacked); + } +} + +struct DummyBitReader { + uint32_t nbits, bits; + void Consume(uint32_t nbits) {} + uint32_t PeekBits(uint32_t n) { + EXPECT_EQ(n, nbits); + return bits; + } +}; + +void HybridUintRoundtrip(HybridUintConfig config, size_t limit = 1 << 24) { + std::mt19937 rng(0); + std::uniform_int_distribution dist(0, limit); + constexpr size_t kNumIntegers = 1 << 20; + std::vector integers(kNumIntegers); + std::vector token(kNumIntegers); + std::vector nbits(kNumIntegers); + std::vector bits(kNumIntegers); + for (size_t i = 0; i < kNumIntegers; i++) { + integers[i] = dist(rng); + config.Encode(integers[i], &token[i], &nbits[i], &bits[i]); + } + for (size_t i = 0; i < kNumIntegers; i++) { + DummyBitReader br{nbits[i], bits[i]}; + EXPECT_EQ(integers[i], + ANSSymbolReader::ReadHybridUintConfig(config, token[i], &br)); + } +} + +TEST(HybridUintTest, Test000) { + HybridUintRoundtrip(HybridUintConfig{0, 0, 0}); +} +TEST(HybridUintTest, Test411) { + HybridUintRoundtrip(HybridUintConfig{4, 1, 1}); +} +TEST(HybridUintTest, Test420) { + HybridUintRoundtrip(HybridUintConfig{4, 2, 0}); +} +TEST(HybridUintTest, Test421) { + HybridUintRoundtrip(HybridUintConfig{4, 2, 1}, 256); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/epf.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/epf.cc new file mode 100644 index 0000000000..1701203d8d --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/epf.cc @@ -0,0 +1,684 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Edge-preserving smoothing: weighted average based on L1 patch similarity. + +#include "lib/jxl/epf.h" + +#include +#include +#include +#include +#include + +#include +#include +#include // std::accumulate +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/epf.cc" +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/filters.h" +#include "lib/jxl/filters_internal.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Vec; + +// The EPF logic treats 8x8 blocks as one unit, each with their own sigma. +// It should be possible to do two blocks at a time in AVX3 vectors, at some +// increase in complexity (broadcasting sigma0/1 to lanes 0..7 and 8..15). +using DF = HWY_CAPPED(float, GroupBorderAssigner::kPaddingXRound); +using DU = HWY_CAPPED(uint32_t, GroupBorderAssigner::kPaddingXRound); + +// kInvSigmaNum / 0.3 +constexpr float kMinSigma = -3.90524291751269967465540850526868f; + +DF df; + +JXL_INLINE Vec Weight(Vec sad, Vec inv_sigma, Vec thres) { + auto v = MulAdd(sad, inv_sigma, Set(DF(), 1.0f)); + auto v2 = v * v; + return IfThenZeroElse(v <= thres, v2); +} + +template +JXL_INLINE void AddPixelStep1(int row, const FilterRows& rows, size_t x, + Vec sad, Vec inv_sigma, + const LoopFilter& lf, Vec* JXL_RESTRICT X, + Vec* JXL_RESTRICT Y, Vec* JXL_RESTRICT B, + Vec* JXL_RESTRICT w) { + auto cx = aligned ? Load(DF(), rows.GetInputRow(row, 0) + x) + : LoadU(DF(), rows.GetInputRow(row, 0) + x); + auto cy = aligned ? Load(DF(), rows.GetInputRow(row, 1) + x) + : LoadU(DF(), rows.GetInputRow(row, 1) + x); + auto cb = aligned ? Load(DF(), rows.GetInputRow(row, 2) + x) + : LoadU(DF(), rows.GetInputRow(row, 2) + x); + + auto weight = Weight(sad, inv_sigma, Set(df, lf.epf_pass1_zeroflush)); + *w += weight; + *X = MulAdd(weight, cx, *X); + *Y = MulAdd(weight, cy, *Y); + *B = MulAdd(weight, cb, *B); +} + +template +JXL_INLINE void AddPixelStep2(int row, const FilterRows& rows, size_t x, + Vec rx, Vec ry, Vec rb, + Vec inv_sigma, const LoopFilter& lf, + Vec* JXL_RESTRICT X, Vec* JXL_RESTRICT Y, + Vec* JXL_RESTRICT B, + Vec* JXL_RESTRICT w) { + auto cx = aligned ? Load(DF(), rows.GetInputRow(row, 0) + x) + : LoadU(DF(), rows.GetInputRow(row, 0) + x); + auto cy = aligned ? Load(DF(), rows.GetInputRow(row, 1) + x) + : LoadU(DF(), rows.GetInputRow(row, 1) + x); + auto cb = aligned ? Load(DF(), rows.GetInputRow(row, 2) + x) + : LoadU(DF(), rows.GetInputRow(row, 2) + x); + + auto sad = AbsDiff(cx, rx) * Set(df, lf.epf_channel_scale[0]); + sad = MulAdd(AbsDiff(cy, ry), Set(df, lf.epf_channel_scale[1]), sad); + sad = MulAdd(AbsDiff(cb, rb), Set(df, lf.epf_channel_scale[2]), sad); + + auto weight = Weight(sad, inv_sigma, Set(df, lf.epf_pass2_zeroflush)); + + *w += weight; + *X = MulAdd(weight, cx, *X); + *Y = MulAdd(weight, cy, *Y); + *B = MulAdd(weight, cb, *B); +} + +template +void GaborishVector(const D df, const float* JXL_RESTRICT row_t, + const float* JXL_RESTRICT row_m, + const float* JXL_RESTRICT row_b, const V w0, const V w1, + const V w2, float* JXL_RESTRICT row_out) { +// Filter x0 is only aligned to blocks (8 floats = 32 bytes). For larger +// vectors, treat loads as unaligned (we manually align the Store). +#undef LoadMaybeU +#if HWY_CAP_GE512 +#define LoadMaybeU LoadU +#else +#define LoadMaybeU Load +#endif + + const auto t = LoadMaybeU(df, row_t); + const auto tl = LoadU(df, row_t - 1); + const auto tr = LoadU(df, row_t + 1); + const auto m = LoadMaybeU(df, row_m); + const auto l = LoadU(df, row_m - 1); + const auto r = LoadU(df, row_m + 1); + const auto b = LoadMaybeU(df, row_b); + const auto bl = LoadU(df, row_b - 1); + const auto br = LoadU(df, row_b + 1); + const auto sum0 = m; + const auto sum1 = (l + r) + (t + b); + const auto sum2 = (tl + tr) + (bl + br); + auto pixels = MulAdd(sum2, w2, MulAdd(sum1, w1, sum0 * w0)); + Store(pixels, df, row_out); +} + +void GaborishRow(const FilterRows& rows, const LoopFilter& /* lf */, + const FilterWeights& filter_weights, size_t x0, size_t x1, + size_t /*sigma_x_offset*/, size_t /* image_y_mod_8 */) { + JXL_DASSERT(x0 % Lanes(df) == 0); + + const float* JXL_RESTRICT gab_weights = filter_weights.gab_weights; + for (size_t c = 0; c < 3; c++) { + const float* JXL_RESTRICT row_t = rows.GetInputRow(-1, c); + const float* JXL_RESTRICT row_m = rows.GetInputRow(0, c); + const float* JXL_RESTRICT row_b = rows.GetInputRow(1, c); + float* JXL_RESTRICT row_out = rows.GetOutputRow(c); + + size_t ix = x0; + +#if HWY_CAP_GE512 + const HWY_FULL(float) dfull; // Gaborish is not block-dependent. + + // For AVX3, x0 might only be aligned to 8, not 16; if so, do a capped + // vector first to ensure full (Store-only!) alignment, then full vectors. + const uintptr_t addr = reinterpret_cast(row_out + ix); + if ((addr % 64) != 0 && ix < x1) { + const auto w0 = Set(df, gab_weights[3 * c + 0]); + const auto w1 = Set(df, gab_weights[3 * c + 1]); + const auto w2 = Set(df, gab_weights[3 * c + 2]); + GaborishVector(df, row_t + ix, row_m + ix, row_b + ix, w0, w1, w2, + row_out + ix); + ix += Lanes(df); + } + + const auto wfull0 = Set(dfull, gab_weights[3 * c + 0]); + const auto wfull1 = Set(dfull, gab_weights[3 * c + 1]); + const auto wfull2 = Set(dfull, gab_weights[3 * c + 2]); + for (; ix + Lanes(dfull) <= x1; ix += Lanes(dfull)) { + GaborishVector(dfull, row_t + ix, row_m + ix, row_b + ix, wfull0, wfull1, + wfull2, row_out + ix); + } +#endif + + // Non-AVX3 loop, or last capped vector for AVX3, if necessary + const auto w0 = Set(df, gab_weights[3 * c + 0]); + const auto w1 = Set(df, gab_weights[3 * c + 1]); + const auto w2 = Set(df, gab_weights[3 * c + 2]); + for (; ix < x1; ix += Lanes(df)) { + GaborishVector(df, row_t + ix, row_m + ix, row_b + ix, w0, w1, w2, + row_out + ix); + } + } +} + +// Step 0: 5x5 plus-shaped kernel with 5 SADs per pixel (3x3 +// plus-shaped). So this makes this filter a 7x7 filter. +void Epf0Row(const FilterRows& rows, const LoopFilter& lf, + const FilterWeights& filter_weights, size_t x0, size_t x1, + size_t sigma_x_offset, size_t image_y_mod_8) { + JXL_DASSERT(x0 % Lanes(df) == 0); + const float* JXL_RESTRICT row_sigma = rows.GetSigmaRow(); + + float sm = lf.epf_pass0_sigma_scale; + float bsm = sm * lf.epf_border_sad_mul; + + HWY_ALIGN float sad_mul[kBlockDim] = {bsm, sm, sm, sm, sm, sm, sm, bsm}; + + if (image_y_mod_8 == 0 || image_y_mod_8 == kBlockDim - 1) { + for (size_t i = 0; i < kBlockDim; i += Lanes(df)) { + Store(Set(df, bsm), df, sad_mul + i); + } + } + + for (size_t x = x0; x < x1; x += Lanes(df)) { + size_t bx = (x + sigma_x_offset) / kBlockDim; + size_t ix = (x + sigma_x_offset) % kBlockDim; + if (row_sigma[bx] < kMinSigma) { + for (size_t c = 0; c < 3; c++) { + auto px = Load(df, rows.GetInputRow(0, c) + x); + Store(px, df, rows.GetOutputRow(c) + x); + } + continue; + } + + const auto sm = Load(df, sad_mul + ix); + const auto inv_sigma = Set(DF(), row_sigma[bx]) * sm; + + decltype(Zero(df)) sads[12]; + for (size_t i = 0; i < 12; i++) sads[i] = Zero(df); + constexpr std::array sads_off[12] = { + {-2, 0}, {-1, -1}, {-1, 0}, {-1, 1}, {0, -2}, {0, -1}, + {0, 1}, {0, 2}, {1, -1}, {1, 0}, {1, 1}, {2, 0}, + }; + + // compute sads + // TODO(veluca): consider unrolling and optimizing this. + for (size_t c = 0; c < 3; c++) { + auto scale = Set(df, lf.epf_channel_scale[c]); + for (size_t i = 0; i < 12; i++) { + auto sad = Zero(df); + constexpr std::array plus_off[] = { + {0, 0}, {-1, 0}, {0, -1}, {1, 0}, {0, 1}}; + for (size_t j = 0; j < 5; j++) { + const auto r11 = LoadU( + df, rows.GetInputRow(plus_off[j][0], c) + x + plus_off[j][1]); + const auto c11 = + LoadU(df, rows.GetInputRow(sads_off[i][0] + plus_off[j][0], c) + + x + sads_off[i][1] + plus_off[j][1]); + sad += AbsDiff(r11, c11); + } + sads[i] = MulAdd(sad, scale, sads[i]); + } + } + const auto x_cc = LoadU(df, rows.GetInputRow(0, 0) + x); + const auto y_cc = LoadU(df, rows.GetInputRow(0, 1) + x); + const auto b_cc = LoadU(df, rows.GetInputRow(0, 2) + x); + + auto w = Set(df, 1); + auto X = x_cc; + auto Y = y_cc; + auto B = b_cc; + + for (size_t i = 0; i < 12; i++) { + AddPixelStep1(/*row=*/sads_off[i][0], rows, + x + sads_off[i][1], sads[i], inv_sigma, + lf, &X, &Y, &B, &w); + } + +#if JXL_HIGH_PRECISION + auto inv_w = Set(df, 1.0f) / w; +#else + auto inv_w = ApproximateReciprocal(w); +#endif + Store(X * inv_w, df, rows.GetOutputRow(0) + x); + Store(Y * inv_w, df, rows.GetOutputRow(1) + x); + Store(B * inv_w, df, rows.GetOutputRow(2) + x); + } +} + +// Step 1: 3x3 plus-shaped kernel with 5 SADs per pixel (also 3x3 +// plus-shaped). So this makes this filter a 5x5 filter. +void Epf1Row(const FilterRows& rows, const LoopFilter& lf, + const FilterWeights& filter_weights, size_t x0, size_t x1, + size_t sigma_x_offset, size_t image_y_mod_8) { + JXL_DASSERT(x0 % Lanes(df) == 0); + const float* JXL_RESTRICT row_sigma = rows.GetSigmaRow(); + + float sm = 1.0f; + float bsm = sm * lf.epf_border_sad_mul; + + HWY_ALIGN float sad_mul[kBlockDim] = {bsm, sm, sm, sm, sm, sm, sm, bsm}; + + if (image_y_mod_8 == 0 || image_y_mod_8 == kBlockDim - 1) { + for (size_t i = 0; i < kBlockDim; i += Lanes(df)) { + Store(Set(df, bsm), df, sad_mul + i); + } + } + + for (size_t x = x0; x < x1; x += Lanes(df)) { + size_t bx = (x + sigma_x_offset) / kBlockDim; + size_t ix = (x + sigma_x_offset) % kBlockDim; + if (row_sigma[bx] < kMinSigma) { + for (size_t c = 0; c < 3; c++) { + auto px = Load(df, rows.GetInputRow(0, c) + x); + Store(px, df, rows.GetOutputRow(c) + x); + } + continue; + } + + const auto sm = Load(df, sad_mul + ix); + const auto inv_sigma = Set(DF(), row_sigma[bx]) * sm; + auto sad0 = Zero(df); + auto sad1 = Zero(df); + auto sad2 = Zero(df); + auto sad3 = Zero(df); + + // compute sads + for (size_t c = 0; c < 3; c++) { + // center px = 22, px above = 21 + auto t = Undefined(df); + + const auto p20 = Load(df, rows.GetInputRow(-2, c) + x); + const auto p21 = Load(df, rows.GetInputRow(-1, c) + x); + auto sad0c = AbsDiff(p20, p21); // SAD 2, 1 + + const auto p11 = LoadU(df, rows.GetInputRow(-1, c) + x - 1); + auto sad1c = AbsDiff(p11, p21); // SAD 1, 2 + + const auto p31 = LoadU(df, rows.GetInputRow(-1, c) + x + 1); + auto sad2c = AbsDiff(p31, p21); // SAD 3, 2 + + const auto p02 = LoadU(df, rows.GetInputRow(0, c) + x - 2); + const auto p12 = LoadU(df, rows.GetInputRow(0, c) + x - 1); + sad1c += AbsDiff(p02, p12); // SAD 1, 2 + sad0c += AbsDiff(p11, p12); // SAD 2, 1 + + const auto p22 = LoadU(df, rows.GetInputRow(0, c) + x); + t = AbsDiff(p12, p22); + sad1c += t; // SAD 1, 2 + sad2c += t; // SAD 3, 2 + t = AbsDiff(p22, p21); + auto sad3c = t; // SAD 2, 3 + sad0c += t; // SAD 2, 1 + + const auto p32 = LoadU(df, rows.GetInputRow(0, c) + x + 1); + sad0c += AbsDiff(p31, p32); // SAD 2, 1 + t = AbsDiff(p22, p32); + sad1c += t; // SAD 1, 2 + sad2c += t; // SAD 3, 2 + + const auto p42 = LoadU(df, rows.GetInputRow(0, c) + x + 2); + sad2c += AbsDiff(p42, p32); // SAD 3, 2 + + const auto p13 = LoadU(df, rows.GetInputRow(1, c) + x - 1); + sad3c += AbsDiff(p13, p12); // SAD 2, 3 + + const auto p23 = Load(df, rows.GetInputRow(1, c) + x); + t = AbsDiff(p22, p23); + sad0c += t; // SAD 2, 1 + sad3c += t; // SAD 2, 3 + sad1c += AbsDiff(p13, p23); // SAD 1, 2 + + const auto p33 = LoadU(df, rows.GetInputRow(1, c) + x + 1); + sad2c += AbsDiff(p33, p23); // SAD 3, 2 + sad3c += AbsDiff(p33, p32); // SAD 2, 3 + + const auto p24 = Load(df, rows.GetInputRow(2, c) + x); + sad3c += AbsDiff(p24, p23); // SAD 2, 3 + + auto scale = Set(df, lf.epf_channel_scale[c]); + sad0 = MulAdd(sad0c, scale, sad0); + sad1 = MulAdd(sad1c, scale, sad1); + sad2 = MulAdd(sad2c, scale, sad2); + sad3 = MulAdd(sad3c, scale, sad3); + } + const auto x_cc = Load(df, rows.GetInputRow(0, 0) + x); + const auto y_cc = Load(df, rows.GetInputRow(0, 1) + x); + const auto b_cc = Load(df, rows.GetInputRow(0, 2) + x); + + auto w = Set(df, 1); + auto X = x_cc; + auto Y = y_cc; + auto B = b_cc; + + // Top row + AddPixelStep1(/*row=*/-1, rows, x, sad0, inv_sigma, lf, + &X, &Y, &B, &w); + // Center + AddPixelStep1(/*row=*/0, rows, x - 1, sad1, inv_sigma, + lf, &X, &Y, &B, &w); + AddPixelStep1(/*row=*/0, rows, x + 1, sad2, inv_sigma, + lf, &X, &Y, &B, &w); + // Bottom + AddPixelStep1(/*row=*/1, rows, x, sad3, inv_sigma, lf, &X, + &Y, &B, &w); +#if JXL_HIGH_PRECISION + auto inv_w = Set(df, 1.0f) / w; +#else + auto inv_w = ApproximateReciprocal(w); +#endif + Store(X * inv_w, df, rows.GetOutputRow(0) + x); + Store(Y * inv_w, df, rows.GetOutputRow(1) + x); + Store(B * inv_w, df, rows.GetOutputRow(2) + x); + } +} + +// Step 2: 3x3 plus-shaped kernel with a single reference pixel, ran on +// the output of the previous step. +void Epf2Row(const FilterRows& rows, const LoopFilter& lf, + const FilterWeights& filter_weights, size_t x0, size_t x1, + size_t sigma_x_offset, size_t image_y_mod_8) { + JXL_DASSERT(x0 % Lanes(df) == 0); + const float* JXL_RESTRICT row_sigma = rows.GetSigmaRow(); + + float sm = lf.epf_pass2_sigma_scale; + float bsm = sm * lf.epf_border_sad_mul; + + HWY_ALIGN float sad_mul[kBlockDim] = {bsm, sm, sm, sm, sm, sm, sm, bsm}; + + if (image_y_mod_8 == 0 || image_y_mod_8 == kBlockDim - 1) { + for (size_t i = 0; i < kBlockDim; i += Lanes(df)) { + Store(Set(df, bsm), df, sad_mul + i); + } + } + + for (size_t x = x0; x < x1; x += Lanes(df)) { + size_t bx = (x + sigma_x_offset) / kBlockDim; + size_t ix = (x + sigma_x_offset) % kBlockDim; + + if (row_sigma[bx] < kMinSigma) { + for (size_t c = 0; c < 3; c++) { + auto px = Load(df, rows.GetInputRow(0, c) + x); + Store(px, df, rows.GetOutputRow(c) + x); + } + continue; + } + + const auto sm = Load(df, sad_mul + ix); + const auto inv_sigma = Set(DF(), row_sigma[bx]) * sm; + + const auto x_cc = Load(df, rows.GetInputRow(0, 0) + x); + const auto y_cc = Load(df, rows.GetInputRow(0, 1) + x); + const auto b_cc = Load(df, rows.GetInputRow(0, 2) + x); + + auto w = Set(df, 1); + auto X = x_cc; + auto Y = y_cc; + auto B = b_cc; + + // Top row + AddPixelStep2(/*row=*/-1, rows, x, x_cc, y_cc, b_cc, + inv_sigma, lf, &X, &Y, &B, &w); + // Center + AddPixelStep2(/*row=*/0, rows, x - 1, x_cc, y_cc, b_cc, + inv_sigma, lf, &X, &Y, &B, &w); + AddPixelStep2(/*row=*/0, rows, x + 1, x_cc, y_cc, b_cc, + inv_sigma, lf, &X, &Y, &B, &w); + // Bottom + AddPixelStep2(/*row=*/1, rows, x, x_cc, y_cc, b_cc, + inv_sigma, lf, &X, &Y, &B, &w); + +#if JXL_HIGH_PRECISION + auto inv_w = Set(df, 1.0f) / w; +#else + auto inv_w = ApproximateReciprocal(w); +#endif + Store(X * inv_w, df, rows.GetOutputRow(0) + x); + Store(Y * inv_w, df, rows.GetOutputRow(1) + x); + Store(B * inv_w, df, rows.GetOutputRow(2) + x); + } +} + +constexpr FilterDefinition kGaborishFilter{&GaborishRow, 1}; +constexpr FilterDefinition kEpf0Filter{&Epf0Row, 3}; +constexpr FilterDefinition kEpf1Filter{&Epf1Row, 2}; +constexpr FilterDefinition kEpf2Filter{&Epf2Row, 1}; + +void FilterPipelineInit(FilterPipeline* fp, const LoopFilter& lf, + const Image3F& in, const Rect& in_rect, + const Rect& image_rect, size_t image_ysize, + Image3F* out, const Rect& out_rect) { + JXL_DASSERT(lf.gab || lf.epf_iters > 0); + // All EPF filters use sigma so we need to compute it. + fp->compute_sigma = lf.epf_iters > 0; + + fp->num_filters = 0; + fp->storage_rows_used = 0; + // First filter always uses the input image. + fp->filters[0].SetInput(&in, in_rect, image_rect, image_ysize); + + if (lf.gab) { + fp->AddStep(kGaborishFilter); + } + + if (lf.epf_iters == 1) { + fp->AddStep(kEpf1Filter); + } else if (lf.epf_iters == 2) { + fp->AddStep(kEpf1Filter); + fp->AddStep(kEpf2Filter); + } else if (lf.epf_iters == 3) { + fp->AddStep(kEpf0Filter); + fp->AddStep(kEpf1Filter); + fp->AddStep(kEpf2Filter); + } + + // At least one of the filters was enabled so "num_filters" must be non-zero. + JXL_DASSERT(fp->num_filters > 0); + + // Set the output of the last filter as the output image. + fp->filters[fp->num_filters - 1].SetOutput(out, out_rect); + + // Walk the list of filters backwards to compute how many rows are needed. + size_t col_border = 0; + for (int i = fp->num_filters - 1; i >= 0; i--) { + // Compute the region where we need to apply this filter. Depending on the + // step we might need to compute a larger portion than the original rect + // because of the border needed by other stages. This is the range of valid + // output values we produce, however we run the filter over a larger region + // to make those values multiple of Lanes(df). + const size_t x0 = + FilterPipeline::FilterStep::MaxLeftPadding(image_rect.x0()) - + col_border; + const size_t x1 = + FilterPipeline::FilterStep::MaxLeftPadding(image_rect.x0()) + + image_rect.xsize() + col_border; + + fp->filters[i].filter_x0 = x0 - x0 % Lanes(df); + fp->filters[i].filter_x1 = RoundUpTo(x1, Lanes(df)); + + // The extra border needed for future filtering. + fp->filters[i].output_col_border = col_border; + col_border += fp->filters[i].filter_def.border; + } + fp->total_border = col_border; + JXL_ASSERT(fp->total_border == lf.Padding()); + JXL_ASSERT(fp->total_border <= kMaxFilterBorder); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(FilterPipelineInit); // Local function + +// Mirror n floats starting at *p and store them before p. +JXL_INLINE void LeftMirror(float* p, size_t n) { + for (size_t i = 0; i < n; i++) { + *(p - 1 - i) = p[i]; + } +} + +// Mirror n floats starting at *(p - n) and store them at *p. +JXL_INLINE void RightMirror(float* p, size_t n) { + for (size_t i = 0; i < n; i++) { + p[i] = *(p - 1 - i); + } +} + +void ComputeSigma(const Rect& block_rect, PassesDecoderState* state) { + const LoopFilter& lf = state->shared->frame_header.loop_filter; + JXL_CHECK(lf.epf_iters > 0); + const AcStrategyImage& ac_strategy = state->shared->ac_strategy; + const float quant_scale = state->shared->quantizer.Scale(); + + const size_t sigma_stride = state->filter_weights.sigma.PixelsPerRow(); + const size_t sharpness_stride = state->shared->epf_sharpness.PixelsPerRow(); + + for (size_t by = 0; by < block_rect.ysize(); ++by) { + float* JXL_RESTRICT sigma_row = + block_rect.Row(&state->filter_weights.sigma, by); + const uint8_t* JXL_RESTRICT sharpness_row = + block_rect.ConstRow(state->shared->epf_sharpness, by); + AcStrategyRow acs_row = ac_strategy.ConstRow(block_rect, by); + const int* const JXL_RESTRICT row_quant = + block_rect.ConstRow(state->shared->raw_quant_field, by); + + for (size_t bx = 0; bx < block_rect.xsize(); bx++) { + AcStrategy acs = acs_row[bx]; + size_t llf_x = acs.covered_blocks_x(); + if (!acs.IsFirstBlock()) continue; + // quant_scale is smaller for low quality. + // quant_scale is roughly 0.08 / butteraugli score. + // + // row_quant is smaller for low quality. + // row_quant is a quantization multiplier of form 1.0 / + // row_quant[bx] + // + // lf.epf_quant_mul is a parameter in the format + // kInvSigmaNum is a constant + float sigma_quant = + lf.epf_quant_mul / (quant_scale * row_quant[bx] * kInvSigmaNum); + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) { + float sigma = + sigma_quant * + lf.epf_sharp_lut[sharpness_row[bx + ix + iy * sharpness_stride]]; + // Avoid infinities. + sigma = std::min(-1e-4f, sigma); // TODO(veluca): remove this. + sigma_row[bx + ix + kSigmaPadding + + (iy + kSigmaPadding) * sigma_stride] = 1.0f / sigma; + } + } + // TODO(veluca): remove this padding. + // Left padding with mirroring. + if (bx + block_rect.x0() == 0) { + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + LeftMirror( + sigma_row + kSigmaPadding + (iy + kSigmaPadding) * sigma_stride, + kSigmaBorder); + } + } + // Right padding with mirroring. + if (bx + block_rect.x0() + llf_x == + state->shared->frame_dim.xsize_blocks) { + for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { + RightMirror(sigma_row + kSigmaPadding + bx + llf_x + + (iy + kSigmaPadding) * sigma_stride, + kSigmaBorder); + } + } + // Offsets for row copying, in blocks. + size_t offset_before = bx + block_rect.x0() == 0 ? 1 : bx + kSigmaPadding; + size_t offset_after = + bx + block_rect.x0() + llf_x == state->shared->frame_dim.xsize_blocks + ? kSigmaPadding + llf_x + bx + kSigmaBorder + : kSigmaPadding + llf_x + bx; + size_t num = offset_after - offset_before; + // Above + if (by + block_rect.y0() == 0) { + for (size_t iy = 0; iy < kSigmaBorder; iy++) { + memcpy( + sigma_row + offset_before + + (kSigmaPadding - 1 - iy) * sigma_stride, + sigma_row + offset_before + (kSigmaPadding + iy) * sigma_stride, + num * sizeof(*sigma_row)); + } + } + // Below + if (by + block_rect.y0() + acs.covered_blocks_y() == + state->shared->frame_dim.ysize_blocks) { + for (size_t iy = 0; iy < kSigmaBorder; iy++) { + memcpy( + sigma_row + offset_before + + sigma_stride * (acs.covered_blocks_y() + kSigmaPadding + iy), + sigma_row + offset_before + + sigma_stride * + (acs.covered_blocks_y() + kSigmaPadding - 1 - iy), + num * sizeof(*sigma_row)); + } + } + } + } +} + +FilterPipeline* PrepareFilterPipeline( + PassesDecoderState* dec_state, const Rect& image_rect, const Image3F& input, + const Rect& input_rect, size_t image_ysize, size_t thread, + Image3F* JXL_RESTRICT out, const Rect& output_rect) { + const LoopFilter& lf = dec_state->shared->frame_header.loop_filter; + // image_rect, input and output must all have the same kPaddingXRound + // alignment for SIMD, but it doesn't need to be 0. + JXL_DASSERT(image_rect.x0() % GroupBorderAssigner::kPaddingXRound == + input_rect.x0() % GroupBorderAssigner::kPaddingXRound); + JXL_DASSERT(image_rect.x0() % GroupBorderAssigner::kPaddingXRound == + output_rect.x0() % GroupBorderAssigner::kPaddingXRound); + + // We need enough pixels to access the padding and the rounding to + // GroupBorderAssigner::kPaddingXRound to the left of the image. + JXL_DASSERT(input_rect.x0() >= + input_rect.x0() % GroupBorderAssigner::kPaddingXRound + + lf.Padding()); + + JXL_DASSERT(image_rect.xsize() == input_rect.xsize()); + JXL_DASSERT(image_rect.xsize() == output_rect.xsize()); + FilterPipeline* fp = &(dec_state->filter_pipelines[thread]); + fp->image_rect = image_rect; + + HWY_DYNAMIC_DISPATCH(FilterPipelineInit) + (fp, lf, input, input_rect, image_rect, image_ysize, out, output_rect); + return fp; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/epf.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/epf.h new file mode 100644 index 0000000000..a2fd9d16f4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/epf.h @@ -0,0 +1,55 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_EPF_H_ +#define LIB_JXL_EPF_H_ + +// Fast SIMD "in-loop" edge preserving filter (adaptive, nonlinear). + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_cache.h" +#include "lib/jxl/filters.h" +#include "lib/jxl/passes_state.h" + +namespace jxl { + +// 4 * (sqrt(0.5)-1), so that Weight(sigma) = 0.5. +static constexpr float kInvSigmaNum = -1.1715728752538099024f; + +// Fills the `state->filter_weights.sigma` image with the precomputed sigma +// values in the area inside `block_rect`. Accesses the AC strategy, quant field +// and epf_sharpness fields in the corresponding positions. +void ComputeSigma(const Rect& block_rect, PassesDecoderState* state); + +// Applies Gaborish + EPF to the given `image_rect` part of the image (used to +// select the sigma values). Input pixels are taken from `input:input_rect`, and +// the filtering result is written to `out:output_rect`. `dec_state->sigma` must +// be padded with `kMaxFilterPadding/kBlockDim` values along the x axis. +// All rects must have the same alignment module +// GroupBorderAssigner::kPaddingXRound pixels. +// `input_rect`, `output_rect` and `image_rect` must all have the same size. +// At least `lf.Padding()` pixels must be accessible and contain valid values +// outside of `image_rect` in `input`. Also, depending on the implementation, +// more pixels in the input up to a vector size boundary should be accessible +// but may contain uninitialized data. +// +// This function only prepares and returns the pipeline, to perform the +// filtering process it must be called on all row from -lf.Padding() to +// image_rect.ysize() + lf.Padding() . +// +// Note: if the output_rect x0 or x1 are not a multiple of kPaddingXRound more +// pixels with potentially uninitialized data will be written to the output left +// and right of the requested rect up to a multiple of kPaddingXRound pixels. +FilterPipeline* PrepareFilterPipeline( + PassesDecoderState* dec_state, const Rect& image_rect, const Image3F& input, + const Rect& input_rect, size_t image_ysize, size_t thread, + Image3F* JXL_RESTRICT out, const Rect& output_rect); + +} // namespace jxl + +#endif // LIB_JXL_EPF_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/fast_math-inl.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/fast_math-inl.h new file mode 100644 index 0000000000..60be66829a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/fast_math-inl.h @@ -0,0 +1,175 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Fast SIMD math ops (log2, encoder only, cos, erf for splines) + +#if defined(LIB_JXL_FAST_MATH_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_FAST_MATH_INL_H_ +#undef LIB_JXL_FAST_MATH_INL_H_ +#else +#define LIB_JXL_FAST_MATH_INL_H_ +#endif + +#include + +#include "lib/jxl/common.h" +#include "lib/jxl/rational_polynomial-inl.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Rebind; +using hwy::HWY_NAMESPACE::ShiftLeft; +using hwy::HWY_NAMESPACE::ShiftRight; + +// Computes base-2 logarithm like std::log2. Undefined if negative / NaN. +// L1 error ~3.9E-6 +template +V FastLog2f(const DF df, V x) { + // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2). + HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f), + HWY_REP4(1.4287160470083755E+00f), + HWY_REP4(7.4245873327820566E-01f)}; + HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f), + HWY_REP4(1.0096718572241148E+00f), + HWY_REP4(1.7409343003366853E-01f)}; + + const Rebind di; + const auto x_bits = BitCast(di, x); + + // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops + const auto exp_bits = x_bits - Set(di, 0x3f2aaaab); // = 2/3 + // Shifted exponent = log2; also used to clear mantissa. + const auto exp_shifted = ShiftRight<23>(exp_bits); + const auto mantissa = BitCast(df, x_bits - ShiftLeft<23>(exp_shifted)); + const auto exp_val = ConvertTo(df, exp_shifted); + return EvalRationalPolynomial(df, mantissa - Set(df, 1.0f), p, q) + exp_val; +} + +// max relative error ~3e-7 +template +V FastPow2f(const DF df, V x) { + const Rebind di; + auto floorx = Floor(x); + auto exp = BitCast(df, ShiftLeft<23>(ConvertTo(di, floorx) + Set(di, 127))); + auto frac = x - floorx; + auto num = frac + Set(df, 1.01749063e+01); + num = MulAdd(num, frac, Set(df, 4.88687798e+01)); + num = MulAdd(num, frac, Set(df, 9.85506591e+01)); + num = num * exp; + auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02)); + den = MulAdd(den, frac, Set(df, -1.94414990e+01)); + den = MulAdd(den, frac, Set(df, 9.85506633e+01)); + return num / den; +} + +// max relative error ~3e-5 +template +V FastPowf(const DF df, V base, V exponent) { + return FastPow2f(df, FastLog2f(df, base) * exponent); +} + +// Computes cosine like std::cos. +// L1 error 7e-5. +template +V FastCosf(const DF df, V x) { + // Step 1: range reduction to [0, 2pi) + const auto pi2 = Set(df, kPi * 2.0f); + const auto pi2_inv = Set(df, 0.5f / kPi); + const auto npi2 = Floor(x * pi2_inv) * pi2; + const auto xmodpi2 = x - npi2; + // Step 2: range reduction to [0, pi] + const auto x_pi = Min(xmodpi2, pi2 - xmodpi2); + // Step 3: range reduction to [0, pi/2] + const auto above_pihalf = x_pi >= Set(df, kPi / 2.0f); + const auto x_pihalf = IfThenElse(above_pihalf, Set(df, kPi) - x_pi, x_pi); + // Step 4: Taylor-like approximation, scaled by 2**0.75 to make angle + // duplication steps faster, on x/4. + const auto xs = x_pihalf * Set(df, 0.25f); + const auto x2 = xs * xs; + const auto x4 = x2 * x2; + const auto cosx_prescaling = + MulAdd(x4, Set(df, 0.06960438), + MulAdd(x2, Set(df, -0.84087373), Set(df, 1.68179268))); + // Step 5: angle duplication. + const auto cosx_scale1 = + MulAdd(cosx_prescaling, cosx_prescaling, Set(df, -1.414213562)); + const auto cosx_scale2 = MulAdd(cosx_scale1, cosx_scale1, Set(df, -1)); + // Step 6: change sign if needed. + const Rebind du; + auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, above_pihalf))); + return BitCast(df, signbit ^ BitCast(du, cosx_scale2)); +} + +// Computes the error function like std::erf. +// L1 error 7e-4. +template +V FastErff(const DF df, V x) { + // Formula from + // https://en.wikipedia.org/wiki/Error_function#Numerical_approximations + // but constants have been recomputed. + const auto xle0 = x <= Zero(df); + const auto absx = Abs(x); + // Compute 1 - 1 / ((((x * a + b) * x + c) * x + d) * x + 1)**4 + const auto denom1 = + MulAdd(absx, Set(df, 7.77394369e-02), Set(df, 2.05260015e-04)); + const auto denom2 = MulAdd(denom1, absx, Set(df, 2.32120216e-01)); + const auto denom3 = MulAdd(denom2, absx, Set(df, 2.77820801e-01)); + const auto denom4 = MulAdd(denom3, absx, Set(df, 1.0f)); + const auto denom5 = denom4 * denom4; + const auto inv_denom5 = Set(df, 1.0f) / denom5; + const auto result = NegMulAdd(inv_denom5, inv_denom5, Set(df, 1.0f)); + // Change sign if needed. + const Rebind du; + auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, xle0))); + return BitCast(df, signbit ^ BitCast(du, result)); +} + +inline float FastLog2f(float f) { + HWY_CAPPED(float, 1) D; + return GetLane(FastLog2f(D, Set(D, f))); +} + +inline float FastPow2f(float f) { + HWY_CAPPED(float, 1) D; + return GetLane(FastPow2f(D, Set(D, f))); +} + +inline float FastPowf(float b, float e) { + HWY_CAPPED(float, 1) D; + return GetLane(FastPowf(D, Set(D, b), Set(D, e))); +} + +inline float FastCosf(float f) { + HWY_CAPPED(float, 1) D; + return GetLane(FastCosf(D, Set(D, f))); +} + +inline float FastErff(float f) { + HWY_CAPPED(float, 1) D; + return GetLane(FastErff(D, Set(D, f))); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_FAST_MATH_INL_H_ + +#if HWY_ONCE + +namespace jxl { +inline float FastLog2f(float f) { return HWY_STATIC_DISPATCH(FastLog2f)(f); } +inline float FastPow2f(float f) { return HWY_STATIC_DISPATCH(FastPow2f)(f); } +inline float FastPowf(float b, float e) { + return HWY_STATIC_DISPATCH(FastPowf)(b, e); +} +inline float FastCosf(float f) { return HWY_STATIC_DISPATCH(FastCosf)(f); } +inline float FastErff(float f) { return HWY_STATIC_DISPATCH(FastErff)(f); } +} // namespace jxl + +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/fast_math_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/fast_math_test.cc new file mode 100644 index 0000000000..50c3bbb03a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/fast_math_test.cc @@ -0,0 +1,280 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/fast_math_test.cc" +#include + +#include "lib/jxl/dec_xyb-inl.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/fast_math-inl.h" +#include "lib/jxl/transfer_functions-inl.h" + +// Test utils +#include +#include +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +HWY_NOINLINE void TestFastLog2() { + constexpr size_t kNumTrials = 1 << 23; + std::mt19937 rng(1); + std::uniform_real_distribution dist(1e-7f, 1e3f); + float max_abs_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = dist(rng); + const auto actual_v = FastLog2f(d, Set(d, f)); + const float actual = GetLane(actual_v); + const float abs_err = std::abs(std::log2(f) - actual); + EXPECT_LT(abs_err, 2.9E-6) << "f = " << f; + max_abs_err = std::max(max_abs_err, abs_err); + } + printf("max abs err %e\n", static_cast(max_abs_err)); +} + +HWY_NOINLINE void TestFastPow2() { + constexpr size_t kNumTrials = 1 << 23; + std::mt19937 rng(1); + std::uniform_real_distribution dist(-100, 100); + float max_rel_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = dist(rng); + const auto actual_v = FastPow2f(d, Set(d, f)); + const float actual = GetLane(actual_v); + const float expected = std::pow(2, f); + const float rel_err = std::abs(expected - actual) / expected; + EXPECT_LT(rel_err, 3.1E-7) << "f = " << f; + max_rel_err = std::max(max_rel_err, rel_err); + } + printf("max rel err %e\n", static_cast(max_rel_err)); +} + +HWY_NOINLINE void TestFastPow() { + constexpr size_t kNumTrials = 1 << 23; + std::mt19937 rng(1); + std::uniform_real_distribution distb(1e-3f, 1e3f); + std::uniform_real_distribution diste(-10, 10); + float max_rel_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float b = distb(rng); + const float e = diste(rng); + const auto actual_v = FastPowf(d, Set(d, b), Set(d, e)); + const float actual = GetLane(actual_v); + const float expected = std::pow(b, e); + const float rel_err = std::abs(expected - actual) / expected; + EXPECT_LT(rel_err, 3E-5) << "b = " << b << " e = " << e; + max_rel_err = std::max(max_rel_err, rel_err); + } + printf("max rel err %e\n", static_cast(max_rel_err)); +} + +HWY_NOINLINE void TestFastCos() { + constexpr size_t kNumTrials = 1 << 23; + std::mt19937 rng(1); + std::uniform_real_distribution dist(-1e3f, 1e3f); + float max_abs_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = dist(rng); + const auto actual_v = FastCosf(d, Set(d, f)); + const float actual = GetLane(actual_v); + const float abs_err = std::abs(std::cos(f) - actual); + EXPECT_LT(abs_err, 7E-5) << "f = " << f; + max_abs_err = std::max(max_abs_err, abs_err); + } + printf("max abs err %e\n", static_cast(max_abs_err)); +} + +HWY_NOINLINE void TestFastErf() { + constexpr size_t kNumTrials = 1 << 23; + std::mt19937 rng(1); + std::uniform_real_distribution dist(-5.f, 5.f); + float max_abs_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = dist(rng); + const auto actual_v = FastErff(d, Set(d, f)); + const float actual = GetLane(actual_v); + const float abs_err = std::abs(std::erf(f) - actual); + EXPECT_LT(abs_err, 7E-4) << "f = " << f; + max_abs_err = std::max(max_abs_err, abs_err); + } + printf("max abs err %e\n", static_cast(max_abs_err)); +} + +HWY_NOINLINE void TestFastSRGB() { + constexpr size_t kNumTrials = 1 << 23; + std::mt19937 rng(1); + std::uniform_real_distribution dist(0.0f, 1.0f); + float max_abs_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = dist(rng); + const auto actual_v = FastLinearToSRGB(d, Set(d, f)); + const float actual = GetLane(actual_v); + const float expected = GetLane(TF_SRGB().EncodedFromDisplay(d, Set(d, f))); + const float abs_err = std::abs(expected - actual); + EXPECT_LT(abs_err, 1.2E-4) << "f = " << f; + max_abs_err = std::max(max_abs_err, abs_err); + } + printf("max abs err %e\n", static_cast(max_abs_err)); +} + +HWY_NOINLINE void TestFastPQEFD() { + constexpr size_t kNumTrials = 1 << 23; + std::mt19937 rng(1); + std::uniform_real_distribution dist(0.0f, 1.0f); + float max_abs_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = dist(rng); + const float actual = GetLane(TF_PQ().EncodedFromDisplay(d, Set(d, f))); + const float expected = TF_PQ().EncodedFromDisplay(f); + const float abs_err = std::abs(expected - actual); + EXPECT_LT(abs_err, 7e-7) << "f = " << f; + max_abs_err = std::max(max_abs_err, abs_err); + } + printf("max abs err %e\n", static_cast(max_abs_err)); +} + +HWY_NOINLINE void TestFastHLGEFD() { + constexpr size_t kNumTrials = 1 << 23; + std::mt19937 rng(1); + std::uniform_real_distribution dist(0.0f, 1.0f); + float max_abs_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = dist(rng); + const float actual = GetLane(TF_HLG().EncodedFromDisplay(d, Set(d, f))); + const float expected = TF_HLG().EncodedFromDisplay(f); + const float abs_err = std::abs(expected - actual); + EXPECT_LT(abs_err, 5e-7) << "f = " << f; + max_abs_err = std::max(max_abs_err, abs_err); + } + printf("max abs err %e\n", static_cast(max_abs_err)); +} + +HWY_NOINLINE void TestFast709EFD() { + constexpr size_t kNumTrials = 1 << 23; + std::mt19937 rng(1); + std::uniform_real_distribution dist(0.0f, 1.0f); + float max_abs_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = dist(rng); + const float actual = GetLane(TF_709().EncodedFromDisplay(d, Set(d, f))); + const float expected = TF_709().EncodedFromDisplay(f); + const float abs_err = std::abs(expected - actual); + EXPECT_LT(abs_err, 2e-6) << "f = " << f; + max_abs_err = std::max(max_abs_err, abs_err); + } + printf("max abs err %e\n", static_cast(max_abs_err)); +} + +HWY_NOINLINE void TestFastPQDFE() { + constexpr size_t kNumTrials = 1 << 23; + std::mt19937 rng(1); + std::uniform_real_distribution dist(0.0f, 1.0f); + float max_abs_err = 0; + HWY_FULL(float) d; + for (size_t i = 0; i < kNumTrials; i++) { + const float f = dist(rng); + const float actual = GetLane(TF_PQ().DisplayFromEncoded(d, Set(d, f))); + const float expected = TF_PQ().DisplayFromEncoded(f); + const float abs_err = std::abs(expected - actual); + EXPECT_LT(abs_err, 3E-6) << "f = " << f; + max_abs_err = std::max(max_abs_err, abs_err); + } + printf("max abs err %e\n", static_cast(max_abs_err)); +} + +HWY_NOINLINE void TestFastXYB() { + if (!HasFastXYBTosRGB8()) return; + ImageMetadata metadata; + ImageBundle ib(&metadata); + int scaling = 1; + int n = 256 * scaling; + float inv_scaling = 1.0f / scaling; + int kChunk = 32; + // The image is divided in chunks to reduce total memory usage. + for (int cr = 0; cr < n; cr += kChunk) { + for (int cg = 0; cg < n; cg += kChunk) { + for (int cb = 0; cb < n; cb += kChunk) { + Image3F chunk(kChunk * kChunk, kChunk); + for (int ir = 0; ir < kChunk; ir++) { + for (int ig = 0; ig < kChunk; ig++) { + for (int ib = 0; ib < kChunk; ib++) { + float r = (cr + ir) * inv_scaling; + float g = (cg + ig) * inv_scaling; + float b = (cb + ib) * inv_scaling; + chunk.PlaneRow(0, ir)[ig * kChunk + ib] = r * (1.0f / 255); + chunk.PlaneRow(1, ir)[ig * kChunk + ib] = g * (1.0f / 255); + chunk.PlaneRow(2, ir)[ig * kChunk + ib] = b * (1.0f / 255); + } + } + } + ib.SetFromImage(std::move(chunk), ColorEncoding::SRGB()); + Image3F xyb(kChunk * kChunk, kChunk); + std::vector roundtrip(kChunk * kChunk * kChunk * 3); + ToXYB(ib, nullptr, &xyb); + jxl::HWY_NAMESPACE::FastXYBTosRGB8( + xyb, Rect(xyb), Rect(xyb), nullptr, Rect(), /*is_rgba=*/false, + roundtrip.data(), xyb.xsize(), xyb.xsize() * 3); + for (int ir = 0; ir < kChunk; ir++) { + for (int ig = 0; ig < kChunk; ig++) { + for (int ib = 0; ib < kChunk; ib++) { + float r = (cr + ir) * inv_scaling; + float g = (cg + ig) * inv_scaling; + float b = (cb + ib) * inv_scaling; + size_t idx = ir * kChunk * kChunk + ig * kChunk + ib; + int rr = roundtrip[3 * idx]; + int rg = roundtrip[3 * idx + 1]; + int rb = roundtrip[3 * idx + 2]; + EXPECT_LT(abs(r - rr), 2) << "expected " << r << " got " << rr; + EXPECT_LT(abs(g - rg), 2) << "expected " << g << " got " << rg; + EXPECT_LT(abs(b - rb), 2) << "expected " << b << " got " << rb; + } + } + } + } + } + } +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +class FastMathTargetTest : public hwy::TestWithParamTarget {}; +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(FastMathTargetTest); + +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastLog2); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPow2); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPow); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastCos); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastErf); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastSRGB); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPQDFE); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPQEFD); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastHLGEFD); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFast709EFD); +HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastXYB); + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/field_encodings.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/field_encodings.h new file mode 100644 index 0000000000..00d0880c71 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/field_encodings.h @@ -0,0 +1,123 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_FIELD_ENCODINGS_H_ +#define LIB_JXL_FIELD_ENCODINGS_H_ + +// Constants needed to encode/decode fields; avoids including the full fields.h. + +#include +#include + +#include + +#include "hwy/base.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +class Visitor; +class Fields { + public: + virtual ~Fields() = default; + virtual const char* Name() const = 0; + virtual Status VisitFields(Visitor* JXL_RESTRICT visitor) = 0; +}; + +// Distribution of U32 values for one particular selector. Represents either a +// power of two-sized range, or a single value. A separate type ensures this is +// only passed to the U32Enc ctor. +struct U32Distr { + // No need to validate - all `d` are legitimate. + constexpr explicit U32Distr(uint32_t d) : d(d) {} + + static constexpr uint32_t kDirect = 0x80000000u; + + constexpr bool IsDirect() const { return (d & kDirect) != 0; } + + // Only call if IsDirect(). + constexpr uint32_t Direct() const { return d & (kDirect - 1); } + + // Only call if !IsDirect(). + constexpr size_t ExtraBits() const { return (d & 0x1F) + 1; } + uint32_t Offset() const { return (d >> 5) & 0x3FFFFFF; } + + uint32_t d; +}; + +// A direct-coded 31-bit value occupying 2 bits in the bitstream. +constexpr U32Distr Val(uint32_t value) { + return U32Distr(value | U32Distr::kDirect); +} + +// Value - `offset` will be signaled in `bits` extra bits. +constexpr U32Distr BitsOffset(uint32_t bits, uint32_t offset) { + return U32Distr(((bits - 1) & 0x1F) + ((offset & 0x3FFFFFF) << 5)); +} + +// Value will be signaled in `bits` extra bits. +constexpr U32Distr Bits(uint32_t bits) { return BitsOffset(bits, 0); } + +// See U32Coder documentation in fields.h. +class U32Enc { + public: + constexpr U32Enc(const U32Distr d0, const U32Distr d1, const U32Distr d2, + const U32Distr d3) + : d_{d0, d1, d2, d3} {} + + // Returns the U32Distr at `selector` = 0..3, least-significant first. + U32Distr GetDistr(const uint32_t selector) const { + JXL_ASSERT(selector < 4); + return d_[selector]; + } + + private: + U32Distr d_[4]; +}; + +// Returns bit with the given `index` (0 = least significant). +template +static inline constexpr uint64_t MakeBit(T index) { + return 1ULL << static_cast(index); +} + +// Returns vector of all possible values of an Enum type. Relies on each Enum +// providing an overload of EnumBits() that returns a bit array of its values, +// which implies values must be in [0, 64). +template +std::vector Values() { + uint64_t bits = EnumBits(Enum()); + + std::vector values; + values.reserve(hwy::PopCount(bits)); + + // For each 1-bit in bits: add its index as value + while (bits != 0) { + const int index = Num0BitsBelowLS1Bit_Nonzero(bits); + values.push_back(static_cast(index)); + bits &= bits - 1; // clear least-significant bit + } + return values; +} + +// Returns true if value is one of Values(). +template +Status EnumValid(const Enum value) { + if (static_cast(value) >= 64) { + return JXL_FAILURE("Value %u too large for %s\n", + static_cast(value), EnumName(Enum())); + } + const uint64_t bit = MakeBit(value); + if ((EnumBits(Enum()) & bit) == 0) { + return JXL_FAILURE("Invalid value %u for %s\n", + static_cast(value), EnumName(Enum())); + } + return true; +} + +} // namespace jxl + +#endif // LIB_JXL_FIELD_ENCODINGS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/fields.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/fields.cc new file mode 100644 index 0000000000..7f00c44610 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/fields.cc @@ -0,0 +1,985 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/fields.h" + +#include + +#include +#include + +#include "hwy/base.h" +#include "lib/jxl/base/bits.h" + +namespace jxl { + +namespace { + +// A bundle can be in one of three states concerning extensions: not-begun, +// active, ended. Bundles may be nested, so we need a stack of states. +class ExtensionStates { + public: + void Push() { + // Initial state = not-begun. + begun_ <<= 1; + ended_ <<= 1; + } + + // Clears current state; caller must check IsEnded beforehand. + void Pop() { + begun_ >>= 1; + ended_ >>= 1; + } + + // Returns true if state == active || state == ended. + Status IsBegun() const { return (begun_ & 1) != 0; } + // Returns true if state != not-begun && state != active. + Status IsEnded() const { return (ended_ & 1) != 0; } + + void Begin() { + JXL_ASSERT(!IsBegun()); + JXL_ASSERT(!IsEnded()); + begun_ += 1; + } + + void End() { + JXL_ASSERT(IsBegun()); + JXL_ASSERT(!IsEnded()); + ended_ += 1; + } + + private: + // Current state := least-significant bit of begun_ and ended_. + uint64_t begun_ = 0; + uint64_t ended_ = 0; +}; + +// Visitors generate Init/AllDefault/Read/Write logic for all fields. Each +// bundle's VisitFields member function calls visitor->U32 etc. We do not +// overload operator() because a function name is easier to search for. + +class VisitorBase : public Visitor { + public: + explicit VisitorBase(bool print_bundles = false) + : print_bundles_(print_bundles) {} + ~VisitorBase() override { JXL_ASSERT(depth_ == 0); } + + // This is the only call site of Fields::VisitFields. Adds tracing and + // ensures EndExtensions was called. + Status Visit(Fields* fields, const char* visitor_name) override { + fputs(visitor_name, stdout); // No newline; no effect if empty + if (print_bundles_) { + Trace("%s\n", print_bundles_ ? fields->Name() : ""); + } + + depth_ += 1; + JXL_ASSERT(depth_ <= Bundle::kMaxExtensions); + extension_states_.Push(); + + const Status ok = fields->VisitFields(this); + + if (ok) { + // If VisitFields called BeginExtensions, must also call + // EndExtensions. + JXL_ASSERT(!extension_states_.IsBegun() || extension_states_.IsEnded()); + } else { + // Failed, undefined state: don't care whether EndExtensions was + // called. + } + + extension_states_.Pop(); + JXL_ASSERT(depth_ != 0); + depth_ -= 1; + + return ok; + } + + // For visitors accepting a const Visitor, need to const-cast so we can call + // the non-const Visitor::VisitFields. NOTE: C is not modified except the + // `all_default` field by CanEncodeVisitor. + Status VisitConst(const Fields& t, const char* message) { + return Visit(const_cast(&t), message); + } + + // Derived types (overridden by InitVisitor because it is unsafe to read + // from *value there) + + Status Bool(bool default_value, bool* JXL_RESTRICT value) override { + uint32_t bits = *value ? 1 : 0; + JXL_RETURN_IF_ERROR(Bits(1, static_cast(default_value), &bits)); + JXL_DASSERT(bits <= 1); + *value = bits == 1; + return true; + } + + // Overridden by ReadVisitor and WriteVisitor. + // Called before any conditional visit based on "extensions". + // Overridden by ReadVisitor, CanEncodeVisitor and WriteVisitor. + Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override { + JXL_RETURN_IF_ERROR(U64(0, extensions)); + + extension_states_.Begin(); + return true; + } + + // Called after all extension fields (if any). Although non-extension + // fields could be visited afterward, we prefer the convention that + // extension fields are always the last to be visited. Overridden by + // ReadVisitor. + Status EndExtensions() override { + extension_states_.End(); + return true; + } + + protected: + // Prints indentation, . + JXL_FORMAT(2, 3) // 1-based plus one because member function + void Trace(const char* format, ...) const { + // Indentation. + printf("%*s", static_cast(2 * depth_), ""); + + va_list args; + va_start(args, format); + vfprintf(stdout, format, args); + va_end(args); + } + + private: + size_t depth_ = 0; // for indentation. + ExtensionStates extension_states_; + const bool print_bundles_; +}; + +struct InitVisitor : public VisitorBase { + Status Bits(const size_t /*unused*/, const uint32_t default_value, + uint32_t* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + Status U32(const U32Enc /*unused*/, const uint32_t default_value, + uint32_t* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + Status U64(const uint64_t default_value, + uint64_t* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + Status Bool(bool default_value, bool* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + Status F16(const float default_value, float* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + // Always visit conditional fields to ensure they are initialized. + Status Conditional(bool /*condition*/) override { return true; } + + Status AllDefault(const Fields& /*fields*/, + bool* JXL_RESTRICT all_default) override { + // Just initialize this field and don't skip initializing others. + JXL_RETURN_IF_ERROR(Bool(true, all_default)); + return false; + } + + Status VisitNested(Fields* /*fields*/) override { + // Avoid re-initializing nested bundles (their ctors already called + // Bundle::Init for their fields). + return true; + } + + const char* VisitorName() override { return "InitVisitor"; } +}; + +// Similar to InitVisitor, but also initializes nested fields. +struct SetDefaultVisitor : public VisitorBase { + Status Bits(const size_t /*unused*/, const uint32_t default_value, + uint32_t* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + Status U32(const U32Enc /*unused*/, const uint32_t default_value, + uint32_t* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + Status U64(const uint64_t default_value, + uint64_t* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + Status Bool(bool default_value, bool* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + Status F16(const float default_value, float* JXL_RESTRICT value) override { + *value = default_value; + return true; + } + + // Always visit conditional fields to ensure they are initialized. + Status Conditional(bool /*condition*/) override { return true; } + + Status AllDefault(const Fields& /*fields*/, + bool* JXL_RESTRICT all_default) override { + // Just initialize this field and don't skip initializing others. + JXL_RETURN_IF_ERROR(Bool(true, all_default)); + return false; + } + + const char* VisitorName() override { return "SetDefaultVisitor"; } +}; + +class AllDefaultVisitor : public VisitorBase { + public: + explicit AllDefaultVisitor(bool print_all_default) + : VisitorBase(print_all_default), print_all_default_(print_all_default) {} + + Status Bits(const size_t bits, const uint32_t default_value, + uint32_t* JXL_RESTRICT value) override { + if (print_all_default_) { + Trace(" u(%zu) = %u, default %u\n", bits, *value, default_value); + } + + all_default_ &= *value == default_value; + return true; + } + + Status U32(const U32Enc /*unused*/, const uint32_t default_value, + uint32_t* JXL_RESTRICT value) override { + if (print_all_default_) { + Trace(" U32 = %u, default %u\n", *value, default_value); + } + + all_default_ &= *value == default_value; + return true; + } + + Status U64(const uint64_t default_value, + uint64_t* JXL_RESTRICT value) override { + if (print_all_default_) { + Trace(" U64 = %" PRIu64 ", default %" PRIu64 "\n", *value, + default_value); + } + + all_default_ &= *value == default_value; + return true; + } + + Status F16(const float default_value, float* JXL_RESTRICT value) override { + if (print_all_default_) { + Trace(" F16 = %.6f, default %.6f\n", static_cast(*value), + static_cast(default_value)); + } + all_default_ &= std::abs(*value - default_value) < 1E-6f; + return true; + } + + Status AllDefault(const Fields& /*fields*/, + bool* JXL_RESTRICT /*all_default*/) override { + // Visit all fields so we can compute the actual all_default_ value. + return false; + } + + bool AllDefault() const { return all_default_; } + + const char* VisitorName() override { return "AllDefaultVisitor"; } + + private: + const bool print_all_default_; + bool all_default_ = true; +}; + +class ReadVisitor : public VisitorBase { + public: + ReadVisitor(BitReader* reader, bool print_read) + : VisitorBase(print_read), print_read_(print_read), reader_(reader) {} + + Status Bits(const size_t bits, const uint32_t /*default_value*/, + uint32_t* JXL_RESTRICT value) override { + *value = BitsCoder::Read(bits, reader_); + if (!reader_->AllReadsWithinBounds()) { + return JXL_STATUS(StatusCode::kNotEnoughBytes, + "Not enough bytes for header"); + } + if (print_read_) Trace(" u(%zu) = %u\n", bits, *value); + return true; + } + + Status U32(const U32Enc dist, const uint32_t /*default_value*/, + uint32_t* JXL_RESTRICT value) override { + *value = U32Coder::Read(dist, reader_); + if (!reader_->AllReadsWithinBounds()) { + return JXL_STATUS(StatusCode::kNotEnoughBytes, + "Not enough bytes for header"); + } + if (print_read_) Trace(" U32 = %u\n", *value); + return true; + } + + Status U64(const uint64_t /*default_value*/, + uint64_t* JXL_RESTRICT value) override { + *value = U64Coder::Read(reader_); + if (!reader_->AllReadsWithinBounds()) { + return JXL_STATUS(StatusCode::kNotEnoughBytes, + "Not enough bytes for header"); + } + if (print_read_) Trace(" U64 = %" PRIu64 "\n", *value); + return true; + } + + Status F16(const float /*default_value*/, + float* JXL_RESTRICT value) override { + ok_ &= F16Coder::Read(reader_, value); + if (!reader_->AllReadsWithinBounds()) { + return JXL_STATUS(StatusCode::kNotEnoughBytes, + "Not enough bytes for header"); + } + if (print_read_) Trace(" F16 = %f\n", static_cast(*value)); + return true; + } + + void SetDefault(Fields* fields) override { Bundle::SetDefault(fields); } + + bool IsReading() const override { return true; } + + // This never fails because visitors are expected to keep reading until + // EndExtensions, see comment there. + Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override { + JXL_QUIET_RETURN_IF_ERROR(VisitorBase::BeginExtensions(extensions)); + if (*extensions == 0) return true; + + // For each nonzero bit, i.e. extension that is present: + for (uint64_t remaining_extensions = *extensions; remaining_extensions != 0; + remaining_extensions &= remaining_extensions - 1) { + const size_t idx_extension = + Num0BitsBelowLS1Bit_Nonzero(remaining_extensions); + // Read additional U64 (one per extension) indicating the number of bits + // (allows skipping individual extensions). + JXL_RETURN_IF_ERROR(U64(0, &extension_bits_[idx_extension])); + if (!SafeAdd(total_extension_bits_, extension_bits_[idx_extension], + total_extension_bits_)) { + return JXL_FAILURE("Extension bits overflowed, invalid codestream"); + } + } + // Used by EndExtensions to skip past any _remaining_ extensions. + pos_after_ext_size_ = reader_->TotalBitsConsumed(); + JXL_ASSERT(pos_after_ext_size_ != 0); + return true; + } + + Status EndExtensions() override { + JXL_QUIET_RETURN_IF_ERROR(VisitorBase::EndExtensions()); + // Happens if extensions == 0: don't read size, done. + if (pos_after_ext_size_ == 0) return true; + + // Not enough bytes as set by BeginExtensions or earlier. Do not return + // this as an JXL_FAILURE or false (which can also propagate to error + // through e.g. JXL_RETURN_IF_ERROR), since this may be used while + // silently checking whether there are enough bytes. If this case must be + // treated as an error, reader_>Close() will do this, just like is already + // done for non-extension fields. + if (!enough_bytes_) return true; + + // Skip new fields this (old?) decoder didn't know about, if any. + const size_t bits_read = reader_->TotalBitsConsumed(); + uint64_t end; + if (!SafeAdd(pos_after_ext_size_, total_extension_bits_, end)) { + return JXL_FAILURE("Invalid extension size, caused overflow"); + } + if (bits_read > end) { + return JXL_FAILURE("Read more extension bits than budgeted"); + } + const size_t remaining_bits = end - bits_read; + if (remaining_bits != 0) { + JXL_WARNING("Skipping %zu-bit extension(s)", remaining_bits); + reader_->SkipBits(remaining_bits); + if (!reader_->AllReadsWithinBounds()) { + return JXL_STATUS(StatusCode::kNotEnoughBytes, + "Not enough bytes for header"); + } + } + return true; + } + + Status OK() const { return ok_; } + + const char* VisitorName() override { return "ReadVisitor"; } + + private: + const bool print_read_; + + // Whether any error other than not enough bytes occurred. + bool ok_ = true; + + // Whether there are enough input bytes to read from. + bool enough_bytes_ = true; + BitReader* const reader_; + // May be 0 even if the corresponding extension is present. + uint64_t extension_bits_[Bundle::kMaxExtensions] = {0}; + uint64_t total_extension_bits_ = 0; + size_t pos_after_ext_size_ = 0; // 0 iff extensions == 0. +}; + +class MaxBitsVisitor : public VisitorBase { + public: + Status Bits(const size_t bits, const uint32_t /*default_value*/, + uint32_t* JXL_RESTRICT /*value*/) override { + max_bits_ += BitsCoder::MaxEncodedBits(bits); + return true; + } + + Status U32(const U32Enc enc, const uint32_t /*default_value*/, + uint32_t* JXL_RESTRICT /*value*/) override { + max_bits_ += U32Coder::MaxEncodedBits(enc); + return true; + } + + Status U64(const uint64_t /*default_value*/, + uint64_t* JXL_RESTRICT /*value*/) override { + max_bits_ += U64Coder::MaxEncodedBits(); + return true; + } + + Status F16(const float /*default_value*/, + float* JXL_RESTRICT /*value*/) override { + max_bits_ += F16Coder::MaxEncodedBits(); + return true; + } + + Status AllDefault(const Fields& /*fields*/, + bool* JXL_RESTRICT all_default) override { + JXL_RETURN_IF_ERROR(Bool(true, all_default)); + return false; // For max bits, assume nothing is default + } + + // Always visit conditional fields to get a (loose) upper bound. + Status Conditional(bool /*condition*/) override { return true; } + + Status BeginExtensions(uint64_t* JXL_RESTRICT /*extensions*/) override { + // Skip - extensions are not included in "MaxBits" because their length + // is potentially unbounded. + return true; + } + + Status EndExtensions() override { return true; } + + size_t MaxBits() const { return max_bits_; } + + const char* VisitorName() override { return "MaxBitsVisitor"; } + + private: + size_t max_bits_ = 0; +}; + +class CanEncodeVisitor : public VisitorBase { + public: + explicit CanEncodeVisitor(bool print_sizes) + : VisitorBase(print_sizes), print_sizes_(print_sizes) {} + + Status Bits(const size_t bits, const uint32_t /*default_value*/, + uint32_t* JXL_RESTRICT value) override { + size_t encoded_bits = 0; + ok_ &= BitsCoder::CanEncode(bits, *value, &encoded_bits); + if (print_sizes_) Trace("u(%zu) = %u\n", bits, *value); + encoded_bits_ += encoded_bits; + return true; + } + + Status U32(const U32Enc enc, const uint32_t /*default_value*/, + uint32_t* JXL_RESTRICT value) override { + size_t encoded_bits = 0; + ok_ &= U32Coder::CanEncode(enc, *value, &encoded_bits); + if (print_sizes_) Trace("U32(%zu) = %u\n", encoded_bits, *value); + encoded_bits_ += encoded_bits; + return true; + } + + Status U64(const uint64_t /*default_value*/, + uint64_t* JXL_RESTRICT value) override { + size_t encoded_bits = 0; + ok_ &= U64Coder::CanEncode(*value, &encoded_bits); + if (print_sizes_) { + Trace("U64(%zu) = %" PRIu64 "\n", encoded_bits, *value); + } + encoded_bits_ += encoded_bits; + return true; + } + + Status F16(const float /*default_value*/, + float* JXL_RESTRICT value) override { + size_t encoded_bits = 0; + ok_ &= F16Coder::CanEncode(*value, &encoded_bits); + if (print_sizes_) { + Trace("F16(%zu) = %.6f\n", encoded_bits, static_cast(*value)); + } + encoded_bits_ += encoded_bits; + return true; + } + + Status AllDefault(const Fields& fields, + bool* JXL_RESTRICT all_default) override { + *all_default = Bundle::AllDefault(fields); + JXL_RETURN_IF_ERROR(Bool(true, all_default)); + return *all_default; + } + + Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override { + JXL_QUIET_RETURN_IF_ERROR(VisitorBase::BeginExtensions(extensions)); + extensions_ = *extensions; + if (*extensions != 0) { + JXL_ASSERT(pos_after_ext_ == 0); + pos_after_ext_ = encoded_bits_; + JXL_ASSERT(pos_after_ext_ != 0); // visited "extensions" + } + return true; + } + // EndExtensions = default. + + Status GetSizes(size_t* JXL_RESTRICT extension_bits, + size_t* JXL_RESTRICT total_bits) { + JXL_RETURN_IF_ERROR(ok_); + *extension_bits = 0; + *total_bits = encoded_bits_; + // Only if extension field was nonzero will we encode their sizes. + if (pos_after_ext_ != 0) { + JXL_ASSERT(encoded_bits_ >= pos_after_ext_); + *extension_bits = encoded_bits_ - pos_after_ext_; + // Also need to encode *extension_bits and bill it to *total_bits. + size_t encoded_bits = 0; + ok_ &= U64Coder::CanEncode(*extension_bits, &encoded_bits); + *total_bits += encoded_bits; + + // TODO(janwas): support encoding individual extension sizes. We + // currently ascribe all bits to the first and send zeros for the + // others. + for (size_t i = 1; i < hwy::PopCount(extensions_); ++i) { + encoded_bits = 0; + ok_ &= U64Coder::CanEncode(0, &encoded_bits); + *total_bits += encoded_bits; + } + } + return true; + } + + const char* VisitorName() override { return "CanEncodeVisitor"; } + + private: + const bool print_sizes_; + bool ok_ = true; + size_t encoded_bits_ = 0; + uint64_t extensions_ = 0; + // Snapshot of encoded_bits_ after visiting the extension field, but NOT + // including the hidden extension sizes. + uint64_t pos_after_ext_ = 0; +}; + +class WriteVisitor : public VisitorBase { + public: + WriteVisitor(const size_t extension_bits, BitWriter* JXL_RESTRICT writer) + : extension_bits_(extension_bits), writer_(writer) {} + + Status Bits(const size_t bits, const uint32_t /*default_value*/, + uint32_t* JXL_RESTRICT value) override { + ok_ &= BitsCoder::Write(bits, *value, writer_); + return true; + } + Status U32(const U32Enc enc, const uint32_t /*default_value*/, + uint32_t* JXL_RESTRICT value) override { + ok_ &= U32Coder::Write(enc, *value, writer_); + return true; + } + + Status U64(const uint64_t /*default_value*/, + uint64_t* JXL_RESTRICT value) override { + ok_ &= U64Coder::Write(*value, writer_); + return true; + } + + Status F16(const float /*default_value*/, + float* JXL_RESTRICT value) override { + ok_ &= F16Coder::Write(*value, writer_); + return true; + } + + Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override { + JXL_QUIET_RETURN_IF_ERROR(VisitorBase::BeginExtensions(extensions)); + if (*extensions == 0) { + JXL_ASSERT(extension_bits_ == 0); + return true; + } + // TODO(janwas): extend API to pass in array of extension_bits, one per + // extension. We currently ascribe all bits to the first extension, but + // this is only an encoder limitation. NOTE: extension_bits_ can be zero + // if an extension does not require any additional fields. + ok_ &= U64Coder::Write(extension_bits_, writer_); + // For each nonzero bit except the lowest/first (already written): + for (uint64_t remaining_extensions = *extensions & (*extensions - 1); + remaining_extensions != 0; + remaining_extensions &= remaining_extensions - 1) { + ok_ &= U64Coder::Write(0, writer_); + } + return true; + } + // EndExtensions = default. + + Status OK() const { return ok_; } + + const char* VisitorName() override { return "WriteVisitor"; } + + private: + const size_t extension_bits_; + BitWriter* JXL_RESTRICT writer_; + bool ok_ = true; +}; + +} // namespace + +void Bundle::Init(Fields* fields) { + InitVisitor visitor; + if (!visitor.Visit(fields, PrintVisitors() ? "-- Init\n" : "")) { + JXL_ABORT("Init should never fail"); + } +} +void Bundle::SetDefault(Fields* fields) { + SetDefaultVisitor visitor; + if (!visitor.Visit(fields, PrintVisitors() ? "-- SetDefault\n" : "")) { + JXL_ABORT("SetDefault should never fail"); + } +} +bool Bundle::AllDefault(const Fields& fields) { + AllDefaultVisitor visitor(/*print_all_default=*/PrintAllDefault()); + const char* name = + (PrintVisitors() || PrintAllDefault()) ? "[[AllDefault\n" : ""; + if (!visitor.VisitConst(fields, name)) { + JXL_ABORT("AllDefault should never fail"); + } + + if (PrintAllDefault()) printf(" %d]]\n", visitor.AllDefault()); + return visitor.AllDefault(); +} +size_t Bundle::MaxBits(const Fields& fields) { + MaxBitsVisitor visitor; +#if JXL_ENABLE_ASSERT + Status ret = +#else + (void) +#endif // JXL_ENABLE_ASSERT + visitor.VisitConst(fields, PrintVisitors() ? "-- MaxBits\n" : ""); + JXL_ASSERT(ret); + return visitor.MaxBits(); +} +Status Bundle::CanEncode(const Fields& fields, size_t* extension_bits, + size_t* total_bits) { + CanEncodeVisitor visitor(/*print_sizes=*/PrintSizes()); + const char* name = (PrintVisitors() || PrintSizes()) ? "[[CanEncode\n" : ""; + JXL_QUIET_RETURN_IF_ERROR(visitor.VisitConst(fields, name)); + JXL_QUIET_RETURN_IF_ERROR(visitor.GetSizes(extension_bits, total_bits)); + if (PrintSizes()) printf(" %zu]]\n", *total_bits); + return true; +} +Status Bundle::Read(BitReader* reader, Fields* fields) { + ReadVisitor visitor(reader, /*print_read=*/PrintRead()); + JXL_RETURN_IF_ERROR( + visitor.Visit(fields, PrintVisitors() ? "-- Read\n" : "")); + return visitor.OK(); +} +bool Bundle::CanRead(BitReader* reader, Fields* fields) { + ReadVisitor visitor(reader, /*print_read=*/PrintRead()); + Status status = visitor.Visit(fields, PrintVisitors() ? "-- Read\n" : ""); + // We are only checking here whether there are enough bytes. We still return + // true for other errors because it means there are enough bytes to determine + // there's an error. Use Read() to determine which error it is. + return status.code() != StatusCode::kNotEnoughBytes; +} +Status Bundle::Write(const Fields& fields, BitWriter* writer, size_t layer, + AuxOut* aux_out) { + size_t extension_bits, total_bits; + JXL_RETURN_IF_ERROR(CanEncode(fields, &extension_bits, &total_bits)); + + BitWriter::Allotment allotment(writer, total_bits); + WriteVisitor visitor(extension_bits, writer); + JXL_RETURN_IF_ERROR( + visitor.VisitConst(fields, PrintVisitors() ? "-- Write\n" : "")); + JXL_RETURN_IF_ERROR(visitor.OK()); + ReclaimAndCharge(writer, &allotment, layer, aux_out); + return true; +} + +size_t U32Coder::MaxEncodedBits(const U32Enc enc) { + size_t extra_bits = 0; + for (uint32_t selector = 0; selector < 4; ++selector) { + const U32Distr d = enc.GetDistr(selector); + if (d.IsDirect()) { + continue; + } else { + extra_bits = std::max(extra_bits, d.ExtraBits()); + } + } + return 2 + extra_bits; +} + +Status U32Coder::CanEncode(const U32Enc enc, const uint32_t value, + size_t* JXL_RESTRICT encoded_bits) { + uint32_t selector; + size_t total_bits; + const Status ok = ChooseSelector(enc, value, &selector, &total_bits); + *encoded_bits = ok ? total_bits : 0; + return ok; +} + +uint32_t U32Coder::Read(const U32Enc enc, BitReader* JXL_RESTRICT reader) { + const uint32_t selector = reader->ReadFixedBits<2>(); + const U32Distr d = enc.GetDistr(selector); + if (d.IsDirect()) { + return d.Direct(); + } else { + return reader->ReadBits(d.ExtraBits()) + d.Offset(); + } +} + +// Returns false if the value is too large to encode. +Status U32Coder::Write(const U32Enc enc, const uint32_t value, + BitWriter* JXL_RESTRICT writer) { + uint32_t selector; + size_t total_bits; + JXL_RETURN_IF_ERROR(ChooseSelector(enc, value, &selector, &total_bits)); + + writer->Write(2, selector); + + const U32Distr d = enc.GetDistr(selector); + if (!d.IsDirect()) { // Nothing more to write for direct encoding + const uint32_t offset = d.Offset(); + JXL_ASSERT(value >= offset); + writer->Write(total_bits - 2, value - offset); + } + + return true; +} + +Status U32Coder::ChooseSelector(const U32Enc enc, const uint32_t value, + uint32_t* JXL_RESTRICT selector, + size_t* JXL_RESTRICT total_bits) { +#if JXL_ENABLE_ASSERT + const size_t bits_required = 32 - Num0BitsAboveMS1Bit(value); +#endif // JXL_ENABLE_ASSERT + JXL_ASSERT(bits_required <= 32); + + *selector = 0; + *total_bits = 0; + + // It is difficult to verify whether Dist32Byte are sorted, so check all + // selectors and keep the one with the fewest total_bits. + *total_bits = 64; // more than any valid encoding + for (uint32_t s = 0; s < 4; ++s) { + const U32Distr d = enc.GetDistr(s); + if (d.IsDirect()) { + if (d.Direct() == value) { + *selector = s; + *total_bits = 2; + return true; // Done, direct is always the best possible. + } + continue; + } + const size_t extra_bits = d.ExtraBits(); + const uint32_t offset = d.Offset(); + if (value < offset || value >= offset + (1ULL << extra_bits)) continue; + + // Better than prior encoding, remember it: + if (2 + extra_bits < *total_bits) { + *selector = s; + *total_bits = 2 + extra_bits; + } + } + + if (*total_bits == 64) { + return JXL_FAILURE("No feasible selector for %u", value); + } + + return true; +} + +uint64_t U64Coder::Read(BitReader* JXL_RESTRICT reader) { + uint64_t selector = reader->ReadFixedBits<2>(); + if (selector == 0) { + return 0; + } + if (selector == 1) { + return 1 + reader->ReadFixedBits<4>(); + } + if (selector == 2) { + return 17 + reader->ReadFixedBits<8>(); + } + + // selector 3, varint, groups have first 12, then 8, and last 4 bits. + uint64_t result = reader->ReadFixedBits<12>(); + + uint64_t shift = 12; + while (reader->ReadFixedBits<1>()) { + if (shift == 60) { + result |= static_cast(reader->ReadFixedBits<4>()) << shift; + break; + } + result |= static_cast(reader->ReadFixedBits<8>()) << shift; + shift += 8; + } + + return result; +} + +// Returns false if the value is too large to encode. +Status U64Coder::Write(uint64_t value, BitWriter* JXL_RESTRICT writer) { + if (value == 0) { + // Selector: use 0 bits, value 0 + writer->Write(2, 0); + } else if (value <= 16) { + // Selector: use 4 bits, value 1..16 + writer->Write(2, 1); + writer->Write(4, value - 1); + } else if (value <= 272) { + // Selector: use 8 bits, value 17..272 + writer->Write(2, 2); + writer->Write(8, value - 17); + } else { + // Selector: varint, first a 12-bit group, after that per 8-bit group. + writer->Write(2, 3); + writer->Write(12, value & 4095); + value >>= 12; + int shift = 12; + while (value > 0 && shift < 60) { + // Indicate varint not done + writer->Write(1, 1); + writer->Write(8, value & 255); + value >>= 8; + shift += 8; + } + if (value > 0) { + // This only could happen if shift == N - 4. + writer->Write(1, 1); + writer->Write(4, value & 15); + // Implicitly closed sequence, no extra stop bit is required. + } else { + // Indicate end of varint + writer->Write(1, 0); + } + } + + return true; +} + +// Can always encode, but useful because it also returns bit size. +Status U64Coder::CanEncode(uint64_t value, size_t* JXL_RESTRICT encoded_bits) { + if (value == 0) { + *encoded_bits = 2; // 2 selector bits + } else if (value <= 16) { + *encoded_bits = 2 + 4; // 2 selector bits + 4 payload bits + } else if (value <= 272) { + *encoded_bits = 2 + 8; // 2 selector bits + 8 payload bits + } else { + *encoded_bits = 2 + 12; // 2 selector bits + 12 payload bits + value >>= 12; + int shift = 12; + while (value > 0 && shift < 60) { + *encoded_bits += 1 + 8; // 1 continuation bit + 8 payload bits + value >>= 8; + shift += 8; + } + if (value > 0) { + // This only could happen if shift == N - 4. + *encoded_bits += 1 + 4; // 1 continuation bit + 4 payload bits + } else { + *encoded_bits += 1; // 1 stop bit + } + } + + return true; +} + +Status F16Coder::Read(BitReader* JXL_RESTRICT reader, + float* JXL_RESTRICT value) { + const uint32_t bits16 = reader->ReadFixedBits<16>(); + const uint32_t sign = bits16 >> 15; + const uint32_t biased_exp = (bits16 >> 10) & 0x1F; + const uint32_t mantissa = bits16 & 0x3FF; + + if (JXL_UNLIKELY(biased_exp == 31)) { + return JXL_FAILURE("F16 infinity or NaN are not supported"); + } + + // Subnormal or zero + if (JXL_UNLIKELY(biased_exp == 0)) { + *value = (1.0f / 16384) * (mantissa * (1.0f / 1024)); + if (sign) *value = -*value; + return true; + } + + // Normalized: convert the representation directly (faster than ldexp/tables). + const uint32_t biased_exp32 = biased_exp + (127 - 15); + const uint32_t mantissa32 = mantissa << (23 - 10); + const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32; + memcpy(value, &bits32, sizeof(bits32)); + return true; +} + +Status F16Coder::Write(float value, BitWriter* JXL_RESTRICT writer) { + uint32_t bits32; + memcpy(&bits32, &value, sizeof(bits32)); + const uint32_t sign = bits32 >> 31; + const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF; + const uint32_t mantissa32 = bits32 & 0x7FFFFF; + + const int32_t exp = static_cast(biased_exp32) - 127; + if (JXL_UNLIKELY(exp > 15)) { + return JXL_FAILURE("Too big to encode, CanEncode should return false"); + } + + // Tiny or zero => zero. + if (exp < -24) { + writer->Write(16, 0); + return true; + } + + uint32_t biased_exp16, mantissa16; + + // exp = [-24, -15] => subnormal + if (JXL_UNLIKELY(exp < -14)) { + biased_exp16 = 0; + const uint32_t sub_exp = static_cast(-14 - exp); + JXL_ASSERT(1 <= sub_exp && sub_exp < 11); + mantissa16 = (1 << (10 - sub_exp)) + (mantissa32 >> (13 + sub_exp)); + } else { + // exp = [-14, 15] + biased_exp16 = static_cast(exp + 15); + JXL_ASSERT(1 <= biased_exp16 && biased_exp16 < 31); + mantissa16 = mantissa32 >> 13; + } + + JXL_ASSERT(mantissa16 < 1024); + const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16; + JXL_ASSERT(bits16 < 0x10000); + writer->Write(16, bits16); + return true; +} + +Status F16Coder::CanEncode(float value, size_t* JXL_RESTRICT encoded_bits) { + *encoded_bits = MaxEncodedBits(); + if (std::isnan(value) || std::isinf(value)) { + return JXL_FAILURE("Should not attempt to store NaN and infinity"); + } + return std::abs(value) <= 65504.0f; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/fields.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/fields.h new file mode 100644 index 0000000000..244b96ff73 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/fields.h @@ -0,0 +1,300 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_FIELDS_H_ +#define LIB_JXL_FIELDS_H_ + +// Forward/backward-compatible 'bundles' with auto-serialized 'fields'. + +#include +#include +#include +#include +#include + +#include +#include // abs +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/field_encodings.h" + +namespace jxl { + +// Integer coders: BitsCoder (raw), U32Coder (table), U64Coder (varint). + +// Reads/writes a given (fixed) number of bits <= 32. +class BitsCoder { + public: + static size_t MaxEncodedBits(const size_t bits) { return bits; } + + static Status CanEncode(const size_t bits, const uint32_t value, + size_t* JXL_RESTRICT encoded_bits) { + *encoded_bits = bits; + if (value >= (1ULL << bits)) { + return JXL_FAILURE("Value %u too large for %zu bits", value, bits); + } + return true; + } + + static uint32_t Read(const size_t bits, BitReader* JXL_RESTRICT reader) { + return reader->ReadBits(bits); + } + + // Returns false if the value is too large to encode. + static Status Write(const size_t bits, const uint32_t value, + BitWriter* JXL_RESTRICT writer) { + if (value >= (1ULL << bits)) { + return JXL_FAILURE("Value %d too large to encode in %zu bits", value, + bits); + } + writer->Write(bits, value); + return true; + } +}; + +// Encodes u32 using a lookup table and/or extra bits, governed by a per-field +// encoding `enc` which consists of four distributions `d` chosen via a 2-bit +// selector (least significant = 0). Each d may have two modes: +// - direct: if d.IsDirect(), the value is d.Direct(); +// - offset: the value is derived from d.ExtraBits() extra bits plus d.Offset(); +// This encoding is denser than Exp-Golomb or Gamma codes when both small and +// large values occur. +// +// Examples: +// Direct: U32Enc(Val(8), Val(16), Val(32), Bits(6)), value 32 => 10b. +// Offset: U32Enc(Val(0), BitsOffset(1, 1), BitsOffset(2, 3), BitsOffset(8, 8)) +// defines the following prefix code: +// 00 -> 0 +// 01x -> 1..2 +// 10xx -> 3..7 +// 11xxxxxxxx -> 8..263 +class U32Coder { + public: + static size_t MaxEncodedBits(U32Enc enc); + static Status CanEncode(U32Enc enc, uint32_t value, + size_t* JXL_RESTRICT encoded_bits); + static uint32_t Read(U32Enc enc, BitReader* JXL_RESTRICT reader); + + // Returns false if the value is too large to encode. + static Status Write(U32Enc enc, uint32_t value, + BitWriter* JXL_RESTRICT writer); + + private: + static Status ChooseSelector(U32Enc enc, uint32_t value, + uint32_t* JXL_RESTRICT selector, + size_t* JXL_RESTRICT total_bits); +}; + +// Encodes 64-bit unsigned integers with a fixed distribution, taking 2 bits +// to encode 0, 6 bits to encode 1 to 16, 10 bits to encode 17 to 272, 15 bits +// to encode up to 4095, and on the order of log2(value) * 1.125 bits for +// larger values. +class U64Coder { + public: + static constexpr size_t MaxEncodedBits() { + return 2 + 12 + 6 * (8 + 1) + (4 + 1); + } + + static uint64_t Read(BitReader* JXL_RESTRICT reader); + + // Returns false if the value is too large to encode. + static Status Write(uint64_t value, BitWriter* JXL_RESTRICT writer); + + // Can always encode, but useful because it also returns bit size. + static Status CanEncode(uint64_t value, size_t* JXL_RESTRICT encoded_bits); +}; + +// IEEE 754 half-precision (binary16). Refuses to read/write NaN/Inf. +class F16Coder { + public: + static constexpr size_t MaxEncodedBits() { return 16; } + + // Returns false if the bit representation is NaN or infinity + static Status Read(BitReader* JXL_RESTRICT reader, float* JXL_RESTRICT value); + + // Returns false if the value is too large to encode. + static Status Write(float value, BitWriter* JXL_RESTRICT writer); + static Status CanEncode(float value, size_t* JXL_RESTRICT encoded_bits); +}; + +// A "bundle" is a forward- and backward compatible collection of fields. +// They are used for SizeHeader/FrameHeader/GroupHeader. Bundles can be +// extended by appending(!) fields. Optional fields may be omitted from the +// bitstream by conditionally visiting them. When reading new bitstreams with +// old code, we skip unknown fields at the end of the bundle. This requires +// storing the amount of extra appended bits, and that fields are visited in +// chronological order of being added to the format, because old decoders +// cannot skip some future fields and resume reading old fields. Similarly, +// new readers query bits in an "extensions" field to skip (groups of) fields +// not present in old bitstreams. Note that each bundle must include an +// "extensions" field prior to freezing the format, otherwise it cannot be +// extended. +// +// To ensure interoperability, there will be no opaque fields. +// +// HOWTO: +// - basic usage: define a struct with member variables ("fields") and a +// VisitFields(v) member function that calls v->U32/Bool etc. for each +// field, specifying their default values. The ctor must call +// Bundle::Init(this). +// +// - print a trace of visitors: ensure each bundle has a static Name() member +// function, and change Bundle::Print* to return true. +// +// - optional fields: in VisitFields, add if (v->Conditional(your_condition)) +// { v->Bool(default, &field); }. This prevents reading/writing field +// if !your_condition, which is typically computed from a prior field. +// WARNING: to ensure all fields are initialized, do not add an else branch; +// instead add another if (v->Conditional(!your_condition)). +// +// - repeated fields: for dynamic sizes, use e.g. std::vector and in +// VisitFields, if (v->IsReading()) field.resize(size) before accessing field. +// For static or bounded sizes, use an array or std::array. In all cases, +// simply visit each array element as if it were a normal field. +// +// - nested bundles: add a bundle as a normal field and in VisitFields call +// JXL_RETURN_IF_ERROR(v->VisitNested(&nested)); +// +// - allow future extensions: define a "uint64_t extensions" field and call +// v->BeginExtensions(&extensions) after visiting all non-extension fields, +// and `return v->EndExtensions();` after the last extension field. +// +// - encode an entire bundle in one bit if ALL its fields equal their default +// values: add a "mutable bool all_default" field and as the first visitor: +// if (v->AllDefault(*this, &all_default)) { +// // Overwrite all serialized fields, but not any nonserialized_*. +// v->SetDefault(this); +// return true; +// } +// Note: if extensions are present, AllDefault() == false. + +class Bundle { + public: + static constexpr size_t kMaxExtensions = 64; // bits in u64 + + // Print the type of each visitor called. + static constexpr bool PrintVisitors() { return false; } + // Print default value for each field and AllDefault result. + static constexpr bool PrintAllDefault() { return false; } + // Print values decoded for each field in Read. + static constexpr bool PrintRead() { return false; } + // Print size for each field and CanEncode total_bits. + static constexpr bool PrintSizes() { return false; } + + // Initializes fields to the default values. It is not recursive to nested + // fields, this function is intended to be called in the constructors so + // each nested field will already Init itself. + static void Init(Fields* JXL_RESTRICT fields); + + // Similar to Init, but recursive to nested fields. + static void SetDefault(Fields* JXL_RESTRICT fields); + + // Returns whether ALL fields (including `extensions`, if present) are equal + // to their default value. + static bool AllDefault(const Fields& fields); + + // Returns max number of bits required to encode a T. + static size_t MaxBits(const Fields& fields); + + // Returns whether a header's fields can all be encoded, i.e. they have a + // valid representation. If so, "*total_bits" is the exact number of bits + // required. Called by Write. + static Status CanEncode(const Fields& fields, + size_t* JXL_RESTRICT extension_bits, + size_t* JXL_RESTRICT total_bits); + + static Status Read(BitReader* reader, Fields* JXL_RESTRICT fields); + + // Returns whether enough bits are available to fully read this bundle using + // Read. Also returns true in case of a codestream error (other than not being + // large enough): that means enough bits are available to determine there's an + // error, use Read to get such error status. + // NOTE: this advances the BitReader, a different one pointing back at the + // original bit position in the codestream must be created to use Read after + // this. + static bool CanRead(BitReader* reader, Fields* JXL_RESTRICT fields); + + static Status Write(const Fields& fields, BitWriter* JXL_RESTRICT writer, + size_t layer, AuxOut* aux_out); + + private: +}; + +// Different subclasses of Visitor are passed to implementations of Fields +// throughout their lifetime. Templates used to be used for this but dynamic +// polymorphism produces more compact executables than template reification did. +class Visitor { + public: + virtual ~Visitor() = default; + virtual Status Visit(Fields* fields, const char* visitor_name) = 0; + + virtual Status Bool(bool default_value, bool* JXL_RESTRICT value) = 0; + virtual Status U32(U32Enc, uint32_t, uint32_t*) = 0; + + // Helper to construct U32Enc from U32Distr. + Status U32(const U32Distr d0, const U32Distr d1, const U32Distr d2, + const U32Distr d3, const uint32_t default_value, + uint32_t* JXL_RESTRICT value) { + return U32(U32Enc(d0, d1, d2, d3), default_value, value); + } + + template + Status Enum(const EnumT default_value, EnumT* JXL_RESTRICT value) { + uint32_t u32 = static_cast(*value); + // 00 -> 0 + // 01 -> 1 + // 10xxxx -> 2..17 + // 11yyyyyy -> 18..81 + JXL_RETURN_IF_ERROR(U32(Val(0), Val(1), BitsOffset(4, 2), BitsOffset(6, 18), + static_cast(default_value), &u32)); + *value = static_cast(u32); + return EnumValid(*value); + } + + virtual Status Bits(size_t bits, uint32_t default_value, + uint32_t* JXL_RESTRICT value) = 0; + virtual Status U64(uint64_t default_value, uint64_t* JXL_RESTRICT value) = 0; + virtual Status F16(float default_value, float* JXL_RESTRICT value) = 0; + + // Returns whether VisitFields should visit some subsequent fields. + // "condition" is typically from prior fields, e.g. flags. + // Overridden by InitVisitor and MaxBitsVisitor. + virtual Status Conditional(bool condition) { return condition; } + + // Overridden by InitVisitor, AllDefaultVisitor and CanEncodeVisitor. + virtual Status AllDefault(const Fields& /*fields*/, + bool* JXL_RESTRICT all_default) { + JXL_RETURN_IF_ERROR(Bool(true, all_default)); + return *all_default; + } + + virtual void SetDefault(Fields* /*fields*/) { + // Do nothing by default, this is overridden by ReadVisitor. + } + + // Returns the result of visiting a nested Bundle. + // Overridden by InitVisitor. + virtual Status VisitNested(Fields* fields) { return Visit(fields, ""); } + + // Overridden by ReadVisitor. Enables dynamically-sized fields. + virtual bool IsReading() const { return false; } + + virtual Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) = 0; + virtual Status EndExtensions() = 0; + + // For debugging + virtual const char* VisitorName() = 0; +}; + +} // namespace jxl + +#endif // LIB_JXL_FIELDS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/fields_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/fields_test.cc new file mode 100644 index 0000000000..78d372dfb3 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/fields_test.cc @@ -0,0 +1,434 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/fields.h" + +#include +#include + +#include +#include + +#include "gtest/gtest.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/common.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" + +namespace jxl { +namespace { + +// Ensures `value` round-trips and in exactly `expected_bits_written`. +void TestU32Coder(const uint32_t value, const size_t expected_bits_written) { + U32Coder coder; + const U32Enc enc(Val(0), Bits(4), Val(0x7FFFFFFF), Bits(32)); + + BitWriter writer; + BitWriter::Allotment allotment( + &writer, RoundUpBitsToByteMultiple(U32Coder::MaxEncodedBits(enc))); + + size_t precheck_pos; + EXPECT_TRUE(coder.CanEncode(enc, value, &precheck_pos)); + EXPECT_EQ(expected_bits_written, precheck_pos); + + EXPECT_TRUE(coder.Write(enc, value, &writer)); + EXPECT_EQ(expected_bits_written, writer.BitsWritten()); + writer.ZeroPadToByte(); + ReclaimAndCharge(&writer, &allotment, 0, nullptr); + + BitReader reader(writer.GetSpan()); + const uint32_t decoded_value = coder.Read(enc, &reader); + EXPECT_EQ(value, decoded_value); + EXPECT_TRUE(reader.Close()); +} + +TEST(FieldsTest, U32CoderTest) { + TestU32Coder(0, 2); + TestU32Coder(1, 6); + TestU32Coder(15, 6); + TestU32Coder(0x7FFFFFFF, 2); + TestU32Coder(128, 34); + TestU32Coder(0x7FFFFFFEu, 34); + TestU32Coder(0x80000000u, 34); + TestU32Coder(0xFFFFFFFFu, 34); +} + +void TestU64Coder(const uint64_t value, const size_t expected_bits_written) { + U64Coder coder; + + BitWriter writer; + BitWriter::Allotment allotment( + &writer, RoundUpBitsToByteMultiple(U64Coder::MaxEncodedBits())); + + size_t precheck_pos; + EXPECT_TRUE(coder.CanEncode(value, &precheck_pos)); + EXPECT_EQ(expected_bits_written, precheck_pos); + + EXPECT_TRUE(coder.Write(value, &writer)); + EXPECT_EQ(expected_bits_written, writer.BitsWritten()); + + writer.ZeroPadToByte(); + ReclaimAndCharge(&writer, &allotment, 0, nullptr); + + BitReader reader(writer.GetSpan()); + const uint64_t decoded_value = coder.Read(&reader); + EXPECT_EQ(value, decoded_value); + EXPECT_TRUE(reader.Close()); +} + +TEST(FieldsTest, U64CoderTest) { + // Values that should take 2 bits (selector 00): 0 + TestU64Coder(0, 2); + + // Values that should take 6 bits (2 for selector, 4 for value): 1..16 + TestU64Coder(1, 6); + TestU64Coder(2, 6); + TestU64Coder(8, 6); + TestU64Coder(15, 6); + TestU64Coder(16, 6); + + // Values that should take 10 bits (2 for selector, 8 for value): 17..272 + TestU64Coder(17, 10); + TestU64Coder(18, 10); + TestU64Coder(100, 10); + TestU64Coder(271, 10); + TestU64Coder(272, 10); + + // Values that should take 15 bits (2 for selector, 12 for value, 1 for varint + // end): (0)..273..4095 + TestU64Coder(273, 15); + TestU64Coder(274, 15); + TestU64Coder(1000, 15); + TestU64Coder(4094, 15); + TestU64Coder(4095, 15); + + // Take 24 bits (of which 20 actual value): (0)..4096..1048575 + TestU64Coder(4096, 24); + TestU64Coder(4097, 24); + TestU64Coder(10000, 24); + TestU64Coder(1048574, 24); + TestU64Coder(1048575, 24); + + // Take 33 bits (of which 28 actual value): (0)..1048576..268435455 + TestU64Coder(1048576, 33); + TestU64Coder(1048577, 33); + TestU64Coder(10000000, 33); + TestU64Coder(268435454, 33); + TestU64Coder(268435455, 33); + + // Take 42 bits (of which 36 actual value): (0)..268435456..68719476735 + TestU64Coder(268435456ull, 42); + TestU64Coder(268435457ull, 42); + TestU64Coder(1000000000ull, 42); + TestU64Coder(68719476734ull, 42); + TestU64Coder(68719476735ull, 42); + + // Take 51 bits (of which 44 actual value): (0)..68719476736..17592186044415 + TestU64Coder(68719476736ull, 51); + TestU64Coder(68719476737ull, 51); + TestU64Coder(1000000000000ull, 51); + TestU64Coder(17592186044414ull, 51); + TestU64Coder(17592186044415ull, 51); + + // Take 60 bits (of which 52 actual value): + // (0)..17592186044416..4503599627370495 + TestU64Coder(17592186044416ull, 60); + TestU64Coder(17592186044417ull, 60); + TestU64Coder(100000000000000ull, 60); + TestU64Coder(4503599627370494ull, 60); + TestU64Coder(4503599627370495ull, 60); + + // Take 69 bits (of which 60 actual value): + // (0)..4503599627370496..1152921504606846975 + TestU64Coder(4503599627370496ull, 69); + TestU64Coder(4503599627370497ull, 69); + TestU64Coder(10000000000000000ull, 69); + TestU64Coder(1152921504606846974ull, 69); + TestU64Coder(1152921504606846975ull, 69); + + // Take 73 bits (of which 64 actual value): + // (0)..1152921504606846976..18446744073709551615 + TestU64Coder(1152921504606846976ull, 73); + TestU64Coder(1152921504606846977ull, 73); + TestU64Coder(10000000000000000000ull, 73); + TestU64Coder(18446744073709551614ull, 73); + TestU64Coder(18446744073709551615ull, 73); +} + +Status TestF16Coder(const float value) { + F16Coder coder; + + size_t max_encoded_bits; + // It is not a fatal error if it can't be encoded. + if (!coder.CanEncode(value, &max_encoded_bits)) return false; + EXPECT_EQ(F16Coder::MaxEncodedBits(), max_encoded_bits); + + BitWriter writer; + BitWriter::Allotment allotment(&writer, + RoundUpBitsToByteMultiple(max_encoded_bits)); + + EXPECT_TRUE(coder.Write(value, &writer)); + EXPECT_EQ(F16Coder::MaxEncodedBits(), writer.BitsWritten()); + writer.ZeroPadToByte(); + ReclaimAndCharge(&writer, &allotment, 0, nullptr); + + BitReader reader(writer.GetSpan()); + float decoded_value; + EXPECT_TRUE(coder.Read(&reader, &decoded_value)); + // All values we test can be represented exactly. + EXPECT_EQ(value, decoded_value); + EXPECT_TRUE(reader.Close()); + return true; +} + +TEST(FieldsTest, F16CoderTest) { + for (float sign : {-1.0f, 1.0f}) { + // (anything less than 1E-3 are subnormals) + for (float mag : {0.0f, 0.5f, 1.0f, 2.0f, 2.5f, 16.015625f, 1.0f / 4096, + 1.0f / 16384, 65504.0f}) { + EXPECT_TRUE(TestF16Coder(sign * mag)); + } + } + + // Out of range + EXPECT_FALSE(TestF16Coder(65504.01f)); + EXPECT_FALSE(TestF16Coder(-65505.0f)); +} + +// Ensures Read(Write()) returns the same fields. +TEST(FieldsTest, TestRoundtripSize) { + for (int i = 0; i < 8; i++) { + SizeHeader size; + ASSERT_TRUE(size.Set(123 + 77 * i, 7 + i)); + + size_t extension_bits = 999, total_bits = 999; // Initialize as garbage. + ASSERT_TRUE(Bundle::CanEncode(size, &extension_bits, &total_bits)); + EXPECT_EQ(0, extension_bits); + + BitWriter writer; + ASSERT_TRUE(WriteSizeHeader(size, &writer, 0, nullptr)); + EXPECT_EQ(total_bits, writer.BitsWritten()); + writer.ZeroPadToByte(); + + SizeHeader size2; + BitReader reader(writer.GetSpan()); + ASSERT_TRUE(ReadSizeHeader(&reader, &size2)); + EXPECT_EQ(total_bits, reader.TotalBitsConsumed()); + EXPECT_TRUE(reader.Close()); + + EXPECT_EQ(size.xsize(), size2.xsize()); + EXPECT_EQ(size.ysize(), size2.ysize()); + } +} + +// Ensure all values can be reached by the encoding. +TEST(FieldsTest, TestCropRect) { + CodecMetadata metadata; + for (int32_t i = -1000; i < 19000; ++i) { + FrameHeader f(&metadata); + f.custom_size_or_origin = true; + f.frame_origin.x0 = i; + f.frame_origin.y0 = i; + f.frame_size.xsize = 1000 + i; + f.frame_size.ysize = 1000 + i; + size_t extension_bits = 0, total_bits = 0; + ASSERT_TRUE(Bundle::CanEncode(f, &extension_bits, &total_bits)); + EXPECT_EQ(0, extension_bits); + EXPECT_GE(total_bits, 9); + } +} +TEST(FieldsTest, TestPreview) { + // (div8 cannot represent 4360, but !div8 can go a little higher) + for (uint32_t i = 1; i < 4360; ++i) { + PreviewHeader p; + ASSERT_TRUE(p.Set(i, i)); + size_t extension_bits = 0, total_bits = 0; + ASSERT_TRUE(Bundle::CanEncode(p, &extension_bits, &total_bits)); + EXPECT_EQ(0, extension_bits); + EXPECT_GE(total_bits, 6); + } +} + +// Ensures Read(Write()) returns the same fields. +TEST(FieldsTest, TestRoundtripFrame) { + CodecMetadata metadata; + FrameHeader h(&metadata); + h.extensions = 0x800; + + size_t extension_bits = 999, total_bits = 999; // Initialize as garbage. + ASSERT_TRUE(Bundle::CanEncode(h, &extension_bits, &total_bits)); + EXPECT_EQ(0, extension_bits); + BitWriter writer; + ASSERT_TRUE(WriteFrameHeader(h, &writer, nullptr)); + EXPECT_EQ(total_bits, writer.BitsWritten()); + writer.ZeroPadToByte(); + + FrameHeader h2(&metadata); + BitReader reader(writer.GetSpan()); + ASSERT_TRUE(ReadFrameHeader(&reader, &h2)); + EXPECT_EQ(total_bits, reader.TotalBitsConsumed()); + EXPECT_TRUE(reader.Close()); + + EXPECT_EQ(h.extensions, h2.extensions); + EXPECT_EQ(h.flags, h2.flags); +} + +#ifndef JXL_CRASH_ON_ERROR +// Ensure out-of-bounds values cause an error. +TEST(FieldsTest, TestOutOfRange) { + SizeHeader h; + ASSERT_TRUE(h.Set(0xFFFFFFFFull, 0xFFFFFFFFull)); + size_t extension_bits = 999, total_bits = 999; // Initialize as garbage. + ASSERT_FALSE(Bundle::CanEncode(h, &extension_bits, &total_bits)); +} +#endif + +struct OldBundle : public Fields { + OldBundle() { Bundle::Init(this); } + const char* Name() const override { return "OldBundle"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Bits(2), Bits(3), Bits(4), 1, &old_small)); + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(1.125f, &old_f)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Bits(7), Bits(12), Bits(16), Bits(32), 0, &old_large)); + + JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions)); + return visitor->EndExtensions(); + } + + uint32_t old_small; + float old_f; + uint32_t old_large; + uint64_t extensions; +}; + +struct NewBundle : public Fields { + NewBundle() { Bundle::Init(this); } + const char* Name() const override { return "NewBundle"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Bits(2), Bits(3), Bits(4), 1, &old_small)); + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(1.125f, &old_f)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Bits(7), Bits(12), Bits(16), Bits(32), 0, &old_large)); + + JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions)); + if (visitor->Conditional(extensions & 1)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(2), Bits(2), Bits(3), Bits(4), 2, &new_small)); + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(-2.0f, &new_f)); + } + if (visitor->Conditional(extensions & 2)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Bits(9), Bits(12), Bits(16), Bits(32), 0, &new_large)); + } + return visitor->EndExtensions(); + } + + uint32_t old_small; + float old_f; + uint32_t old_large; + uint64_t extensions; + + // If extensions & 1 + uint32_t new_small = 2; + float new_f = -2.0f; + // If extensions & 2 + uint32_t new_large = 0; +}; + +TEST(FieldsTest, TestNewDecoderOldData) { + OldBundle old_bundle; + old_bundle.old_large = 123; + old_bundle.old_f = 3.75f; + old_bundle.extensions = 0; + + // Write to bit stream + const size_t kMaxOutBytes = 999; + BitWriter writer; + // Make sure values are initialized by code under test. + size_t extension_bits = 12345, total_bits = 12345; + ASSERT_TRUE(Bundle::CanEncode(old_bundle, &extension_bits, &total_bits)); + ASSERT_LE(total_bits, kMaxOutBytes * kBitsPerByte); + EXPECT_EQ(0, extension_bits); + AuxOut aux_out; + ASSERT_TRUE(Bundle::Write(old_bundle, &writer, kLayerHeader, &aux_out)); + + BitWriter::Allotment allotment(&writer, + kMaxOutBytes * kBitsPerByte - total_bits); + writer.Write(20, 0xA55A); // sentinel + writer.ZeroPadToByte(); + ReclaimAndCharge(&writer, &allotment, kLayerHeader, nullptr); + + ASSERT_LE(writer.GetSpan().size(), kMaxOutBytes); + BitReader reader(writer.GetSpan()); + NewBundle new_bundle; + ASSERT_TRUE(Bundle::Read(&reader, &new_bundle)); + EXPECT_EQ(reader.TotalBitsConsumed(), + aux_out.layers[kLayerHeader].total_bits); + EXPECT_EQ(reader.ReadBits(20), 0xA55A); + EXPECT_TRUE(reader.Close()); + + // Old fields are the same in both + EXPECT_EQ(old_bundle.extensions, new_bundle.extensions); + EXPECT_EQ(old_bundle.old_small, new_bundle.old_small); + EXPECT_EQ(old_bundle.old_f, new_bundle.old_f); + EXPECT_EQ(old_bundle.old_large, new_bundle.old_large); + // New fields match their defaults + EXPECT_EQ(2, new_bundle.new_small); + EXPECT_EQ(-2.0f, new_bundle.new_f); + EXPECT_EQ(0, new_bundle.new_large); +} + +TEST(FieldsTest, TestOldDecoderNewData) { + NewBundle new_bundle; + new_bundle.old_large = 123; + new_bundle.extensions = 3; + new_bundle.new_f = 999.0f; + new_bundle.new_large = 456; + + // Write to bit stream + constexpr size_t kMaxOutBytes = 999; + BitWriter writer; + // Make sure values are initialized by code under test. + size_t extension_bits = 12345, total_bits = 12345; + ASSERT_TRUE(Bundle::CanEncode(new_bundle, &extension_bits, &total_bits)); + EXPECT_NE(0, extension_bits); + AuxOut aux_out; + ASSERT_TRUE(Bundle::Write(new_bundle, &writer, kLayerHeader, &aux_out)); + ASSERT_LE(aux_out.layers[kLayerHeader].total_bits, + kMaxOutBytes * kBitsPerByte); + + BitWriter::Allotment allotment( + &writer, + kMaxOutBytes * kBitsPerByte - aux_out.layers[kLayerHeader].total_bits); + // Ensure Read skips the additional fields + writer.Write(20, 0xA55A); // sentinel + writer.ZeroPadToByte(); + ReclaimAndCharge(&writer, &allotment, kLayerHeader, nullptr); + + BitReader reader(writer.GetSpan()); + OldBundle old_bundle; + ASSERT_TRUE(Bundle::Read(&reader, &old_bundle)); + EXPECT_EQ(reader.TotalBitsConsumed(), + aux_out.layers[kLayerHeader].total_bits); + EXPECT_EQ(reader.ReadBits(20), 0xA55A); + EXPECT_TRUE(reader.Close()); + + // Old fields are the same in both + EXPECT_EQ(new_bundle.extensions, old_bundle.extensions); + EXPECT_EQ(new_bundle.old_small, old_bundle.old_small); + EXPECT_EQ(new_bundle.old_f, old_bundle.old_f); + EXPECT_EQ(new_bundle.old_large, old_bundle.old_large); + // (Can't check new fields because old decoder doesn't know about them) +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/filters.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/filters.cc new file mode 100644 index 0000000000..9cb62c1e94 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/filters.cc @@ -0,0 +1,112 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/filters.h" + +#include + +#include "lib/jxl/base/profiler.h" + +namespace jxl { + +Status FilterWeights::Init(const LoopFilter& lf, + const FrameDimensions& frame_dim) { + if (lf.epf_iters > 0) { + sigma = ImageF(frame_dim.xsize_blocks + 2 * kSigmaPadding, + frame_dim.ysize_blocks + 2 * kSigmaPadding); + } + if (lf.gab) { + JXL_RETURN_IF_ERROR(GaborishWeights(lf)); + } + return true; +} + +Status FilterWeights::GaborishWeights(const LoopFilter& lf) { + const float kZeroEpsilon = 1e-6; + + gab_weights[0] = 1; + gab_weights[1] = lf.gab_x_weight1; + gab_weights[2] = lf.gab_x_weight2; + gab_weights[3] = 1; + gab_weights[4] = lf.gab_y_weight1; + gab_weights[5] = lf.gab_y_weight2; + gab_weights[6] = 1; + gab_weights[7] = lf.gab_b_weight1; + gab_weights[8] = lf.gab_b_weight2; + // Normalize + for (size_t c = 0; c < 3; c++) { + const float div = gab_weights[3 * c] + + 4 * (gab_weights[3 * c + 1] + gab_weights[3 * c + 2]); + if (std::abs(div) < kZeroEpsilon) { + return JXL_FAILURE("Gaborish weights lead to near 0 unnormalized kernel"); + } + const float mul = 1.0f / div; + gab_weights[3 * c] *= mul; + gab_weights[3 * c + 1] *= mul; + gab_weights[3 * c + 2] *= mul; + } + return true; +} + +void FilterPipeline::ApplyFiltersRow(const LoopFilter& lf, + const FilterWeights& filter_weights, + ssize_t y) { + PROFILER_ZONE("Gaborish+EPF"); + JXL_DASSERT(num_filters != 0); // Must be initialized. + + JXL_ASSERT(y < static_cast(image_rect.ysize() + lf.Padding())); + + // The minimum value of the center row "y" needed to process the current + // filter. + ssize_t rows_needed = -static_cast(lf.Padding()); + + // We pass `image_rect.x0() - image_rect.x0() % kBlockDim` as the x0 for + // the row_sigma, so to go from an `x` value in the filter to the + // corresponding value in row_sigma we use the fact that we mapped + // image_rect.x0() in the original image to MaxLeftPadding(image_rect.x0()) in + // the input/output rows seen by the filters: + // x_in_sigma_row = + // ((x - (image_rect.x0() % kPaddingXRound) + image_rect.x0()) - + // (image_rect.x0() - image_rect.x0() % kBlockDim))) / kBlockDim + // x_in_sigma_row = + // x - image_rect.x0() % kPaddingXRound + image_rect.x0() % kBlockDim + const size_t sigma_x_offset = + image_rect.x0() % kBlockDim - + image_rect.x0() % GroupBorderAssigner::kPaddingXRound; + + for (size_t i = 0; i < num_filters; i++) { + const FilterStep& filter = filters[i]; + + rows_needed += filter.filter_def.border; + + // After this "y" points to the rect row for the center of the filter. + y -= filter.filter_def.border; + if (y < rows_needed) return; + + // Apply filter to the given region. + FilterRows rows(filter.filter_def.border); + filter.set_input_rows(filter, &rows, y); + filter.set_output_rows(filter, &rows, y); + + // The "y" coordinate used for the sigma image in EPF1. Sigma is padded + // with kMaxFilterPadding (or kMaxFilterPadding/kBlockDim rows in sigma) + // above and below. + const size_t sigma_y = kMaxFilterPadding + image_rect.y0() + y; + // The offset to subtract to a "x" value in the filter to obtain the + // corresponding x in the sigma row. + if (compute_sigma) { + rows.SetSigma(filter_weights.sigma, sigma_y, + image_rect.x0() - image_rect.x0() % kBlockDim); + } + + filter.filter_def.apply(rows, lf, filter_weights, filter.filter_x0, + filter.filter_x1, sigma_x_offset, + sigma_y % kBlockDim); + } + + JXL_DASSERT(rows_needed == 0); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/filters.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/filters.h new file mode 100644 index 0000000000..1dad66fc42 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/filters.h @@ -0,0 +1,348 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_FILTERS_H_ +#define LIB_JXL_FILTERS_H_ + +#include + +#include "lib/jxl/common.h" +#include "lib/jxl/dec_group_border.h" +#include "lib/jxl/filters_internal.h" +#include "lib/jxl/image.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/sanitizers.h" + +namespace jxl { + +struct FilterWeights { + // Initialize the FilterWeights for the passed LoopFilter and FrameDimensions. + // Returns an error if the weights are invalid. + Status Init(const LoopFilter& lf, const FrameDimensions& frame_dim); + + // Normalized weights for gaborish, in XYB order, each weight for Manhattan + // distance of 0, 1 and 2 respectively. + float gab_weights[9]; + + // Sigma values for EPF, if enabled. + // Note that, for speed reasons, this is actually kInvSigmaNum / sigma. + ImageF sigma; + + private: + Status GaborishWeights(const LoopFilter& lf); +}; + +static constexpr size_t kMaxFinalizeRectPadding = 9; + +// Line-based EPF only needs to keep in cache 21 lines of the image, so 256 is +// sufficient for everything to fit in the L2 cache. We add +// 2*RoundUpTo(kMaxFinalizeRectPadding, kBlockDim) pixels as we might have up to +// two extra borders on each side. +constexpr size_t kApplyImageFeaturesTileDim = + 256 + 2 * RoundUpToBlockDim(kMaxFinalizeRectPadding); + +// The maximum row storage needed by the filtering pipeline. This is the sum of +// the number of input rows needed by each step. +constexpr size_t kTotalStorageRows = 7 + 5 + 3; // max is EPF0 + EPF1 + EPF2. + +// The maximum sum of all the borders in a chain of filters. +constexpr size_t kMaxFilterBorder = 1 * kBlockDim; + +// The maximum horizontal filter padding ever needed to apply a chain of +// filters. Intermediate storage must have at least as much padding on each +// left and right sides. This value must be a multiple of kBlockDim. +constexpr size_t kMaxFilterPadding = kMaxFilterBorder + kBlockDim; +static_assert(kMaxFilterPadding % kBlockDim == 0, + "kMaxFilterPadding must be a multiple of block size."); + +// Same as FilterBorder and FilterPadding but for Sigma. +constexpr size_t kSigmaBorder = kMaxFilterBorder / kBlockDim; +constexpr size_t kSigmaPadding = kMaxFilterPadding / kBlockDim; + +// Utility struct to define input/output rows of row-based loop filters. +constexpr size_t kMaxBorderSize = 3; +struct FilterRows { + explicit FilterRows(int border_size) : border_size_(border_size) { + JXL_DASSERT(border_size <= static_cast(kMaxBorderSize)); + } + + JXL_INLINE const float* GetInputRow(int row, size_t c) const { + // Check that row is within range. + JXL_DASSERT(-border_size_ <= row && row <= border_size_); + return rows_in_[c] + offsets_in_[kMaxBorderSize + row]; + } + + float* GetOutputRow(size_t c) const { return rows_out_[c]; } + + const float* GetSigmaRow() const { + JXL_DASSERT(row_sigma_ != nullptr); + return row_sigma_; + } + + template + void SetInput(const Image3F& in, size_t y_offset, ssize_t y0, ssize_t x0, + ssize_t full_image_y_offset = 0, ssize_t image_ysize = 0) { + RowMap row_map(full_image_y_offset, image_ysize); + for (size_t c = 0; c < 3; c++) { + rows_in_[c] = in.ConstPlaneRow(c, 0); + } + for (int32_t i = -border_size_; i <= border_size_; i++) { + size_t y = row_map(y0 + i); + offsets_in_[i + kMaxBorderSize] = + static_cast((y + y_offset) * in.PixelsPerRow()) + x0; + } + } + + template + void SetOutput(Image3F* out, size_t y_offset, ssize_t y0, ssize_t x0) { + size_t y = RowMap()(y0); + for (size_t c = 0; c < 3; c++) { + rows_out_[c] = out->PlaneRow(c, y + y_offset) + x0; + } + } + + // Sets the sigma row for the given y0, x0 input image position. Sigma images + // have one pixel per input image block, although they are padded with two + // blocks (pixels in sigma) on each one of the four sides. The (x0, y0) values + // should include this padding. + void SetSigma(const ImageF& sigma, size_t y0, size_t x0) { + JXL_DASSERT(x0 % kBlockDim == 0); + row_sigma_ = sigma.ConstRow(y0 / kBlockDim) + x0 / kBlockDim; + } + + private: + // Base pointer to each one of the planes. + const float* JXL_RESTRICT rows_in_[3]; + + // Offset to the pixel x0 at the different rows. offsets_in_[kMaxBorderSize] + // references the center row, regardless of the border_size_. Only the center + // row, border_size_ before and border_size_ after are initialized. The offset + // is relative to the base pointer in rows_in_. + ssize_t offsets_in_[2 * kMaxBorderSize + 1]; + + float* JXL_RESTRICT rows_out_[3]; + + const float* JXL_RESTRICT row_sigma_{nullptr}; + + const int border_size_; +}; + +// Definition of a filter. This specifies the function to be used to apply the +// filter and its row and column padding requirements. +struct FilterDefinition { + // Function to apply the filter to a given row. The filter constant parameters + // are passed in LoopFilter lf and filter_weights. `sigma_x_offset` is needed + // to offset the `x0` value so that it will cause correct accesses to + // rows.GetSigmaRow(): there is just one sigma value per 8 pixels, and if the + // image rectangle is not aligned to multiples of 8 pixels, we need to + // compensate for the difference between x0 and the image position modulo 8. + void (*apply)(const FilterRows& rows, const LoopFilter& lf, + const FilterWeights& filter_weights, size_t x0, size_t x1, + size_t sigma_x_offset, size_t image_y_mod_8); + + // Number of source image rows and cols before and after an input pixel needed + // to compute the output of the filter. For a 3x3 convolution this border will + // be only 1. + size_t border; +}; + +// A chain of filters to be applied to a source image. This instance must be +// initialized by the FilterPipelineInit() function before it can be used. +class FilterPipeline { + public: + FilterPipeline() : FilterPipeline(kApplyImageFeaturesTileDim) {} + explicit FilterPipeline(size_t max_rect_xsize) + : storage{max_rect_xsize + 2 * kMaxFilterPadding + + GroupBorderAssigner::kPaddingXRound, + kTotalStorageRows} { +#if MEMORY_SANITIZER + // The padding of the storage may be used uninitialized since we process + // multiple SIMD lanes at a time, aligned to a multiple of lanes. + // For example, in a hypothetical 3-step filter process where all filters + // use 1 pixel border the first filter needs to process 2 pixels more on + // each side than the requested rect.x0(), rect.xsize(), while the second + // filter needs to process 1 more pixel on each side, however for + // performance reasons both will process Lanes(df) more pixels on each + // side assuming this Lanes(df) value is more than one. In that case the + // second filter will be using one pixel of uninitialized data to generate + // an output pixel that won't affect the final output but may cause msan + // failures. For this reason we initialize the padding region. + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < storage.ysize(); y++) { + float* row = storage.PlaneRow(c, y); + std::fill(row, row + kMaxFilterPadding, msan::kSanitizerSentinel); + std::fill(row + storage.xsize() - kMaxFilterPadding, + row + storage.xsize(), msan::kSanitizerSentinel); + } + } +#endif // MEMORY_SANITIZER + } + + FilterPipeline(const FilterPipeline&) = delete; + FilterPipeline(FilterPipeline&&) = default; + + // Apply the filter chain to a given row. To apply the filter chain to a whole + // image this must be called for `image_rect.ysize() + 2 * total_border` + // values of `y`, in increasing order, starting from `y = -total_border`. + // `image_rect` is the value passed to FilterPipelineInit(). + void ApplyFiltersRow(const LoopFilter& lf, + const FilterWeights& filter_weights, ssize_t y); + + struct FilterStep { + // We don't map self.input_rect.x0() directly to kMaxFilterPadding in + // input/output row since they might have a different alignment, instead we + // keep the alignment modulo kPaddingXRound. + static size_t MaxLeftPadding(size_t image_rect_x0) { + return kMaxFilterPadding + + image_rect_x0 % GroupBorderAssigner::kPaddingXRound; + } + + // Sets the input of the filter step as an image region. + void SetInput(const Image3F* im_input, const Rect& input_rect, + const Rect& image_rect, size_t image_ysize) { + input = im_input; + this->input_rect = input_rect; + this->image_rect = image_rect; + this->image_ysize = image_ysize; + JXL_DASSERT(SameSize(input_rect, image_rect)); + set_input_rows = [](const FilterStep& self, FilterRows* rows, + ssize_t y0) { + ssize_t full_image_y_offset = + static_cast(self.image_rect.y0()) - + static_cast(self.input_rect.y0()); + rows->SetInput(*(self.input), 0, + self.input_rect.y0() + y0, + self.input_rect.x0() - kMaxFilterPadding, + full_image_y_offset, self.image_ysize); + rows->SetInput( + *(self.input), 0, self.input_rect.y0() + y0, + self.input_rect.x0() - MaxLeftPadding(self.input_rect.x0()), + full_image_y_offset, self.image_ysize); + }; + } + + // Sets the input of the filter step as the temporary cyclic storage with + // num_rows rows. The value image_rect.x0() during application will be + // mapped to "kMaxFilterPadding + alignment" regardless of the rect being + // processed. + template + void SetInputCyclicStorage(const Image3F* storage, size_t offset_rows) { + input = storage; + input_y_offset = offset_rows; + set_input_rows = [](const FilterStep& self, FilterRows* rows, + ssize_t y0) { + rows->SetInput>(*(self.input), self.input_y_offset, + y0, 0); + }; + } + + // Sets the output of the filter step as the temporary cyclic storage with + // num_rows rows. The value image_rect.x0() during application will be + // mapped to "kMaxFilterPadding + alignment" regardless of the rect being + // processed. + template + void SetOutputCyclicStorage(Image3F* storage, size_t offset_rows) { + output = storage; + output_y_offset = offset_rows; + set_output_rows = [](const FilterStep& self, FilterRows* rows, + ssize_t y0) { + rows->SetOutput>(self.output, self.output_y_offset, + y0, 0); + }; + } + + // Set the output of the filter step as the output image. The value + // rect.x0() will be mapped to the same value in the output image. + void SetOutput(Image3F* im_output, const Rect& output_rect) { + output = im_output; + this->output_rect = output_rect; + set_output_rows = [](const FilterStep& self, FilterRows* rows, + ssize_t y0) { + rows->SetOutput(self.output, 0, self.output_rect.y0() + y0, + static_cast(self.output_rect.x0()) - + MaxLeftPadding(self.output_rect.x0())); + }; + } + + // The input and output image buffers for the current filter step. Note that + // the rows used from these images depends on the module used in + // set_input_rows and set_output_rows functions. + const Image3F* input; + size_t input_y_offset = 0; + Image3F* output; + size_t output_y_offset = 0; + + // Input/output rect for the first/last steps of the filter. + Rect input_rect; + Rect output_rect; + + // Information to properly do RowMapMirror(). + Rect image_rect; + size_t image_ysize; + + // Functions that compute the list of rows needed to process a region for + // the given row and starting column. + void (*set_input_rows)(const FilterStep&, FilterRows* rows, ssize_t y0); + void (*set_output_rows)(const FilterStep&, FilterRows* rows, ssize_t y0); + + // Actual filter descriptor. + FilterDefinition filter_def; + + // Range of output pixels of the step. The filter [x0, x1) range is always + // a multiple of Lanes(df) and is large enough to contain the input and + // border needed by the next stages, but values outside that range may be + // undefined values. Coordinates are relative to the FilterRows pointers. + size_t filter_x0, filter_x1; + + // Number of extra horizontal pixels needed on each side of the output of + // this filter to produce the requested rect at the end of the chain. This + // value is always 0 for the last filter of the chain but it depends on the + // actual filter chain used in other cases. + size_t output_col_border; + }; + + template + void AddStep(const FilterDefinition& filter_def) { + JXL_DASSERT(num_filters < kMaxFilters); + filters[num_filters].filter_def = filter_def; + + if (num_filters > 0) { + // If it is not the first step we need to set the previous step output to + // a portion of the cyclic storage. We only need as many rows as the + // input of the current stage. + constexpr size_t num_rows = 2 * border + 1; + filters[num_filters - 1].SetOutputCyclicStorage( + &storage, storage_rows_used); + filters[num_filters].SetInputCyclicStorage(&storage, + storage_rows_used); + storage_rows_used += num_rows; + JXL_DASSERT(storage_rows_used <= kTotalStorageRows); + } + num_filters++; + } + + // Tile storage for ApplyImageFeatures steps. Different groups of rows of this + // image are used for the intermediate steps. + Image3F storage; + size_t storage_rows_used = 0; + + static const size_t kMaxFilters = 4; + FilterStep filters[kMaxFilters]; + size_t num_filters = 0; + + // Whether we need to compute the sigma_row_ during application. + bool compute_sigma = false; + + // Rect to be processed in the image coordinates. This doesn't include any + // padding needed to produce the output. + Rect image_rect; + + // The total border needed to process this pipeline. + size_t total_border = 0; +}; + +} // namespace jxl + +#endif // LIB_JXL_FILTERS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/filters_internal.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/filters_internal.h new file mode 100644 index 0000000000..4ad90faaf2 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/filters_internal.h @@ -0,0 +1,55 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_FILTERS_INTERNAL_H_ +#define LIB_JXL_FILTERS_INTERNAL_H_ + +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { + +// Maps a row to the range [0, image_ysize) mirroring it when outside the [0, +// image_ysize) range. The input row is offset by `full_image_y_offset`, i.e. +// row `y` corresponds to row `y + full_image_y_offset` in the full frame. +struct RowMapMirror { + RowMapMirror(ssize_t full_image_y_offset, size_t image_ysize) + : full_image_y_offset_(full_image_y_offset), image_ysize_(image_ysize) {} + size_t operator()(ssize_t y) { + return Mirror(y + full_image_y_offset_, image_ysize_) - + full_image_y_offset_; + } + ssize_t full_image_y_offset_; + size_t image_ysize_; +}; + +// Maps a row in the range [-16, \inf) to a row number in the range [0, m) using +// the modulo operation. +template +struct RowMapMod { + RowMapMod() = default; + RowMapMod(ssize_t /*full_image_y_offset*/, size_t /*image_ysize*/) {} + size_t operator()(ssize_t y) { + JXL_DASSERT(y >= -16); + // The `m > 16 ? m : 16 * m` is evaluated at compile time and is a multiple + // of m of at least 16. This is to make sure that the left operand is + // positive. + return static_cast(y + (m > 16 ? m : 16 * m)) % m; + } +}; + +// Identity mapping. Maps a row in the range [0, ysize) to the same value. +struct RowMapId { + size_t operator()(ssize_t y) { + JXL_DASSERT(y >= 0); + return y; + } +}; + +} // namespace jxl + +#endif // LIB_JXL_FILTERS_INTERNAL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/filters_internal_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/filters_internal_test.cc new file mode 100644 index 0000000000..c47269d194 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/filters_internal_test.cc @@ -0,0 +1,50 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/filters_internal.h" + +#include "gtest/gtest.h" + +namespace jxl { + +class FiltersInternalTest : public ::testing::Test {}; + +// Test the mping of rows using RowMapMod. +TEST(FiltersInternalTest, RowMapModTest) { + RowMapMod<5> m; + // Identity part: + EXPECT_EQ(0, m(0)); + EXPECT_EQ(4, m(4)); + + // Larger than the module work. + EXPECT_EQ(0, m(5)); + EXPECT_EQ(1, m(11)); + + // Smaller than 0 up to a block. + EXPECT_EQ(4, m(-1)); + EXPECT_EQ(2, m(-8)); +} + +// Test the implementation for mirroring of rows. +TEST(FiltersInternalTest, RowMapMirrorTest) { + RowMapMirror m(0, 10); // Image size of 10 rows. + + EXPECT_EQ(2, m(-3)); + EXPECT_EQ(1, m(-2)); + EXPECT_EQ(0, m(-1)); + + EXPECT_EQ(0, m(0)); + EXPECT_EQ(9, m(9)); + + EXPECT_EQ(9, m(10)); + EXPECT_EQ(8, m(11)); + EXPECT_EQ(7, m(12)); + + // It mirrors the rows to infinity. + EXPECT_EQ(1, m(21)); + EXPECT_EQ(1, m(41)); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/frame_header.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/frame_header.cc new file mode 100644 index 0000000000..bee1070350 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/frame_header.cc @@ -0,0 +1,376 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/frame_header.h" + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/fields.h" + +namespace jxl { + +constexpr uint8_t YCbCrChromaSubsampling::kHShift[]; +constexpr uint8_t YCbCrChromaSubsampling::kVShift[]; + +static Status VisitBlendMode(Visitor* JXL_RESTRICT visitor, + BlendMode default_value, BlendMode* blend_mode) { + uint32_t encoded = static_cast(*blend_mode); + + JXL_QUIET_RETURN_IF_ERROR(visitor->U32( + Val(static_cast(BlendMode::kReplace)), + Val(static_cast(BlendMode::kAdd)), + Val(static_cast(BlendMode::kBlend)), BitsOffset(2, 3), + static_cast(default_value), &encoded)); + if (encoded > 4) { + return JXL_FAILURE("Invalid blend_mode"); + } + *blend_mode = static_cast(encoded); + return true; +} + +static Status VisitFrameType(Visitor* JXL_RESTRICT visitor, + FrameType default_value, FrameType* frame_type) { + uint32_t encoded = static_cast(*frame_type); + + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(static_cast(FrameType::kRegularFrame)), + Val(static_cast(FrameType::kDCFrame)), + Val(static_cast(FrameType::kReferenceOnly)), + Val(static_cast(FrameType::kSkipProgressive)), + static_cast(default_value), &encoded)); + *frame_type = static_cast(encoded); + return true; +} + +BlendingInfo::BlendingInfo() { Bundle::Init(this); } + +Status BlendingInfo::VisitFields(Visitor* JXL_RESTRICT visitor) { + JXL_QUIET_RETURN_IF_ERROR( + VisitBlendMode(visitor, BlendMode::kReplace, &mode)); + if (visitor->Conditional(nonserialized_num_extra_channels > 0 && + (mode == BlendMode::kBlend || + mode == BlendMode::kAlphaWeightedAdd))) { + // Up to 11 alpha channels for blending. + JXL_QUIET_RETURN_IF_ERROR(visitor->U32( + Val(0), Val(1), Val(2), BitsOffset(3, 3), 0, &alpha_channel)); + if (visitor->IsReading() && + alpha_channel >= nonserialized_num_extra_channels) { + return JXL_FAILURE("Invalid alpha channel for blending"); + } + } + if (visitor->Conditional((nonserialized_num_extra_channels > 0 && + (mode == BlendMode::kBlend || + mode == BlendMode::kAlphaWeightedAdd)) || + mode == BlendMode::kMul)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &clamp)); + } + // 'old' frame for blending. Only necessary if this is not a full frame, or + // blending is not kReplace. + if (visitor->Conditional(mode != BlendMode::kReplace || + nonserialized_is_partial_frame)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(0), Val(1), Val(2), Val(3), 0, &source)); + } + return true; +} + +AnimationFrame::AnimationFrame(const CodecMetadata* metadata) + : nonserialized_metadata(metadata) { + Bundle::Init(this); +} +Status AnimationFrame::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->Conditional(nonserialized_metadata != nullptr && + nonserialized_metadata->m.have_animation)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(0), Val(1), Bits(8), Bits(32), 0, &duration)); + } + + if (visitor->Conditional( + nonserialized_metadata != nullptr && + nonserialized_metadata->m.animation.have_timecodes)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(32, 0, &timecode)); + } + return true; +} + +YCbCrChromaSubsampling::YCbCrChromaSubsampling() { Bundle::Init(this); } +Passes::Passes() { Bundle::Init(this); } +Status Passes::VisitFields(Visitor* JXL_RESTRICT visitor) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(3), BitsOffset(3, 4), 1, &num_passes)); + JXL_ASSERT(num_passes <= kMaxNumPasses); // Cannot happen when reading + + if (visitor->Conditional(num_passes != 1)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32( + Val(0), Val(1), Val(2), BitsOffset(1, 3), 0, &num_downsample)); + JXL_ASSERT(num_downsample <= 4); // 1,2,4,8 + if (num_downsample > num_passes) { + return JXL_FAILURE("num_downsample %u > num_passes %u", num_downsample, + num_passes); + } + + for (uint32_t i = 0; i < num_passes - 1; i++) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(2, 0, &shift[i])); + } + shift[num_passes - 1] = 0; + + for (uint32_t i = 0; i < num_downsample; ++i) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(4), Val(8), 1, &downsample[i])); + } + for (uint32_t i = 0; i < num_downsample; ++i) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(0), Val(1), Val(2), Bits(3), 0, &last_pass[i])); + if (last_pass[i] >= num_passes) { + return JXL_FAILURE("last_pass %u >= num_passes %u", last_pass[i], + num_passes); + } + } + } + + return true; +} +FrameHeader::FrameHeader(const CodecMetadata* metadata) + : animation_frame(metadata), nonserialized_metadata(metadata) { + Bundle::Init(this); +} + +Status ReadFrameHeader(BitReader* JXL_RESTRICT reader, + FrameHeader* JXL_RESTRICT frame) { + return Bundle::Read(reader, frame); +} + +Status WriteFrameHeader(const FrameHeader& frame, + BitWriter* JXL_RESTRICT writer, AuxOut* aux_out) { + return Bundle::Write(frame, writer, kLayerHeader, aux_out); +} + +Status FrameHeader::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + + JXL_QUIET_RETURN_IF_ERROR( + VisitFrameType(visitor, FrameType::kRegularFrame, &frame_type)); + if (visitor->IsReading() && nonserialized_is_preview && + frame_type != kRegularFrame) { + return JXL_FAILURE("Only regular frame could be a preview"); + } + + // FrameEncoding. + bool is_modular = (encoding == FrameEncoding::kModular); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &is_modular)); + encoding = (is_modular ? FrameEncoding::kModular : FrameEncoding::kVarDCT); + + // Flags + JXL_QUIET_RETURN_IF_ERROR(visitor->U64(0, &flags)); + + // Color transform + bool xyb_encoded = nonserialized_metadata == nullptr || + nonserialized_metadata->m.xyb_encoded; + + bool fp = nonserialized_metadata != nullptr && + nonserialized_metadata->m.bit_depth.floating_point_sample; + + if (xyb_encoded) { + if (is_modular && fp) { + return JXL_FAILURE( + "Floating point samples is not supported with XYB color encoding"); + } + color_transform = ColorTransform::kXYB; + } else { + // Alternate if kYCbCr. + bool alternate = color_transform == ColorTransform::kYCbCr; + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &alternate)); + color_transform = + (alternate ? ColorTransform::kYCbCr : ColorTransform::kNone); + } + + // Chroma subsampling for YCbCr, if no DC frame is used. + if (visitor->Conditional(color_transform == ColorTransform::kYCbCr && + ((flags & kUseDcFrame) == 0))) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&chroma_subsampling)); + } + + size_t num_extra_channels = + nonserialized_metadata != nullptr + ? nonserialized_metadata->m.extra_channel_info.size() + : 0; + + // Upsampling + if (visitor->Conditional((flags & kUseDcFrame) == 0)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(4), Val(8), 1, &upsampling)); + if (nonserialized_metadata != nullptr && + visitor->Conditional(num_extra_channels != 0)) { + const std::vector& extra_channels = + nonserialized_metadata->m.extra_channel_info; + extra_channel_upsampling.resize(extra_channels.size(), 1); + for (size_t i = 0; i < extra_channels.size(); ++i) { + uint32_t dim_shift = + nonserialized_metadata->m.extra_channel_info[i].dim_shift; + uint32_t& ec_upsampling = extra_channel_upsampling[i]; + ec_upsampling >>= dim_shift; + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(4), Val(8), 1, &ec_upsampling)); + ec_upsampling <<= dim_shift; + if (ec_upsampling < upsampling) { + return JXL_FAILURE( + "EC upsampling (%u) < color upsampling (%u), which is invalid.", + ec_upsampling, upsampling); + } + if (ec_upsampling > 8) { + return JXL_FAILURE("EC upsampling too large (%u)", ec_upsampling); + } + } + } else { + extra_channel_upsampling.clear(); + } + } + + // Modular- or VarDCT-specific data. + if (visitor->Conditional(encoding == FrameEncoding::kModular)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(2, 1, &group_size_shift)); + } + if (visitor->Conditional(encoding == FrameEncoding::kVarDCT && + color_transform == ColorTransform::kXYB)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 3, &x_qm_scale)); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 2, &b_qm_scale)); + } else { + x_qm_scale = b_qm_scale = 2; // noop + } + + // Not useful for kPatchSource + if (visitor->Conditional(frame_type != FrameType::kReferenceOnly)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&passes)); + } + + if (visitor->Conditional(frame_type == FrameType::kDCFrame)) { + // Up to 4 pyramid levels - for up to 16384x downsampling. + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(3), Val(4), 1, &dc_level)); + } + if (frame_type != FrameType::kDCFrame) { + dc_level = 0; + } + + bool is_partial_frame = false; + if (visitor->Conditional(frame_type != FrameType::kDCFrame)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &custom_size_or_origin)); + if (visitor->Conditional(custom_size_or_origin)) { + const U32Enc enc(Bits(8), BitsOffset(11, 256), BitsOffset(14, 2304), + BitsOffset(30, 18688)); + // Frame offset, only if kRegularFrame or kSkipProgressive. + if (visitor->Conditional(frame_type == FrameType::kRegularFrame || + frame_type == FrameType::kSkipProgressive)) { + uint32_t ux0 = PackSigned(frame_origin.x0); + uint32_t uy0 = PackSigned(frame_origin.y0); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(enc, 0, &ux0)); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(enc, 0, &uy0)); + frame_origin.x0 = UnpackSigned(ux0); + frame_origin.y0 = UnpackSigned(uy0); + } + // Frame size + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(enc, 0, &frame_size.xsize)); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(enc, 0, &frame_size.ysize)); + int32_t image_xsize = default_xsize(); + int32_t image_ysize = default_ysize(); + if (frame_type == FrameType::kRegularFrame || + frame_type == FrameType::kSkipProgressive) { + is_partial_frame |= frame_origin.x0 > 0; + is_partial_frame |= frame_origin.y0 > 0; + is_partial_frame |= (static_cast(frame_size.xsize) + + frame_origin.x0) < image_xsize; + is_partial_frame |= (static_cast(frame_size.ysize) + + frame_origin.y0) < image_ysize; + } + } + } + + // Blending info, animation info and whether this is the last frame or not. + if (visitor->Conditional(frame_type == FrameType::kRegularFrame || + frame_type == FrameType::kSkipProgressive)) { + blending_info.nonserialized_num_extra_channels = num_extra_channels; + blending_info.nonserialized_is_partial_frame = is_partial_frame; + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&blending_info)); + bool replace_all = (blending_info.mode == BlendMode::kReplace); + extra_channel_blending_info.resize(num_extra_channels); + for (size_t i = 0; i < num_extra_channels; i++) { + auto& ec_blending_info = extra_channel_blending_info[i]; + ec_blending_info.nonserialized_is_partial_frame = is_partial_frame; + ec_blending_info.nonserialized_num_extra_channels = num_extra_channels; + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&ec_blending_info)); + replace_all &= (ec_blending_info.mode == BlendMode::kReplace); + } + if (visitor->IsReading() && nonserialized_is_preview) { + if (!replace_all || custom_size_or_origin) { + return JXL_FAILURE("Preview is not compatible with blending"); + } + } + if (visitor->Conditional(nonserialized_metadata != nullptr && + nonserialized_metadata->m.have_animation)) { + animation_frame.nonserialized_metadata = nonserialized_metadata; + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&animation_frame)); + } + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(true, &is_last)); + } + if (frame_type != FrameType::kRegularFrame) { + is_last = false; + } + + // ID of that can be used to refer to this frame. 0 for a non-zero-duration + // frame means that it will not be referenced. Not necessary for the last + // frame. + if (visitor->Conditional(frame_type != kDCFrame && !is_last)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(0), Val(1), Val(2), Val(3), 0, &save_as_reference)); + } + + // If this frame is not blended on another frame post-color-transform, it may + // be stored for being referenced either before or after the color transform. + // If it is blended post-color-transform, it must be blended after. It must + // also be blended after if this is a kRegular frame that does not cover the + // full frame, as samples outside the partial region are from a + // post-color-transform frame. + if (frame_type != FrameType::kDCFrame) { + if (visitor->Conditional(CanBeReferenced() && + blending_info.mode == BlendMode::kReplace && + !is_partial_frame && + (frame_type == FrameType::kRegularFrame || + frame_type == FrameType::kSkipProgressive))) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->Bool(false, &save_before_color_transform)); + } else if (visitor->Conditional(frame_type == FrameType::kReferenceOnly)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->Bool(true, &save_before_color_transform)); + if (!save_before_color_transform && + (frame_size.xsize < nonserialized_metadata->xsize() || + frame_size.ysize < nonserialized_metadata->ysize() || + frame_origin.x0 != 0 || frame_origin.y0 != 0)) { + return JXL_FAILURE( + "non-patch reference frame with invalid crop: %zux%zu%+d%+d", + static_cast(frame_size.xsize), + static_cast(frame_size.ysize), + static_cast(frame_origin.x0), + static_cast(frame_origin.y0)); + } + } + } else { + save_before_color_transform = true; + } + + JXL_QUIET_RETURN_IF_ERROR(VisitNameString(visitor, &name)); + + loop_filter.nonserialized_is_modular = is_modular; + JXL_RETURN_IF_ERROR(visitor->VisitNested(&loop_filter)); + + JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions)); + // Extensions: in chronological order of being added to the format. + return visitor->EndExtensions(); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/frame_header.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/frame_header.h new file mode 100644 index 0000000000..dab0267adf --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/frame_header.h @@ -0,0 +1,492 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_FRAME_HEADER_H_ +#define LIB_JXL_FRAME_HEADER_H_ + +// Frame header with backward and forward-compatible extension capability and +// compressed integer fields. + +#include +#include + +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image_metadata.h" +#include "lib/jxl/loop_filter.h" + +namespace jxl { + +// Also used by extra channel names. +static inline Status VisitNameString(Visitor* JXL_RESTRICT visitor, + std::string* name) { + uint32_t name_length = static_cast(name->length()); + // Allows layer name lengths up to 1071 bytes + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(0), Bits(4), BitsOffset(5, 16), + BitsOffset(10, 48), 0, &name_length)); + if (visitor->IsReading()) { + name->resize(name_length); + } + for (size_t i = 0; i < name_length; i++) { + uint32_t c = (*name)[i]; + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(8, 0, &c)); + (*name)[i] = static_cast(c); + } + return true; +} + +enum class FrameEncoding : uint32_t { + kVarDCT, + kModular, +}; + +enum class ColorTransform : uint32_t { + kXYB, // Values are encoded with XYB. May only be used if + // ImageBundle::xyb_encoded. + kNone, // Values are encoded according to the attached color profile. May + // only be used if !ImageBundle::xyb_encoded. + kYCbCr, // Values are encoded according to the attached color profile, but + // transformed to YCbCr. May only be used if + // !ImageBundle::xyb_encoded. +}; + +inline std::array JpegOrder(ColorTransform ct, bool is_gray) { + if (is_gray) { + return {0, 0, 0}; + } + JXL_ASSERT(ct != ColorTransform::kXYB); + if (ct == ColorTransform::kYCbCr) { + return {1, 0, 2}; + } else { + return {0, 1, 2}; + } +} + +struct YCbCrChromaSubsampling : public Fields { + YCbCrChromaSubsampling(); + const char* Name() const override { return "YCbCrChromaSubsampling"; } + size_t HShift(size_t c) const { return maxhs_ - kHShift[channel_mode_[c]]; } + size_t VShift(size_t c) const { return maxvs_ - kVShift[channel_mode_[c]]; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override { + // TODO(veluca): consider allowing 4x downsamples + for (size_t i = 0; i < 3; i++) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(2, 0, &channel_mode_[i])); + } + Recompute(); + return true; + } + + uint8_t MaxHShift() const { return maxhs_; } + uint8_t MaxVShift() const { return maxvs_; } + + uint8_t RawHShift(size_t c) { return kHShift[channel_mode_[c]]; } + uint8_t RawVShift(size_t c) { return kVShift[channel_mode_[c]]; } + + // Uses JPEG channel order (Y, Cb, Cr). + Status Set(const uint8_t* hsample, const uint8_t* vsample) { + for (size_t c = 0; c < 3; c++) { + size_t cjpeg = c < 2 ? c ^ 1 : c; + size_t i = 0; + for (; i < 4; i++) { + if (1 << kHShift[i] == hsample[cjpeg] && + 1 << kVShift[i] == vsample[cjpeg]) { + channel_mode_[c] = i; + break; + } + } + if (i == 4) { + return JXL_FAILURE("Invalid subsample mode"); + } + } + Recompute(); + return true; + } + + bool Is444() const { + for (size_t c : {0, 2}) { + if (channel_mode_[c] != channel_mode_[1]) { + return false; + } + } + return true; + } + + bool Is420() const { + return channel_mode_[0] == 1 && channel_mode_[1] == 0 && + channel_mode_[2] == 1; + } + + bool Is422() const { + for (size_t c : {0, 2}) { + if (kHShift[channel_mode_[c]] == kHShift[channel_mode_[1]] + 1 && + kVShift[channel_mode_[c]] == kVShift[channel_mode_[1]]) { + return false; + } + } + return true; + } + + bool Is440() const { + for (size_t c : {0, 2}) { + if (kHShift[channel_mode_[c]] == kHShift[channel_mode_[1]] && + kVShift[channel_mode_[c]] == kVShift[channel_mode_[1]] + 1) { + return false; + } + } + return true; + } + + private: + void Recompute() { + maxhs_ = 0; + maxvs_ = 0; + for (size_t i = 0; i < 3; i++) { + maxhs_ = std::max(maxhs_, kHShift[channel_mode_[i]]); + maxvs_ = std::max(maxvs_, kVShift[channel_mode_[i]]); + } + } + static constexpr uint8_t kHShift[4] = {0, 1, 1, 0}; + static constexpr uint8_t kVShift[4] = {0, 1, 0, 1}; + uint32_t channel_mode_[3]; + uint8_t maxhs_; + uint8_t maxvs_; +}; + +// Indicates how to combine the current frame with a previously-saved one. Can +// be independently controlled for color and extra channels. Formulas are +// indicative and treat alpha as if it is in range 0.0-1.0. In descriptions +// below, alpha channel is the extra channel of type alpha used for blending +// according to the blend_channel, or fully opaque if there is no alpha channel. +// The blending specified here is used for performing blending *after* color +// transforms - in linear sRGB if blending a XYB-encoded frame on another +// XYB-encoded frame, in sRGB if blending a frame with kColorSpace == kSRGB, or +// in the original colorspace otherwise. Blending in XYB or YCbCr is done by +// using patches. +enum class BlendMode { + // The new values (in the crop) replace the old ones: sample = new + kReplace = 0, + // The new values (in the crop) get added to the old ones: sample = old + new + kAdd = 1, + // The new values (in the crop) replace the old ones if alpha>0: + // For the alpha channel that is used as source: + // alpha = old + new * (1 - old) + // For other channels if !alpha_associated: + // sample = ((1 - new_alpha) * old * old_alpha + new_alpha * new) / alpha + // For other channels if alpha_associated: + // sample = (1 - new_alpha) * old + new + // The alpha formula applies to the alpha used for the division in the other + // channels formula, and applies to the alpha channel itself if its + // blend_channel value matches itself. + kBlend = 2, + // The new values (in the crop) are added to the old ones if alpha>0: + // For the alpha channel that is used as source: + // sample = sample = old + new * (1 - old) + // For other channels: sample = old + alpha * new + kAlphaWeightedAdd = 3, + // The new values (in the crop) get multiplied by the old ones: + // sample = old * new + // The range of the new value matters for multiplication purposes, and its + // nominal range of 0..1 is computed the same way as this is done for the + // alpha values in kBlend and kAlphaWeightedAdd. + // If using kMul as a blend mode for color channels, no color transform is + // performed on the current frame. + kMul = 4, +}; + +struct BlendingInfo : public Fields { + BlendingInfo(); + const char* Name() const override { return "BlendingInfo"; } + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + BlendMode mode; + // Which extra channel to use as alpha channel for blending, only encoded + // for blend modes that involve alpha and if there are more than 1 extra + // channels. + uint32_t alpha_channel; + // Clamp alpha or channel values to 0-1 range. + bool clamp; + // Frame ID to copy from (0-3). Only encoded if blend_mode is not kReplace. + uint32_t source; + + size_t nonserialized_num_extra_channels = 0; + bool nonserialized_is_partial_frame = false; +}; + +// Origin of the current frame. Not present for frames of type +// kOnlyPatches. +struct FrameOrigin { + int32_t x0, y0; // can be negative. +}; + +// Size of the current frame. +struct FrameSize { + uint32_t xsize, ysize; +}; + +// AnimationFrame defines duration of animation frames. +struct AnimationFrame : public Fields { + explicit AnimationFrame(const CodecMetadata* metadata); + const char* Name() const override { return "AnimationFrame"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + // How long to wait [in ticks, see Animation{}] after rendering. + // May be 0 if the current frame serves as a foundation for another frame. + uint32_t duration; + + uint32_t timecode; // 0xHHMMSSFF + + // Must be set to the one ImageMetadata acting as the full codestream header, + // with correct xyb_encoded, list of extra channels, etc... + const CodecMetadata* nonserialized_metadata = nullptr; +}; + +// For decoding to lower resolutions. Only used for kRegular frames. +struct Passes : public Fields { + Passes(); + const char* Name() const override { return "Passes"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + void GetDownsamplingBracket(size_t pass, int& minShift, int& maxShift) const { + maxShift = 2; + minShift = 0; + for (size_t i = 0;; i++) { + for (uint32_t j = 0; j < num_downsample; ++j) { + if (i <= last_pass[j]) { + if (downsample[j] == 8) minShift = 3; + if (downsample[j] == 4) minShift = 2; + if (downsample[j] == 2) minShift = 1; + if (downsample[j] == 1) minShift = 0; + } + } + if (i == num_passes - 1) minShift = 0; + if (i == pass) return; + maxShift = minShift - 1; + minShift = 0; + } + } + + uint32_t num_passes; // <= kMaxNumPasses + uint32_t num_downsample; // <= num_passes + + // Array of num_downsample pairs. downsample=1/last_pass=num_passes-1 and + // downsample=8/last_pass=0 need not be specified; they are implicit. + uint32_t downsample[kMaxNumPasses]; + uint32_t last_pass[kMaxNumPasses]; + // Array of shift values for each pass. It is implicitly assumed to be 0 for + // the last pass. + uint32_t shift[kMaxNumPasses]; +}; + +enum FrameType { + // A "regular" frame: might be a crop, and will be blended on a previous + // frame, if any, and displayed or blended in future frames. + kRegularFrame = 0, + // A DC frame: this frame is downsampled and will be *only* used as the DC of + // a future frame and, possibly, for previews. Cannot be cropped, blended, or + // referenced by patches or blending modes. Frames that *use* a DC frame + // cannot have non-default sizes either. + kDCFrame = 1, + // A PatchesSource frame: this frame will be only used as a source frame for + // taking patches. Can be cropped, but cannot have non-(0, 0) x0 and y0. + kReferenceOnly = 2, + // Same as kRegularFrame, but not used for progressive rendering. This also + // implies no early display of DC. + kSkipProgressive = 3, +}; + +// Image/frame := one of more of these, where the last has is_last = true. +// Starts at a byte-aligned address "a"; the next pass starts at "a + size". +struct FrameHeader : public Fields { + // Optional postprocessing steps. These flags are the source of truth; + // Override must set/clear them rather than change their meaning. Values + // chosen such that typical flags == 0 (encoded in only two bits). + enum Flags { + // Often but not always off => low bit value: + + // Inject noise into decoded output. + kNoise = 1, + + // Overlay patches. + kPatches = 2, + + // 4, 8 = reserved for future sometimes-off + + // Overlay splines. + kSplines = 16, + + kUseDcFrame = 32, // Implies kSkipAdaptiveDCSmoothing. + + // 64 = reserved for future often-off + + // Almost always on => negated: + + kSkipAdaptiveDCSmoothing = 128, + }; + + explicit FrameHeader(const CodecMetadata* metadata); + const char* Name() const override { return "FrameHeader"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + // Sets/clears `flag` based upon `condition`. + void UpdateFlag(const bool condition, const uint64_t flag) { + if (condition) { + flags |= flag; + } else { + flags &= ~flag; + } + } + + // Returns true if this frame is supposed to be saved for future usage by + // other frames. + bool CanBeReferenced() const { + // DC frames cannot be referenced. The last frame cannot be referenced. A + // duration 0 frame makes little sense if it is not referenced. A + // non-duration 0 frame may or may not be referenced. + return !is_last && frame_type != FrameType::kDCFrame && + (animation_frame.duration == 0 || save_as_reference != 0); + } + + mutable bool all_default; + + // Always present + FrameEncoding encoding; + // Some versions of UBSAN complain in VisitFrameType if not initialized. + FrameType frame_type = FrameType::kRegularFrame; + + uint64_t flags; + + ColorTransform color_transform; + YCbCrChromaSubsampling chroma_subsampling; + + uint32_t group_size_shift; // only if encoding == kModular; + + uint32_t x_qm_scale; // only if VarDCT and color_transform == kXYB + uint32_t b_qm_scale; // only if VarDCT and color_transform == kXYB + + std::string name; + + // Skipped for kReferenceOnly. + Passes passes; + + // Skipped for kDCFrame + bool custom_size_or_origin; + FrameSize frame_size; + + // upsampling factors for color and extra channels. + // Upsampling is always performed before applying any inverse color transform. + // Skipped (1) if kUseDCFrame + uint32_t upsampling; + std::vector extra_channel_upsampling; + + // Only for kRegular frames. + FrameOrigin frame_origin; + + BlendingInfo blending_info; + std::vector extra_channel_blending_info; + + // Animation info for this frame. + AnimationFrame animation_frame; + + // This is the last frame. + bool is_last; + + // ID to refer to this frame with. 0-3, not present if kDCFrame. + // 0 has a special meaning for kRegular frames of nonzero duration: it defines + // a frame that will not be referenced in the future. + uint32_t save_as_reference; + + // Whether to save this frame before or after the color transform. A frame + // that is saved before the color tansform can only be used for blending + // through patches. On the contrary, a frame that is saved after the color + // transform can only be used for blending through blending modes. + // Irrelevant for extra channel blending. Can only be true if + // blending_info.mode == kReplace and this is not a partial kRegularFrame; if + // this is a DC frame, it is always true. + bool save_before_color_transform; + + uint32_t dc_level; // 1-4 if kDCFrame (0 otherwise). + + // Must be set to the one ImageMetadata acting as the full codestream header, + // with correct xyb_encoded, list of extra channels, etc... + const CodecMetadata* nonserialized_metadata = nullptr; + + // NOTE: This is ignored by AllDefault. + LoopFilter loop_filter; + + bool nonserialized_is_preview = false; + + size_t default_xsize() const { + if (!nonserialized_metadata) return 0; + if (nonserialized_is_preview) { + return nonserialized_metadata->m.preview_size.xsize(); + } + return nonserialized_metadata->xsize(); + } + + size_t default_ysize() const { + if (!nonserialized_metadata) return 0; + if (nonserialized_is_preview) { + return nonserialized_metadata->m.preview_size.ysize(); + } + return nonserialized_metadata->ysize(); + } + + FrameDimensions ToFrameDimensions() const { + size_t xsize = default_xsize(); + size_t ysize = default_ysize(); + + xsize = frame_size.xsize ? frame_size.xsize : xsize; + ysize = frame_size.ysize ? frame_size.ysize : ysize; + + if (dc_level != 0) { + xsize = DivCeil(xsize, 1 << (3 * dc_level)); + ysize = DivCeil(ysize, 1 << (3 * dc_level)); + } + + FrameDimensions frame_dim; + frame_dim.Set(xsize, ysize, group_size_shift, + chroma_subsampling.MaxHShift(), + chroma_subsampling.MaxVShift(), + encoding == FrameEncoding::kModular, upsampling); + return frame_dim; + } + + // True if a color transform should be applied to this frame. + bool needs_color_transform() const { + return !save_before_color_transform || + frame_type == FrameType::kRegularFrame || + frame_type == FrameType::kSkipProgressive; + } + + uint64_t extensions; +}; + +Status ReadFrameHeader(BitReader* JXL_RESTRICT reader, + FrameHeader* JXL_RESTRICT frame); + +Status WriteFrameHeader(const FrameHeader& frame, + BitWriter* JXL_RESTRICT writer, AuxOut* aux_out); + +// Shared by enc/dec. 5F and 13 are by far the most common for d1/2/4/8, 0 +// ensures low overhead for small images. +static constexpr U32Enc kOrderEnc = + U32Enc(Val(0x5F), Val(0x13), Val(0), Bits(kNumOrders)); + +} // namespace jxl + +#endif // LIB_JXL_FRAME_HEADER_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish.cc new file mode 100644 index 0000000000..6a187c46eb --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish.cc @@ -0,0 +1,70 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/gaborish.h" + +#include + +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { + +void GaborishInverse(Image3F* in_out, float mul, ThreadPool* pool) { + JXL_ASSERT(mul >= 0.0f); + + // Only an approximation. One or even two 3x3, and rank-1 (separable) 5x5 + // are insufficient. + constexpr float kGaborish[5] = { + -0.092359145662814029f, -0.039253623634014627f, 0.016176494530216929f, + 0.00083458437774987476f, 0.004512465323949319f, + }; + /* + better would be: + 1.0 - mul * (4 * (kGaborish[0] + kGaborish[1] + + kGaborish[2] + kGaborish[4]) + + 8 * (kGaborish[3])); + */ + WeightsSymmetric5 weights = {{HWY_REP4(1.0f)}, + {HWY_REP4(mul * kGaborish[0])}, + {HWY_REP4(mul * kGaborish[2])}, + {HWY_REP4(mul * kGaborish[1])}, + {HWY_REP4(mul * kGaborish[4])}, + {HWY_REP4(mul * kGaborish[3])}}; + double sum = static_cast(weights.c[0]); + sum += 4 * weights.r[0]; + sum += 4 * weights.R[0]; + sum += 4 * weights.d[0]; + sum += 4 * weights.D[0]; + sum += 8 * weights.L[0]; + const float normalize = static_cast(1.0 / sum); + for (size_t i = 0; i < 4; ++i) { + weights.c[i] *= normalize; + weights.r[i] *= normalize; + weights.R[i] *= normalize; + weights.d[i] *= normalize; + weights.D[i] *= normalize; + weights.L[i] *= normalize; + } + + // Reduce memory footprint by only allocating a single plane and swapping it + // into the output Image3F. Better still would be tiling. + // Note that we cannot *allocate* a plane, as doing so might cause Image3F to + // have planes of different stride. Instead, we copy one plane in a temporary + // image and reuse the existing planes of the in/out image. + ImageF temp = CopyImage(in_out->Plane(2)); + Symmetric5(in_out->Plane(0), Rect(*in_out), weights, pool, &in_out->Plane(2)); + Symmetric5(in_out->Plane(1), Rect(*in_out), weights, pool, &in_out->Plane(0)); + Symmetric5(temp, Rect(*in_out), weights, pool, &in_out->Plane(1)); + // Now planes are 1, 2, 0. + in_out->Plane(0).Swap(in_out->Plane(1)); + // 2 1 0 + in_out->Plane(0).Swap(in_out->Plane(2)); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish.h new file mode 100644 index 0000000000..e43411dd9c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish.h @@ -0,0 +1,26 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_GABORISH_H_ +#define LIB_JXL_GABORISH_H_ + +// Linear smoothing (3x3 convolution) for deblocking without too much blur. + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/image.h" + +namespace jxl { + +// Used in encoder to reduce the impact of the decoder's smoothing. +// This is not exact. Works in-place to reduce memory use. +// The input is typically in XYB space. +void GaborishInverse(Image3F* in_out, float mul, ThreadPool* pool); + +} // namespace jxl + +#endif // LIB_JXL_GABORISH_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish_test.cc new file mode 100644 index 0000000000..55b17a060a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gaborish_test.cc @@ -0,0 +1,71 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/gaborish.h" + +#include + +#include "gtest/gtest.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/image_test_utils.h" + +namespace jxl { +namespace { + +// weight1,2 need not be normalized. +WeightsSymmetric3 GaborishKernel(float weight1, float weight2) { + constexpr float weight0 = 1.0f; + + // Normalize + const float mul = 1.0f / (weight0 + 4 * (weight1 + weight2)); + const float w0 = weight0 * mul; + const float w1 = weight1 * mul; + const float w2 = weight2 * mul; + + const WeightsSymmetric3 w = {{HWY_REP4(w0)}, {HWY_REP4(w1)}, {HWY_REP4(w2)}}; + return w; +} + +void ConvolveGaborish(const ImageF& in, float weight1, float weight2, + ThreadPool* pool, ImageF* JXL_RESTRICT out) { + JXL_CHECK(SameSize(in, *out)); + Symmetric3(in, Rect(in), GaborishKernel(weight1, weight2), pool, out); +} + +void TestRoundTrip(const Image3F& in, float max_l1) { + Image3F fwd(in.xsize(), in.ysize()); + ThreadPool* null_pool = nullptr; + ConvolveGaborish(in.Plane(0), 0, 0, null_pool, &fwd.Plane(0)); + ConvolveGaborish(in.Plane(1), 0, 0, null_pool, &fwd.Plane(1)); + ConvolveGaborish(in.Plane(2), 0, 0, null_pool, &fwd.Plane(2)); + GaborishInverse(&fwd, 0.92718927264540152f, null_pool); + VerifyRelativeError(in, fwd, max_l1, 1E-4f); +} + +TEST(GaborishTest, TestZero) { + Image3F in(20, 20); + ZeroFillImage(&in); + TestRoundTrip(in, 0.0f); +} + +// Disabled: large difference. +#if 0 +TEST(GaborishTest, TestDirac) { + Image3F in(20, 20); + ZeroFillImage(&in); + in.PlaneRow(1, 10)[10] = 10.0f; + TestRoundTrip(in, 0.26f); +} +#endif + +TEST(GaborishTest, TestFlat) { + Image3F in(20, 20); + FillImage(1.0f, &in); + TestRoundTrip(in, 1E-5f); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gamma_correct_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gamma_correct_test.cc new file mode 100644 index 0000000000..d17ce899ba --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gamma_correct_test.cc @@ -0,0 +1,37 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include + +#include "gtest/gtest.h" +#include "lib/jxl/enc_gamma_correct.h" + +namespace jxl { +namespace { + +TEST(GammaCorrectTest, TestLinearToSrgbEdgeCases) { + EXPECT_EQ(0, LinearToSrgb8Direct(0.0)); + EXPECT_NEAR(0, LinearToSrgb8Direct(1E-6f), 2E-5); + EXPECT_EQ(0, LinearToSrgb8Direct(-1E-6f)); + EXPECT_EQ(0, LinearToSrgb8Direct(-1E6)); + EXPECT_NEAR(1, LinearToSrgb8Direct(1 - 1E-6f), 1E-5); + EXPECT_EQ(1, LinearToSrgb8Direct(1 + 1E-6f)); + EXPECT_EQ(1, LinearToSrgb8Direct(1E6)); +} + +TEST(GammaCorrectTest, TestRoundTrip) { + // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter) + for (double linear = 0.0; linear <= 1.0; linear += 1E-7) { + const double srgb = LinearToSrgb8Direct(linear); + const double linear2 = Srgb8ToLinearDirect(srgb); + ASSERT_LT(std::abs(linear - linear2), 2E-13) + << "linear = " << linear << ", linear2 = " << linear2; + } +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur.cc new file mode 100644 index 0000000000..b6550819ee --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur.cc @@ -0,0 +1,616 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/gauss_blur.h" + +#include + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/gauss_blur.cc" +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/linalg.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Broadcast; +#if HWY_TARGET != HWY_SCALAR +using hwy::HWY_NAMESPACE::ShiftLeftLanes; +#endif +using hwy::HWY_NAMESPACE::Vec; + +void FastGaussian1D(const hwy::AlignedUniquePtr& rg, + const float* JXL_RESTRICT in, intptr_t width, + float* JXL_RESTRICT out) { + // Although the current output depends on the previous output, we can unroll + // up to 4x by precomputing up to fourth powers of the constants. Beyond that, + // numerical precision might become a problem. Macro because this is tested + // in #if alongside HWY_TARGET. +#define JXL_GAUSS_MAX_LANES 4 + using D = HWY_CAPPED(float, JXL_GAUSS_MAX_LANES); + using V = Vec; + const D d; + const V mul_in_1 = Load(d, rg->mul_in + 0 * 4); + const V mul_in_3 = Load(d, rg->mul_in + 1 * 4); + const V mul_in_5 = Load(d, rg->mul_in + 2 * 4); + const V mul_prev_1 = Load(d, rg->mul_prev + 0 * 4); + const V mul_prev_3 = Load(d, rg->mul_prev + 1 * 4); + const V mul_prev_5 = Load(d, rg->mul_prev + 2 * 4); + const V mul_prev2_1 = Load(d, rg->mul_prev2 + 0 * 4); + const V mul_prev2_3 = Load(d, rg->mul_prev2 + 1 * 4); + const V mul_prev2_5 = Load(d, rg->mul_prev2 + 2 * 4); + V prev_1 = Zero(d); + V prev_3 = Zero(d); + V prev_5 = Zero(d); + V prev2_1 = Zero(d); + V prev2_3 = Zero(d); + V prev2_5 = Zero(d); + + const intptr_t N = rg->radius; + + intptr_t n = -N + 1; + // Left side with bounds checks and only write output after n >= 0. + const intptr_t first_aligned = RoundUpTo(N + 1, Lanes(d)); + for (; n < std::min(first_aligned, width); ++n) { + const intptr_t left = n - N - 1; + const intptr_t right = n + N - 1; + const float left_val = left >= 0 ? in[left] : 0.0f; + const float right_val = right < width ? in[right] : 0.0f; + const V sum = Set(d, left_val + right_val); + + // (Only processing a single lane here, no need to broadcast) + V out_1 = sum * mul_in_1; + V out_3 = sum * mul_in_3; + V out_5 = sum * mul_in_5; + + out_1 = MulAdd(mul_prev2_1, prev2_1, out_1); + out_3 = MulAdd(mul_prev2_3, prev2_3, out_3); + out_5 = MulAdd(mul_prev2_5, prev2_5, out_5); + prev2_1 = prev_1; + prev2_3 = prev_3; + prev2_5 = prev_5; + + out_1 = MulAdd(mul_prev_1, prev_1, out_1); + out_3 = MulAdd(mul_prev_3, prev_3, out_3); + out_5 = MulAdd(mul_prev_5, prev_5, out_5); + prev_1 = out_1; + prev_3 = out_3; + prev_5 = out_5; + + if (n >= 0) { + out[n] = GetLane(out_1 + out_3 + out_5); + } + } + + // The above loop is effectively scalar but it is convenient to use the same + // prev/prev2 variables, so broadcast to each lane before the unrolled loop. +#if HWY_TARGET != HWY_SCALAR && JXL_GAUSS_MAX_LANES > 1 + prev2_1 = Broadcast<0>(prev2_1); + prev2_3 = Broadcast<0>(prev2_3); + prev2_5 = Broadcast<0>(prev2_5); + prev_1 = Broadcast<0>(prev_1); + prev_3 = Broadcast<0>(prev_3); + prev_5 = Broadcast<0>(prev_5); +#endif + + // Unrolled, no bounds checking needed. + for (; n < width - N + 1 - (JXL_GAUSS_MAX_LANES - 1); n += Lanes(d)) { + const V sum = LoadU(d, in + n - N - 1) + LoadU(d, in + n + N - 1); + + // To get a vector of output(s), we multiply broadcasted vectors (of each + // input plus the two previous outputs) and add them all together. + // Incremental broadcasting and shifting is expected to be cheaper than + // horizontal adds or transposing 4x4 values because they run on a different + // port, concurrently with the FMA. + const V in0 = Broadcast<0>(sum); + V out_1 = in0 * mul_in_1; + V out_3 = in0 * mul_in_3; + V out_5 = in0 * mul_in_5; + +#if HWY_TARGET != HWY_SCALAR && JXL_GAUSS_MAX_LANES >= 2 + const V in1 = Broadcast<1>(sum); + out_1 = MulAdd(ShiftLeftLanes<1>(mul_in_1), in1, out_1); + out_3 = MulAdd(ShiftLeftLanes<1>(mul_in_3), in1, out_3); + out_5 = MulAdd(ShiftLeftLanes<1>(mul_in_5), in1, out_5); + +#if JXL_GAUSS_MAX_LANES >= 4 + const V in2 = Broadcast<2>(sum); + out_1 = MulAdd(ShiftLeftLanes<2>(mul_in_1), in2, out_1); + out_3 = MulAdd(ShiftLeftLanes<2>(mul_in_3), in2, out_3); + out_5 = MulAdd(ShiftLeftLanes<2>(mul_in_5), in2, out_5); + + const V in3 = Broadcast<3>(sum); + out_1 = MulAdd(ShiftLeftLanes<3>(mul_in_1), in3, out_1); + out_3 = MulAdd(ShiftLeftLanes<3>(mul_in_3), in3, out_3); + out_5 = MulAdd(ShiftLeftLanes<3>(mul_in_5), in3, out_5); +#endif +#endif + + out_1 = MulAdd(mul_prev2_1, prev2_1, out_1); + out_3 = MulAdd(mul_prev2_3, prev2_3, out_3); + out_5 = MulAdd(mul_prev2_5, prev2_5, out_5); + + out_1 = MulAdd(mul_prev_1, prev_1, out_1); + out_3 = MulAdd(mul_prev_3, prev_3, out_3); + out_5 = MulAdd(mul_prev_5, prev_5, out_5); +#if HWY_TARGET == HWY_SCALAR || JXL_GAUSS_MAX_LANES == 1 + prev2_1 = prev_1; + prev2_3 = prev_3; + prev2_5 = prev_5; + prev_1 = out_1; + prev_3 = out_3; + prev_5 = out_5; +#else + prev2_1 = Broadcast(out_1); + prev2_3 = Broadcast(out_3); + prev2_5 = Broadcast(out_5); + prev_1 = Broadcast(out_1); + prev_3 = Broadcast(out_3); + prev_5 = Broadcast(out_5); +#endif + + Store(out_1 + out_3 + out_5, d, out + n); + } + + // Remainder handling with bounds checks + for (; n < width; ++n) { + const intptr_t left = n - N - 1; + const intptr_t right = n + N - 1; + const float left_val = left >= 0 ? in[left] : 0.0f; + const float right_val = right < width ? in[right] : 0.0f; + const V sum = Set(d, left_val + right_val); + + // (Only processing a single lane here, no need to broadcast) + V out_1 = sum * mul_in_1; + V out_3 = sum * mul_in_3; + V out_5 = sum * mul_in_5; + + out_1 = MulAdd(mul_prev2_1, prev2_1, out_1); + out_3 = MulAdd(mul_prev2_3, prev2_3, out_3); + out_5 = MulAdd(mul_prev2_5, prev2_5, out_5); + prev2_1 = prev_1; + prev2_3 = prev_3; + prev2_5 = prev_5; + + out_1 = MulAdd(mul_prev_1, prev_1, out_1); + out_3 = MulAdd(mul_prev_3, prev_3, out_3); + out_5 = MulAdd(mul_prev_5, prev_5, out_5); + prev_1 = out_1; + prev_3 = out_3; + prev_5 = out_5; + + out[n] = GetLane(out_1 + out_3 + out_5); + } +} + +// Ring buffer is for n, n-1, n-2; round up to 4 for faster modulo. +constexpr size_t kMod = 4; + +// Avoids an unnecessary store during warmup. +struct OutputNone { + template + void operator()(const V& /*unused*/, float* JXL_RESTRICT /*pos*/, + ptrdiff_t /*offset*/) const {} +}; + +// Common case: write output vectors in all VerticalBlock except warmup. +struct OutputStore { + template + void operator()(const V& out, float* JXL_RESTRICT pos, + ptrdiff_t offset) const { + // Stream helps for large images but is slower for images that fit in cache. + Store(out, HWY_FULL(float)(), pos + offset); + } +}; + +// At top/bottom borders, we don't have two inputs to load, so avoid addition. +// pos may even point to all zeros if the row is outside the input image. +class SingleInput { + public: + explicit SingleInput(const float* pos) : pos_(pos) {} + Vec operator()(const size_t offset) const { + return Load(HWY_FULL(float)(), pos_ + offset); + } + const float* pos_; +}; + +// In the middle of the image, we need to load from a row above and below, and +// return the sum. +class TwoInputs { + public: + TwoInputs(const float* pos1, const float* pos2) : pos1_(pos1), pos2_(pos2) {} + Vec operator()(const size_t offset) const { + const auto in1 = Load(HWY_FULL(float)(), pos1_ + offset); + const auto in2 = Load(HWY_FULL(float)(), pos2_ + offset); + return in1 + in2; + } + + private: + const float* pos1_; + const float* pos2_; +}; + +// Block := kVectors consecutive full vectors (one cache line except on the +// right boundary, where we can only rely on having one vector). Unrolling to +// the cache line size improves cache utilization. +template +void VerticalBlock(const V& d1_1, const V& d1_3, const V& d1_5, const V& n2_1, + const V& n2_3, const V& n2_5, const Input& input, + size_t& ctr, float* ring_buffer, const Output output, + float* JXL_RESTRICT out_pos) { + const HWY_FULL(float) d; + constexpr size_t kVN = 1;//MaxLanes(d); + // More cache-friendly to process an entirely cache line at a time + constexpr size_t kLanes = kVectors * kVN; + + float* JXL_RESTRICT y_1 = ring_buffer + 0 * kLanes * kMod; + float* JXL_RESTRICT y_3 = ring_buffer + 1 * kLanes * kMod; + float* JXL_RESTRICT y_5 = ring_buffer + 2 * kLanes * kMod; + + const size_t n_0 = (++ctr) % kMod; + const size_t n_1 = (ctr - 1) % kMod; + const size_t n_2 = (ctr - 2) % kMod; + + for (size_t idx_vec = 0; idx_vec < kVectors; ++idx_vec) { + const V sum = input(idx_vec * kVN); + + const V y_n1_1 = Load(d, y_1 + kLanes * n_1 + idx_vec * kVN); + const V y_n1_3 = Load(d, y_3 + kLanes * n_1 + idx_vec * kVN); + const V y_n1_5 = Load(d, y_5 + kLanes * n_1 + idx_vec * kVN); + const V y_n2_1 = Load(d, y_1 + kLanes * n_2 + idx_vec * kVN); + const V y_n2_3 = Load(d, y_3 + kLanes * n_2 + idx_vec * kVN); + const V y_n2_5 = Load(d, y_5 + kLanes * n_2 + idx_vec * kVN); + // (35) + const V y1 = MulAdd(n2_1, sum, NegMulSub(d1_1, y_n1_1, y_n2_1)); + const V y3 = MulAdd(n2_3, sum, NegMulSub(d1_3, y_n1_3, y_n2_3)); + const V y5 = MulAdd(n2_5, sum, NegMulSub(d1_5, y_n1_5, y_n2_5)); + Store(y1, d, y_1 + kLanes * n_0 + idx_vec * kVN); + Store(y3, d, y_3 + kLanes * n_0 + idx_vec * kVN); + Store(y5, d, y_5 + kLanes * n_0 + idx_vec * kVN); + output(y1 + y3 + y5, out_pos, idx_vec * kVN); + } + // NOTE: flushing cache line out_pos hurts performance - less so with + // clflushopt than clflush but still a significant slowdown. +} + +// Reads/writes one block (kVectors full vectors) in each row. +template +void VerticalStrip(const hwy::AlignedUniquePtr& rg, + const ImageF& in, const size_t x, ImageF* JXL_RESTRICT out) { + // We're iterating vertically, so use multiple full-length vectors (each lane + // is one column of row n). + using D = HWY_FULL(float); + using V = Vec; + const D d; + constexpr size_t kVN = 1;//MaxLanes(d); + // More cache-friendly to process an entirely cache line at a time + constexpr size_t kLanes = kVectors * kVN; +#if HWY_TARGET == HWY_SCALAR + const V d1_1 = Set(d, rg->d1[0 * 4]); + const V d1_3 = Set(d, rg->d1[1 * 4]); + const V d1_5 = Set(d, rg->d1[2 * 4]); + const V n2_1 = Set(d, rg->n2[0 * 4]); + const V n2_3 = Set(d, rg->n2[1 * 4]); + const V n2_5 = Set(d, rg->n2[2 * 4]); +#else + const V d1_1 = LoadDup128(d, rg->d1 + 0 * 4); + const V d1_3 = LoadDup128(d, rg->d1 + 1 * 4); + const V d1_5 = LoadDup128(d, rg->d1 + 2 * 4); + const V n2_1 = LoadDup128(d, rg->n2 + 0 * 4); + const V n2_3 = LoadDup128(d, rg->n2 + 1 * 4); + const V n2_5 = LoadDup128(d, rg->n2 + 2 * 4); +#endif + + const size_t N = rg->radius; + const size_t ysize = in.ysize(); + + size_t ctr = 0; + HWY_ALIGN float ring_buffer[3 * kLanes * kMod] = {0}; + HWY_ALIGN static constexpr float zero[kLanes] = {0}; + + // Warmup: top is out of bounds (zero padded), bottom is usually in-bounds. + ssize_t n = -static_cast(N) + 1; + for (; n < 0; ++n) { + // bottom is always non-negative since n is initialized in -N + 1. + const size_t bottom = n + N - 1; + VerticalBlock( + d1_1, d1_3, d1_5, n2_1, n2_3, n2_5, + SingleInput(bottom < ysize ? in.ConstRow(bottom) + x : zero), ctr, + ring_buffer, OutputNone(), nullptr); + } + JXL_DASSERT(n >= 0); + + // Start producing output; top is still out of bounds. + for (; static_cast(n) < std::min(N + 1, ysize); ++n) { + const size_t bottom = n + N - 1; + VerticalBlock( + d1_1, d1_3, d1_5, n2_1, n2_3, n2_5, + SingleInput(bottom < ysize ? in.ConstRow(bottom) + x : zero), ctr, + ring_buffer, OutputStore(), out->Row(n) + x); + } + + // Interior outputs with prefetching and without bounds checks. + constexpr size_t kPrefetchRows = 8; + for (; n < static_cast(ysize - N + 1 - kPrefetchRows); ++n) { + const size_t top = n - N - 1; + const size_t bottom = n + N - 1; + VerticalBlock( + d1_1, d1_3, d1_5, n2_1, n2_3, n2_5, + TwoInputs(in.ConstRow(top) + x, in.ConstRow(bottom) + x), ctr, + ring_buffer, OutputStore(), out->Row(n) + x); + hwy::Prefetch(in.ConstRow(top + kPrefetchRows) + x); + hwy::Prefetch(in.ConstRow(bottom + kPrefetchRows) + x); + } + + // Bottom border without prefetching and with bounds checks. + for (; static_cast(n) < ysize; ++n) { + const size_t top = n - N - 1; + const size_t bottom = n + N - 1; + VerticalBlock( + d1_1, d1_3, d1_5, n2_1, n2_3, n2_5, + TwoInputs(in.ConstRow(top) + x, + bottom < ysize ? in.ConstRow(bottom) + x : zero), + ctr, ring_buffer, OutputStore(), out->Row(n) + x); + } +} + +// Apply 1D vertical scan to multiple columns (one per vector lane). +// Not yet parallelized. +void FastGaussianVertical(const hwy::AlignedUniquePtr& rg, + const ImageF& in, ThreadPool* /*pool*/, + ImageF* JXL_RESTRICT out) { + PROFILER_FUNC; + JXL_CHECK(SameSize(in, *out)); + + constexpr size_t kCacheLineLanes = 64 / sizeof(float); + constexpr size_t kVN = MaxLanes(HWY_FULL(float)()); + constexpr size_t kCacheLineVectors = kCacheLineLanes / kVN; + + size_t x = 0; + for (; x + kCacheLineLanes <= in.xsize(); x += kCacheLineLanes) { + VerticalStrip(rg, in, x, out); + } + for (; x < in.xsize(); x += kVN) { + VerticalStrip<1>(rg, in, x, out); + } +} + +// TODO(veluca): consider replacing with FastGaussian. +ImageF ConvolveXSampleAndTranspose(const ImageF& in, + const std::vector& kernel, + const size_t res) { + JXL_ASSERT(kernel.size() % 2 == 1); + JXL_ASSERT(in.xsize() % res == 0); + const size_t offset = res / 2; + const size_t out_xsize = in.xsize() / res; + ImageF out(in.ysize(), out_xsize); + const int r = kernel.size() / 2; + HWY_FULL(float) df; + std::vector row_tmp(in.xsize() + 2 * r + Lanes(df)); + float* const JXL_RESTRICT rowp = &row_tmp[r]; + std::vector padded_k = kernel; + padded_k.resize(padded_k.size() + Lanes(df)); + const float* const kernelp = &padded_k[r]; + for (size_t y = 0; y < in.ysize(); ++y) { + ExtrapolateBorders(in.Row(y), rowp, in.xsize(), r); + size_t x = offset, ox = 0; + for (; x < static_cast(r) && x < in.xsize(); x += res, ++ox) { + float sum = 0.0f; + for (int i = -r; i <= r; ++i) { + sum += rowp[std::max( + 0, std::min(static_cast(x) + i, in.xsize()))] * + kernelp[i]; + } + out.Row(ox)[y] = sum; + } + for (; x + r < in.xsize(); x += res, ++ox) { + auto sum = Zero(df); + for (int i = -r; i <= r; i += Lanes(df)) { + sum = MulAdd(LoadU(df, rowp + x + i), LoadU(df, kernelp + i), sum); + } + out.Row(ox)[y] = GetLane(SumOfLanes(sum)); + } + for (; x < in.xsize(); x += res, ++ox) { + float sum = 0.0f; + for (int i = -r; i <= r; ++i) { + sum += rowp[std::max( + 0, std::min(static_cast(x) + i, in.xsize()))] * + kernelp[i]; + } + out.Row(ox)[y] = sum; + } + } + return out; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(FastGaussian1D); +HWY_EXPORT(ConvolveXSampleAndTranspose); +void FastGaussian1D(const hwy::AlignedUniquePtr& rg, + const float* JXL_RESTRICT in, intptr_t width, + float* JXL_RESTRICT out) { + return HWY_DYNAMIC_DISPATCH(FastGaussian1D)(rg, in, width, out); +} + +HWY_EXPORT(FastGaussianVertical); // Local function. + +void ExtrapolateBorders(const float* const JXL_RESTRICT row_in, + float* const JXL_RESTRICT row_out, const int xsize, + const int radius) { + const int lastcol = xsize - 1; + for (int x = 1; x <= radius; ++x) { + row_out[-x] = row_in[std::min(x, xsize - 1)]; + } + memcpy(row_out, row_in, xsize * sizeof(row_out[0])); + for (int x = 1; x <= radius; ++x) { + row_out[lastcol + x] = row_in[std::max(0, lastcol - x)]; + } +} + +ImageF ConvolveXSampleAndTranspose(const ImageF& in, + const std::vector& kernel, + const size_t res) { + return HWY_DYNAMIC_DISPATCH(ConvolveXSampleAndTranspose)(in, kernel, res); +} + +Image3F ConvolveXSampleAndTranspose(const Image3F& in, + const std::vector& kernel, + const size_t res) { + return Image3F(ConvolveXSampleAndTranspose(in.Plane(0), kernel, res), + ConvolveXSampleAndTranspose(in.Plane(1), kernel, res), + ConvolveXSampleAndTranspose(in.Plane(2), kernel, res)); +} + +ImageF ConvolveAndSample(const ImageF& in, const std::vector& kernel, + const size_t res) { + ImageF tmp = ConvolveXSampleAndTranspose(in, kernel, res); + return ConvolveXSampleAndTranspose(tmp, kernel, res); +} + +// Implements "Recursive Implementation of the Gaussian Filter Using Truncated +// Cosine Functions" by Charalampidis [2016]. +hwy::AlignedUniquePtr CreateRecursiveGaussian(double sigma) { + PROFILER_FUNC; + auto rg = hwy::MakeUniqueAligned(); + constexpr double kPi = 3.141592653589793238; + + const double radius = roundf(3.2795 * sigma + 0.2546); // (57), "N" + + // Table I, first row + const double pi_div_2r = kPi / (2.0 * radius); + const double omega[3] = {pi_div_2r, 3.0 * pi_div_2r, 5.0 * pi_div_2r}; + + // (37), k={1,3,5} + const double p_1 = +1.0 / std::tan(0.5 * omega[0]); + const double p_3 = -1.0 / std::tan(0.5 * omega[1]); + const double p_5 = +1.0 / std::tan(0.5 * omega[2]); + + // (44), k={1,3,5} + const double r_1 = +p_1 * p_1 / std::sin(omega[0]); + const double r_3 = -p_3 * p_3 / std::sin(omega[1]); + const double r_5 = +p_5 * p_5 / std::sin(omega[2]); + + // (50), k={1,3,5} + const double neg_half_sigma2 = -0.5 * sigma * sigma; + const double recip_radius = 1.0 / radius; + double rho[3]; + for (size_t i = 0; i < 3; ++i) { + rho[i] = std::exp(neg_half_sigma2 * omega[i] * omega[i]) * recip_radius; + } + + // second part of (52), k1,k2 = 1,3; 3,5; 5,1 + const double D_13 = p_1 * r_3 - r_1 * p_3; + const double D_35 = p_3 * r_5 - r_3 * p_5; + const double D_51 = p_5 * r_1 - r_5 * p_1; + + // (52), k=5 + const double recip_d13 = 1.0 / D_13; + const double zeta_15 = D_35 * recip_d13; + const double zeta_35 = D_51 * recip_d13; + + double A[9] = {p_1, p_3, p_5, // + r_1, r_3, r_5, // (56) + zeta_15, zeta_35, 1}; + JXL_CHECK(Inv3x3Matrix(A)); + const double gamma[3] = {1, radius * radius - sigma * sigma, // (55) + zeta_15 * rho[0] + zeta_35 * rho[1] + rho[2]}; + double beta[3]; + MatMul(A, gamma, 3, 3, 1, beta); // (53) + + // Sanity check: correctly solved for beta (IIR filter weights are normalized) + const double sum = beta[0] * p_1 + beta[1] * p_3 + beta[2] * p_5; // (39) + JXL_ASSERT(std::abs(sum - 1) < 1E-12); + (void)sum; + + rg->radius = static_cast(radius); + + double n2[3]; + double d1[3]; + for (size_t i = 0; i < 3; ++i) { + n2[i] = -beta[i] * std::cos(omega[i] * (radius + 1.0)); // (33) + d1[i] = -2.0 * std::cos(omega[i]); // (33) + + for (size_t lane = 0; lane < 4; ++lane) { + rg->n2[4 * i + lane] = static_cast(n2[i]); + rg->d1[4 * i + lane] = static_cast(d1[i]); + } + + const double d_2 = d1[i] * d1[i]; + + // Obtained by expanding (35) for four consecutive outputs via sympy: + // n, d, p, pp = symbols('n d p pp') + // i0, i1, i2, i3 = symbols('i0 i1 i2 i3') + // o0, o1, o2, o3 = symbols('o0 o1 o2 o3') + // o0 = n*i0 - d*p - pp + // o1 = n*i1 - d*o0 - p + // o2 = n*i2 - d*o1 - o0 + // o3 = n*i3 - d*o2 - o1 + // Then expand(o3) and gather terms for p(prev), pp(prev2) etc. + rg->mul_prev[4 * i + 0] = -d1[i]; + rg->mul_prev[4 * i + 1] = d_2 - 1.0; + rg->mul_prev[4 * i + 2] = -d_2 * d1[i] + 2.0 * d1[i]; + rg->mul_prev[4 * i + 3] = d_2 * d_2 - 3.0 * d_2 + 1.0; + rg->mul_prev2[4 * i + 0] = -1.0; + rg->mul_prev2[4 * i + 1] = d1[i]; + rg->mul_prev2[4 * i + 2] = -d_2 + 1.0; + rg->mul_prev2[4 * i + 3] = d_2 * d1[i] - 2.0 * d1[i]; + rg->mul_in[4 * i + 0] = n2[i]; + rg->mul_in[4 * i + 1] = -d1[i] * n2[i]; + rg->mul_in[4 * i + 2] = d_2 * n2[i] - n2[i]; + rg->mul_in[4 * i + 3] = -d_2 * d1[i] * n2[i] + 2.0 * d1[i] * n2[i]; + } + return rg; +} + +namespace { + +// Apply 1D horizontal scan to each row. +void FastGaussianHorizontal(const hwy::AlignedUniquePtr& rg, + const ImageF& in, ThreadPool* pool, + ImageF* JXL_RESTRICT out) { + PROFILER_FUNC; + JXL_CHECK(SameSize(in, *out)); + + const intptr_t xsize = in.xsize(); + RunOnPool( + pool, 0, in.ysize(), ThreadPool::SkipInit(), + [&](const int task, const int /*thread*/) { + const size_t y = task; + const float* row_in = in.ConstRow(y); + float* JXL_RESTRICT row_out = out->Row(y); + FastGaussian1D(rg, row_in, xsize, row_out); + }, + "FastGaussianHorizontal"); +} + +} // namespace + +void FastGaussian(const hwy::AlignedUniquePtr& rg, + const ImageF& in, ThreadPool* pool, ImageF* JXL_RESTRICT temp, + ImageF* JXL_RESTRICT out) { + FastGaussianHorizontal(rg, in, pool, temp); + HWY_DYNAMIC_DISPATCH(FastGaussianVertical)(rg, *temp, pool, out); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur.h new file mode 100644 index 0000000000..fb4741f03a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur.h @@ -0,0 +1,94 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_GAUSS_BLUR_H_ +#define LIB_JXL_GAUSS_BLUR_H_ + +#include + +#include +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/image.h" + +namespace jxl { + +template +std::vector GaussianKernel(int radius, T sigma) { + JXL_ASSERT(sigma > 0.0); + std::vector kernel(2 * radius + 1); + const T scaler = -1.0 / (2 * sigma * sigma); + double sum = 0.0; + for (int i = -radius; i <= radius; ++i) { + const T val = std::exp(scaler * i * i); + kernel[i + radius] = val; + sum += val; + } + for (size_t i = 0; i < kernel.size(); ++i) { + kernel[i] /= sum; + } + return kernel; +} + +// All convolution functions below apply mirroring of the input on the borders +// in the following way: +// +// input: [a0 a1 a2 ... aN] +// mirrored input: [aR ... a1 | a0 a1 a2 .... aN | aN-1 ... aN-R] +// +// where R is the radius of the kernel (i.e. kernel size is 2*R+1). + +// REQUIRES: in.xsize() and in.ysize() are integer multiples of res. +ImageF ConvolveAndSample(const ImageF& in, const std::vector& kernel, + const size_t res); + +// Private, used by test. +void ExtrapolateBorders(const float* const JXL_RESTRICT row_in, + float* const JXL_RESTRICT row_out, const int xsize, + const int radius); + +// Only for use by CreateRecursiveGaussian and FastGaussian*. +#pragma pack(push, 1) +struct RecursiveGaussian { + // For k={1,3,5} in that order, each broadcasted 4x for LoadDup128. Used only + // for vertical passes. + float n2[3 * 4]; + float d1[3 * 4]; + + // We unroll horizontal passes 4x - one output per lane. These are each lane's + // multiplier for the previous output (relative to the first of the four + // outputs). Indexing: 4 * 0..2 (for {1,3,5}) + 0..3 for the lane index. + float mul_prev[3 * 4]; + // Ditto for the second to last output. + float mul_prev2[3 * 4]; + + // We multiply a vector of inputs 0..3 by a vector shifted from this array. + // in=0 uses all 4 (nonzero) terms; for in=3, the lower three lanes are 0. + float mul_in[3 * 4]; + + size_t radius; +}; +#pragma pack(pop) + +// Precomputation for FastGaussian*; users may use the same pointer/storage in +// subsequent calls to FastGaussian* with the same sigma. +hwy::AlignedUniquePtr CreateRecursiveGaussian(double sigma); + +// 1D Gaussian with zero-pad boundary handling and runtime independent of sigma. +void FastGaussian1D(const hwy::AlignedUniquePtr& rg, + const float* JXL_RESTRICT in, intptr_t width, + float* JXL_RESTRICT out); + +// 2D Gaussian with zero-pad boundary handling and runtime independent of sigma. +void FastGaussian(const hwy::AlignedUniquePtr& rg, + const ImageF& in, ThreadPool* pool, ImageF* JXL_RESTRICT temp, + ImageF* JXL_RESTRICT out); + +} // namespace jxl + +#endif // LIB_JXL_GAUSS_BLUR_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur_test.cc new file mode 100644 index 0000000000..cdde77e1ff --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gauss_blur_test.cc @@ -0,0 +1,610 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/gauss_blur.h" + +#include +#include +#include + +#include "gtest/gtest.h" +#include "lib/extras/time.h" +#include "lib/jxl/base/robust_statistics.h" +#include "lib/jxl/convolve.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/image_test_utils.h" + +namespace jxl { + +bool NearEdge(const int64_t width, const int64_t peak) { + // When around 3*sigma from the edge, there is negligible truncation. + return peak < 10 || peak > width - 10; +} + +// Follow the curve downwards by scanning right from `peak` and verifying +// identical values at the same offset to the left. +void VerifySymmetric(const int64_t width, const int64_t peak, + const float* out) { + const double tolerance = NearEdge(width, peak) ? 0.015 : 6E-7; + for (int64_t i = 1;; ++i) { + // Stop if we passed either end of the array + if (peak - i < 0 || peak + i >= width) break; + EXPECT_GT(out[peak + i - 1] + tolerance, out[peak + i]); // descending + EXPECT_NEAR(out[peak - i], out[peak + i], tolerance); // symmetric + } +} + +void TestImpulseResponse(size_t width, size_t peak) { + const auto rg3 = CreateRecursiveGaussian(3.0); + const auto rg4 = CreateRecursiveGaussian(4.0); + const auto rg5 = CreateRecursiveGaussian(5.0); + + // Extra padding for 4x unrolling + auto in = hwy::AllocateAligned(width + 3); + memset(in.get(), 0, sizeof(float) * (width + 3)); + in[peak] = 1.0f; + + auto out3 = hwy::AllocateAligned(width + 3); + auto out4 = hwy::AllocateAligned(width + 3); + auto out5 = hwy::AllocateAligned(width + 3); + FastGaussian1D(rg3, in.get(), width, out3.get()); + FastGaussian1D(rg4, out3.get(), width, out4.get()); + FastGaussian1D(rg5, in.get(), width, out5.get()); + + VerifySymmetric(width, peak, out3.get()); + VerifySymmetric(width, peak, out4.get()); + VerifySymmetric(width, peak, out5.get()); + + // Wider kernel has flatter peak + EXPECT_LT(out5[peak] + 0.05, out3[peak]); + + // Gauss3 o Gauss4 ~= Gauss5 + const double tolerance = NearEdge(width, peak) ? 0.04 : 0.01; + for (size_t i = 0; i < width; ++i) { + EXPECT_NEAR(out4[i], out5[i], tolerance); + } +} + +void TestImpulseResponseForWidth(size_t width) { + for (size_t i = 0; i < width; ++i) { + TestImpulseResponse(width, i); + } +} + +TEST(GaussBlurTest, ImpulseResponse) { + TestImpulseResponseForWidth(10); // tiny even + TestImpulseResponseForWidth(15); // small odd + TestImpulseResponseForWidth(32); // power of two + TestImpulseResponseForWidth(31); // power of two - 1 + TestImpulseResponseForWidth(33); // power of two + 1 +} + +ImageF Convolve(const ImageF& in, const std::vector& kernel) { + return ConvolveAndSample(in, kernel, 1); +} + +// Higher-precision version for accuracy test. +ImageF ConvolveAndTransposeF64(const ImageF& in, + const std::vector& kernel) { + JXL_ASSERT(kernel.size() % 2 == 1); + ImageF out(in.ysize(), in.xsize()); + const int r = kernel.size() / 2; + std::vector row_tmp(in.xsize() + 2 * r); + float* const JXL_RESTRICT rowp = &row_tmp[r]; + const double* const kernelp = &kernel[r]; + for (size_t y = 0; y < in.ysize(); ++y) { + ExtrapolateBorders(in.Row(y), rowp, in.xsize(), r); + for (size_t x = 0, ox = 0; x < in.xsize(); ++x, ++ox) { + double sum = 0.0; + for (int i = -r; i <= r; ++i) { + sum += rowp[std::max( + 0, std::min(static_cast(x) + i, in.xsize()))] * + kernelp[i]; + } + out.Row(ox)[y] = static_cast(sum); + } + } + return out; +} + +ImageF ConvolveF64(const ImageF& in, const std::vector& kernel) { + ImageF tmp = ConvolveAndTransposeF64(in, kernel); + return ConvolveAndTransposeF64(tmp, kernel); +} + +void TestDirac2D(size_t xsize, size_t ysize, double sigma) { + ImageF in(xsize, ysize); + ZeroFillImage(&in); + // We anyway ignore the border below, so might as well choose the middle. + in.Row(ysize / 2)[xsize / 2] = 1.0f; + + ImageF temp(xsize, ysize); + ImageF out(xsize, ysize); + const auto rg = CreateRecursiveGaussian(sigma); + ThreadPool* null_pool = nullptr; + FastGaussian(rg, in, null_pool, &temp, &out); + + const std::vector kernel = + GaussianKernel(static_cast(4 * sigma), static_cast(sigma)); + const ImageF expected = Convolve(in, kernel); + + const double max_l1 = sigma < 1.5 ? 5E-3 : 6E-4; + const size_t border = 2 * sigma; + VerifyRelativeError(expected, out, max_l1, 1E-8, border); +} + +TEST(GaussBlurTest, Test2D) { + const std::vector dimensions{6, 15, 17, 64, 50, 49}; + for (int xsize : dimensions) { + for (int ysize : dimensions) { + for (double sigma : {1.0, 2.5, 3.6, 7.0}) { + TestDirac2D(static_cast(xsize), static_cast(ysize), + sigma); + } + } + } +} + +// Slow (44 sec). To run, remove the disabled prefix. +TEST(GaussBlurTest, DISABLED_SlowTestDirac1D) { + const double sigma = 7.0; + const auto rg = CreateRecursiveGaussian(sigma); + + // IPOL accuracy test uses 10^-15 tolerance, this is 2*10^-11. + const size_t radius = static_cast(7 * sigma); + const std::vector kernel = GaussianKernel(radius, sigma); + + const size_t length = 16384; + ImageF inputs(length, 1); + ZeroFillImage(&inputs); + + auto outputs = hwy::AllocateAligned(length); + + // One per center position + auto sum_abs_err = hwy::AllocateAligned(length); + std::fill(sum_abs_err.get(), sum_abs_err.get() + length, 0.0); + + for (size_t center = radius; center < length - radius; ++center) { + inputs.Row(0)[center - 1] = 0.0f; // reset last peak, entire array now 0 + inputs.Row(0)[center] = 1.0f; + FastGaussian1D(rg, inputs.Row(0), length, outputs.get()); + + const ImageF outputs_fir = ConvolveF64(inputs, kernel); + + for (size_t i = 0; i < length; ++i) { + const float abs_err = std::abs(outputs[i] - outputs_fir.Row(0)[i]); + sum_abs_err[i] += static_cast(abs_err); + } + } + + const double max_abs_err = + *std::max_element(sum_abs_err.get(), sum_abs_err.get() + length); + printf("Max abs err: %.8e\n", max_abs_err); +} + +void TestRandom(size_t xsize, size_t ysize, float min, float max, double sigma, + double max_l1, double max_rel) { + printf("%4zu x %4zu %4.1f %4.1f sigma %.1f\n", xsize, ysize, min, max, sigma); + ImageF in(xsize, ysize); + RandomFillImage(&in, min, max, 65537 + xsize * 129 + ysize); + // FastGaussian/Convolve handle borders differently, so keep those pixels 0. + const size_t border = 4 * sigma; + SetBorder(border, 0.0f, &in); + + ImageF temp(xsize, ysize); + ImageF out(xsize, ysize); + const auto rg = CreateRecursiveGaussian(sigma); + ThreadPool* null_pool = nullptr; + FastGaussian(rg, in, null_pool, &temp, &out); + + const std::vector kernel = + GaussianKernel(static_cast(4 * sigma), static_cast(sigma)); + const ImageF expected = Convolve(in, kernel); + + VerifyRelativeError(expected, out, max_l1, max_rel, border); +} + +void TestRandomForSizes(float min, float max, double sigma) { + double max_l1 = 5E-3; + double max_rel = 3E-3; + TestRandom(128, 1, min, max, sigma, max_l1, max_rel); + TestRandom(1, 128, min, max, sigma, max_l1, max_rel); + TestRandom(30, 201, min, max, sigma, max_l1 * 1.6, max_rel * 1.2); + TestRandom(201, 30, min, max, sigma, max_l1 * 1.6, max_rel * 1.2); + TestRandom(201, 201, min, max, sigma, max_l1 * 2.0, max_rel * 1.2); +} + +TEST(GaussBlurTest, TestRandom) { + // small non-negative + TestRandomForSizes(0.0f, 10.0f, 3.0f); + TestRandomForSizes(0.0f, 10.0f, 7.0f); + + // small negative + TestRandomForSizes(-4.0f, -1.0f, 3.0f); + TestRandomForSizes(-4.0f, -1.0f, 7.0f); + + // mixed positive/negative + TestRandomForSizes(-6.0f, 6.0f, 3.0f); + TestRandomForSizes(-6.0f, 6.0f, 7.0f); +} + +TEST(GaussBlurTest, TestSign) { + const size_t xsize = 500; + const size_t ysize = 606; + ImageF in(xsize, ysize); + + ZeroFillImage(&in); + const float center[33 * 33] = { + -0.128445f, -0.098473f, -0.121883f, -0.093601f, 0.095665f, -0.271332f, + -0.705475f, -1.324005f, -2.020741f, -1.329464f, 1.834064f, 4.787300f, + 5.834560f, 5.272720f, 3.967960f, 3.547935f, 3.432732f, 3.383015f, + 3.239326f, 3.290806f, 3.298954f, 3.397808f, 3.359730f, 3.533844f, + 3.511856f, 3.436787f, 3.428310f, 3.460209f, 3.550011f, 3.590942f, + 3.593109f, 3.560005f, 3.443165f, 0.089741f, 0.179230f, -0.032997f, + -0.182610f, 0.005669f, -0.244759f, -0.395123f, -0.514961f, -1.003529f, + -1.798656f, -2.377975f, 0.222191f, 3.957664f, 5.946804f, 5.543129f, + 4.290096f, 3.621010f, 3.407257f, 3.392494f, 3.345367f, 3.391903f, + 3.441605f, 3.429260f, 3.444969f, 3.507130f, 3.518612f, 3.443111f, + 3.475948f, 3.536148f, 3.470333f, 3.628311f, 3.600243f, 3.292892f, + -0.226730f, -0.573616f, -0.762165f, -0.398739f, -0.189842f, -0.275921f, + -0.446739f, -0.550037f, -0.461033f, -0.724792f, -1.448349f, -1.814064f, + -0.491032f, 2.817703f, 5.213242f, 5.675629f, 4.864548f, 3.876324f, + 3.535587f, 3.530312f, 3.413765f, 3.386261f, 3.404854f, 3.383472f, + 3.420830f, 3.326496f, 3.257877f, 3.362152f, 3.489609f, 3.619587f, + 3.555805f, 3.423164f, 3.309708f, -0.483940f, -0.502926f, -0.592983f, + -0.492527f, -0.413616f, -0.482555f, -0.475506f, -0.447990f, -0.338120f, + -0.189072f, -0.376427f, -0.910828f, -1.878044f, -1.937927f, 1.423218f, + 4.871609f, 5.767548f, 5.103741f, 3.983868f, 3.633003f, 3.458263f, + 3.507309f, 3.247021f, 3.220612f, 3.326061f, 3.352814f, 3.291061f, + 3.322739f, 3.444302f, 3.506207f, 3.556839f, 3.529575f, 3.457024f, + -0.408161f, -0.431343f, -0.454369f, -0.356419f, -0.380924f, -0.399452f, + -0.439476f, -0.412189f, -0.306816f, -0.008213f, -0.325813f, -0.537842f, + -0.984100f, -1.805332f, -2.028198f, 0.773205f, 4.423046f, 5.604839f, + 5.231617f, 4.080299f, 3.603008f, 3.498741f, 3.517010f, 3.333897f, + 3.381336f, 3.342617f, 3.369686f, 3.434155f, 3.490452f, 3.607029f, + 3.555298f, 3.702297f, 3.618679f, -0.503609f, -0.578564f, -0.419014f, + -0.239883f, 0.269836f, 0.022984f, -0.455067f, -0.621777f, -0.304176f, + -0.163792f, -0.490250f, -0.466637f, -0.391792f, -0.657940f, -1.498035f, + -1.895836f, 0.036537f, 3.462456f, 5.586445f, 5.658791f, 4.434784f, + 3.423435f, 3.318848f, 3.202328f, 3.532764f, 3.436687f, 3.354881f, + 3.356941f, 3.382645f, 3.503902f, 3.512867f, 3.632366f, 3.537312f, + -0.274734f, -0.658829f, -0.726532f, -0.281254f, 0.053196f, -0.064991f, + -0.608517f, -0.720966f, -0.070602f, -0.111320f, -0.440956f, -0.492180f, + -0.488762f, -0.569283f, -1.012741f, -1.582779f, -2.101479f, -1.392380f, + 2.451153f, 5.555855f, 6.096313f, 5.230045f, 4.068172f, 3.404274f, + 3.392586f, 3.326065f, 3.156670f, 3.284828f, 3.347012f, 3.319252f, + 3.352310f, 3.610790f, 3.499847f, -0.150600f, -0.314445f, -0.093575f, + -0.057384f, 0.053688f, -0.189255f, -0.263515f, -0.318653f, 0.053246f, + 0.080627f, -0.119553f, -0.152454f, -0.305420f, -0.404869f, -0.385944f, + -0.689949f, -1.204914f, -1.985748f, -1.711361f, 1.260658f, 4.626896f, + 5.888351f, 5.450989f, 4.070587f, 3.539200f, 3.383492f, 3.296318f, + 3.267334f, 3.436028f, 3.463005f, 3.502625f, 3.522282f, 3.403763f, + -0.348049f, -0.302303f, -0.137016f, -0.041737f, -0.164001f, -0.358849f, + -0.469627f, -0.428291f, -0.375797f, -0.246346f, -0.118950f, -0.084229f, + -0.205681f, -0.241199f, -0.391796f, -0.323151f, -0.241211f, -0.834137f, + -1.684219f, -1.972137f, 0.448399f, 4.019985f, 5.648144f, 5.647846f, + 4.295094f, 3.641884f, 3.374790f, 3.197342f, 3.425545f, 3.507481f, + 3.478065f, 3.430889f, 3.341900f, -1.016304f, -0.959221f, -0.909466f, + -0.810715f, -0.590729f, -0.594467f, -0.646721f, -0.629364f, -0.528561f, + -0.551819f, -0.301086f, -0.149101f, -0.060146f, -0.162220f, -0.326210f, + -0.156548f, -0.036293f, -0.426098f, -1.145470f, -1.628998f, -2.003052f, + -1.142891f, 2.885162f, 5.652863f, 5.718426f, 4.911140f, 3.234222f, + 3.473373f, 3.577183f, 3.271603f, 3.410435f, 3.505489f, 3.434032f, + -0.508911f, -0.438797f, -0.437450f, -0.627426f, -0.511745f, -0.304874f, + -0.274246f, -0.261841f, -0.228466f, -0.342491f, -0.528206f, -0.490082f, + -0.516350f, -0.361694f, -0.398514f, -0.276020f, -0.210369f, -0.355938f, + -0.402622f, -0.538864f, -1.249573f, -2.100105f, -0.996178f, 1.886410f, + 4.929745f, 5.630871f, 5.444199f, 4.042740f, 3.739189f, 3.691399f, + 3.391956f, 3.469696f, 3.431232f, 0.204849f, 0.205433f, -0.131927f, + -0.367908f, -0.374378f, -0.126820f, -0.186951f, -0.228565f, -0.081776f, + -0.143143f, -0.379230f, -0.598701f, -0.458019f, -0.295586f, -0.407730f, + -0.245853f, -0.043140f, 0.024242f, -0.038998f, -0.044151f, -0.425991f, + -1.240753f, -1.943146f, -2.174755f, 0.523415f, 4.376751f, 5.956558f, + 5.850082f, 4.403152f, 3.517399f, 3.560753f, 3.554836f, 3.471985f, + -0.508503f, -0.109783f, 0.057747f, 0.190079f, -0.257153f, -0.591980f, + -0.666771f, -0.525391f, -0.293060f, -0.489731f, -0.304855f, -0.259644f, + -0.367825f, -0.346977f, -0.292889f, -0.215652f, -0.120705f, -0.176010f, + -0.422905f, -0.114647f, -0.289749f, -0.374203f, -0.606754f, -1.127949f, + -1.994583f, -0.588058f, 3.415840f, 5.603470f, 5.811581f, 4.959423f, + 3.721760f, 3.710499f, 3.785461f, -0.554588f, -0.565517f, -0.434578f, + -0.012482f, -0.284660f, -0.699795f, -0.957535f, -0.755135f, -0.382034f, + -0.321552f, -0.287571f, -0.279537f, -0.314972f, -0.256287f, -0.372818f, + -0.316017f, -0.287975f, -0.365639f, -0.512589f, -0.420692f, -0.436485f, + -0.295353f, -0.451958f, -0.755459f, -1.272358f, -2.301353f, -1.776161f, + 1.572483f, 4.826286f, 5.741898f, 5.162853f, 4.028049f, 3.686325f, + -0.495590f, -0.664413f, -0.760044f, -0.152634f, -0.286480f, -0.340462f, + 0.076477f, 0.187706f, -0.068787f, -0.293491f, -0.361145f, -0.292515f, + -0.140671f, -0.190723f, -0.333302f, -0.368168f, -0.192581f, -0.154499f, + -0.236544f, -0.124405f, -0.208321f, -0.465607f, -0.883080f, -1.104813f, + -1.210567f, -1.415665f, -1.924683f, -1.634758f, 0.601017f, 4.276672f, + 5.501350f, 5.331257f, 3.809288f, -0.727722f, -0.533619f, -0.511524f, + -0.470688f, -0.610710f, -0.575130f, -0.311115f, -0.090420f, -0.297676f, + -0.646118f, -0.742805f, -0.485050f, -0.330910f, -0.275417f, -0.357037f, + -0.425598f, -0.481876f, -0.488941f, -0.393551f, -0.051105f, -0.090755f, + -0.328674f, -0.536369f, -0.533684f, -0.336960f, -0.689194f, -1.187195f, + -1.860954f, -2.290253f, -0.424774f, 3.050060f, 5.083332f, 5.291920f, + -0.343605f, -0.190975f, -0.303692f, -0.456512f, -0.681820f, -0.690693f, + -0.416729f, -0.286446f, -0.442055f, -0.709148f, -0.569160f, -0.382423f, + -0.402321f, -0.383362f, -0.366413f, -0.290718f, -0.110069f, -0.220280f, + -0.279018f, -0.255424f, -0.262081f, -0.487556f, -0.444492f, -0.250500f, + -0.119583f, -0.291557f, -0.537781f, -1.104073f, -1.737091f, -1.697441f, + -0.323456f, 2.042049f, 4.605103f, -0.310631f, -0.279568f, -0.012695f, + -0.160130f, -0.358746f, -0.421101f, -0.559677f, -0.474136f, -0.416565f, + -0.561817f, -0.534672f, -0.519157f, -0.767197f, -0.605831f, -0.186523f, + 0.219872f, 0.264984f, -0.193432f, -0.363182f, -0.467472f, -0.462009f, + -0.571053f, -0.522476f, -0.315903f, -0.237427f, -0.147320f, -0.100201f, + -0.237568f, -0.763435f, -1.242043f, -2.135159f, -1.409485f, 1.236370f, + -0.474247f, -0.517906f, -0.410217f, -0.542244f, -0.795986f, -0.590004f, + -0.388863f, -0.462921f, -0.810627f, -0.778637f, -0.512486f, -0.718025f, + -0.710854f, -0.482513f, -0.318233f, -0.194962f, -0.220116f, -0.421673f, + -0.534233f, -0.403339f, -0.389332f, -0.407303f, -0.437355f, -0.469730f, + -0.359600f, -0.352745f, -0.466755f, -0.414585f, -0.430756f, -0.656822f, + -1.237038f, -2.046097f, -1.574898f, -0.593815f, -0.582165f, -0.336098f, + -0.372612f, -0.554386f, -0.410603f, -0.428276f, -0.647644f, -0.640720f, + -0.582207f, -0.414112f, -0.435547f, -0.435505f, -0.332561f, -0.248116f, + -0.340221f, -0.277855f, -0.352699f, -0.377319f, -0.230850f, -0.313267f, + -0.446270f, -0.346237f, -0.420422f, -0.530781f, -0.400341f, -0.463661f, + -0.209091f, -0.056705f, -0.011772f, -0.169388f, -0.736275f, -1.463017f, + -0.752701f, -0.668865f, -0.329765f, -0.299347f, -0.245667f, -0.286999f, + -0.520420f, -0.675438f, -0.255753f, 0.141357f, -0.079639f, -0.419476f, + -0.374069f, -0.046253f, 0.116116f, -0.145847f, -0.380371f, -0.563412f, + -0.638634f, -0.310116f, -0.260914f, -0.508404f, -0.465508f, -0.527824f, + -0.370979f, -0.305595f, -0.244694f, -0.254490f, 0.009968f, -0.050201f, + -0.331219f, -0.614960f, -0.788208f, -0.483242f, -0.367516f, -0.186951f, + -0.180031f, 0.129711f, -0.127811f, -0.384750f, -0.499542f, -0.418613f, + -0.121635f, 0.203197f, -0.167290f, -0.397270f, -0.355461f, -0.218746f, + -0.376785f, -0.521698f, -0.721581f, -0.845741f, -0.535439f, -0.220882f, + -0.309067f, -0.555248f, -0.690342f, -0.664948f, -0.390102f, 0.020355f, + -0.130447f, -0.173252f, -0.170059f, -0.633663f, -0.956001f, -0.621696f, + -0.388302f, -0.342262f, -0.244370f, -0.386948f, -0.401421f, -0.172979f, + -0.206163f, -0.450058f, -0.525789f, -0.549274f, -0.349251f, -0.474613f, + -0.667976f, -0.435600f, -0.175369f, -0.196877f, -0.202976f, -0.242481f, + -0.258369f, -0.189133f, -0.395397f, -0.765499f, -0.944016f, -0.850967f, + -0.631561f, -0.152493f, -0.046432f, -0.262066f, -0.195919f, 0.048218f, + 0.084972f, 0.039902f, 0.000618f, -0.404430f, -0.447456f, -0.418076f, + -0.631935f, -0.717415f, -0.502888f, -0.530514f, -0.747826f, -0.704041f, + -0.674969f, -0.516853f, -0.418446f, -0.327740f, -0.308815f, -0.481636f, + -0.440083f, -0.481720f, -0.341053f, -0.283897f, -0.324368f, -0.352829f, + -0.434349f, -0.545589f, -0.533104f, -0.472755f, -0.570496f, -0.557735f, + -0.708176f, -0.493332f, -0.194416f, -0.186249f, -0.256710f, -0.271835f, + -0.304752f, -0.431267f, -0.422398f, -0.646725f, -0.680801f, -0.249031f, + -0.058567f, -0.213890f, -0.383949f, -0.540291f, -0.549877f, -0.225567f, + -0.037174f, -0.499874f, -0.641010f, -0.628044f, -0.390549f, -0.311497f, + -0.542313f, -0.569565f, -0.473408f, -0.331245f, -0.357197f, -0.285599f, + -0.200157f, -0.201866f, -0.124428f, -0.346016f, -0.392311f, -0.264496f, + -0.285370f, -0.436974f, -0.523483f, -0.410461f, -0.267925f, -0.055016f, + -0.382458f, -0.319771f, -0.049927f, 0.124329f, 0.266102f, -0.106606f, + -0.773647f, -0.973053f, -0.708206f, -0.486137f, -0.319923f, -0.493900f, + -0.490860f, -0.324986f, -0.147346f, -0.146088f, -0.161758f, -0.084396f, + -0.379494f, 0.041626f, -0.113361f, -0.277767f, 0.083366f, 0.126476f, + 0.139057f, 0.038040f, 0.038162f, -0.242126f, -0.411736f, -0.370049f, + -0.455357f, -0.039257f, 0.264442f, -0.271492f, -0.425346f, -0.514847f, + -0.448650f, -0.580399f, -0.652603f, -0.774803f, -0.692524f, -0.579578f, + -0.465206f, -0.386265f, -0.458012f, -0.446594f, -0.284893f, -0.345448f, + -0.350876f, -0.440350f, -0.360378f, -0.270428f, 0.237213f, -0.063602f, + -0.364529f, -0.179867f, 0.078197f, 0.117947f, -0.093410f, -0.359119f, + -0.480961f, -0.540638f, -0.436287f, -0.598576f, -0.253735f, -0.060093f, + -0.549145f, -0.808327f, -0.698593f, -0.595764f, -0.582508f, -0.497353f, + -0.480892f, -0.584240f, -0.665791f, -0.690903f, -0.743446f, -0.796677f, + -0.782391f, -0.649010f, -0.628139f, -0.880848f, -0.829361f, -0.373272f, + -0.223667f, 0.174572f, -0.348743f, -0.798901f, -0.692307f, -0.607609f, + -0.401455f, -0.480919f, -0.450798f, -0.435413f, -0.322338f, -0.228382f, + -0.450466f, -0.504440f, -0.477402f, -0.662224f, -0.583397f, -0.217445f, + -0.157459f, -0.079584f, -0.226168f, -0.488720f, -0.669624f, -0.666878f, + -0.565311f, -0.549625f, -0.364601f, -0.497627f, -0.736897f, -0.763023f, + -0.741020f, -0.404503f, 0.184814f, -0.075315f, -0.281513f, -0.532906f, + -0.405800f, -0.313438f, -0.536652f, -0.403381f, 0.011967f, 0.103310f, + -0.269848f, -0.508656f, -0.445923f, -0.644859f, -0.617870f, -0.500927f, + -0.371559f, -0.125580f, 0.028625f, -0.154713f, -0.442024f, -0.492764f, + -0.199371f, 0.236305f, 0.225925f, 0.075577f, -0.285812f, -0.437145f, + -0.374260f, -0.156693f, -0.129635f, -0.243206f, -0.123058f, 0.162148f, + -0.313152f, -0.337982f, -0.358421f, 0.040070f, 0.038925f, -0.333313f, + -0.351662f, 0.023014f, 0.091362f, -0.282890f, -0.373253f, -0.389050f, + -0.532707f, -0.423347f, -0.349968f, -0.287045f, -0.202442f, -0.308430f, + -0.222801f, -0.106323f, -0.056358f, 0.027222f, 0.390732f, 0.033558f, + -0.160088f, -0.382217f, -0.535282f, -0.515900f, -0.022736f, 0.165665f, + -0.111408f, -0.233784f, -0.312357f, -0.541885f, -0.480022f, -0.482513f, + -0.246254f, 0.132244f, 0.090134f, 0.234634f, -0.089249f, -0.460854f, + -0.515457f, -0.450874f, -0.311031f, -0.387680f, -0.360554f, -0.179241f, + -0.283817f, -0.475815f, -0.246399f, -0.388958f, -0.551140f, -0.496239f, + -0.559879f, -0.379761f, -0.254288f, -0.395111f, -0.613018f, -0.459427f, + -0.263580f, -0.268929f, 0.080826f, 0.115616f, -0.097324f, -0.325310f, + -0.480450f, -0.313286f, -0.310371f, -0.517361f, -0.288288f, -0.112679f, + -0.173241f, -0.221664f, -0.039452f, -0.107578f, -0.089630f, -0.483768f, + -0.571087f, -0.497108f, -0.321533f, -0.375492f, -0.540363f, -0.406815f, + -0.388512f, -0.514561f, -0.540192f, -0.402412f, -0.232246f, -0.304749f, + -0.383724f, -0.679596f, -0.685463f, -0.694538f, -0.642937f, -0.425789f, + 0.103271f, -0.194862f, -0.487999f, -0.717281f, -0.681850f, -0.709286f, + -0.615398f, -0.554245f, -0.254681f, -0.049950f, -0.002914f, -0.095383f, + -0.370911f, -0.564224f, -0.242714f}; + const size_t xtest = xsize / 2; + const size_t ytest = ysize / 2; + + for (intptr_t dy = -16; dy <= 16; ++dy) { + float* row = in.Row(ytest + dy); + for (intptr_t dx = -16; dx <= 16; ++dx) + row[xtest + dx] = center[(dy + 16) * 33 + (dx + 16)]; + } + + const double sigma = 7.155933; + + ImageF temp(xsize, ysize); + ImageF out_rg(xsize, ysize); + const auto rg = CreateRecursiveGaussian(sigma); + ThreadPool* null_pool = nullptr; + FastGaussian(rg, in, null_pool, &temp, &out_rg); + + ImageF out_old; + { + const std::vector kernel = + GaussianKernel(static_cast(4 * sigma), static_cast(sigma)); + printf("old kernel size %zu\n", kernel.size()); + out_old = Convolve(in, kernel); + } + + printf("rg %.4f old %.4f\n", out_rg.Row(ytest)[xtest], + out_old.Row(ytest)[xtest]); +} + +// Returns megapixels/sec. "div" is a divisor for the number of repetitions, +// used to reduce benchmark duration. Func returns elapsed time. +template +double Measure(const size_t xsize, const size_t ysize, int div, + const Func& func) { +#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ + defined(THREAD_SANITIZER) + int reps = 10 / div; +#else + int reps = 2000 / div; +#endif + if (reps < 2) reps = 2; + std::vector elapsed; + for (int i = 0; i < reps; ++i) { + elapsed.push_back(func(xsize, ysize)); + } + + double mean_elapsed; + // Potential loss of precision, and also enough samples for mode. + if (reps > 50) { + std::sort(elapsed.begin(), elapsed.end()); + mean_elapsed = jxl::HalfSampleMode()(elapsed.data(), elapsed.size()); + } else { + // Skip first(noisier) + mean_elapsed = Geomean(elapsed.data() + 1, elapsed.size() - 1); + } + return (xsize * ysize * 1E-6) / mean_elapsed; +} + +void Benchmark1D() { + // Uncomment to disable SIMD and force and scalar implementation + // hwy::DisableTargets(~HWY_SCALAR); + + const size_t length = 16384; // (same value used for running IPOL benchmark) + const double sigma = 7.0; // (from Butteraugli application) + // NOTE: MSVC and clang disagree on the required captures, so use =. + const double mps_rg1 = + Measure(length, 1, 1, [=](size_t /*xsize*/, size_t /*ysize*/) { + ImageF in(length, 1); + const float expected = length; + FillImage(expected, &in); + + ImageF temp(length, 1); + ImageF out(length, 1); + const auto rg = CreateRecursiveGaussian(sigma); + const double t0 = Now(); + FastGaussian1D(rg, in.Row(0), length, out.Row(0)); + const double t1 = Now(); + // Prevent optimizing out + const float actual = out.ConstRow(0)[length / 2]; + const float rel_err = std::abs(actual - expected) / expected; + EXPECT_LT(rel_err, 9E-5); + return t1 - t0; + }); + // Report milliseconds for comparison with IPOL benchmark + const double milliseconds = (1E-6 * length) / mps_rg1 * 1E3; + printf("%5zu @%.1f: rg 1D %e\n", length, sigma, milliseconds); +} + +void Benchmark(size_t xsize, size_t ysize, double sigma) { + // Uncomment to run AVX2 + // hwy::DisableTargets(HWY_AVX3); + + const double mps_rg = + Measure(xsize, ysize, 1, [sigma](size_t xsize, size_t ysize) { + ImageF in(xsize, ysize); + const float expected = xsize + ysize; + FillImage(expected, &in); + + ImageF temp(xsize, ysize); + ImageF out(xsize, ysize); + const auto rg = CreateRecursiveGaussian(sigma); + ThreadPool* null_pool = nullptr; + const double t0 = Now(); + FastGaussian(rg, in, null_pool, &temp, &out); + const double t1 = Now(); + // Prevent optimizing out + const float actual = out.ConstRow(ysize / 2)[xsize / 2]; + const float rel_err = std::abs(actual - expected) / expected; + EXPECT_LT(rel_err, 9E-5); + return t1 - t0; + }); + + const double mps_fir = + Measure(xsize, ysize, 100, [sigma](size_t xsize, size_t ysize) { + ImageF in(xsize, ysize); + const float expected = xsize + ysize; + FillImage(expected, &in); + const std::vector kernel = GaussianKernel( + static_cast(4 * sigma), static_cast(sigma)); + const double t0 = Now(); + const ImageF out = Convolve(in, kernel); + const double t1 = Now(); + + // Prevent optimizing out + const float actual = out.ConstRow(ysize / 2)[xsize / 2]; + const float rel_err = std::abs(actual - expected) / expected; + EXPECT_LT(rel_err, 5E-6); + return t1 - t0; + }); + + const double mps_simd7 = + Measure(xsize, ysize, 10, [](size_t xsize, size_t ysize) { + ImageF in(xsize, ysize); + const float expected = xsize + ysize; + FillImage(expected, &in); + ImageF out(xsize, ysize); + // Gaussian with sigma 1 + const WeightsSeparable7 weights = { + {HWY_REP4(0.383103f), HWY_REP4(0.241843f), HWY_REP4(0.060626f), + HWY_REP4(0.00598f)}, + {HWY_REP4(0.383103f), HWY_REP4(0.241843f), HWY_REP4(0.060626f), + HWY_REP4(0.00598f)}}; + ThreadPool* null_pool = nullptr; + const double t0 = Now(); + Separable7(in, Rect(in), weights, null_pool, &out); + const double t1 = Now(); + + // Prevent optimizing out + const float actual = out.ConstRow(ysize / 2)[xsize / 2]; + const float rel_err = std::abs(actual - expected) / expected; + EXPECT_LT(rel_err, 5E-6); + return t1 - t0; + }); + + printf("%zu,%zu,%.1f,%.1f,%.1f\n", xsize, ysize, mps_fir, mps_simd7, mps_rg); +} + +TEST(GaussBlurTest, BenchmarkTest) { + Benchmark1D(); + Benchmark(77, 177, 7); +} + +TEST(GaussBlurTest, DISABLED_SlowBenchmark) { + Benchmark1D(); + + // Euler's gamma as a nothing-up-my-sleeve number, so sizes are unlikely to + // interact with cache properties + const float g = 0.57721566; + const size_t d0 = 128; + const size_t d1 = static_cast(d0 / g); + const size_t d2 = static_cast(d1 / g); + const size_t d3 = static_cast(d2 / g); + Benchmark(d0, d0, 7); + Benchmark(d0, d1, 7); + Benchmark(d1, d0, 7); + Benchmark(d1, d1, 7); + Benchmark(d1, d2, 7); + Benchmark(d2, d1, 7); + Benchmark(d2, d2, 7); + Benchmark(d2, d3, 7); + Benchmark(d3, d2, 7); + Benchmark(d3, d3, 7); + + Benchmark(1920, 1080, 7); + + PROFILER_PRINT_RESULTS(); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gradient_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gradient_test.cc new file mode 100644 index 0000000000..332684ae4c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/gradient_test.cc @@ -0,0 +1,205 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include + +#include +#include +#include + +#include "gtest/gtest.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/thread_pool_internal.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_file.h" +#include "lib/jxl/dec_params.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { +namespace { + +// Returns distance of point p to line p0..p1, the result is signed and is not +// normalized. +double PointLineDist(double x0, double y0, double x1, double y1, double x, + double y) { + return (y1 - y0) * x - (x1 - x0) * y + x1 * y0 - y1 * x0; +} + +// Generates a test image with a gradient from one color to another. +// Angle in degrees, colors can be given in hex as 0xRRGGBB. The angle is the +// angle in which the change direction happens. +Image3F GenerateTestGradient(uint32_t color0, uint32_t color1, double angle, + size_t xsize, size_t ysize) { + Image3F image(xsize, ysize); + + double x0 = xsize / 2; + double y0 = ysize / 2; + double x1 = x0 + std::sin(angle / 360.0 * 2.0 * kPi); + double y1 = y0 + std::cos(angle / 360.0 * 2.0 * kPi); + + double maxdist = + std::max(fabs(PointLineDist(x0, y0, x1, y1, 0, 0)), + fabs(PointLineDist(x0, y0, x1, y1, xsize, 0))); + + for (size_t c = 0; c < 3; ++c) { + float c0 = ((color0 >> (8 * (2 - c))) & 255); + float c1 = ((color1 >> (8 * (2 - c))) & 255); + for (size_t y = 0; y < ysize; ++y) { + float* row = image.PlaneRow(c, y); + for (size_t x = 0; x < xsize; ++x) { + double dist = PointLineDist(x0, y0, x1, y1, x, y); + double v = ((dist / maxdist) + 1.0) / 2.0; + float color = c0 * (1.0 - v) + c1 * v; + row[x] = color; + } + } + } + + return image; +} + +// Computes the max of the horizontal and vertical second derivative for each +// pixel, where second derivative means absolute value of difference of left +// delta and right delta (top/bottom for vertical direction). +// The radius over which the derivative is computed is only 1 pixel and it only +// checks two angles (hor and ver), but this approximation works well enough. +static ImageF Gradient2(const ImageF& image) { + size_t xsize = image.xsize(); + size_t ysize = image.ysize(); + ImageF image2(image.xsize(), image.ysize()); + for (size_t y = 1; y + 1 < ysize; y++) { + const auto* JXL_RESTRICT row0 = image.Row(y - 1); + const auto* JXL_RESTRICT row1 = image.Row(y); + const auto* JXL_RESTRICT row2 = image.Row(y + 1); + auto* row_out = image2.Row(y); + for (size_t x = 1; x + 1 < xsize; x++) { + float ddx = (row1[x] - row1[x - 1]) - (row1[x + 1] - row1[x]); + float ddy = (row1[x] - row0[x]) - (row2[x] - row1[x]); + row_out[x] = std::max(fabsf(ddx), fabsf(ddy)); + } + } + // Copy to the borders + if (ysize > 2) { + auto* JXL_RESTRICT row0 = image2.Row(0); + const auto* JXL_RESTRICT row1 = image2.Row(1); + const auto* JXL_RESTRICT row2 = image2.Row(ysize - 2); + auto* JXL_RESTRICT row3 = image2.Row(ysize - 1); + for (size_t x = 1; x + 1 < xsize; x++) { + row0[x] = row1[x]; + row3[x] = row2[x]; + } + } else { + const auto* row0_in = image.Row(0); + const auto* row1_in = image.Row(ysize - 1); + auto* row0_out = image2.Row(0); + auto* row1_out = image2.Row(ysize - 1); + for (size_t x = 1; x + 1 < xsize; x++) { + // Image too narrow, take first derivative instead + row0_out[x] = row1_out[x] = fabsf(row0_in[x] - row1_in[x]); + } + } + if (xsize > 2) { + for (size_t y = 0; y < ysize; y++) { + auto* row = image2.Row(y); + row[0] = row[1]; + row[xsize - 1] = row[xsize - 2]; + } + } else { + for (size_t y = 0; y < ysize; y++) { + const auto* JXL_RESTRICT row_in = image.Row(y); + auto* row_out = image2.Row(y); + // Image too narrow, take first derivative instead + row_out[0] = row_out[xsize - 1] = fabsf(row_in[0] - row_in[xsize - 1]); + } + } + return image2; +} + +static Image3F Gradient2(const Image3F& image) { + return Image3F(Gradient2(image.Plane(0)), Gradient2(image.Plane(1)), + Gradient2(image.Plane(2))); +} + +/* +Tests if roundtrip with jxl on a gradient image doesn't cause banding. +Only tests if use_gradient is true. Set to false for debugging to see the +distance values. +Angle in degrees, colors can be given in hex as 0xRRGGBB. +*/ +void TestGradient(ThreadPool* pool, uint32_t color0, uint32_t color1, + size_t xsize, size_t ysize, float angle, bool fast_mode, + float butteraugli_distance, bool use_gradient = true) { + CompressParams cparams; + cparams.butteraugli_distance = butteraugli_distance; + if (fast_mode) { + cparams.speed_tier = SpeedTier::kSquirrel; + } + DecompressParams dparams; + + Image3F gradient = GenerateTestGradient(color0, color1, angle, xsize, ysize); + + CodecInOut io; + io.metadata.m.SetUintSamples(8); + io.metadata.m.color_encoding = ColorEncoding::SRGB(); + io.SetFromImage(std::move(gradient), io.metadata.m.color_encoding); + + CodecInOut io2; + + PaddedBytes compressed; + AuxOut* aux_out = nullptr; + PassesEncoderState enc_state; + EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, aux_out, pool)); + EXPECT_TRUE(DecodeFile(dparams, compressed, &io2, pool)); + EXPECT_TRUE(io2.Main().TransformTo(io2.metadata.m.color_encoding, pool)); + + if (use_gradient) { + // Test that the gradient map worked. For that, we take a second derivative + // of the image with Gradient2 to measure how linear the change is in x and + // y direction. For a well handled gradient, we expect max values around + // 0.1, while if there is noticeable banding, which means the gradient map + // failed, the values are around 0.5-1.0 (regardless of + // butteraugli_distance). + Image3F gradient2 = Gradient2(*io2.Main().color()); + + std::array image_max; + Image3Max(gradient2, &image_max); + + // TODO(jyrki): These values used to work with 0.2, 0.2, 0.2. + EXPECT_LE(image_max[0], 3.15); + EXPECT_LE(image_max[1], 1.72); + EXPECT_LE(image_max[2], 5.05); + } +} + +static constexpr bool fast_mode = true; + +TEST(GradientTest, SteepGradient) { + ThreadPoolInternal pool(8); + // Relatively steep gradients, colors from the sky of stp.png + TestGradient(&pool, 0xd99d58, 0x889ab1, 512, 512, 90, fast_mode, 3.0); +} + +TEST(GradientTest, SubtleGradient) { + ThreadPoolInternal pool(8); + // Very subtle gradient + TestGradient(&pool, 0xb89b7b, 0xa89b8d, 512, 512, 90, fast_mode, 4.0); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/headers.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/headers.cc new file mode 100644 index 0000000000..41e8595bc4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/headers.cc @@ -0,0 +1,212 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/headers.h" + +#include "lib/jxl/common.h" +#include "lib/jxl/fields.h" + +namespace jxl { +namespace { + +struct Rational { + constexpr explicit Rational(uint32_t num, uint32_t den) + : num(num), den(den) {} + + // Returns floor(multiplicand * rational). + constexpr uint32_t MulTruncate(uint32_t multiplicand) const { + return uint64_t(multiplicand) * num / den; + } + + uint32_t num; + uint32_t den; +}; + +Rational FixedAspectRatios(uint32_t ratio) { + JXL_ASSERT(0 != ratio && ratio < 8); + // Other candidates: 5/4, 7/5, 14/9, 16/10, 5/3, 21/9, 12/5 + constexpr Rational kRatios[7] = {Rational(1, 1), // square + Rational(12, 10), // + Rational(4, 3), // camera + Rational(3, 2), // mobile camera + Rational(16, 9), // camera/display + Rational(5, 4), // + Rational(2, 1)}; // + return kRatios[ratio - 1]; +} + +uint32_t FindAspectRatio(uint32_t xsize, uint32_t ysize) { + for (uint32_t r = 1; r < 8; ++r) { + if (xsize == FixedAspectRatios(r).MulTruncate(ysize)) { + return r; + } + } + return 0; // Must send xsize instead +} + +} // namespace + +size_t SizeHeader::xsize() const { + if (ratio_ != 0) { + return FixedAspectRatios(ratio_).MulTruncate( + static_cast(ysize())); + } + return small_ ? ((xsize_div8_minus_1_ + 1) * 8) : xsize_; +} + +Status SizeHeader::Set(size_t xsize64, size_t ysize64) { + if (xsize64 > 0xFFFFFFFFull || ysize64 > 0xFFFFFFFFull) { + return JXL_FAILURE("Image too large"); + } + const uint32_t xsize32 = static_cast(xsize64); + const uint32_t ysize32 = static_cast(ysize64); + if (xsize64 == 0 || ysize64 == 0) return JXL_FAILURE("Empty image"); + small_ = xsize64 <= 256 && ysize64 <= 256 && (xsize64 % kBlockDim) == 0 && + (ysize64 % kBlockDim) == 0; + if (small_) { + ysize_div8_minus_1_ = ysize32 / 8 - 1; + } else { + ysize_ = ysize32; + } + + ratio_ = FindAspectRatio(xsize32, ysize32); + if (ratio_ == 0) { + if (small_) { + xsize_div8_minus_1_ = xsize32 / 8 - 1; + } else { + xsize_ = xsize32; + } + } + JXL_ASSERT(xsize() == xsize64); + JXL_ASSERT(ysize() == ysize64); + return true; +} + +Status PreviewHeader::Set(size_t xsize64, size_t ysize64) { + const uint32_t xsize32 = static_cast(xsize64); + const uint32_t ysize32 = static_cast(ysize64); + if (xsize64 == 0 || ysize64 == 0) return JXL_FAILURE("Empty preview"); + div8_ = (xsize64 % kBlockDim) == 0 && (ysize64 % kBlockDim) == 0; + if (div8_) { + ysize_div8_ = ysize32 / 8; + } else { + ysize_ = ysize32; + } + + ratio_ = FindAspectRatio(xsize32, ysize32); + if (ratio_ == 0) { + if (div8_) { + xsize_div8_ = xsize32 / 8; + } else { + xsize_ = xsize32; + } + } + JXL_ASSERT(xsize() == xsize64); + JXL_ASSERT(ysize() == ysize64); + return true; +} + +size_t PreviewHeader::xsize() const { + if (ratio_ != 0) { + return FixedAspectRatios(ratio_).MulTruncate( + static_cast(ysize())); + } + return div8_ ? (xsize_div8_ * 8) : xsize_; +} + +SizeHeader::SizeHeader() { Bundle::Init(this); } +Status SizeHeader::VisitFields(Visitor* JXL_RESTRICT visitor) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &small_)); + + if (visitor->Conditional(small_)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(5, 0, &ysize_div8_minus_1_)); + } + if (visitor->Conditional(!small_)) { + // (Could still be small, but non-multiple of 8.) + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(BitsOffset(9, 1), BitsOffset(13, 1), + BitsOffset(18, 1), BitsOffset(30, 1), + 1, &ysize_)); + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 0, &ratio_)); + if (visitor->Conditional(ratio_ == 0 && small_)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(5, 0, &xsize_div8_minus_1_)); + } + if (visitor->Conditional(ratio_ == 0 && !small_)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(BitsOffset(9, 1), BitsOffset(13, 1), + BitsOffset(18, 1), BitsOffset(30, 1), + 1, &xsize_)); + } + + return true; +} + +PreviewHeader::PreviewHeader() { Bundle::Init(this); } +Status PreviewHeader::VisitFields(Visitor* JXL_RESTRICT visitor) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &div8_)); + + if (visitor->Conditional(div8_)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(16), Val(32), BitsOffset(5, 1), + BitsOffset(9, 33), 1, &ysize_div8_)); + } + if (visitor->Conditional(!div8_)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(BitsOffset(6, 1), BitsOffset(8, 65), + BitsOffset(10, 321), + BitsOffset(12, 1345), 1, &ysize_)); + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 0, &ratio_)); + if (visitor->Conditional(ratio_ == 0 && div8_)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(16), Val(32), BitsOffset(5, 1), + BitsOffset(9, 33), 1, &xsize_div8_)); + } + if (visitor->Conditional(ratio_ == 0 && !div8_)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(BitsOffset(6, 1), BitsOffset(8, 65), + BitsOffset(10, 321), + BitsOffset(12, 1345), 1, &xsize_)); + } + + return true; +} + +AnimationHeader::AnimationHeader() { Bundle::Init(this); } +Status AnimationHeader::VisitFields(Visitor* JXL_RESTRICT visitor) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(100), Val(1000), BitsOffset(10, 1), + BitsOffset(30, 1), 1, &tps_numerator)); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(1), Val(1001), BitsOffset(8, 1), + BitsOffset(10, 1), 1, + &tps_denominator)); + + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(0), Bits(3), Bits(16), Bits(32), 0, &num_loops)); + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_timecodes)); + return true; +} + +Status ReadSizeHeader(BitReader* JXL_RESTRICT reader, + SizeHeader* JXL_RESTRICT size) { + return Bundle::Read(reader, size); +} + +Status WriteSizeHeader(const SizeHeader& size, BitWriter* JXL_RESTRICT writer, + size_t layer, AuxOut* aux_out) { + const size_t max_bits = Bundle::MaxBits(size); + if (max_bits != SizeHeader::kMaxBits) { + JXL_ABORT("Please update SizeHeader::kMaxBits from %zu to %zu\n", + SizeHeader::kMaxBits, max_bits); + } + + // Only check the number of non-extension bits (extensions are unbounded). + // (Bundle::Write will call CanEncode again, but it is fast because SizeHeader + // is tiny.) + size_t extension_bits, total_bits; + JXL_RETURN_IF_ERROR(Bundle::CanEncode(size, &extension_bits, &total_bits)); + JXL_ASSERT(total_bits - extension_bits < SizeHeader::kMaxBits); + + return Bundle::Write(size, writer, layer, aux_out); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/headers.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/headers.h new file mode 100644 index 0000000000..d33e2b5498 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/headers.h @@ -0,0 +1,106 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_HEADERS_H_ +#define LIB_JXL_HEADERS_H_ + +// Codestream headers, also stored in CodecInOut. + +#include +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/field_encodings.h" + +namespace jxl { + +// Reserved by ISO/IEC 10918-1. LF causes files opened in text mode to be +// rejected because the marker changes to 0x0D instead. The 0xFF prefix also +// ensures there were no 7-bit transmission limitations. +static constexpr uint8_t kCodestreamMarker = 0x0A; + +// Compact representation of image dimensions (best case: 9 bits) so decoders +// can preallocate early. +class SizeHeader : public Fields { + public: + // All fields are valid after reading at most this many bits. WriteSizeHeader + // verifies this matches Bundle::MaxBits(SizeHeader). + static constexpr size_t kMaxBits = 78; + + SizeHeader(); + const char* Name() const override { return "SizeHeader"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + Status Set(size_t xsize, size_t ysize); + + size_t xsize() const; + size_t ysize() const { + return small_ ? ((ysize_div8_minus_1_ + 1) * 8) : ysize_; + } + + private: + bool small_; // xsize and ysize <= 256 and divisible by 8. + + uint32_t ysize_div8_minus_1_; + uint32_t ysize_; + + uint32_t ratio_; + uint32_t xsize_div8_minus_1_; + uint32_t xsize_; +}; + +// (Similar to SizeHeader but different encoding because previews are smaller) +class PreviewHeader : public Fields { + public: + PreviewHeader(); + const char* Name() const override { return "PreviewHeader"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + Status Set(size_t xsize, size_t ysize); + + size_t xsize() const; + size_t ysize() const { return div8_ ? (ysize_div8_ * 8) : ysize_; } + + private: + bool div8_; // xsize and ysize divisible by 8. + + uint32_t ysize_div8_; + uint32_t ysize_; + + uint32_t ratio_; + uint32_t xsize_div8_; + uint32_t xsize_; +}; + +struct AnimationHeader : public Fields { + AnimationHeader(); + const char* Name() const override { return "AnimationHeader"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + // Ticks per second (expressed as rational number to support NTSC) + uint32_t tps_numerator; + uint32_t tps_denominator; + + uint32_t num_loops; // 0 means to repeat infinitely. + + bool have_timecodes; +}; + +Status ReadSizeHeader(BitReader* JXL_RESTRICT reader, + SizeHeader* JXL_RESTRICT size); + +Status WriteSizeHeader(const SizeHeader& size, BitWriter* JXL_RESTRICT writer, + size_t layer, AuxOut* aux_out); + +} // namespace jxl + +#endif // LIB_JXL_HEADERS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_table.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_table.cc new file mode 100644 index 0000000000..9ae7865af6 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_table.cc @@ -0,0 +1,161 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/huffman_table.h" + +#include /* for memcpy */ +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/dec_huffman.h" + +namespace jxl { + +/* Returns reverse(reverse(key, len) + 1, len), where reverse(key, len) is the + bit-wise reversal of the len least significant bits of key. */ +static inline int GetNextKey(int key, int len) { + int step = 1u << (len - 1); + while (key & step) { + step >>= 1; + } + return (key & (step - 1)) + step; +} + +/* Stores code in table[0], table[step], table[2*step], ..., table[end] */ +/* Assumes that end is an integer multiple of step */ +static inline void ReplicateValue(HuffmanCode* table, int step, int end, + HuffmanCode code) { + do { + end -= step; + table[end] = code; + } while (end > 0); +} + +/* Returns the table width of the next 2nd level table. count is the histogram + of bit lengths for the remaining symbols, len is the code length of the next + processed symbol */ +static inline size_t NextTableBitSize(const uint16_t* const count, size_t len, + int root_bits) { + size_t left = 1u << (len - root_bits); + while (len < PREFIX_MAX_BITS) { + if (left <= count[len]) break; + left -= count[len]; + ++len; + left <<= 1; + } + return len - root_bits; +} + +uint32_t BuildHuffmanTable(HuffmanCode* root_table, int root_bits, + const uint8_t* const code_lengths, + size_t code_lengths_size, uint16_t* count) { + HuffmanCode code; /* current table entry */ + HuffmanCode* table; /* next available space in table */ + size_t len; /* current code length */ + size_t symbol; /* symbol index in original or sorted table */ + int key; /* reversed prefix code */ + int step; /* step size to replicate values in current table */ + int low; /* low bits for current root entry */ + int mask; /* mask for low bits */ + size_t table_bits; /* key length of current table */ + int table_size; /* size of current table */ + int total_size; /* sum of root table size and 2nd level table sizes */ + /* offsets in sorted table for each length */ + uint16_t offset[PREFIX_MAX_BITS + 1]; + size_t max_length = 1; + + if (code_lengths_size > 1u << PREFIX_MAX_BITS) return 0; + + /* symbols sorted by code length */ + std::vector sorted_storage(code_lengths_size); + uint16_t* sorted = sorted_storage.data(); + + /* generate offsets into sorted symbol table by code length */ + { + uint16_t sum = 0; + for (len = 1; len <= PREFIX_MAX_BITS; len++) { + offset[len] = sum; + if (count[len]) { + sum = static_cast(sum + count[len]); + max_length = len; + } + } + } + + /* sort symbols by length, by symbol order within each length */ + for (symbol = 0; symbol < code_lengths_size; symbol++) { + if (code_lengths[symbol] != 0) { + sorted[offset[code_lengths[symbol]]++] = symbol; + } + } + + table = root_table; + table_bits = root_bits; + table_size = 1u << table_bits; + total_size = table_size; + + /* special case code with only one value */ + if (offset[PREFIX_MAX_BITS] == 1) { + code.bits = 0; + code.value = static_cast(sorted[0]); + for (key = 0; key < total_size; ++key) { + table[key] = code; + } + return total_size; + } + + /* fill in root table */ + /* let's reduce the table size to a smaller size if possible, and */ + /* create the repetitions by memcpy if possible in the coming loop */ + if (table_bits > max_length) { + table_bits = max_length; + table_size = 1u << table_bits; + } + key = 0; + symbol = 0; + code.bits = 1; + step = 2; + do { + for (; count[code.bits] != 0; --count[code.bits]) { + code.value = static_cast(sorted[symbol++]); + ReplicateValue(&table[key], step, table_size, code); + key = GetNextKey(key, code.bits); + } + step <<= 1; + } while (++code.bits <= table_bits); + + /* if root_bits != table_bits we only created one fraction of the */ + /* table, and we need to replicate it now. */ + while (total_size != table_size) { + memcpy(&table[table_size], &table[0], table_size * sizeof(table[0])); + table_size <<= 1; + } + + /* fill in 2nd level tables and add pointers to root table */ + mask = total_size - 1; + low = -1; + for (len = root_bits + 1, step = 2; len <= max_length; ++len, step <<= 1) { + for (; count[len] != 0; --count[len]) { + if ((key & mask) != low) { + table += table_size; + table_bits = NextTableBitSize(count, len, root_bits); + table_size = 1u << table_bits; + total_size += table_size; + low = key & mask; + root_table[low].bits = static_cast(table_bits + root_bits); + root_table[low].value = + static_cast((table - root_table) - low); + } + code.bits = static_cast(len - root_bits); + code.value = static_cast(sorted[symbol++]); + ReplicateValue(&table[key >> root_bits], step, table_size, code); + key = GetNextKey(key, len); + } + } + + return total_size; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_table.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_table.h new file mode 100644 index 0000000000..11cdb2fc45 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_table.h @@ -0,0 +1,28 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_HUFFMAN_TABLE_H_ +#define LIB_JXL_HUFFMAN_TABLE_H_ + +#include +#include + +namespace jxl { + +struct HuffmanCode { + uint8_t bits; /* number of bits used for this symbol */ + uint16_t value; /* symbol value or table offset */ +}; + +/* Builds Huffman lookup table assuming code lengths are in symbol order. */ +/* Returns 0 in case of error (invalid tree or memory error), otherwise + populated size of table. */ +uint32_t BuildHuffmanTable(HuffmanCode* root_table, int root_bits, + const uint8_t* code_lengths, + size_t code_lengths_size, uint16_t* count); + +} // namespace jxl + +#endif // LIB_JXL_HUFFMAN_TABLE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_tree.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_tree.cc new file mode 100644 index 0000000000..77107b08d2 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_tree.cc @@ -0,0 +1,328 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/huffman_tree.h" + +#include +#include +#include + +#include "lib/jxl/base/status.h" + +namespace jxl { + +void SetDepth(const HuffmanTree& p, HuffmanTree* pool, uint8_t* depth, + uint8_t level) { + if (p.index_left >= 0) { + ++level; + SetDepth(pool[p.index_left], pool, depth, level); + SetDepth(pool[p.index_right_or_value], pool, depth, level); + } else { + depth[p.index_right_or_value] = level; + } +} + +// Sort the root nodes, least popular first. +static JXL_INLINE bool Compare(const HuffmanTree& v0, const HuffmanTree& v1) { + return v0.total_count < v1.total_count; +} + +// This function will create a Huffman tree. +// +// The catch here is that the tree cannot be arbitrarily deep. +// Brotli specifies a maximum depth of 15 bits for "code trees" +// and 7 bits for "code length code trees." +// +// count_limit is the value that is to be faked as the minimum value +// and this minimum value is raised until the tree matches the +// maximum length requirement. +// +// This algorithm is not of excellent performance for very long data blocks, +// especially when population counts are longer than 2**tree_limit, but +// we are not planning to use this with extremely long blocks. +// +// See http://en.wikipedia.org/wiki/Huffman_coding +void CreateHuffmanTree(const uint32_t* data, const size_t length, + const int tree_limit, uint8_t* depth) { + // For block sizes below 64 kB, we never need to do a second iteration + // of this loop. Probably all of our block sizes will be smaller than + // that, so this loop is mostly of academic interest. If we actually + // would need this, we would be better off with the Katajainen algorithm. + for (uint32_t count_limit = 1;; count_limit *= 2) { + std::vector tree; + tree.reserve(2 * length + 1); + + for (size_t i = length; i != 0;) { + --i; + if (data[i]) { + const uint32_t count = std::max(data[i], count_limit - 1); + tree.emplace_back(count, -1, static_cast(i)); + } + } + + const size_t n = tree.size(); + if (n == 1) { + // Fake value; will be fixed on upper level. + depth[tree[0].index_right_or_value] = 1; + break; + } + + std::stable_sort(tree.begin(), tree.end(), Compare); + + // The nodes are: + // [0, n): the sorted leaf nodes that we start with. + // [n]: we add a sentinel here. + // [n + 1, 2n): new parent nodes are added here, starting from + // (n+1). These are naturally in ascending order. + // [2n]: we add a sentinel at the end as well. + // There will be (2n+1) elements at the end. + const HuffmanTree sentinel(std::numeric_limits::max(), -1, -1); + tree.push_back(sentinel); + tree.push_back(sentinel); + + size_t i = 0; // Points to the next leaf node. + size_t j = n + 1; // Points to the next non-leaf node. + for (size_t k = n - 1; k != 0; --k) { + size_t left, right; + if (tree[i].total_count <= tree[j].total_count) { + left = i; + ++i; + } else { + left = j; + ++j; + } + if (tree[i].total_count <= tree[j].total_count) { + right = i; + ++i; + } else { + right = j; + ++j; + } + + // The sentinel node becomes the parent node. + size_t j_end = tree.size() - 1; + tree[j_end].total_count = + tree[left].total_count + tree[right].total_count; + tree[j_end].index_left = static_cast(left); + tree[j_end].index_right_or_value = static_cast(right); + + // Add back the last sentinel node. + tree.push_back(sentinel); + } + JXL_DASSERT(tree.size() == 2 * n + 1); + SetDepth(tree[2 * n - 1], &tree[0], depth, 0); + + // We need to pack the Huffman tree in tree_limit bits. + // If this was not successful, add fake entities to the lowest values + // and retry. + if (*std::max_element(&depth[0], &depth[length]) <= tree_limit) { + break; + } + } +} + +void Reverse(uint8_t* v, size_t start, size_t end) { + --end; + while (start < end) { + uint8_t tmp = v[start]; + v[start] = v[end]; + v[end] = tmp; + ++start; + --end; + } +} + +void WriteHuffmanTreeRepetitions(const uint8_t previous_value, + const uint8_t value, size_t repetitions, + size_t* tree_size, uint8_t* tree, + uint8_t* extra_bits_data) { + JXL_DASSERT(repetitions > 0); + if (previous_value != value) { + tree[*tree_size] = value; + extra_bits_data[*tree_size] = 0; + ++(*tree_size); + --repetitions; + } + if (repetitions == 7) { + tree[*tree_size] = value; + extra_bits_data[*tree_size] = 0; + ++(*tree_size); + --repetitions; + } + if (repetitions < 3) { + for (size_t i = 0; i < repetitions; ++i) { + tree[*tree_size] = value; + extra_bits_data[*tree_size] = 0; + ++(*tree_size); + } + } else { + repetitions -= 3; + size_t start = *tree_size; + while (true) { + tree[*tree_size] = 16; + extra_bits_data[*tree_size] = repetitions & 0x3; + ++(*tree_size); + repetitions >>= 2; + if (repetitions == 0) { + break; + } + --repetitions; + } + Reverse(tree, start, *tree_size); + Reverse(extra_bits_data, start, *tree_size); + } +} + +void WriteHuffmanTreeRepetitionsZeros(size_t repetitions, size_t* tree_size, + uint8_t* tree, uint8_t* extra_bits_data) { + if (repetitions == 11) { + tree[*tree_size] = 0; + extra_bits_data[*tree_size] = 0; + ++(*tree_size); + --repetitions; + } + if (repetitions < 3) { + for (size_t i = 0; i < repetitions; ++i) { + tree[*tree_size] = 0; + extra_bits_data[*tree_size] = 0; + ++(*tree_size); + } + } else { + repetitions -= 3; + size_t start = *tree_size; + while (true) { + tree[*tree_size] = 17; + extra_bits_data[*tree_size] = repetitions & 0x7; + ++(*tree_size); + repetitions >>= 3; + if (repetitions == 0) { + break; + } + --repetitions; + } + Reverse(tree, start, *tree_size); + Reverse(extra_bits_data, start, *tree_size); + } +} + +static void DecideOverRleUse(const uint8_t* depth, const size_t length, + bool* use_rle_for_non_zero, + bool* use_rle_for_zero) { + size_t total_reps_zero = 0; + size_t total_reps_non_zero = 0; + size_t count_reps_zero = 1; + size_t count_reps_non_zero = 1; + for (size_t i = 0; i < length;) { + const uint8_t value = depth[i]; + size_t reps = 1; + for (size_t k = i + 1; k < length && depth[k] == value; ++k) { + ++reps; + } + if (reps >= 3 && value == 0) { + total_reps_zero += reps; + ++count_reps_zero; + } + if (reps >= 4 && value != 0) { + total_reps_non_zero += reps; + ++count_reps_non_zero; + } + i += reps; + } + *use_rle_for_non_zero = total_reps_non_zero > count_reps_non_zero * 2; + *use_rle_for_zero = total_reps_zero > count_reps_zero * 2; +} + +void WriteHuffmanTree(const uint8_t* depth, size_t length, size_t* tree_size, + uint8_t* tree, uint8_t* extra_bits_data) { + uint8_t previous_value = 8; + + // Throw away trailing zeros. + size_t new_length = length; + for (size_t i = 0; i < length; ++i) { + if (depth[length - i - 1] == 0) { + --new_length; + } else { + break; + } + } + + // First gather statistics on if it is a good idea to do rle. + bool use_rle_for_non_zero = false; + bool use_rle_for_zero = false; + if (length > 50) { + // Find rle coding for longer codes. + // Shorter codes seem not to benefit from rle. + DecideOverRleUse(depth, new_length, &use_rle_for_non_zero, + &use_rle_for_zero); + } + + // Actual rle coding. + for (size_t i = 0; i < new_length;) { + const uint8_t value = depth[i]; + size_t reps = 1; + if ((value != 0 && use_rle_for_non_zero) || + (value == 0 && use_rle_for_zero)) { + for (size_t k = i + 1; k < new_length && depth[k] == value; ++k) { + ++reps; + } + } + if (value == 0) { + WriteHuffmanTreeRepetitionsZeros(reps, tree_size, tree, extra_bits_data); + } else { + WriteHuffmanTreeRepetitions(previous_value, value, reps, tree_size, tree, + extra_bits_data); + previous_value = value; + } + i += reps; + } +} + +namespace { + +uint16_t ReverseBits(int num_bits, uint16_t bits) { + static const size_t kLut[16] = {// Pre-reversed 4-bit values. + 0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe, + 0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf}; + size_t retval = kLut[bits & 0xf]; + for (int i = 4; i < num_bits; i += 4) { + retval <<= 4; + bits = static_cast(bits >> 4); + retval |= kLut[bits & 0xf]; + } + retval >>= (-num_bits & 0x3); + return static_cast(retval); +} + +} // namespace + +void ConvertBitDepthsToSymbols(const uint8_t* depth, size_t len, + uint16_t* bits) { + // In Brotli, all bit depths are [1..15] + // 0 bit depth means that the symbol does not exist. + const int kMaxBits = 16; // 0..15 are values for bits + uint16_t bl_count[kMaxBits] = {0}; + { + for (size_t i = 0; i < len; ++i) { + ++bl_count[depth[i]]; + } + bl_count[0] = 0; + } + uint16_t next_code[kMaxBits]; + next_code[0] = 0; + { + int code = 0; + for (size_t i = 1; i < kMaxBits; ++i) { + code = (code + bl_count[i - 1]) << 1; + next_code[i] = static_cast(code); + } + } + for (size_t i = 0; i < len; ++i) { + if (depth[i]) { + bits[i] = ReverseBits(depth[i], next_code[depth[i]]++); + } + } +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_tree.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_tree.h new file mode 100644 index 0000000000..e4ccac49bc --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/huffman_tree.h @@ -0,0 +1,52 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Library for creating Huffman codes from population counts. + +#ifndef LIB_JXL_HUFFMAN_TREE_H_ +#define LIB_JXL_HUFFMAN_TREE_H_ + +#include +#include + +namespace jxl { + +// A node of a Huffman tree. +struct HuffmanTree { + HuffmanTree(uint32_t count, int16_t left, int16_t right) + : total_count(count), index_left(left), index_right_or_value(right) {} + uint32_t total_count; + int16_t index_left; + int16_t index_right_or_value; +}; + +void SetDepth(const HuffmanTree& p, HuffmanTree* pool, uint8_t* depth, + uint8_t level); + +// This function will create a Huffman tree. +// +// The (data,length) contains the population counts. +// The tree_limit is the maximum bit depth of the Huffman codes. +// +// The depth contains the tree, i.e., how many bits are used for +// the symbol. +// +// See http://en.wikipedia.org/wiki/Huffman_coding +void CreateHuffmanTree(const uint32_t* data, const size_t length, + const int tree_limit, uint8_t* depth); + +// Write a Huffman tree from bit depths into the bitstream representation +// of a Huffman tree. The generated Huffman tree is to be compressed once +// more using a Huffman tree +void WriteHuffmanTree(const uint8_t* depth, size_t length, size_t* tree_size, + uint8_t* tree, uint8_t* extra_bits_data); + +// Get the actual bit values for a tree of bit depths. +void ConvertBitDepthsToSymbols(const uint8_t* depth, size_t len, + uint16_t* bits); + +} // namespace jxl + +#endif // LIB_JXL_HUFFMAN_TREE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/iaca_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/iaca_test.cc new file mode 100644 index 0000000000..9b2e8ea25c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/iaca_test.cc @@ -0,0 +1,21 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/iaca.h" + +#include "gtest/gtest.h" + +namespace jxl { +namespace { + +TEST(IacaTest, MarkersDefaultToDisabledAndDoNotCrash) { + BeginIACA(); + EndIACA(); +} + +TEST(IacaTest, ScopeDefaultToDisabledAndDoNotCrash) { ScopeIACA iaca; } + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec.cc new file mode 100644 index 0000000000..619c81451e --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec.cc @@ -0,0 +1,404 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/icc_codec.h" + +#include + +#include +#include +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/icc_codec_common.h" + +namespace jxl { +namespace { + +uint64_t DecodeVarInt(const uint8_t* input, size_t inputSize, size_t* pos) { + size_t i; + uint64_t ret = 0; + for (i = 0; *pos + i < inputSize && i < 10; ++i) { + ret |= uint64_t(input[*pos + i] & 127) << uint64_t(7 * i); + // If the next-byte flag is not set, stop + if ((input[*pos + i] & 128) == 0) break; + } + // TODO: Return a decoding error if i == 10. + *pos += i + 1; + return ret; +} + +// Shuffles or interleaves bytes, for example with width 2, turns "ABCDabcd" +// into "AaBbCcDc". Transposes a matrix of ceil(size / width) columns and +// width rows. There are size elements, size may be < width * height, if so the +// last elements of the rightmost column are missing, the missing spots are +// transposed along with the filled spots, and the result has the missing +// elements at the end of the bottom row. The input is the input matrix in +// scanline order but with missing elements skipped (which may occur in multiple +// locations), the output is the result matrix in scanline order (with +// no need to skip missing elements as they are past the end of the data). +void Shuffle(uint8_t* data, size_t size, size_t width) { + size_t height = (size + width - 1) / width; // amount of rows of output + PaddedBytes result(size); + // i = output index, j input index + size_t s = 0, j = 0; + for (size_t i = 0; i < size; i++) { + result[i] = data[j]; + j += height; + if (j >= size) j = ++s; + } + + for (size_t i = 0; i < size; i++) { + data[i] = result[i]; + } +} + +// TODO(eustas): should be 20, or even 18, once DecodeVarInt is improved; +// currently DecodeVarInt does not signal the errors, and marks +// 11 bytes as used even if only 10 are used (and 9 is enough for +// 63-bit values). +constexpr const size_t kPreambleSize = 22; // enough for reading 2 VarInts + +} // namespace + +// Mimics the beginning of UnpredictICC for quick validity check. +// At least kPreambleSize bytes of data should be valid at invocation time. +Status CheckPreamble(const PaddedBytes& data, size_t enc_size, + size_t output_limit) { + const uint8_t* enc = data.data(); + size_t size = data.size(); + size_t pos = 0; + uint64_t osize = DecodeVarInt(enc, size, &pos); + JXL_RETURN_IF_ERROR(CheckIs32Bit(osize)); + if (pos >= size) return JXL_FAILURE("Out of bounds"); + uint64_t csize = DecodeVarInt(enc, size, &pos); + JXL_RETURN_IF_ERROR(CheckIs32Bit(csize)); + JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, csize, size)); + // We expect that UnpredictICC inflates input, not the other way round. + if (osize + 65536 < enc_size) return JXL_FAILURE("Malformed ICC"); + if (output_limit && osize > output_limit) { + return JXL_FAILURE("Decoded ICC is too large"); + } + return true; +} + +// Decodes the result of PredictICC back to a valid ICC profile. +Status UnpredictICC(const uint8_t* enc, size_t size, PaddedBytes* result) { + if (!result->empty()) return JXL_FAILURE("result must be empty initially"); + size_t pos = 0; + // TODO(lode): technically speaking we need to check that the entire varint + // decoding never goes out of bounds, not just the first byte. This requires + // a DecodeVarInt function that returns an error code. It is safe to use + // DecodeVarInt with out of bounds values, it silently returns, but the + // specification requires an error. Idem for all DecodeVarInt below. + if (pos >= size) return JXL_FAILURE("Out of bounds"); + uint64_t osize = DecodeVarInt(enc, size, &pos); // Output size + JXL_RETURN_IF_ERROR(CheckIs32Bit(osize)); + if (pos >= size) return JXL_FAILURE("Out of bounds"); + uint64_t csize = DecodeVarInt(enc, size, &pos); // Commands size + // Every command is translated to at least on byte. + JXL_RETURN_IF_ERROR(CheckIs32Bit(csize)); + size_t cpos = pos; // pos in commands stream + JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, csize, size)); + size_t commands_end = cpos + csize; + pos = commands_end; // pos in data stream + + // Header + PaddedBytes header = ICCInitialHeaderPrediction(); + EncodeUint32(0, osize, &header); + for (size_t i = 0; i <= kICCHeaderSize; i++) { + if (result->size() == osize) { + if (cpos != commands_end) return JXL_FAILURE("Not all commands used"); + if (pos != size) return JXL_FAILURE("Not all data used"); + return true; // Valid end + } + if (i == kICCHeaderSize) break; // Done + ICCPredictHeader(result->data(), result->size(), header.data(), i); + if (pos >= size) return JXL_FAILURE("Out of bounds"); + result->push_back(enc[pos++] + header[i]); + } + if (cpos >= commands_end) return JXL_FAILURE("Out of bounds"); + + // Tag list + uint64_t numtags = DecodeVarInt(enc, size, &cpos); + + if (numtags != 0) { + numtags--; + JXL_RETURN_IF_ERROR(CheckIs32Bit(numtags)); + AppendUint32(numtags, result); + uint64_t prevtagstart = kICCHeaderSize + numtags * 12; + uint64_t prevtagsize = 0; + for (;;) { + if (result->size() > osize) return JXL_FAILURE("Invalid result size"); + if (cpos > commands_end) return JXL_FAILURE("Out of bounds"); + if (cpos == commands_end) break; // Valid end + uint8_t command = enc[cpos++]; + uint8_t tagcode = command & 63; + Tag tag; + if (tagcode == 0) { + break; + } else if (tagcode == kCommandTagUnknown) { + JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, 4, size)); + tag = DecodeKeyword(enc, size, pos); + pos += 4; + } else if (tagcode == kCommandTagTRC) { + tag = kRtrcTag; + } else if (tagcode == kCommandTagXYZ) { + tag = kRxyzTag; + } else { + if (tagcode - kCommandTagStringFirst >= kNumTagStrings) { + return JXL_FAILURE("Unknown tagcode"); + } + tag = *kTagStrings[tagcode - kCommandTagStringFirst]; + } + AppendKeyword(tag, result); + + uint64_t tagstart; + uint64_t tagsize = prevtagsize; + if (tag == kRxyzTag || tag == kGxyzTag || tag == kBxyzTag || + tag == kKxyzTag || tag == kWtptTag || tag == kBkptTag || + tag == kLumiTag) { + tagsize = 20; + } + + if (command & kFlagBitOffset) { + if (cpos >= commands_end) return JXL_FAILURE("Out of bounds"); + tagstart = DecodeVarInt(enc, size, &cpos); + } else { + JXL_RETURN_IF_ERROR(CheckIs32Bit(prevtagstart)); + tagstart = prevtagstart + prevtagsize; + } + JXL_RETURN_IF_ERROR(CheckIs32Bit(tagstart)); + AppendUint32(tagstart, result); + if (command & kFlagBitSize) { + if (cpos >= commands_end) return JXL_FAILURE("Out of bounds"); + tagsize = DecodeVarInt(enc, size, &cpos); + } + JXL_RETURN_IF_ERROR(CheckIs32Bit(tagsize)); + AppendUint32(tagsize, result); + prevtagstart = tagstart; + prevtagsize = tagsize; + + if (tagcode == kCommandTagTRC) { + AppendKeyword(kGtrcTag, result); + AppendUint32(tagstart, result); + AppendUint32(tagsize, result); + AppendKeyword(kBtrcTag, result); + AppendUint32(tagstart, result); + AppendUint32(tagsize, result); + } + + if (tagcode == kCommandTagXYZ) { + JXL_RETURN_IF_ERROR(CheckIs32Bit(tagstart + tagsize * 2)); + AppendKeyword(kGxyzTag, result); + AppendUint32(tagstart + tagsize, result); + AppendUint32(tagsize, result); + AppendKeyword(kBxyzTag, result); + AppendUint32(tagstart + tagsize * 2, result); + AppendUint32(tagsize, result); + } + } + } + + // Main Content + for (;;) { + if (result->size() > osize) return JXL_FAILURE("Invalid result size"); + if (cpos > commands_end) return JXL_FAILURE("Out of bounds"); + if (cpos == commands_end) break; // Valid end + uint8_t command = enc[cpos++]; + if (command == kCommandInsert) { + if (cpos >= commands_end) return JXL_FAILURE("Out of bounds"); + uint64_t num = DecodeVarInt(enc, size, &cpos); + JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, num, size)); + for (size_t i = 0; i < num; i++) { + result->push_back(enc[pos++]); + } + } else if (command == kCommandShuffle2 || command == kCommandShuffle4) { + if (cpos >= commands_end) return JXL_FAILURE("Out of bounds"); + uint64_t num = DecodeVarInt(enc, size, &cpos); + JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, num, size)); + PaddedBytes shuffled(num); + for (size_t i = 0; i < num; i++) { + shuffled[i] = enc[pos + i]; + } + if (command == kCommandShuffle2) { + Shuffle(shuffled.data(), num, 2); + } else if (command == kCommandShuffle4) { + Shuffle(shuffled.data(), num, 4); + } + for (size_t i = 0; i < num; i++) { + result->push_back(shuffled[i]); + pos++; + } + } else if (command == kCommandPredict) { + JXL_RETURN_IF_ERROR(CheckOutOfBounds(cpos, 2, commands_end)); + uint8_t flags = enc[cpos++]; + + size_t width = (flags & 3) + 1; + if (width == 3) return JXL_FAILURE("Invalid width"); + + int order = (flags & 12) >> 2; + if (order == 3) return JXL_FAILURE("Invalid order"); + + uint64_t stride = width; + if (flags & 16) { + if (cpos >= commands_end) return JXL_FAILURE("Out of bounds"); + stride = DecodeVarInt(enc, size, &cpos); + if (stride < width) { + return JXL_FAILURE("Invalid stride"); + } + } + // If stride * 4 >= result->size(), return failure. The check + // "size == 0 || ((size - 1) >> 2) < stride" corresponds to + // "stride * 4 >= size", but does not suffer from integer overflow. + // This check is more strict than necessary but follows the specification + // and the encoder should ensure this is followed. + if (result->empty() || ((result->size() - 1u) >> 2u) < stride) { + return JXL_FAILURE("Invalid stride"); + } + + if (cpos >= commands_end) return JXL_FAILURE("Out of bounds"); + uint64_t num = DecodeVarInt(enc, size, &cpos); // in bytes + JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, num, size)); + + PaddedBytes shuffled(num); + for (size_t i = 0; i < num; i++) { + shuffled[i] = enc[pos + i]; + } + if (width > 1) Shuffle(shuffled.data(), num, width); + + size_t start = result->size(); + for (size_t i = 0; i < num; i++) { + uint8_t predicted = LinearPredictICCValue(result->data(), start, i, + stride, width, order); + result->push_back(predicted + shuffled[i]); + } + pos += num; + } else if (command == kCommandXYZ) { + AppendKeyword(kXyz_Tag, result); + for (int i = 0; i < 4; i++) result->push_back(0); + JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, 12, size)); + for (size_t i = 0; i < 12; i++) { + result->push_back(enc[pos++]); + } + } else if (command >= kCommandTypeStartFirst && + command < kCommandTypeStartFirst + kNumTypeStrings) { + AppendKeyword(*kTypeStrings[command - kCommandTypeStartFirst], result); + for (size_t i = 0; i < 4; i++) { + result->push_back(0); + } + } else { + return JXL_FAILURE("Unknown command"); + } + } + + if (pos != size) return JXL_FAILURE("Not all data used"); + if (result->size() != osize) return JXL_FAILURE("Invalid result size"); + + return true; +} + +Status ICCReader::Init(BitReader* reader, size_t output_limit) { + JXL_RETURN_IF_ERROR(CheckEOI(reader)); + used_bits_base_ = reader->TotalBitsConsumed(); + if (bits_to_skip_ == 0) { + enc_size_ = U64Coder::Read(reader); + if (enc_size_ > 268435456) { + // Avoid too large memory allocation for invalid file. + return JXL_FAILURE("Too large encoded profile"); + } + JXL_RETURN_IF_ERROR( + DecodeHistograms(reader, kNumICCContexts, &code_, &context_map_)); + ans_reader_ = ANSSymbolReader(&code_, reader); + i_ = 0; + decompressed_.resize(std::min(i_ + 0x400, enc_size_)); + for (; i_ < std::min(2, enc_size_); i_++) { + decompressed_[i_] = ans_reader_.ReadHybridUint( + ICCANSContext(i_, i_ > 0 ? decompressed_[i_ - 1] : 0, + i_ > 1 ? decompressed_[i_ - 2] : 0), + reader, context_map_); + } + if (enc_size_ > kPreambleSize) { + for (; i_ < kPreambleSize; i_++) { + decompressed_[i_] = ans_reader_.ReadHybridUint( + ICCANSContext(i_, decompressed_[i_ - 1], decompressed_[i_ - 2]), + reader, context_map_); + } + JXL_RETURN_IF_ERROR(CheckEOI(reader)); + JXL_RETURN_IF_ERROR( + CheckPreamble(decompressed_, enc_size_, output_limit)); + } + bits_to_skip_ = reader->TotalBitsConsumed() - used_bits_base_; + } else { + reader->SkipBits(bits_to_skip_); + } + return true; +} + +Status ICCReader::Process(BitReader* reader, PaddedBytes* icc) { + ANSSymbolReader::Checkpoint checkpoint; + size_t saved_i = 0; + auto save = [&]() { + ans_reader_.Save(&checkpoint); + bits_to_skip_ = reader->TotalBitsConsumed() - used_bits_base_; + saved_i = i_; + }; + save(); + auto check_and_restore = [&]() { + Status status = CheckEOI(reader); + if (!status) { + // not enough bytes. + ans_reader_.Restore(checkpoint); + i_ = saved_i; + return status; + } + return Status(true); + }; + for (; i_ < enc_size_; i_++) { + if (i_ % ANSSymbolReader::kMaxCheckpointInterval == 0 && i_ > 0) { + JXL_RETURN_IF_ERROR(check_and_restore()); + save(); + if ((i_ > 0) && (((i_ & 0xFFFF) == 0))) { + float used_bytes = + (reader->TotalBitsConsumed() - used_bits_base_) / 8.0f; + if (i_ > used_bytes * 256) return JXL_FAILURE("Corrupted stream"); + } + decompressed_.resize(std::min(i_ + 0x400, enc_size_)); + } + JXL_DASSERT(i_ >= 2); + decompressed_[i_] = ans_reader_.ReadHybridUint( + ICCANSContext(i_, decompressed_[i_ - 1], decompressed_[i_ - 2]), reader, + context_map_); + } + JXL_RETURN_IF_ERROR(check_and_restore()); + bits_to_skip_ = reader->TotalBitsConsumed() - used_bits_base_; + if (!ans_reader_.CheckANSFinalState()) { + return JXL_FAILURE("Corrupted ICC profile"); + } + + icc->clear(); + return UnpredictICC(decompressed_.data(), decompressed_.size(), icc); +} + +Status ICCReader::CheckEOI(BitReader* reader) { + if (reader->AllReadsWithinBounds()) return true; + return JXL_STATUS(StatusCode::kNotEnoughBytes, + "Not enough bytes for reading ICC profile"); +} + +Status ReadICC(BitReader* JXL_RESTRICT reader, PaddedBytes* JXL_RESTRICT icc, + size_t output_limit) { + ICCReader icc_reader; + JXL_RETURN_IF_ERROR(icc_reader.Init(reader, output_limit)); + JXL_RETURN_IF_ERROR(icc_reader.Process(reader, icc)); + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec.h new file mode 100644 index 0000000000..d55b316957 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec.h @@ -0,0 +1,64 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ICC_CODEC_H_ +#define LIB_JXL_ICC_CODEC_H_ + +// Compressed representation of ICC profiles. + +#include +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_bit_writer.h" + +namespace jxl { + +// Should still be called if `icc.empty()` - if so, writes only 1 bit. +Status WriteICC(const PaddedBytes& icc, BitWriter* JXL_RESTRICT writer, + size_t layer, AuxOut* JXL_RESTRICT aux_out); + +struct ICCReader { + Status Init(BitReader* reader, size_t output_limit); + Status Process(BitReader* reader, PaddedBytes* icc); + void Reset() { + bits_to_skip_ = 0; + decompressed_.clear(); + } + + private: + Status CheckEOI(BitReader* reader); + size_t i_ = 0; + size_t bits_to_skip_ = 0; + size_t used_bits_base_ = 0; + uint64_t enc_size_ = 0; + std::vector context_map_; + ANSCode code_; + ANSSymbolReader ans_reader_; + PaddedBytes decompressed_; +}; + +// `icc` may be empty afterwards - if so, call CreateProfile. Does not append, +// clears any original data that was in icc. +// If `output_limit` is not 0, then returns error if resulting profile would be +// longer than `output_limit` +Status ReadICC(BitReader* JXL_RESTRICT reader, PaddedBytes* JXL_RESTRICT icc, + size_t output_limit = 0); + +// Exposed only for testing +Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result); + +// Exposed only for testing +Status UnpredictICC(const uint8_t* enc, size_t size, PaddedBytes* result); + +} // namespace jxl + +#endif // LIB_JXL_ICC_CODEC_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_common.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_common.cc new file mode 100644 index 0000000000..1e118c5d5c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_common.cc @@ -0,0 +1,192 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/icc_codec_common.h" + +#include + +#include +#include +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/common.h" +#include "lib/jxl/fields.h" + +namespace jxl { +namespace { +static uint8_t ByteKind1(uint8_t b) { + if ('a' <= b && b <= 'z') return 0; + if ('A' <= b && b <= 'Z') return 0; + if ('0' <= b && b <= '9') return 1; + if (b == '.' || b == ',') return 1; + if (b == 0) return 2; + if (b == 1) return 3; + if (b < 16) return 4; + if (b == 255) return 6; + if (b > 240) return 5; + return 7; +} + +static uint8_t ByteKind2(uint8_t b) { + if ('a' <= b && b <= 'z') return 0; + if ('A' <= b && b <= 'Z') return 0; + if ('0' <= b && b <= '9') return 1; + if (b == '.' || b == ',') return 1; + if (b < 16) return 2; + if (b > 240) return 3; + return 4; +} + +template +T PredictValue(T p1, T p2, T p3, int order) { + if (order == 0) return p1; + if (order == 1) return 2 * p1 - p2; + if (order == 2) return 3 * p1 - 3 * p2 + p3; + return 0; +} +} // namespace + +uint32_t DecodeUint32(const uint8_t* data, size_t size, size_t pos) { + return pos + 4 > size ? 0 : LoadBE32(data + pos); +} + +void EncodeUint32(size_t pos, uint32_t value, PaddedBytes* data) { + if (pos + 4 > data->size()) return; + StoreBE32(value, data->data() + pos); +} + +void AppendUint32(uint32_t value, PaddedBytes* data) { + data->resize(data->size() + 4); + EncodeUint32(data->size() - 4, value, data); +} + +typedef std::array Tag; + +Tag DecodeKeyword(const uint8_t* data, size_t size, size_t pos) { + if (pos + 4 > size) return {' ', ' ', ' ', ' '}; + return {data[pos], data[pos + 1], data[pos + 2], data[pos + 3]}; +} + +void EncodeKeyword(const Tag& keyword, uint8_t* data, size_t size, size_t pos) { + if (keyword.size() != 4 || pos + 3 >= size) return; + for (size_t i = 0; i < 4; ++i) data[pos + i] = keyword[i]; +} + +void AppendKeyword(const Tag& keyword, PaddedBytes* data) { + JXL_ASSERT(keyword.size() == 4); + data->append(keyword); +} + +// Checks if a + b > size, taking possible integer overflow into account. +Status CheckOutOfBounds(size_t a, size_t b, size_t size) { + size_t pos = a + b; + if (pos > size) return JXL_FAILURE("Out of bounds"); + if (pos < a) return JXL_FAILURE("Out of bounds"); // overflow happened + return true; +} + +Status CheckIs32Bit(uint64_t v) { + static constexpr const uint64_t kUpper32 = ~static_cast(0xFFFFFFFF); + if ((v & kUpper32) != 0) return JXL_FAILURE("32-bit value expected"); + return true; +} + +PaddedBytes ICCInitialHeaderPrediction() { + PaddedBytes result(kICCHeaderSize); + for (size_t i = 0; i < kICCHeaderSize; i++) { + result[i] = 0; + } + result[8] = 4; + EncodeKeyword(kMntrTag, result.data(), result.size(), 12); + EncodeKeyword(kRgb_Tag, result.data(), result.size(), 16); + EncodeKeyword(kXyz_Tag, result.data(), result.size(), 20); + EncodeKeyword(kAcspTag, result.data(), result.size(), 36); + result[68] = 0; + result[69] = 0; + result[70] = 246; + result[71] = 214; + result[72] = 0; + result[73] = 1; + result[74] = 0; + result[75] = 0; + result[76] = 0; + result[77] = 0; + result[78] = 211; + result[79] = 45; + return result; +} + +void ICCPredictHeader(const uint8_t* icc, size_t size, uint8_t* header, + size_t pos) { + if (pos == 8 && size >= 8) { + header[80] = icc[4]; + header[81] = icc[5]; + header[82] = icc[6]; + header[83] = icc[7]; + } + if (pos == 41 && size >= 41) { + if (icc[40] == 'A') { + header[41] = 'P'; + header[42] = 'P'; + header[43] = 'L'; + } + if (icc[40] == 'M') { + header[41] = 'S'; + header[42] = 'F'; + header[43] = 'T'; + } + } + if (pos == 42 && size >= 42) { + if (icc[40] == 'S' && icc[41] == 'G') { + header[42] = 'I'; + header[43] = ' '; + } + if (icc[40] == 'S' && icc[41] == 'U') { + header[42] = 'N'; + header[43] = 'W'; + } + } +} + +// Predicts a value with linear prediction of given order (0-2), for integers +// with width bytes and given stride in bytes between values. +// The start position is at start + i, and the relevant modulus of i describes +// which byte of the multi-byte integer is being handled. +// The value start + i must be at least stride * 4. +uint8_t LinearPredictICCValue(const uint8_t* data, size_t start, size_t i, + size_t stride, size_t width, int order) { + size_t pos = start + i; + if (width == 1) { + uint8_t p1 = data[pos - stride]; + uint8_t p2 = data[pos - stride * 2]; + uint8_t p3 = data[pos - stride * 3]; + return PredictValue(p1, p2, p3, order); + } else if (width == 2) { + size_t p = start + (i & ~1); + uint16_t p1 = (data[p - stride * 1] << 8) + data[p - stride * 1 + 1]; + uint16_t p2 = (data[p - stride * 2] << 8) + data[p - stride * 2 + 1]; + uint16_t p3 = (data[p - stride * 3] << 8) + data[p - stride * 3 + 1]; + uint16_t pred = PredictValue(p1, p2, p3, order); + return (i & 1) ? (pred & 255) : ((pred >> 8) & 255); + } else { + size_t p = start + (i & ~3); + uint32_t p1 = DecodeUint32(data, pos, p - stride); + uint32_t p2 = DecodeUint32(data, pos, p - stride * 2); + uint32_t p3 = DecodeUint32(data, pos, p - stride * 3); + uint32_t pred = PredictValue(p1, p2, p3, order); + unsigned shiftbytes = 3 - (i & 3); + return (pred >> (shiftbytes * 8)) & 255; + } +} + +size_t ICCANSContext(size_t i, size_t b1, size_t b2) { + if (i <= 128) return 0; + return 1 + ByteKind1(b1) + ByteKind2(b2) * 8; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_common.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_common.h new file mode 100644 index 0000000000..8ccc7e9091 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_common.h @@ -0,0 +1,106 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_ICC_CODEC_COMMON_H_ +#define LIB_JXL_ICC_CODEC_COMMON_H_ + +// Compressed representation of ICC profiles. + +#include +#include + +#include + +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +static constexpr size_t kICCHeaderSize = 128; + +typedef std::array Tag; + +static const Tag kAcspTag = {'a', 'c', 's', 'p'}; +static const Tag kBkptTag = {'b', 'k', 'p', 't'}; +static const Tag kBtrcTag = {'b', 'T', 'R', 'C'}; +static const Tag kBxyzTag = {'b', 'X', 'Y', 'Z'}; +static const Tag kChadTag = {'c', 'h', 'a', 'd'}; +static const Tag kChrmTag = {'c', 'h', 'r', 'm'}; +static const Tag kCprtTag = {'c', 'p', 'r', 't'}; +static const Tag kCurvTag = {'c', 'u', 'r', 'v'}; +static const Tag kDescTag = {'d', 'e', 's', 'c'}; +static const Tag kDmddTag = {'d', 'm', 'd', 'd'}; +static const Tag kDmndTag = {'d', 'm', 'n', 'd'}; +static const Tag kGbd_Tag = {'g', 'b', 'd', ' '}; +static const Tag kGtrcTag = {'g', 'T', 'R', 'C'}; +static const Tag kGxyzTag = {'g', 'X', 'Y', 'Z'}; +static const Tag kKtrcTag = {'k', 'T', 'R', 'C'}; +static const Tag kKxyzTag = {'k', 'X', 'Y', 'Z'}; +static const Tag kLumiTag = {'l', 'u', 'm', 'i'}; +static const Tag kMab_Tag = {'m', 'A', 'B', ' '}; +static const Tag kMba_Tag = {'m', 'B', 'A', ' '}; +static const Tag kMlucTag = {'m', 'l', 'u', 'c'}; +static const Tag kMntrTag = {'m', 'n', 't', 'r'}; +static const Tag kParaTag = {'p', 'a', 'r', 'a'}; +static const Tag kRgb_Tag = {'R', 'G', 'B', ' '}; +static const Tag kRtrcTag = {'r', 'T', 'R', 'C'}; +static const Tag kRxyzTag = {'r', 'X', 'Y', 'Z'}; +static const Tag kSf32Tag = {'s', 'f', '3', '2'}; +static const Tag kTextTag = {'t', 'e', 'x', 't'}; +static const Tag kVcgtTag = {'v', 'c', 'g', 't'}; +static const Tag kWtptTag = {'w', 't', 'p', 't'}; +static const Tag kXyz_Tag = {'X', 'Y', 'Z', ' '}; + +// Tag names focused on RGB and GRAY monitor profiles +static constexpr size_t kNumTagStrings = 17; +static constexpr const Tag* kTagStrings[kNumTagStrings] = { + &kCprtTag, &kWtptTag, &kBkptTag, &kRxyzTag, &kGxyzTag, &kBxyzTag, + &kKxyzTag, &kRtrcTag, &kGtrcTag, &kBtrcTag, &kKtrcTag, &kChadTag, + &kDescTag, &kChrmTag, &kDmndTag, &kDmddTag, &kLumiTag}; + +static constexpr size_t kCommandTagUnknown = 1; +static constexpr size_t kCommandTagTRC = 2; +static constexpr size_t kCommandTagXYZ = 3; +static constexpr size_t kCommandTagStringFirst = 4; + +// Tag types focused on RGB and GRAY monitor profiles +static constexpr size_t kNumTypeStrings = 8; +static constexpr const Tag* kTypeStrings[kNumTypeStrings] = { + &kXyz_Tag, &kDescTag, &kTextTag, &kMlucTag, + &kParaTag, &kCurvTag, &kSf32Tag, &kGbd_Tag}; + +static constexpr size_t kCommandInsert = 1; +static constexpr size_t kCommandShuffle2 = 2; +static constexpr size_t kCommandShuffle4 = 3; +static constexpr size_t kCommandPredict = 4; +static constexpr size_t kCommandXYZ = 10; +static constexpr size_t kCommandTypeStartFirst = 16; + +static constexpr size_t kFlagBitOffset = 64; +static constexpr size_t kFlagBitSize = 128; + +static constexpr size_t kNumICCContexts = 41; + +uint32_t DecodeUint32(const uint8_t* data, size_t size, size_t pos); +void EncodeUint32(size_t pos, uint32_t value, PaddedBytes* data); +void AppendUint32(uint32_t value, PaddedBytes* data); +Tag DecodeKeyword(const uint8_t* data, size_t size, size_t pos); +void EncodeKeyword(const Tag& keyword, uint8_t* data, size_t size, size_t pos); +void AppendKeyword(const Tag& keyword, PaddedBytes* data); + +// Checks if a + b > size, taking possible integer overflow into account. +Status CheckOutOfBounds(size_t a, size_t b, size_t size); +Status CheckIs32Bit(uint64_t v); + +PaddedBytes ICCInitialHeaderPrediction(); +void ICCPredictHeader(const uint8_t* icc, size_t size, uint8_t* header, + size_t pos); +uint8_t LinearPredictICCValue(const uint8_t* data, size_t start, size_t i, + size_t stride, size_t width, int order); +size_t ICCANSContext(size_t i, size_t b1, size_t b2); + +} // namespace jxl + +#endif // LIB_JXL_ICC_CODEC_COMMON_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_test.cc new file mode 100644 index 0000000000..d365471afa --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/icc_codec_test.cc @@ -0,0 +1,207 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/icc_codec.h" + +#include + +#include "gtest/gtest.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/enc_icc_codec.h" + +namespace jxl { +namespace { + +void TestProfile(const PaddedBytes& icc) { + BitWriter writer; + ASSERT_TRUE(WriteICC(icc, &writer, 0, nullptr)); + writer.ZeroPadToByte(); + PaddedBytes dec; + BitReader reader(writer.GetSpan()); + ASSERT_TRUE(ReadICC(&reader, &dec)); + ASSERT_TRUE(reader.Close()); + EXPECT_EQ(icc.size(), dec.size()); + if (icc.size() == dec.size()) { + for (size_t i = 0; i < icc.size(); i++) { + EXPECT_EQ(icc[i], dec[i]); + if (icc[i] != dec[i]) break; // One output is enough + } + } +} + +void TestProfile(const std::string& icc) { + PaddedBytes bytes(icc.size()); + for (size_t i = 0; i < icc.size(); i++) { + bytes[i] = icc[i]; + } + TestProfile(bytes); +} + +// Valid profile from one of the images output by the decoder. +static const unsigned char kTestProfile[] = { + 0x00, 0x00, 0x03, 0x80, 0x6c, 0x63, 0x6d, 0x73, 0x04, 0x30, 0x00, 0x00, + 0x6d, 0x6e, 0x74, 0x72, 0x52, 0x47, 0x42, 0x20, 0x58, 0x59, 0x5a, 0x20, + 0x07, 0xe3, 0x00, 0x04, 0x00, 0x1d, 0x00, 0x0f, 0x00, 0x32, 0x00, 0x2e, + 0x61, 0x63, 0x73, 0x70, 0x41, 0x50, 0x50, 0x4c, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0xf6, 0xd6, + 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x6c, 0x63, 0x6d, 0x73, + 0x5f, 0x07, 0x0d, 0x3e, 0x4d, 0x32, 0xf2, 0x6e, 0x5d, 0x77, 0x26, 0xcc, + 0x23, 0xb0, 0x6a, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d, + 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x01, 0x20, 0x00, 0x00, 0x00, 0x42, + 0x63, 0x70, 0x72, 0x74, 0x00, 0x00, 0x01, 0x64, 0x00, 0x00, 0x01, 0x00, + 0x77, 0x74, 0x70, 0x74, 0x00, 0x00, 0x02, 0x64, 0x00, 0x00, 0x00, 0x14, + 0x63, 0x68, 0x61, 0x64, 0x00, 0x00, 0x02, 0x78, 0x00, 0x00, 0x00, 0x2c, + 0x72, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x02, 0xa4, 0x00, 0x00, 0x00, 0x14, + 0x62, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x02, 0xb8, 0x00, 0x00, 0x00, 0x14, + 0x67, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x02, 0xcc, 0x00, 0x00, 0x00, 0x14, + 0x72, 0x54, 0x52, 0x43, 0x00, 0x00, 0x02, 0xe0, 0x00, 0x00, 0x00, 0x20, + 0x67, 0x54, 0x52, 0x43, 0x00, 0x00, 0x02, 0xe0, 0x00, 0x00, 0x00, 0x20, + 0x62, 0x54, 0x52, 0x43, 0x00, 0x00, 0x02, 0xe0, 0x00, 0x00, 0x00, 0x20, + 0x63, 0x68, 0x72, 0x6d, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x24, + 0x64, 0x6d, 0x6e, 0x64, 0x00, 0x00, 0x03, 0x24, 0x00, 0x00, 0x00, 0x28, + 0x64, 0x6d, 0x64, 0x64, 0x00, 0x00, 0x03, 0x4c, 0x00, 0x00, 0x00, 0x32, + 0x6d, 0x6c, 0x75, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x0c, 0x65, 0x6e, 0x55, 0x53, 0x00, 0x00, 0x00, 0x26, + 0x00, 0x00, 0x00, 0x1c, 0x00, 0x52, 0x00, 0x47, 0x00, 0x42, 0x00, 0x5f, + 0x00, 0x44, 0x00, 0x36, 0x00, 0x35, 0x00, 0x5f, 0x00, 0x53, 0x00, 0x52, + 0x00, 0x47, 0x00, 0x5f, 0x00, 0x52, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x5f, + 0x00, 0x37, 0x00, 0x30, 0x00, 0x39, 0x00, 0x00, 0x6d, 0x6c, 0x75, 0x63, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, + 0x65, 0x6e, 0x55, 0x53, 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0x00, 0x1c, + 0x00, 0x43, 0x00, 0x6f, 0x00, 0x70, 0x00, 0x79, 0x00, 0x72, 0x00, 0x69, + 0x00, 0x67, 0x00, 0x68, 0x00, 0x74, 0x00, 0x20, 0x00, 0x32, 0x00, 0x30, + 0x00, 0x31, 0x00, 0x38, 0x00, 0x20, 0x00, 0x47, 0x00, 0x6f, 0x00, 0x6f, + 0x00, 0x67, 0x00, 0x6c, 0x00, 0x65, 0x00, 0x20, 0x00, 0x4c, 0x00, 0x4c, + 0x00, 0x43, 0x00, 0x2c, 0x00, 0x20, 0x00, 0x43, 0x00, 0x43, 0x00, 0x2d, + 0x00, 0x42, 0x00, 0x59, 0x00, 0x2d, 0x00, 0x53, 0x00, 0x41, 0x00, 0x20, + 0x00, 0x33, 0x00, 0x2e, 0x00, 0x30, 0x00, 0x20, 0x00, 0x55, 0x00, 0x6e, + 0x00, 0x70, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x74, 0x00, 0x65, 0x00, 0x64, + 0x00, 0x20, 0x00, 0x6c, 0x00, 0x69, 0x00, 0x63, 0x00, 0x65, 0x00, 0x6e, + 0x00, 0x73, 0x00, 0x65, 0x00, 0x28, 0x00, 0x68, 0x00, 0x74, 0x00, 0x74, + 0x00, 0x70, 0x00, 0x73, 0x00, 0x3a, 0x00, 0x2f, 0x00, 0x2f, 0x00, 0x63, + 0x00, 0x72, 0x00, 0x65, 0x00, 0x61, 0x00, 0x74, 0x00, 0x69, 0x00, 0x76, + 0x00, 0x65, 0x00, 0x63, 0x00, 0x6f, 0x00, 0x6d, 0x00, 0x6d, 0x00, 0x6f, + 0x00, 0x6e, 0x00, 0x73, 0x00, 0x2e, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x67, + 0x00, 0x2f, 0x00, 0x6c, 0x00, 0x69, 0x00, 0x63, 0x00, 0x65, 0x00, 0x6e, + 0x00, 0x73, 0x00, 0x65, 0x00, 0x73, 0x00, 0x2f, 0x00, 0x62, 0x00, 0x79, + 0x00, 0x2d, 0x00, 0x73, 0x00, 0x61, 0x00, 0x2f, 0x00, 0x33, 0x00, 0x2e, + 0x00, 0x30, 0x00, 0x2f, 0x00, 0x6c, 0x00, 0x65, 0x00, 0x67, 0x00, 0x61, + 0x00, 0x6c, 0x00, 0x63, 0x00, 0x6f, 0x00, 0x64, 0x00, 0x65, 0x00, 0x29, + 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xd6, + 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x73, 0x66, 0x33, 0x32, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x0c, 0x42, 0x00, 0x00, 0x05, 0xde, + 0xff, 0xff, 0xf3, 0x25, 0x00, 0x00, 0x07, 0x93, 0x00, 0x00, 0xfd, 0x90, + 0xff, 0xff, 0xfb, 0xa1, 0xff, 0xff, 0xfd, 0xa2, 0x00, 0x00, 0x03, 0xdc, + 0x00, 0x00, 0xc0, 0x6e, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x6f, 0xa0, 0x00, 0x00, 0x38, 0xf5, 0x00, 0x00, 0x03, 0x90, + 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x24, 0x9f, + 0x00, 0x00, 0x0f, 0x84, 0x00, 0x00, 0xb6, 0xc4, 0x58, 0x59, 0x5a, 0x20, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x62, 0x97, 0x00, 0x00, 0xb7, 0x87, + 0x00, 0x00, 0x18, 0xd9, 0x70, 0x61, 0x72, 0x61, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x38, 0xe4, 0x00, 0x00, 0xe8, 0xf0, + 0x00, 0x00, 0x17, 0x10, 0x00, 0x00, 0x38, 0xe4, 0x00, 0x00, 0x14, 0xbc, + 0x63, 0x68, 0x72, 0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, + 0x00, 0x00, 0xa3, 0xd7, 0x00, 0x00, 0x54, 0x7c, 0x00, 0x00, 0x4c, 0xcd, + 0x00, 0x00, 0x99, 0x9a, 0x00, 0x00, 0x26, 0x67, 0x00, 0x00, 0x0f, 0x5c, + 0x6d, 0x6c, 0x75, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x0c, 0x65, 0x6e, 0x55, 0x53, 0x00, 0x00, 0x00, 0x0c, + 0x00, 0x00, 0x00, 0x1c, 0x00, 0x47, 0x00, 0x6f, 0x00, 0x6f, 0x00, 0x67, + 0x00, 0x6c, 0x00, 0x65, 0x6d, 0x6c, 0x75, 0x63, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x65, 0x6e, 0x55, 0x53, + 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x49, 0x00, 0x6d, + 0x00, 0x61, 0x00, 0x67, 0x00, 0x65, 0x00, 0x20, 0x00, 0x63, 0x00, 0x6f, + 0x00, 0x64, 0x00, 0x65, 0x00, 0x63, 0x00, 0x00, +}; + +} // namespace + +TEST(IccCodecTest, Icc) { + // Empty string cannot be tested, encoder checks against writing it. + TestProfile("a"); + TestProfile("ab"); + TestProfile("aaaa"); + + { + // Exactly the ICC header size + PaddedBytes profile(128); + for (size_t i = 0; i < 128; i++) { + profile[i] = 0; + } + TestProfile(profile); + } + + { + PaddedBytes profile; + profile.append(kTestProfile, kTestProfile + sizeof(kTestProfile)); + TestProfile(profile); + } + + // Test substrings of full profile + { + PaddedBytes profile; + for (size_t i = 0; i <= 256; i++) { + profile.push_back(kTestProfile[i]); + TestProfile(profile); + } + } +} + +// kTestProfile after encoding with the ICC codec +static const unsigned char kEncodedTestProfile[] = { + 0x1f, 0x8b, 0x1, 0x13, 0x10, 0x0, 0x0, 0x0, 0x20, 0x4c, 0xcc, 0x3, + 0xe7, 0xa0, 0xa5, 0xa2, 0x90, 0xa4, 0x27, 0xe8, 0x79, 0x1d, 0xe3, 0x26, + 0x57, 0x54, 0xef, 0x0, 0xe8, 0x97, 0x2, 0xce, 0xa1, 0xd7, 0x85, 0x16, + 0xb4, 0x29, 0x94, 0x58, 0xf2, 0x56, 0xc0, 0x76, 0xea, 0x23, 0xec, 0x7c, + 0x73, 0x51, 0x41, 0x40, 0x23, 0x21, 0x95, 0x4, 0x75, 0x12, 0xc9, 0xcc, + 0x16, 0xbd, 0xb6, 0x99, 0xad, 0xf8, 0x75, 0x35, 0xb6, 0x42, 0xae, 0xae, + 0xae, 0x86, 0x56, 0xf8, 0xcc, 0x16, 0x30, 0xb3, 0x45, 0xad, 0xd, 0x40, + 0xd6, 0xd1, 0xd6, 0x99, 0x40, 0xbe, 0xe2, 0xdc, 0x31, 0x7, 0xa6, 0xb9, + 0x27, 0x92, 0x38, 0x0, 0x3, 0x5e, 0x2c, 0xbe, 0xe6, 0xfb, 0x19, 0xbf, + 0xf3, 0x6d, 0xbc, 0x4d, 0x64, 0xe5, 0xba, 0x76, 0xde, 0x31, 0x65, 0x66, + 0x14, 0xa6, 0x3a, 0xc5, 0x8f, 0xb1, 0xb4, 0xba, 0x1f, 0xb1, 0xb8, 0xd4, + 0x75, 0xba, 0x18, 0x86, 0x95, 0x3c, 0x26, 0xf6, 0x25, 0x62, 0x53, 0xfd, + 0x9c, 0x94, 0x76, 0xf6, 0x95, 0x2c, 0xb1, 0xfd, 0xdc, 0xc0, 0xe4, 0x3f, + 0xb3, 0xff, 0x67, 0xde, 0xd5, 0x94, 0xcc, 0xb0, 0x83, 0x2f, 0x28, 0x93, + 0x92, 0x3, 0xa1, 0x41, 0x64, 0x60, 0x62, 0x70, 0x80, 0x87, 0xaf, 0xe7, + 0x60, 0x4a, 0x20, 0x23, 0xb3, 0x11, 0x7, 0x38, 0x38, 0xd4, 0xa, 0x66, + 0xb5, 0x93, 0x41, 0x90, 0x19, 0x17, 0x18, 0x60, 0xa5, 0xb, 0x7a, 0x24, + 0xaa, 0x20, 0x81, 0xac, 0xa9, 0xa1, 0x70, 0xa6, 0x12, 0x8a, 0x4a, 0xa3, + 0xa0, 0xf9, 0x9a, 0x97, 0xe7, 0xa8, 0xac, 0x8, 0xa8, 0xc4, 0x2a, 0x86, + 0xa7, 0x69, 0x1e, 0x67, 0xe6, 0xbe, 0xa4, 0xd3, 0xff, 0x91, 0x61, 0xf6, + 0x8a, 0xe6, 0xb5, 0xb3, 0x61, 0x9f, 0x19, 0x17, 0x98, 0x27, 0x6b, 0xe9, + 0x8, 0x98, 0xe1, 0x21, 0x4a, 0x9, 0xb5, 0xd7, 0xca, 0xfa, 0x94, 0xd0, + 0x69, 0x1a, 0xeb, 0x52, 0x1, 0x4e, 0xf5, 0xf6, 0xdf, 0x7f, 0xe7, 0x29, + 0x70, 0xee, 0x4, 0xda, 0x2f, 0xa4, 0xff, 0xfe, 0xbb, 0x6f, 0xa8, 0xff, + 0xfe, 0xdb, 0xaf, 0x8, 0xf6, 0x72, 0xa1, 0x40, 0x5d, 0xf0, 0x2d, 0x8, + 0x82, 0x5b, 0x87, 0xbd, 0x10, 0x8, 0xe9, 0x7, 0xee, 0x4b, 0x80, 0xda, + 0x4a, 0x4, 0xc5, 0x5e, 0xa0, 0xb7, 0x1e, 0x60, 0xb0, 0x59, 0x76, 0x60, + 0xb, 0x2e, 0x19, 0x8a, 0x2e, 0x1c, 0xe6, 0x6, 0x20, 0xb8, 0x64, 0x18, + 0x2a, 0xcf, 0x51, 0x94, 0xd4, 0xee, 0xc3, 0xfe, 0x39, 0x74, 0xd4, 0x2b, + 0x48, 0xc9, 0x83, 0x4c, 0x9b, 0xd0, 0x4c, 0x35, 0x10, 0xe3, 0x9, 0xf7, + 0x72, 0xf0, 0x7a, 0xe, 0xbf, 0x7d, 0x36, 0x2e, 0x19, 0x7e, 0x3f, 0xc, + 0xf7, 0x93, 0xe7, 0xf4, 0x1d, 0x32, 0xc6, 0xb0, 0x89, 0xad, 0xe0, 0x28, + 0xc1, 0xa7, 0x59, 0xe3, 0x0, +}; + +// Tests that the decoded kEncodedTestProfile matches kTestProfile. +TEST(IccCodecTest, EncodedIccProfile) { + jxl::BitReader reader(jxl::Span(kEncodedTestProfile, + sizeof(kEncodedTestProfile))); + jxl::PaddedBytes dec; + ASSERT_TRUE(ReadICC(&reader, &dec)); + ASSERT_TRUE(reader.Close()); + EXPECT_EQ(sizeof(kTestProfile), dec.size()); + if (sizeof(kTestProfile) == dec.size()) { + for (size_t i = 0; i < dec.size(); i++) { + EXPECT_EQ(kTestProfile[i], dec[i]); + if (kTestProfile[i] != dec[i]) break; // One output is enough + } + } +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image.cc new file mode 100644 index 0000000000..0d63d797e1 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image.cc @@ -0,0 +1,313 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/image.h" + +#include // swap + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/image.cc" +#include +#include + +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/sanitizers.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { + +namespace HWY_NAMESPACE { +size_t GetVectorSize() { return HWY_LANES(uint8_t); } +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE + +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +namespace { + +HWY_EXPORT(GetVectorSize); // Local function. + +size_t VectorSize() { + static size_t bytes = HWY_DYNAMIC_DISPATCH(GetVectorSize)(); + return bytes; +} + +// Returns distance [bytes] between the start of two consecutive rows, a +// multiple of vector/cache line size but NOT CacheAligned::kAlias - see below. +size_t BytesPerRow(const size_t xsize, const size_t sizeof_t) { + const size_t vec_size = VectorSize(); + size_t valid_bytes = xsize * sizeof_t; + + // Allow unaligned accesses starting at the last valid value - this may raise + // msan errors unless the user calls InitializePaddingForUnalignedAccesses. + // Skip for the scalar case because no extra lanes will be loaded. + if (vec_size != 0) { + valid_bytes += vec_size - sizeof_t; + } + + // Round up to vector and cache line size. + const size_t align = std::max(vec_size, CacheAligned::kAlignment); + size_t bytes_per_row = RoundUpTo(valid_bytes, align); + + // During the lengthy window before writes are committed to memory, CPUs + // guard against read after write hazards by checking the address, but + // only the lower 11 bits. We avoid a false dependency between writes to + // consecutive rows by ensuring their sizes are not multiples of 2 KiB. + // Avoid2K prevents the same problem for the planes of an Image3. + if (bytes_per_row % CacheAligned::kAlias == 0) { + bytes_per_row += align; + } + + JXL_ASSERT(bytes_per_row % align == 0); + return bytes_per_row; +} + +} // namespace + +PlaneBase::PlaneBase(const size_t xsize, const size_t ysize, + const size_t sizeof_t) + : xsize_(static_cast(xsize)), + ysize_(static_cast(ysize)), + orig_xsize_(static_cast(xsize)), + orig_ysize_(static_cast(ysize)) { + // (Can't profile CacheAligned itself because it is used by profiler.h) + PROFILER_FUNC; + + JXL_CHECK(xsize == xsize_); + JXL_CHECK(ysize == ysize_); + + JXL_ASSERT(sizeof_t == 1 || sizeof_t == 2 || sizeof_t == 4 || sizeof_t == 8); + + bytes_per_row_ = 0; + // Dimensions can be zero, e.g. for lazily-allocated images. Only allocate + // if nonzero, because "zero" bytes still have padding/bookkeeping overhead. + if (xsize != 0 && ysize != 0) { + bytes_per_row_ = BytesPerRow(xsize, sizeof_t); + bytes_ = AllocateArray(bytes_per_row_ * ysize); + JXL_CHECK(bytes_.get()); + InitializePadding(sizeof_t, Padding::kRoundUp); + } +} + +void PlaneBase::InitializePadding(const size_t sizeof_t, Padding padding) { +#if defined(MEMORY_SANITIZER) || HWY_IDE + if (xsize_ == 0 || ysize_ == 0) return; + + const size_t vec_size = VectorSize(); + if (vec_size == 0) return; // Scalar mode: no padding needed + + const size_t valid_size = xsize_ * sizeof_t; + const size_t initialize_size = padding == Padding::kRoundUp + ? RoundUpTo(valid_size, vec_size) + : valid_size + vec_size - sizeof_t; + if (valid_size == initialize_size) return; + + for (size_t y = 0; y < ysize_; ++y) { + uint8_t* JXL_RESTRICT row = static_cast(VoidRow(y)); +#if defined(__clang__) && (__clang_major__ <= 6) + // There's a bug in msan in clang-6 when handling AVX2 operations. This + // workaround allows tests to pass on msan, although it is slower and + // prevents msan warnings from uninitialized images. + std::fill(row, msan::kSanitizerSentinelByte, initialize_size); +#else + memset(row + valid_size, msan::kSanitizerSentinelByte, + initialize_size - valid_size); +#endif // clang6 + } +#endif // MEMORY_SANITIZER +} + +void PlaneBase::Swap(PlaneBase& other) { + std::swap(xsize_, other.xsize_); + std::swap(ysize_, other.ysize_); + std::swap(orig_xsize_, other.orig_xsize_); + std::swap(orig_ysize_, other.orig_ysize_); + std::swap(bytes_per_row_, other.bytes_per_row_); + std::swap(bytes_, other.bytes_); +} + +ImageB ImageFromPacked(const uint8_t* packed, const size_t xsize, + const size_t ysize, const size_t bytes_per_row) { + JXL_ASSERT(bytes_per_row >= xsize); + ImageB image(xsize, ysize); + PROFILER_FUNC; + for (size_t y = 0; y < ysize; ++y) { + uint8_t* const JXL_RESTRICT row = image.Row(y); + const uint8_t* const JXL_RESTRICT packed_row = packed + y * bytes_per_row; + memcpy(row, packed_row, xsize); + } + return image; +} + +// Note that using mirroring here gives slightly worse results. +ImageF PadImage(const ImageF& in, const size_t xsize, const size_t ysize) { + JXL_ASSERT(xsize >= in.xsize()); + JXL_ASSERT(ysize >= in.ysize()); + ImageF out(xsize, ysize); + size_t y = 0; + for (; y < in.ysize(); ++y) { + const float* JXL_RESTRICT row_in = in.ConstRow(y); + float* JXL_RESTRICT row_out = out.Row(y); + memcpy(row_out, row_in, in.xsize() * sizeof(row_in[0])); + const int lastcol = in.xsize() - 1; + const float lastval = row_out[lastcol]; + for (size_t x = in.xsize(); x < xsize; ++x) { + row_out[x] = lastval; + } + } + + // TODO(janwas): no need to copy if we can 'extend' image: if rows are + // pointers to any memory? Or allocate larger image before IO? + const int lastrow = in.ysize() - 1; + for (; y < ysize; ++y) { + const float* JXL_RESTRICT row_in = out.ConstRow(lastrow); + float* JXL_RESTRICT row_out = out.Row(y); + memcpy(row_out, row_in, xsize * sizeof(row_out[0])); + } + return out; +} + +Image3F PadImageMirror(const Image3F& in, const size_t xborder, + const size_t yborder) { + size_t xsize = in.xsize(); + size_t ysize = in.ysize(); + Image3F out(xsize + 2 * xborder, ysize + 2 * yborder); + if (xborder > xsize || yborder > ysize) { + for (size_t c = 0; c < 3; c++) { + for (int32_t y = 0; y < static_cast(out.ysize()); y++) { + float* row_out = out.PlaneRow(c, y); + const float* row_in = in.PlaneRow( + c, Mirror(y - static_cast(yborder), in.ysize())); + for (int32_t x = 0; x < static_cast(out.xsize()); x++) { + int32_t xin = Mirror(x - static_cast(xborder), in.xsize()); + row_out[x] = row_in[xin]; + } + } + } + return out; + } + CopyImageTo(in, Rect(xborder, yborder, xsize, ysize), &out); + for (size_t c = 0; c < 3; c++) { + // Horizontal pad. + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xborder; x++) { + out.PlaneRow(c, y + yborder)[x] = + in.ConstPlaneRow(c, y)[xborder - x - 1]; + out.PlaneRow(c, y + yborder)[x + xsize + xborder] = + in.ConstPlaneRow(c, y)[xsize - 1 - x]; + } + } + // Vertical pad. + for (size_t y = 0; y < yborder; y++) { + memcpy(out.PlaneRow(c, y), out.ConstPlaneRow(c, 2 * yborder - 1 - y), + out.xsize() * sizeof(float)); + memcpy(out.PlaneRow(c, y + ysize + yborder), + out.ConstPlaneRow(c, ysize + yborder - 1 - y), + out.xsize() * sizeof(float)); + } + } + return out; +} + +Image3F PadImageToMultiple(const Image3F& in, const size_t N) { + PROFILER_FUNC; + const size_t xsize_blocks = DivCeil(in.xsize(), N); + const size_t ysize_blocks = DivCeil(in.ysize(), N); + const size_t xsize = N * xsize_blocks; + const size_t ysize = N * ysize_blocks; + ImageF out[3]; + for (size_t c = 0; c < 3; ++c) { + out[c] = PadImage(in.Plane(c), xsize, ysize); + } + return Image3F(std::move(out[0]), std::move(out[1]), std::move(out[2])); +} + +void PadImageToBlockMultipleInPlace(Image3F* JXL_RESTRICT in) { + PROFILER_FUNC; + const size_t xsize_orig = in->xsize(); + const size_t ysize_orig = in->ysize(); + const size_t xsize = RoundUpToBlockDim(xsize_orig); + const size_t ysize = RoundUpToBlockDim(ysize_orig); + // Expands image size to the originally-allocated size. + in->ShrinkTo(xsize, ysize); + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < ysize_orig; y++) { + float* JXL_RESTRICT row = in->PlaneRow(c, y); + for (size_t x = xsize_orig; x < xsize; x++) { + row[x] = row[xsize_orig - 1]; + } + } + const float* JXL_RESTRICT row_src = in->ConstPlaneRow(c, ysize_orig - 1); + for (size_t y = ysize_orig; y < ysize; y++) { + memcpy(in->PlaneRow(c, y), row_src, xsize * sizeof(float)); + } + } +} + +float DotProduct(const ImageF& a, const ImageF& b) { + double sum = 0.0; + for (size_t y = 0; y < a.ysize(); ++y) { + const float* const JXL_RESTRICT row_a = a.ConstRow(y); + const float* const JXL_RESTRICT row_b = b.ConstRow(y); + for (size_t x = 0; x < a.xsize(); ++x) { + sum += row_a[x] * row_b[x]; + } + } + return sum; +} + +static void DownsampleImage(const ImageF& input, size_t factor, + ImageF* output) { + JXL_ASSERT(factor != 1); + output->ShrinkTo(DivCeil(input.xsize(), factor), + DivCeil(input.ysize(), factor)); + size_t in_stride = input.PixelsPerRow(); + for (size_t y = 0; y < output->ysize(); y++) { + float* row_out = output->Row(y); + const float* row_in = input.Row(factor * y); + for (size_t x = 0; x < output->xsize(); x++) { + size_t cnt = 0; + float sum = 0; + for (size_t iy = 0; iy < factor && iy + factor * y < input.ysize(); + iy++) { + for (size_t ix = 0; ix < factor && ix + factor * x < input.xsize(); + ix++) { + sum += row_in[iy * in_stride + x * factor + ix]; + cnt++; + } + } + row_out[x] = sum / cnt; + } + } +} + +void DownsampleImage(ImageF* image, size_t factor) { + // Allocate extra space to avoid a reallocation when padding. + ImageF downsampled(DivCeil(image->xsize(), factor) + kBlockDim, + DivCeil(image->ysize(), factor) + kBlockDim); + DownsampleImage(*image, factor, &downsampled); + *image = std::move(downsampled); +} + +void DownsampleImage(Image3F* opsin, size_t factor) { + JXL_ASSERT(factor != 1); + // Allocate extra space to avoid a reallocation when padding. + Image3F downsampled(DivCeil(opsin->xsize(), factor) + kBlockDim, + DivCeil(opsin->ysize(), factor) + kBlockDim); + downsampled.ShrinkTo(downsampled.xsize() - kBlockDim, + downsampled.ysize() - kBlockDim); + for (size_t c = 0; c < 3; c++) { + DownsampleImage(opsin->Plane(c), factor, &downsampled.Plane(c)); + } + *opsin = std::move(downsampled); +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image.h new file mode 100644 index 0000000000..9240e01593 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image.h @@ -0,0 +1,437 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_IMAGE_H_ +#define LIB_JXL_IMAGE_H_ + +// SIMD/multicore-friendly planar image representation with row accessors. + +#include +#include +#include + +#include +#include // std::move + +#include "lib/jxl/base/cache_aligned.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +// Type-independent parts of Plane<> - reduces code duplication and facilitates +// moving member function implementations to cc file. +struct PlaneBase { + PlaneBase() + : xsize_(0), + ysize_(0), + orig_xsize_(0), + orig_ysize_(0), + bytes_per_row_(0), + bytes_(nullptr) {} + PlaneBase(size_t xsize, size_t ysize, size_t sizeof_t); + + // Copy construction/assignment is forbidden to avoid inadvertent copies, + // which can be very expensive. Use CopyImageTo() instead. + PlaneBase(const PlaneBase& other) = delete; + PlaneBase& operator=(const PlaneBase& other) = delete; + + // Move constructor (required for returning Image from function) + PlaneBase(PlaneBase&& other) noexcept = default; + + // Move assignment (required for std::vector) + PlaneBase& operator=(PlaneBase&& other) noexcept = default; + + void Swap(PlaneBase& other); + + // Useful for pre-allocating image with some padding for alignment purposes + // and later reporting the actual valid dimensions. May also be used to + // un-shrink the image. Caller is responsible for ensuring xsize/ysize are <= + // the original dimensions. + void ShrinkTo(const size_t xsize, const size_t ysize) { + JXL_CHECK(xsize <= orig_xsize_); + JXL_CHECK(ysize <= orig_ysize_); + xsize_ = static_cast(xsize); + ysize_ = static_cast(ysize); + // NOTE: we can't recompute bytes_per_row for more compact storage and + // better locality because that would invalidate the image contents. + } + + // How many pixels. + JXL_INLINE size_t xsize() const { return xsize_; } + JXL_INLINE size_t ysize() const { return ysize_; } + + // NOTE: do not use this for copying rows - the valid xsize may be much less. + JXL_INLINE size_t bytes_per_row() const { return bytes_per_row_; } + + // Raw access to byte contents, for interfacing with other libraries. + // Unsigned char instead of char to avoid surprises (sign extension). + JXL_INLINE uint8_t* bytes() { + void* p = bytes_.get(); + return static_cast(JXL_ASSUME_ALIGNED(p, 64)); + } + JXL_INLINE const uint8_t* bytes() const { + const void* p = bytes_.get(); + return static_cast(JXL_ASSUME_ALIGNED(p, 64)); + } + + protected: + // Returns pointer to the start of a row. + JXL_INLINE void* VoidRow(const size_t y) const { +#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ + defined(THREAD_SANITIZER) + if (y >= ysize_) { + JXL_ABORT("Row(%zu) in (%u x %u) image\n", y, xsize_, ysize_); + } +#endif + + void* row = bytes_.get() + y * bytes_per_row_; + return JXL_ASSUME_ALIGNED(row, 64); + } + + enum class Padding { + // Allow Load(d, row + x) for x = 0; x < xsize(); x += Lanes(d). Default. + kRoundUp, + // Allow LoadU(d, row + x) for x = xsize() - 1. This requires an extra + // vector to be initialized. If done by default, this would suppress + // legitimate msan warnings. We therefore require users to explicitly call + // InitializePadding before using unaligned loads (e.g. convolution). + kUnaligned + }; + + // Initializes the minimum bytes required to suppress msan warnings from + // legitimate (according to Padding mode) vector loads/stores on the right + // border, where some lanes are uninitialized and assumed to be unused. + void InitializePadding(size_t sizeof_t, Padding padding); + + // (Members are non-const to enable assignment during move-assignment.) + uint32_t xsize_; // In valid pixels, not including any padding. + uint32_t ysize_; + uint32_t orig_xsize_; + uint32_t orig_ysize_; + size_t bytes_per_row_; // Includes padding. + CacheAlignedUniquePtr bytes_; +}; + +// Single channel, aligned rows separated by padding. T must be POD. +// +// 'Single channel' (one 2D array per channel) simplifies vectorization +// (repeating the same operation on multiple adjacent components) without the +// complexity of a hybrid layout (8 R, 8 G, 8 B, ...). In particular, clients +// can easily iterate over all components in a row and Image requires no +// knowledge of the pixel format beyond the component type "T". +// +// 'Aligned' means each row is aligned to the L1 cache line size. This prevents +// false sharing between two threads operating on adjacent rows. +// +// 'Padding' is still relevant because vectors could potentially be larger than +// a cache line. By rounding up row sizes to the vector size, we allow +// reading/writing ALIGNED vectors whose first lane is a valid sample. This +// avoids needing a separate loop to handle remaining unaligned lanes. +// +// This image layout could also be achieved with a vector and a row accessor +// function, but a class wrapper with support for "deleter" allows wrapping +// existing memory allocated by clients without copying the pixels. It also +// provides convenient accessors for xsize/ysize, which shortens function +// argument lists. Supports move-construction so it can be stored in containers. +template +class Plane : public PlaneBase { + public: + using T = ComponentType; + static constexpr size_t kNumPlanes = 1; + + Plane() = default; + Plane(const size_t xsize, const size_t ysize) + : PlaneBase(xsize, ysize, sizeof(T)) {} + + void InitializePaddingForUnalignedAccesses() { + InitializePadding(sizeof(T), Padding::kUnaligned); + } + + JXL_INLINE T* Row(const size_t y) { return static_cast(VoidRow(y)); } + + // Returns pointer to const (see above). + JXL_INLINE const T* Row(const size_t y) const { + return static_cast(VoidRow(y)); + } + + // Documents that the access is const. + JXL_INLINE const T* ConstRow(const size_t y) const { + return static_cast(VoidRow(y)); + } + + // Returns number of pixels (some of which are padding) per row. Useful for + // computing other rows via pointer arithmetic. WARNING: this must + // NOT be used to determine xsize. + JXL_INLINE intptr_t PixelsPerRow() const { + return static_cast(bytes_per_row_ / sizeof(T)); + } +}; + +using ImageSB = Plane; +using ImageB = Plane; +using ImageS = Plane; // signed integer or half-float +using ImageU = Plane; +using ImageI = Plane; +using ImageF = Plane; +using ImageD = Plane; + +// Also works for Image3 and mixed argument types. +template +bool SameSize(const Image1& image1, const Image2& image2) { + return image1.xsize() == image2.xsize() && image1.ysize() == image2.ysize(); +} + +template +class Image3; + +// Rectangular region in image(s). Factoring this out of Image instead of +// shifting the pointer by x0/y0 allows this to apply to multiple images with +// different resolutions (e.g. color transform and quantization field). +// Can compare using SameSize(rect1, rect2). +class Rect { + public: + // Most windows are xsize_max * ysize_max, except those on the borders where + // begin + size_max > end. + constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize_max, + size_t ysize_max, size_t xend, size_t yend) + : x0_(xbegin), + y0_(ybegin), + xsize_(ClampedSize(xbegin, xsize_max, xend)), + ysize_(ClampedSize(ybegin, ysize_max, yend)) {} + + // Construct with origin and known size (typically from another Rect). + constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize, size_t ysize) + : x0_(xbegin), y0_(ybegin), xsize_(xsize), ysize_(ysize) {} + + // Construct a rect that covers a whole image/plane/ImageBundle etc. + template + explicit Rect(const Image& image) + : Rect(0, 0, image.xsize(), image.ysize()) {} + + Rect() : Rect(0, 0, 0, 0) {} + + Rect(const Rect&) = default; + Rect& operator=(const Rect&) = default; + + // Construct a subrect that resides in an image/plane/ImageBundle etc. + template + Rect Crop(const Image& image) const { + return Rect(x0_, y0_, xsize_, ysize_, image.xsize(), image.ysize()); + } + + // Returns a rect that only contains `num` lines with offset `y` from `y0()`. + Rect Lines(size_t y, size_t num) const { + JXL_DASSERT(y + num <= ysize_); + return Rect(x0_, y0_ + y, xsize_, num); + } + + Rect Line(size_t y) const { return Lines(y, 1); } + + JXL_MUST_USE_RESULT Rect Intersection(const Rect& other) const { + return Rect(std::max(x0_, other.x0_), std::max(y0_, other.y0_), xsize_, + ysize_, std::min(x0_ + xsize_, other.x0_ + other.xsize_), + std::min(y0_ + ysize_, other.y0_ + other.ysize_)); + } + + JXL_MUST_USE_RESULT Rect Translate(int64_t x_offset, int64_t y_offset) const { + return Rect(x0_ + x_offset, y0_ + y_offset, xsize_, ysize_); + } + + template + T* Row(Plane* image, size_t y) const { + return image->Row(y + y0_) + x0_; + } + + template + const T* Row(const Plane* image, size_t y) const { + return image->Row(y + y0_) + x0_; + } + + template + T* PlaneRow(Image3* image, const size_t c, size_t y) const { + return image->PlaneRow(c, y + y0_) + x0_; + } + + template + const T* ConstRow(const Plane& image, size_t y) const { + return image.ConstRow(y + y0_) + x0_; + } + + template + const T* ConstPlaneRow(const Image3& image, size_t c, size_t y) const { + return image.ConstPlaneRow(c, y + y0_) + x0_; + } + + bool IsInside(const Rect& other) const { + return x0_ >= other.x0() && x0_ + xsize_ <= other.x0() + other.xsize_ && + y0_ >= other.y0() && y0_ + ysize_ <= other.y0() + other.ysize(); + } + + // Returns true if this Rect fully resides in the given image. ImageT could be + // Plane or Image3; however if ImageT is Rect, results are nonsensical. + template + bool IsInside(const ImageT& image) const { + return (x0_ + xsize_ <= image.xsize()) && (y0_ + ysize_ <= image.ysize()); + } + + size_t x0() const { return x0_; } + size_t y0() const { return y0_; } + size_t xsize() const { return xsize_; } + size_t ysize() const { return ysize_; } + + private: + // Returns size_max, or whatever is left in [begin, end). + static constexpr size_t ClampedSize(size_t begin, size_t size_max, + size_t end) { + return (begin + size_max <= end) ? size_max + : (end > begin ? end - begin : 0); + } + + size_t x0_; + size_t y0_; + + size_t xsize_; + size_t ysize_; +}; + +// Currently, we abuse Image to either refer to an image that owns its storage +// or one that doesn't. In similar vein, we abuse Image* function parameters to +// either mean "assign to me" or "fill the provided image with data". +// Hopefully, the "assign to me" meaning will go away and most images in the +// codebase will not be backed by own storage. When this happens we can redesign +// Image to be a non-storage-holding view class and introduce BackedImage in +// those places that actually need it. + +// NOTE: we can't use Image as a view because invariants are violated +// (alignment and the presence of padding before/after each "row"). + +// A bundle of 3 same-sized images. Typically constructed by moving from three +// rvalue references to Image. To overwrite an existing Image3 using +// single-channel producers, we also need access to Image*. Constructing +// temporary non-owning Image pointing to one plane of an existing Image3 risks +// dangling references, especially if the wrapper is moved. Therefore, we +// store an array of Image (which are compact enough that size is not a concern) +// and provide Plane+Row accessors. +template +class Image3 { + public: + using T = ComponentType; + using PlaneT = jxl::Plane; + static constexpr size_t kNumPlanes = 3; + + Image3() : planes_{PlaneT(), PlaneT(), PlaneT()} {} + + Image3(const size_t xsize, const size_t ysize) + : planes_{PlaneT(xsize, ysize), PlaneT(xsize, ysize), + PlaneT(xsize, ysize)} {} + + Image3(Image3&& other) noexcept { + for (size_t i = 0; i < kNumPlanes; i++) { + planes_[i] = std::move(other.planes_[i]); + } + } + + Image3(PlaneT&& plane0, PlaneT&& plane1, PlaneT&& plane2) { + JXL_CHECK(SameSize(plane0, plane1)); + JXL_CHECK(SameSize(plane0, plane2)); + planes_[0] = std::move(plane0); + planes_[1] = std::move(plane1); + planes_[2] = std::move(plane2); + } + + // Copy construction/assignment is forbidden to avoid inadvertent copies, + // which can be very expensive. Use CopyImageTo instead. + Image3(const Image3& other) = delete; + Image3& operator=(const Image3& other) = delete; + + Image3& operator=(Image3&& other) noexcept { + for (size_t i = 0; i < kNumPlanes; i++) { + planes_[i] = std::move(other.planes_[i]); + } + return *this; + } + + // Returns row pointer; usage: PlaneRow(idx_plane, y)[x] = val. + JXL_INLINE T* PlaneRow(const size_t c, const size_t y) { + // Custom implementation instead of calling planes_[c].Row ensures only a + // single multiplication is needed for PlaneRow(0..2, y). + PlaneRowBoundsCheck(c, y); + const size_t row_offset = y * planes_[0].bytes_per_row(); + void* row = planes_[c].bytes() + row_offset; + return static_cast(JXL_ASSUME_ALIGNED(row, 64)); + } + + // Returns const row pointer; usage: val = PlaneRow(idx_plane, y)[x]. + JXL_INLINE const T* PlaneRow(const size_t c, const size_t y) const { + PlaneRowBoundsCheck(c, y); + const size_t row_offset = y * planes_[0].bytes_per_row(); + const void* row = planes_[c].bytes() + row_offset; + return static_cast(JXL_ASSUME_ALIGNED(row, 64)); + } + + // Returns const row pointer, even if called from a non-const Image3. + JXL_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) const { + PlaneRowBoundsCheck(c, y); + return PlaneRow(c, y); + } + + JXL_INLINE const PlaneT& Plane(size_t idx) const { return planes_[idx]; } + + JXL_INLINE PlaneT& Plane(size_t idx) { return planes_[idx]; } + + void Swap(Image3& other) { + for (size_t c = 0; c < 3; ++c) { + other.planes_[c].Swap(planes_[c]); + } + } + + // Useful for pre-allocating image with some padding for alignment purposes + // and later reporting the actual valid dimensions. May also be used to + // un-shrink the image. Caller is responsible for ensuring xsize/ysize are <= + // the original dimensions. + void ShrinkTo(const size_t xsize, const size_t ysize) { + for (PlaneT& plane : planes_) { + plane.ShrinkTo(xsize, ysize); + } + } + + // Sizes of all three images are guaranteed to be equal. + JXL_INLINE size_t xsize() const { return planes_[0].xsize(); } + JXL_INLINE size_t ysize() const { return planes_[0].ysize(); } + // Returns offset [bytes] from one row to the next row of the same plane. + // WARNING: this must NOT be used to determine xsize, nor for copying rows - + // the valid xsize may be much less. + JXL_INLINE size_t bytes_per_row() const { return planes_[0].bytes_per_row(); } + // Returns number of pixels (some of which are padding) per row. Useful for + // computing other rows via pointer arithmetic. WARNING: this must NOT be used + // to determine xsize. + JXL_INLINE intptr_t PixelsPerRow() const { return planes_[0].PixelsPerRow(); } + + private: + void PlaneRowBoundsCheck(const size_t c, const size_t y) const { +#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ + defined(THREAD_SANITIZER) + if (c >= kNumPlanes || y >= ysize()) { + JXL_ABORT("PlaneRow(%zu, %zu) in (%zu x %zu) image\n", c, y, xsize(), + ysize()); + } +#endif + } + + private: + PlaneT planes_[kNumPlanes]; +}; + +using Image3B = Image3; +using Image3S = Image3; +using Image3U = Image3; +using Image3I = Image3; +using Image3F = Image3; +using Image3D = Image3; + +} // namespace jxl + +#endif // LIB_JXL_IMAGE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle.cc new file mode 100644 index 0000000000..0221903219 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle.cc @@ -0,0 +1,149 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/image_bundle.h" + +#include +#include + +#include "lib/jxl/alpha.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/luminance.h" + +namespace jxl { + +void ImageBundle::ShrinkTo(size_t xsize, size_t ysize) { + if (HasColor()) color_.ShrinkTo(xsize, ysize); + for (ImageF& ec : extra_channels_) { + ec.ShrinkTo(xsize, ysize); + } +} + +// Called by all other SetFrom*. +void ImageBundle::SetFromImage(Image3F&& color, + const ColorEncoding& c_current) { + JXL_CHECK(color.xsize() != 0 && color.ysize() != 0); + JXL_CHECK(metadata_->color_encoding.IsGray() == c_current.IsGray()); + color_ = std::move(color); + c_current_ = c_current; + VerifySizes(); +} + +void ImageBundle::VerifyMetadata() const { + JXL_CHECK(!c_current_.ICC().empty()); + JXL_CHECK(metadata_->color_encoding.IsGray() == IsGray()); + + if (metadata_->HasAlpha() && alpha().xsize() == 0) { + JXL_ABORT("MD alpha_bits %u IB alpha %zu x %zu\n", + metadata_->GetAlphaBits(), alpha().xsize(), alpha().ysize()); + } + const uint32_t alpha_bits = metadata_->GetAlphaBits(); + JXL_CHECK(alpha_bits <= 32); + + // metadata_->num_extra_channels may temporarily differ from + // extra_channels_.size(), e.g. after SetAlpha. They are synced by the next + // call to VisitFields. +} + +void ImageBundle::VerifySizes() const { + const size_t xs = xsize(); + const size_t ys = ysize(); + + if (HasExtraChannels()) { + JXL_CHECK(xs != 0 && ys != 0); + for (const ImageF& ec : extra_channels_) { + JXL_CHECK(ec.xsize() == xs); + JXL_CHECK(ec.ysize() == ys); + } + } +} + +size_t ImageBundle::DetectRealBitdepth() const { + return metadata_->bit_depth.bits_per_sample; + + // TODO(lode): let this function return lower bit depth if possible, e.g. + // return 8 bits in case the original image came from a 16-bit PNG that + // was in fact representable as 8-bit PNG. Ensure that the implementation + // returns 16 if e.g. two consecutive 16-bit values appeared in the original + // image (such as 32768 and 32769), take into account that e.g. the values + // 3-bit can represent is not a superset of the values 2-bit can represent, + // and there may be slight imprecisions in the floating point image. +} + +const ImageF& ImageBundle::alpha() const { + JXL_ASSERT(HasAlpha()); + const size_t ec = metadata_->Find(ExtraChannel::kAlpha) - + metadata_->extra_channel_info.data(); + JXL_ASSERT(ec < extra_channels_.size()); + return extra_channels_[ec]; +} +ImageF* ImageBundle::alpha() { + JXL_ASSERT(HasAlpha()); + const size_t ec = metadata_->Find(ExtraChannel::kAlpha) - + metadata_->extra_channel_info.data(); + JXL_ASSERT(ec < extra_channels_.size()); + return &extra_channels_[ec]; +} + +const ImageF& ImageBundle::depth() const { + JXL_ASSERT(HasDepth()); + const size_t ec = metadata_->Find(ExtraChannel::kDepth) - + metadata_->extra_channel_info.data(); + JXL_ASSERT(ec < extra_channels_.size()); + return extra_channels_[ec]; +} + +void ImageBundle::SetAlpha(ImageF&& alpha, bool alpha_is_premultiplied) { + const ExtraChannelInfo* eci = metadata_->Find(ExtraChannel::kAlpha); + // Must call SetAlphaBits first, otherwise we don't know which channel index + JXL_CHECK(eci != nullptr); + JXL_CHECK(alpha.xsize() != 0 && alpha.ysize() != 0); + JXL_CHECK(eci->alpha_associated == alpha_is_premultiplied); + extra_channels_.insert( + extra_channels_.begin() + (eci - metadata_->extra_channel_info.data()), + std::move(alpha)); + // num_extra_channels is automatically set in visitor + VerifySizes(); +} +void ImageBundle::PremultiplyAlpha() { + if (!HasAlpha()) return; + if (!HasColor()) return; + const ExtraChannelInfo* eci = metadata_->Find(ExtraChannel::kAlpha); + if (eci->alpha_associated) return; // already premultiplied + JXL_CHECK(color_.ysize() == alpha()->ysize()); + JXL_CHECK(color_.xsize() == alpha()->xsize()); + for (size_t y = 0; y < color_.ysize(); y++) { + ::jxl::PremultiplyAlpha(color_.PlaneRow(0, y), color_.PlaneRow(1, y), + color_.PlaneRow(2, y), alpha()->Row(y), + color_.xsize()); + } +} +void ImageBundle::UnpremultiplyAlpha() { + if (!HasAlpha()) return; + if (!HasColor()) return; + const ExtraChannelInfo* eci = metadata_->Find(ExtraChannel::kAlpha); + if (!eci->alpha_associated) return; // already unpremultiplied + JXL_CHECK(color_.ysize() == alpha()->ysize()); + JXL_CHECK(color_.xsize() == alpha()->xsize()); + for (size_t y = 0; y < color_.ysize(); y++) { + ::jxl::UnpremultiplyAlpha(color_.PlaneRow(0, y), color_.PlaneRow(1, y), + color_.PlaneRow(2, y), alpha()->Row(y), + color_.xsize()); + } +} + +void ImageBundle::SetExtraChannels(std::vector&& extra_channels) { + for (const ImageF& plane : extra_channels) { + JXL_CHECK(plane.xsize() != 0 && plane.ysize() != 0); + } + extra_channels_ = std::move(extra_channels); + VerifySizes(); +} +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle.h new file mode 100644 index 0000000000..83f5f7bd31 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle.h @@ -0,0 +1,263 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_IMAGE_BUNDLE_H_ +#define LIB_JXL_IMAGE_BUNDLE_H_ + +// The main image or frame consists of a bundle of associated images. + +#include +#include + +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_metadata.h" +#include "lib/jxl/jpeg/jpeg_data.h" +#include "lib/jxl/opsin_params.h" +#include "lib/jxl/quantizer.h" + +namespace jxl { + +// A bundle of color/alpha/depth/plane images. +class ImageBundle { + public: + // Uninitialized state for use as output parameter. + ImageBundle() : metadata_(nullptr) {} + // Caller is responsible for setting metadata before calling Set*. + explicit ImageBundle(const ImageMetadata* metadata) : metadata_(metadata) {} + + // Move-only (allows storing in std::vector). + ImageBundle(ImageBundle&&) = default; + ImageBundle& operator=(ImageBundle&&) = default; + + ImageBundle Copy() const { + ImageBundle copy(metadata_); + copy.color_ = CopyImage(color_); + copy.c_current_ = c_current_; + copy.extra_channels_.reserve(extra_channels_.size()); + for (const ImageF& plane : extra_channels_) { + copy.extra_channels_.emplace_back(CopyImage(plane)); + } + + copy.jpeg_data = + jpeg_data ? make_unique(*jpeg_data) : nullptr; + copy.color_transform = color_transform; + copy.chroma_subsampling = chroma_subsampling; + + return copy; + } + + // -- SIZE + + size_t xsize() const { + if (IsJPEG()) return jpeg_data->width; + if (color_.xsize() != 0) return color_.xsize(); + return extra_channels_.empty() ? 0 : extra_channels_[0].xsize(); + } + size_t ysize() const { + if (IsJPEG()) return jpeg_data->height; + if (color_.ysize() != 0) return color_.ysize(); + return extra_channels_.empty() ? 0 : extra_channels_[0].ysize(); + } + void ShrinkTo(size_t xsize, size_t ysize); + + // sizes taking orientation into account + size_t oriented_xsize() const { + if (static_cast(metadata_->GetOrientation()) > 4) { + return ysize(); + } else { + return xsize(); + } + } + size_t oriented_ysize() const { + if (static_cast(metadata_->GetOrientation()) > 4) { + return xsize(); + } else { + return ysize(); + } + } + + // -- COLOR + + // Whether color() is valid/usable. Returns true in most cases. Even images + // with spot colors (one example of when !planes().empty()) typically have a + // part that can be converted to RGB. + bool HasColor() const { return color_.xsize() != 0; } + + // For resetting the size when switching from a reference to main frame. + void RemoveColor() { color_ = Image3F(); } + + // Do not use if !HasColor(). + const Image3F& color() const { + // If this fails, Set* was not called - perhaps because decoding failed? + JXL_DASSERT(HasColor()); + return color_; + } + + // Do not use if !HasColor(). + Image3F* color() { + JXL_DASSERT(HasColor()); + return &color_; + } + + // If c_current.IsGray(), all planes must be identical. NOTE: c_current is + // independent of metadata()->color_encoding, which is the original, whereas + // a decoder might return pixels in a different c_current. + // This only sets the color channels, you must also make extra channels + // match the amount that is in the metadata. + void SetFromImage(Image3F&& color, const ColorEncoding& c_current); + + // -- COLOR ENCODING + + const ColorEncoding& c_current() const { return c_current_; } + + // Returns whether the color image has identical planes. Once established by + // Set*, remains unchanged until a subsequent Set* or TransformTo. + bool IsGray() const { return c_current_.IsGray(); } + + bool IsSRGB() const { return c_current_.IsSRGB(); } + bool IsLinearSRGB() const { + return c_current_.white_point == WhitePoint::kD65 && + c_current_.primaries == Primaries::kSRGB && c_current_.tf.IsLinear(); + } + + // Set the c_current profile without doing any transformation, e.g. if the + // transformation was already applied. + void OverrideProfile(const ColorEncoding& new_c_current) { + c_current_ = new_c_current; + } + + // TODO(lode): TransformTo and CopyTo are implemented in enc_image_bundle.cc, + // move these functions out of this header file and class, to + // enc_image_bundle.h. + + // Transforms color to c_desired and sets c_current to c_desired. Alpha and + // metadata remains unchanged. + Status TransformTo(const ColorEncoding& c_desired, + ThreadPool* pool = nullptr); + // Copies this:rect, converts to c_desired, and allocates+fills out. + Status CopyTo(const Rect& rect, const ColorEncoding& c_desired, Image3B* out, + ThreadPool* pool = nullptr) const; + Status CopyTo(const Rect& rect, const ColorEncoding& c_desired, Image3F* out, + ThreadPool* pool = nullptr) const; + Status CopyToSRGB(const Rect& rect, Image3B* out, + ThreadPool* pool = nullptr) const; + + // Detect 'real' bit depth, which can be lower than nominal bit depth + // (this is common in PNG), returns 'real' bit depth + size_t DetectRealBitdepth() const; + + // -- ALPHA + + void SetAlpha(ImageF&& alpha, bool alpha_is_premultiplied); + bool HasAlpha() const { + return metadata_->Find(ExtraChannel::kAlpha) != nullptr; + } + bool AlphaIsPremultiplied() const { + const ExtraChannelInfo* eci = metadata_->Find(ExtraChannel::kAlpha); + return (eci == nullptr) ? false : eci->alpha_associated; + } + // Premultiply alpha (if it isn't already premultiplied) + void PremultiplyAlpha(); + // Unpremultiply alpha (if it isn't already non-premultiplied) + void UnpremultiplyAlpha(); + const ImageF& alpha() const; + ImageF* alpha(); + + // -- DEPTH + bool HasDepth() const { + return metadata_->Find(ExtraChannel::kDepth) != nullptr; + } + const ImageF& depth() const; + + // -- EXTRA CHANNELS + + // Extra channels of unknown interpretation (e.g. spot colors). + void SetExtraChannels(std::vector&& extra_channels); + void ClearExtraChannels() { extra_channels_.clear(); } + bool HasExtraChannels() const { return !extra_channels_.empty(); } + const std::vector& extra_channels() const { return extra_channels_; } + std::vector& extra_channels() { return extra_channels_; } + + const ImageMetadata* metadata() const { return metadata_; } + + void VerifyMetadata() const; + + void SetDecodedBytes(size_t decoded_bytes) { decoded_bytes_ = decoded_bytes; } + size_t decoded_bytes() const { return decoded_bytes_; } + + // -- JPEG transcoding: + + // Returns true if image does or will represent quantized DCT-8 coefficients, + // stored in 8x8 pixel regions. + bool IsJPEG() const { +#if JPEGXL_ENABLE_TRANSCODE_JPEG + return jpeg_data != nullptr; +#else // JPEGXL_ENABLE_TRANSCODE_JPEG + return false; +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + } + + std::unique_ptr jpeg_data; + // these fields are used to signal the input JPEG color space + // NOTE: JPEG doesn't actually provide a way to determine whether YCbCr was + // applied or not. + ColorTransform color_transform = ColorTransform::kNone; + YCbCrChromaSubsampling chroma_subsampling; + + FrameOrigin origin{0, 0}; + // Animation-related information. This assumes GIF- and APNG- like animation. + uint32_t duration = 0; + bool use_for_next_frame = false; + bool blend = false; + BlendMode blendmode = BlendMode::kBlend; + std::string name; + + private: + // Called after any Set* to ensure their sizes are compatible. + void VerifySizes() const; + + // Required for TransformTo so that an ImageBundle is self-sufficient. Always + // points to the same thing, but cannot be const-pointer because that prevents + // the compiler from generating a move ctor. + const ImageMetadata* metadata_; + + // Initialized by Set*: + Image3F color_; // If empty, planes_ is not; all planes equal if IsGray(). + ColorEncoding c_current_; // of color_ + + // Initialized by SetPlanes; size = ImageMetadata.num_extra_channels + std::vector extra_channels_; + + // How many bytes of the input were actually read. + size_t decoded_bytes_ = 0; +}; + +// Does color transformation from in.c_current() to c_desired if the color +// encodings are different, or nothing if they are already the same. +// If color transformation is done, stores the transformed values into store and +// sets the out pointer to store, else leaves store untouched and sets the out +// pointer to &in. +// Returns false if color transform fails. +Status TransformIfNeeded(const ImageBundle& in, const ColorEncoding& c_desired, + ThreadPool* pool, ImageBundle* store, + const ImageBundle** out); + +} // namespace jxl + +#endif // LIB_JXL_IMAGE_BUNDLE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle_test.cc new file mode 100644 index 0000000000..6de2e49dbf --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_bundle_test.cc @@ -0,0 +1,36 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/image_bundle.h" + +#include "gtest/gtest.h" +#include "lib/jxl/aux_out.h" + +namespace jxl { +namespace { + +TEST(ImageBundleTest, ExtraChannelName) { + AuxOut aux_out; + BitWriter writer; + BitWriter::Allotment allotment(&writer, 99); + + ImageMetadata metadata; + ExtraChannelInfo eci; + eci.type = ExtraChannel::kBlack; + eci.name = "testK"; + metadata.extra_channel_info.push_back(std::move(eci)); + ASSERT_TRUE(WriteImageMetadata(metadata, &writer, /*layer=*/0, &aux_out)); + writer.ZeroPadToByte(); + ReclaimAndCharge(&writer, &allotment, /*layer=*/0, &aux_out); + + BitReader reader(writer.GetSpan()); + ImageMetadata metadata_out; + ASSERT_TRUE(ReadImageMetadata(&reader, &metadata_out)); + EXPECT_TRUE(reader.Close()); + EXPECT_EQ("testK", metadata_out.Find(ExtraChannel::kBlack)->name); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_metadata.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_metadata.cc new file mode 100644 index 0000000000..2d9d62e268 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_metadata.cc @@ -0,0 +1,414 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/image_metadata.h" + +#include +#include + +#include "lib/jxl/alpha.h" +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/fields.h" + +namespace jxl { +BitDepth::BitDepth() { Bundle::Init(this); } +Status BitDepth::VisitFields(Visitor* JXL_RESTRICT visitor) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &floating_point_sample)); + // The same fields (bits_per_sample and exponent_bits_per_sample) are read + // in a different way depending on floating_point_sample's value. It's still + // default-initialized correctly so using visitor->Conditional is not + // required. + if (!floating_point_sample) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32( + Val(8), Val(10), Val(12), BitsOffset(6, 1), 8, &bits_per_sample)); + exponent_bits_per_sample = 0; + } else { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32( + Val(32), Val(16), Val(24), BitsOffset(6, 1), 32, &bits_per_sample)); + // The encoded value is exponent_bits_per_sample - 1, encoded in 3 bits + // so the value can be in range [1, 8]. + const uint32_t offset = 1; + exponent_bits_per_sample -= offset; + JXL_QUIET_RETURN_IF_ERROR( + visitor->Bits(4, 8 - offset, &exponent_bits_per_sample)); + exponent_bits_per_sample += offset; + } + + // Error-checking for floating point ranges. + if (floating_point_sample) { + if (exponent_bits_per_sample < 2 || exponent_bits_per_sample > 8) { + return JXL_FAILURE("Invalid exponent_bits_per_sample: %u", + exponent_bits_per_sample); + } + int mantissa_bits = + static_cast(bits_per_sample) - exponent_bits_per_sample - 1; + if (mantissa_bits < 2 || mantissa_bits > 23) { + return JXL_FAILURE("Invalid bits_per_sample: %u", bits_per_sample); + } + } else { + if (bits_per_sample > 31) { + return JXL_FAILURE("Invalid bits_per_sample: %u", bits_per_sample); + } + } + return true; +} + +CustomTransformData::CustomTransformData() { Bundle::Init(this); } +Status CustomTransformData::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + if (visitor->Conditional(nonserialized_xyb_encoded)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&opsin_inverse_matrix)); + } + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 0, &custom_weights_mask)); + if (visitor->Conditional((custom_weights_mask & 0x1) != 0)) { + // 4 5x5 kernels, but all of them can be obtained by symmetry from one, + // which is symmetric along its main diagonal. The top-left kernel is + // defined by + // + // 0 1 2 3 4 + // 1 5 6 7 8 + // 2 6 9 10 11 + // 3 7 10 12 13 + // 4 8 11 13 14 + float constexpr kWeights2[15] = { + -0.01716200f, -0.03452303f, -0.04022174f, -0.02921014f, -0.00624645f, + 0.14111091f, 0.28896755f, 0.00278718f, -0.01610267f, 0.56661550f, + 0.03777607f, -0.01986694f, -0.03144731f, -0.01185068f, -0.00213539f}; + for (size_t i = 0; i < 15; i++) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(kWeights2[i], &upsampling2_weights[i])); + } + } + if (visitor->Conditional((custom_weights_mask & 0x2) != 0)) { + // 16 5x5 kernels, but all of them can be obtained by symmetry from + // three, two of which are symmetric along their main diagonals. The top + // left 4 kernels are defined by + // + // 0 1 2 3 4 5 6 7 8 9 + // 1 10 11 12 13 14 15 16 17 18 + // 2 11 19 20 21 22 23 24 25 26 + // 3 12 20 27 28 29 30 31 32 33 + // 4 13 21 28 34 35 36 37 38 39 + // + // 5 14 22 29 35 40 41 42 43 44 + // 6 15 23 30 36 41 45 46 47 48 + // 7 16 24 31 37 42 46 49 50 51 + // 8 17 25 32 38 43 47 50 52 53 + // 9 18 26 33 39 44 48 51 53 54 + constexpr float kWeights4[55] = { + -0.02419067f, -0.03491987f, -0.03693351f, -0.03094285f, -0.00529785f, + -0.01663432f, -0.03556863f, -0.03888905f, -0.03516850f, -0.00989469f, + 0.23651958f, 0.33392945f, -0.01073543f, -0.01313181f, -0.03556694f, + 0.13048175f, 0.40103025f, 0.03951150f, -0.02077584f, 0.46914198f, + -0.00209270f, -0.01484589f, -0.04064806f, 0.18942530f, 0.56279892f, + 0.06674400f, -0.02335494f, -0.03551682f, -0.00754830f, -0.02267919f, + -0.02363578f, 0.00315804f, -0.03399098f, -0.01359519f, -0.00091653f, + -0.00335467f, -0.01163294f, -0.01610294f, -0.00974088f, -0.00191622f, + -0.01095446f, -0.03198464f, -0.04455121f, -0.02799790f, -0.00645912f, + 0.06390599f, 0.22963888f, 0.00630981f, -0.01897349f, 0.67537268f, + 0.08483369f, -0.02534994f, -0.02205197f, -0.01667999f, -0.00384443f}; + for (size_t i = 0; i < 55; i++) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(kWeights4[i], &upsampling4_weights[i])); + } + } + if (visitor->Conditional((custom_weights_mask & 0x4) != 0)) { + // 64 5x5 kernels, all of them can be obtained by symmetry from + // 10, 4 of which are symmetric along their main diagonals. The top + // left 16 kernels are defined by + // 0 1 2 3 4 5 6 7 8 9 a b c d e f 10 11 12 13 + // 1 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22 23 24 25 26 + // 2 15 27 28 29 2a 2b 2c 2d 2e 2f 30 31 32 33 34 35 36 37 38 + // 3 16 28 39 3a 3b 3c 3d 3e 3f 40 41 42 43 44 45 46 47 48 49 + // 4 17 29 3a 4a 4b 4c 4d 4e 4f 50 51 52 53 54 55 56 57 58 59 + + // 5 18 2a 3b 4b 5a 5b 5c 5d 5e 5f 60 61 62 63 64 65 66 67 68 + // 6 19 2b 3c 4c 5b 69 6a 6b 6c 6d 6e 6f 70 71 72 73 74 75 76 + // 7 1a 2c 3d 4d 5c 6a 77 78 79 7a 7b 7c 7d 7e 7f 80 81 82 83 + // 8 1b 2d 3e 4e 5d 6b 78 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f + // 9 1c 2e 3f 4f 5e 6c 79 85 90 91 92 93 94 95 96 97 98 99 9a + + // a 1d 2f 40 50 5f 6d 7a 86 91 9b 9c 9d 9e 9f a0 a1 a2 a3 a4 + // b 1e 30 41 51 60 6e 7b 87 92 9c a5 a6 a7 a8 a9 aa ab ac ad + // c 1f 31 42 52 61 6f 7c 88 93 9d a6 ae af b0 b1 b2 b3 b4 b5 + // d 20 32 43 53 62 70 7d 89 94 9e a7 af b6 b7 b8 b9 ba bb bc + // e 21 33 44 54 63 71 7e 8a 95 9f a8 b0 b7 bd be bf c0 c1 c2 + + // f 22 34 45 55 64 72 7f 8b 96 a0 a9 b1 b8 be c3 c4 c5 c6 c7 + // 10 23 35 46 56 65 73 80 8c 97 a1 aa b2 b9 bf c4 c8 c9 ca cb + // 11 24 36 47 57 66 74 81 8d 98 a2 ab b3 ba c0 c5 c9 cc cd ce + // 12 25 37 48 58 67 75 82 8e 99 a3 ac b4 bb c1 c6 ca cd cf d0 + // 13 26 38 49 59 68 76 83 8f 9a a4 ad b5 bc c2 c7 cb ce d0 d1 + constexpr float kWeights8[210] = { + -0.02928613f, -0.03706353f, -0.03783812f, -0.03324558f, -0.00447632f, + -0.02519406f, -0.03752601f, -0.03901508f, -0.03663285f, -0.00646649f, + -0.02066407f, -0.03838633f, -0.04002101f, -0.03900035f, -0.00901973f, + -0.01626393f, -0.03954148f, -0.04046620f, -0.03979621f, -0.01224485f, + 0.29895328f, 0.35757708f, -0.02447552f, -0.01081748f, -0.04314594f, + 0.23903219f, 0.41119301f, -0.00573046f, -0.01450239f, -0.04246845f, + 0.17567618f, 0.45220643f, 0.02287757f, -0.01936783f, -0.03583255f, + 0.11572472f, 0.47416733f, 0.06284440f, -0.02685066f, 0.42720050f, + -0.02248939f, -0.01155273f, -0.04562755f, 0.28689496f, 0.49093869f, + -0.00007891f, -0.01545926f, -0.04562659f, 0.21238920f, 0.53980934f, + 0.03369474f, -0.02070211f, -0.03866988f, 0.14229550f, 0.56593398f, + 0.08045181f, -0.02888298f, -0.03680918f, -0.00542229f, -0.02920477f, + -0.02788574f, -0.02118180f, -0.03942402f, -0.00775547f, -0.02433614f, + -0.03193943f, -0.02030828f, -0.04044014f, -0.01074016f, -0.01930822f, + -0.03620399f, -0.01974125f, -0.03919545f, -0.01456093f, -0.00045072f, + -0.00360110f, -0.01020207f, -0.01231907f, -0.00638988f, -0.00071592f, + -0.00279122f, -0.00957115f, -0.01288327f, -0.00730937f, -0.00107783f, + -0.00210156f, -0.00890705f, -0.01317668f, -0.00813895f, -0.00153491f, + -0.02128481f, -0.04173044f, -0.04831487f, -0.03293190f, -0.00525260f, + -0.01720322f, -0.04052736f, -0.05045706f, -0.03607317f, -0.00738030f, + -0.01341764f, -0.03965629f, -0.05151616f, -0.03814886f, -0.01005819f, + 0.18968273f, 0.33063684f, -0.01300105f, -0.01372950f, -0.04017465f, + 0.13727832f, 0.36402234f, 0.01027890f, -0.01832107f, -0.03365072f, + 0.08734506f, 0.38194295f, 0.04338228f, -0.02525993f, 0.56408126f, + 0.00458352f, -0.01648227f, -0.04887868f, 0.24585519f, 0.62026135f, + 0.04314807f, -0.02213737f, -0.04158014f, 0.16637289f, 0.65027023f, + 0.09621636f, -0.03101388f, -0.04082742f, -0.00904519f, -0.02790922f, + -0.02117818f, 0.00798662f, -0.03995711f, -0.01243427f, -0.02231705f, + -0.02946266f, 0.00992055f, -0.03600283f, -0.01684920f, -0.00111684f, + -0.00411204f, -0.01297130f, -0.01723725f, -0.01022545f, -0.00165306f, + -0.00313110f, -0.01218016f, -0.01763266f, -0.01125620f, -0.00231663f, + -0.01374149f, -0.03797620f, -0.05142937f, -0.03117307f, -0.00581914f, + -0.01064003f, -0.03608089f, -0.05272168f, -0.03375670f, -0.00795586f, + 0.09628104f, 0.27129991f, -0.00353779f, -0.01734151f, -0.03153981f, + 0.05686230f, 0.28500998f, 0.02230594f, -0.02374955f, 0.68214326f, + 0.05018048f, -0.02320852f, -0.04383616f, 0.18459474f, 0.71517975f, + 0.10805613f, -0.03263677f, -0.03637639f, -0.01394373f, -0.02511203f, + -0.01728636f, 0.05407331f, -0.02867568f, -0.01893131f, -0.00240854f, + -0.00446511f, -0.01636187f, -0.02377053f, -0.01522848f, -0.00333334f, + -0.00819975f, -0.02964169f, -0.04499287f, -0.02745350f, -0.00612408f, + 0.02727416f, 0.19446600f, 0.00159832f, -0.02232473f, 0.74982506f, + 0.11452620f, -0.03348048f, -0.01605681f, -0.02070339f, -0.00458223f}; + for (size_t i = 0; i < 210; i++) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(kWeights8[i], &upsampling8_weights[i])); + } + } + return true; +} + +ExtraChannelInfo::ExtraChannelInfo() { Bundle::Init(this); } +Status ExtraChannelInfo::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + + // General + JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(ExtraChannel::kAlpha, &type)); + + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&bit_depth)); + + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(0), Val(3), Val(4), BitsOffset(3, 1), 0, &dim_shift)); + if ((1U << dim_shift) > 8) { + return JXL_FAILURE("dim_shift %u too large", dim_shift); + } + + JXL_QUIET_RETURN_IF_ERROR(VisitNameString(visitor, &name)); + + // Conditional + if (visitor->Conditional(type == ExtraChannel::kAlpha)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &alpha_associated)); + } + if (visitor->Conditional(type == ExtraChannel::kSpotColor)) { + for (float& c : spot_color) { + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0, &c)); + } + } + if (visitor->Conditional(type == ExtraChannel::kCFA)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(1), Bits(2), BitsOffset(4, 3), + BitsOffset(8, 19), 1, &cfa_channel)); + } + return true; +} + +ImageMetadata::ImageMetadata() { Bundle::Init(this); } +Status ImageMetadata::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + + // Bundle::AllDefault does not allow usage when reading (it may abort the + // program when a codestream has invalid values), but when reading we + // overwrite the extra_fields value, so do not need to call AllDefault. + bool tone_mapping_default = + visitor->IsReading() ? false : Bundle::AllDefault(tone_mapping); + + bool extra_fields = (orientation != 1 || have_preview || have_animation || + have_intrinsic_size || !tone_mapping_default); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &extra_fields)); + if (visitor->Conditional(extra_fields)) { + orientation--; + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 0, &orientation)); + orientation++; + // (No need for bounds checking because we read exactly 3 bits) + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_intrinsic_size)); + if (visitor->Conditional(have_intrinsic_size)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&intrinsic_size)); + } + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_preview)); + if (visitor->Conditional(have_preview)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&preview_size)); + } + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_animation)); + if (visitor->Conditional(have_animation)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&animation)); + } + } else { + orientation = 1; // identity + have_intrinsic_size = false; + have_preview = false; + have_animation = false; + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&bit_depth)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->Bool(true, &modular_16_bit_buffer_sufficient)); + + num_extra_channels = extra_channel_info.size(); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(0), Val(1), BitsOffset(4, 2), + BitsOffset(12, 1), 0, + &num_extra_channels)); + + if (visitor->Conditional(num_extra_channels != 0)) { + if (visitor->IsReading()) { + extra_channel_info.resize(num_extra_channels); + } + for (ExtraChannelInfo& eci : extra_channel_info) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&eci)); + } + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(true, &xyb_encoded)); + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&color_encoding)); + if (visitor->Conditional(extra_fields)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&tone_mapping)); + } + + // Treat as if only the fields up to extra channels exist. + if (visitor->IsReading() && nonserialized_only_parse_basic_info) { + return true; + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions)); + // Extensions: in chronological order of being added to the format. + return visitor->EndExtensions(); +} + +OpsinInverseMatrix::OpsinInverseMatrix() { Bundle::Init(this); } +Status OpsinInverseMatrix::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + for (int i = 0; i < 9; ++i) { + JXL_QUIET_RETURN_IF_ERROR(visitor->F16( + DefaultInverseOpsinAbsorbanceMatrix()[i], &inverse_matrix[i])); + } + for (int i = 0; i < 3; ++i) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(kNegOpsinAbsorbanceBiasRGB[i], &opsin_biases[i])); + } + for (int i = 0; i < 4; ++i) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(kDefaultQuantBias[i], &quant_biases[i])); + } + return true; +} + +ToneMapping::ToneMapping() { Bundle::Init(this); } +Status ToneMapping::VisitFields(Visitor* JXL_RESTRICT visitor) { + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(kDefaultIntensityTarget, &intensity_target)); + if (intensity_target <= 0.f) { + return JXL_FAILURE("invalid intensity target"); + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.0f, &min_nits)); + if (min_nits < 0.f || min_nits > intensity_target) { + return JXL_FAILURE("invalid min %f vs max %f", min_nits, intensity_target); + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &relative_to_max_display)); + + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.0f, &linear_below)); + if (linear_below < 0 || (relative_to_max_display && linear_below > 1.0f)) { + return JXL_FAILURE("invalid linear_below %f (%s)", linear_below, + relative_to_max_display ? "relative" : "absolute"); + } + + return true; +} + +Status ReadImageMetadata(BitReader* JXL_RESTRICT reader, + ImageMetadata* JXL_RESTRICT metadata) { + return Bundle::Read(reader, metadata); +} + +Status WriteImageMetadata(const ImageMetadata& metadata, + BitWriter* JXL_RESTRICT writer, size_t layer, + AuxOut* aux_out) { + return Bundle::Write(metadata, writer, layer, aux_out); +} + +void ImageMetadata::SetAlphaBits(uint32_t bits, bool alpha_is_premultiplied) { + std::vector& eciv = extra_channel_info; + ExtraChannelInfo* alpha = Find(ExtraChannel::kAlpha); + if (bits == 0) { + if (alpha != nullptr) { + // Remove the alpha channel from the extra channel info. It's + // theoretically possible that there are multiple, remove all in that + // case. This ensure a next HasAlpha() will return false. + const auto is_alpha = [](const ExtraChannelInfo& eci) { + return eci.type == ExtraChannel::kAlpha; + }; + eciv.erase(std::remove_if(eciv.begin(), eciv.end(), is_alpha), + eciv.end()); + } + } else { + if (alpha == nullptr) { + ExtraChannelInfo info; + info.type = ExtraChannel::kAlpha; + info.bit_depth.bits_per_sample = bits; + info.dim_shift = 0; + info.alpha_associated = alpha_is_premultiplied; + // Prepend rather than append: in case there already are other extra + // channels, prefer alpha channel to be listed first. + eciv.insert(eciv.begin(), info); + } else { + // Ignores potential extra alpha channels, only sets to first one. + alpha->bit_depth.bits_per_sample = bits; + alpha->bit_depth.floating_point_sample = false; + alpha->bit_depth.exponent_bits_per_sample = 0; + alpha->alpha_associated = alpha_is_premultiplied; + } + } + num_extra_channels = extra_channel_info.size(); + if (bits > 12) modular_16_bit_buffer_sufficient = false; +} +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_metadata.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_metadata.h new file mode 100644 index 0000000000..e5f7969215 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_metadata.h @@ -0,0 +1,410 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Main codestream header bundles, the metadata that applies to all frames. + +#ifndef LIB_JXL_IMAGE_METADATA_H_ +#define LIB_JXL_IMAGE_METADATA_H_ + +#include +#include + +#include + +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/jpeg/jpeg_data.h" +#include "lib/jxl/opsin_params.h" + +namespace jxl { + +// EXIF orientation of the image. This field overrides any field present in +// actual EXIF metadata. The value tells which transformation the decoder must +// apply after decoding to display the image with the correct orientation. +enum class Orientation : uint32_t { + // Values 1..8 match the EXIF definitions. + kIdentity = 1, + kFlipHorizontal, + kRotate180, + kFlipVertical, + kTranspose, + kRotate90, + kAntiTranspose, + kRotate270, +}; +// Don't need an EnumBits because Orientation is not read via Enum(). + +enum class ExtraChannel : uint32_t { + // First two enumerators (most common) are cheaper to encode + kAlpha, + kDepth, + + kSpotColor, + kSelectionMask, + kBlack, // for CMYK + kCFA, // Bayer channel + kThermal, + kReserved0, + kReserved1, + kReserved2, + kReserved3, + kReserved4, + kReserved5, + kReserved6, + kReserved7, + kUnknown, // disambiguated via name string, raise warning if unsupported + kOptional // like kUnknown but can silently be ignored +}; +static inline const char* EnumName(ExtraChannel /*unused*/) { + return "ExtraChannel"; +} +static inline constexpr uint64_t EnumBits(ExtraChannel /*unused*/) { + using EC = ExtraChannel; + return MakeBit(EC::kAlpha) | MakeBit(EC::kDepth) | MakeBit(EC::kSpotColor) | + MakeBit(EC::kSelectionMask) | MakeBit(EC::kBlack) | MakeBit(EC::kCFA) | + MakeBit(EC::kUnknown) | MakeBit(EC::kOptional); +} + +// Used in ImageMetadata and ExtraChannelInfo. +struct BitDepth : public Fields { + BitDepth(); + const char* Name() const override { return "BitDepth"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + // Whether the original (uncompressed) samples are floating point or + // unsigned integer. + bool floating_point_sample; + + // Bit depth of the original (uncompressed) image samples. Must be in the + // range [1, 32]. + uint32_t bits_per_sample; + + // Floating point exponent bits of the original (uncompressed) image samples, + // only used if floating_point_sample is true. + // If used, the samples are floating point with: + // - 1 sign bit + // - exponent_bits_per_sample exponent bits + // - (bits_per_sample - exponent_bits_per_sample - 1) mantissa bits + // If used, exponent_bits_per_sample must be in the range + // [2, 8] and amount of mantissa bits must be in the range [2, 23]. + // NOTE: exponent_bits_per_sample is 8 for single precision binary32 + // point, 5 for half precision binary16, 7 for fp24. + uint32_t exponent_bits_per_sample; +}; + +// Describes one extra channel. +struct ExtraChannelInfo : public Fields { + ExtraChannelInfo(); + const char* Name() const override { return "ExtraChannelInfo"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + mutable bool all_default; + + ExtraChannel type; + BitDepth bit_depth; + uint32_t dim_shift; // downsampled by 2^dim_shift on each axis + + std::string name; // UTF-8 + + // Conditional: + bool alpha_associated; // i.e. premultiplied + float spot_color[4]; // spot color in linear RGBA + uint32_t cfa_channel; +}; + +struct OpsinInverseMatrix : public Fields { + OpsinInverseMatrix(); + const char* Name() const override { return "OpsinInverseMatrix"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + mutable bool all_default; + + float inverse_matrix[9]; + float opsin_biases[3]; + float quant_biases[4]; +}; + +// Information useful for mapping HDR images to lower dynamic range displays. +struct ToneMapping : public Fields { + ToneMapping(); + const char* Name() const override { return "ToneMapping"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + mutable bool all_default; + + // Upper bound on the intensity level present in the image. For unsigned + // integer pixel encodings, this is the brightness of the largest + // representable value. The image does not necessarily contain a pixel + // actually this bright. An encoder is allowed to set 255 for SDR images + // without computing a histogram. + float intensity_target; // [nits] + + // Lower bound on the intensity level present in the image. This may be + // loose, i.e. lower than the actual darkest pixel. When tone mapping, a + // decoder will map [min_nits, intensity_target] to the display range. + float min_nits; + + bool relative_to_max_display; // see below + // The tone mapping will leave unchanged (linear mapping) any pixels whose + // brightness is strictly below this. The interpretation depends on + // relative_to_max_display. If true, this is a ratio [0, 1] of the maximum + // display brightness [nits], otherwise an absolute brightness [nits]. + float linear_below; +}; + +// Contains weights to customize some trasnforms - in particular, XYB and +// upsampling. +struct CustomTransformData : public Fields { + CustomTransformData(); + const char* Name() const override { return "CustomTransformData"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + // Must be set before calling VisitFields. Must equal xyb_encoded of + // ImageMetadata, should be set by ImageMetadata during VisitFields. + bool nonserialized_xyb_encoded = false; + + mutable bool all_default; + + OpsinInverseMatrix opsin_inverse_matrix; + + uint32_t custom_weights_mask; + float upsampling2_weights[15]; + float upsampling4_weights[55]; + float upsampling8_weights[210]; +}; + +// Properties of the original image bundle. This enables Encode(Decode()) to +// re-create an equivalent image without user input. +struct ImageMetadata : public Fields { + ImageMetadata(); + const char* Name() const override { return "ImageMetadata"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + // Returns bit depth of the JPEG XL compressed alpha channel, or 0 if no alpha + // channel present. In the theoretical case that there are multiple alpha + // channels, returns the bit depht of the first. + uint32_t GetAlphaBits() const { + const ExtraChannelInfo* alpha = Find(ExtraChannel::kAlpha); + if (alpha == nullptr) return 0; + JXL_ASSERT(alpha->bit_depth.bits_per_sample != 0); + return alpha->bit_depth.bits_per_sample; + } + + // Sets bit depth of alpha channel, adding extra channel if needed, or + // removing all alpha channels if bits is 0. + // Assumes integer alpha channel and not designed to support multiple + // alpha channels (it's possible to use those features by manipulating + // extra_channel_info directly). + // + // Callers must insert the actual channel image at the same index before any + // further modifications to extra_channel_info. + void SetAlphaBits(uint32_t bits, bool alpha_is_premultiplied = false); + + bool HasAlpha() const { return GetAlphaBits() != 0; } + + // Sets the original bit depth fields to indicate unsigned integer of the + // given bit depth. + // TODO(lode): move function to BitDepth + void SetUintSamples(uint32_t bits) { + bit_depth.bits_per_sample = bits; + bit_depth.exponent_bits_per_sample = 0; + bit_depth.floating_point_sample = false; + // RCT / Squeeze may add one bit each, and this is about int16_t, + // so uint13 should still be OK but limiting it to 12 seems safer. + // TODO(jon): figure out a better way to set this header field. + // (in particular, if modular mode is not used it doesn't matter, + // and if transforms are restricted, up to 15-bit could be done) + if (bits > 12) modular_16_bit_buffer_sufficient = false; + } + // Sets the original bit depth fields to indicate single precision floating + // point. + // TODO(lode): move function to BitDepth + void SetFloat32Samples() { + bit_depth.bits_per_sample = 32; + bit_depth.exponent_bits_per_sample = 8; + bit_depth.floating_point_sample = true; + modular_16_bit_buffer_sufficient = false; + } + + void SetFloat16Samples() { + bit_depth.bits_per_sample = 16; + bit_depth.exponent_bits_per_sample = 5; + bit_depth.floating_point_sample = true; + modular_16_bit_buffer_sufficient = false; + } + + void SetIntensityTarget(float intensity_target) { + tone_mapping.intensity_target = intensity_target; + } + float IntensityTarget() const { + JXL_ASSERT(tone_mapping.intensity_target != 0); + return tone_mapping.intensity_target; + } + + // Returns first ExtraChannelInfo of the given type, or nullptr if none. + const ExtraChannelInfo* Find(ExtraChannel type) const { + for (const ExtraChannelInfo& eci : extra_channel_info) { + if (eci.type == type) return &eci; + } + return nullptr; + } + + // Returns first ExtraChannelInfo of the given type, or nullptr if none. + ExtraChannelInfo* Find(ExtraChannel type) { + for (ExtraChannelInfo& eci : extra_channel_info) { + if (eci.type == type) return &eci; + } + return nullptr; + } + + Orientation GetOrientation() const { + return static_cast(orientation); + } + + bool ExtraFieldsDefault() const; + + mutable bool all_default; + + BitDepth bit_depth; + bool modular_16_bit_buffer_sufficient; // otherwise 32 is. + + // Whether the colors values of the pixels of frames are encoded in the + // codestream using the absolute XYB color space, or the using values that + // follow the color space defined by the ColorEncoding or ICC profile. This + // determines when or whether a CMS (Color Management System) is needed to get + // the pixels in a desired color space. In one case, the pixels have one known + // color space and a CMS is needed to convert them to the original image's + // color space, in the other case the pixels have the color space of the + // original image and a CMS is required if a different display space, or a + // single known consistent color space for multiple decoded images, is + // desired. In all cases, the color space of all frames from a single image is + // the same, both VarDCT and modular frames. + // + // If true: then frames can be decoded to XYB (which can also be converted to + // linear and non-linear sRGB with the built in conversion without CMS). The + // attached ColorEncoding or ICC profile has no effect on the meaning of the + // pixel's color values, but instead indicates what the color profile of the + // original image was, and what color profile one should convert to when + // decoding to integers to prevent clipping and precision loss. To do that + // conversion requires a CMS. + // + // If false: then the color values of decoded frames are in the space defined + // by the attached ColorEncoding or ICC profile. To instead get the pixels in + // a chosen known color space, such as sRGB, requires a CMS, since the + // attached ColorEncoding or ICC profile could be any arbitrary color space. + // This mode is typically used for lossless images encoded as integers. + // Frames can also use YCbCr encoding, some frames may and some may not, but + // this is not a different color space but a certain encoding of the RGB + // values. + // + // Note: if !xyb_encoded, but the attached color profile indicates XYB (which + // can happen either if it's a ColorEncoding with color_space_ == + // ColorSpace::kXYB, or if it's an ICC Profile that has been crafted to + // represent XYB), then the frames still may not use ColorEncoding kXYB, they + // must still use kNone (or kYCbCr, which would mean applying the YCbCr + // transform to the 3-channel XYB data), since with !xyb_encoded, the 3 + // channels are stored as-is, no matter what meaning the color profile assigns + // to them. To use ColorEncoding::kXYB, xyb_encoded must be true. + // + // This value is defined in image metadata because this is the global + // codestream header. This value does not affect the image itself, so is not + // image metadata per se, it only affects the encoding, and what color space + // the decoder can receive the pixels in without needing a CMS. + bool xyb_encoded; + + ColorEncoding color_encoding; + + // These values are initialized to defaults such that the 'extra_fields' + // condition in VisitFields uses correctly initialized values. + uint32_t orientation = 1; + bool have_preview = false; + bool have_animation = false; + bool have_intrinsic_size = false; + + // If present, the stored image has the dimensions of the first SizeHeader, + // but decoders are advised to resample or display per `intrinsic_size`. + SizeHeader intrinsic_size; // only if have_intrinsic_size + + ToneMapping tone_mapping; + + // When reading: deserialized. When writing: automatically set from vector. + uint32_t num_extra_channels; + std::vector extra_channel_info; + + // Only present if m.have_preview. + PreviewHeader preview_size; + // Only present if m.have_animation. + AnimationHeader animation; + + uint64_t extensions; + + // Option to stop parsing after basic info, and treat as if the later + // fields do not participate. Use to parse only basic image information + // excluding the final larger or variable sized data. + bool nonserialized_only_parse_basic_info = false; +}; + +Status ReadImageMetadata(BitReader* JXL_RESTRICT reader, + ImageMetadata* JXL_RESTRICT metadata); + +Status WriteImageMetadata(const ImageMetadata& metadata, + BitWriter* JXL_RESTRICT writer, size_t layer, + AuxOut* aux_out); + +// All metadata applicable to the entire codestream (dimensions, extra channels, +// ...) +struct CodecMetadata { + // TODO(lode): use the preview and animation fields too, in place of the + // nonserialized_ ones in ImageMetadata. + ImageMetadata m; + // The size of the codestream: this is the nominal size applicable to all + // frames, although some frames can have a different effective size through + // crop, dc_level or representing a the preview. + SizeHeader size; + // Often default. + CustomTransformData transform_data; + + size_t xsize() const { return size.xsize(); } + size_t ysize() const { return size.ysize(); } + size_t oriented_xsize(bool keep_orientation) const { + if (static_cast(m.GetOrientation()) > 4 && !keep_orientation) { + return ysize(); + } else { + return xsize(); + } + } + size_t oriented_preview_xsize(bool keep_orientation) const { + if (static_cast(m.GetOrientation()) > 4 && !keep_orientation) { + return m.preview_size.ysize(); + } else { + return m.preview_size.xsize(); + } + } + size_t oriented_ysize(bool keep_orientation) const { + if (static_cast(m.GetOrientation()) > 4 && !keep_orientation) { + return xsize(); + } else { + return ysize(); + } + } + size_t oriented_preview_ysize(bool keep_orientation) const { + if (static_cast(m.GetOrientation()) > 4 && !keep_orientation) { + return m.preview_size.xsize(); + } else { + return m.preview_size.ysize(); + } + } +}; + +} // namespace jxl + +#endif // LIB_JXL_IMAGE_METADATA_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_ops.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_ops.h new file mode 100644 index 0000000000..f3c2b5995e --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_ops.h @@ -0,0 +1,814 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_IMAGE_OPS_H_ +#define LIB_JXL_IMAGE_OPS_H_ + +// Operations on images. + +#include +#include +#include +#include + +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/image.h" + +namespace jxl { + +template +void CopyImageTo(const Plane& from, Plane* JXL_RESTRICT to) { + PROFILER_ZONE("CopyImage1"); + JXL_ASSERT(SameSize(from, *to)); + if (from.ysize() == 0 || from.xsize() == 0) return; + for (size_t y = 0; y < from.ysize(); ++y) { + const T* JXL_RESTRICT row_from = from.ConstRow(y); + T* JXL_RESTRICT row_to = to->Row(y); + memcpy(row_to, row_from, from.xsize() * sizeof(T)); + } +} + +// DEPRECATED - prefer to preallocate result. +template +Plane CopyImage(const Plane& from) { + Plane to(from.xsize(), from.ysize()); + CopyImageTo(from, &to); + return to; +} + +// Copies `from:rect_from` to `to:rect_to`. +template +void CopyImageTo(const Rect& rect_from, const Plane& from, + const Rect& rect_to, Plane* JXL_RESTRICT to) { + PROFILER_ZONE("CopyImageR"); + JXL_DASSERT(SameSize(rect_from, rect_to)); + JXL_DASSERT(rect_from.IsInside(from)); + JXL_DASSERT(rect_to.IsInside(*to)); + if (rect_from.xsize() == 0) return; + for (size_t y = 0; y < rect_from.ysize(); ++y) { + const T* JXL_RESTRICT row_from = rect_from.ConstRow(from, y); + T* JXL_RESTRICT row_to = rect_to.Row(to, y); + memcpy(row_to, row_from, rect_from.xsize() * sizeof(T)); + } +} + +// DEPRECATED - Returns a copy of the "image" pixels that lie in "rect". +template +Plane CopyImage(const Rect& rect, const Plane& image) { + Plane copy(rect.xsize(), rect.ysize()); + CopyImageTo(rect, image, ©); + return copy; +} + +// Copies `from:rect_from` to `to:rect_to`. +template +void CopyImageTo(const Rect& rect_from, const Image3& from, + const Rect& rect_to, Image3* JXL_RESTRICT to) { + PROFILER_ZONE("CopyImageR"); + JXL_ASSERT(SameSize(rect_from, rect_to)); + for (size_t c = 0; c < 3; c++) { + CopyImageTo(rect_from, from.Plane(c), rect_to, &to->Plane(c)); + } +} + +template +void ConvertPlaneAndClamp(const Rect& rect_from, const Plane& from, + const Rect& rect_to, Plane* JXL_RESTRICT to) { + PROFILER_ZONE("ConvertPlane"); + JXL_ASSERT(SameSize(rect_from, rect_to)); + using M = decltype(T() + U()); + for (size_t y = 0; y < rect_to.ysize(); ++y) { + const T* JXL_RESTRICT row_from = rect_from.ConstRow(from, y); + U* JXL_RESTRICT row_to = rect_to.Row(to, y); + for (size_t x = 0; x < rect_to.xsize(); ++x) { + row_to[x] = + std::min(std::max(row_from[x], std::numeric_limits::min()), + std::numeric_limits::max()); + } + } +} + +// Copies `from` to `to`. +template +void CopyImageTo(const T& from, T* JXL_RESTRICT to) { + return CopyImageTo(Rect(from), from, Rect(*to), to); +} + +// Copies `from:rect_from` to `to`. +template +void CopyImageTo(const Rect& rect_from, const T& from, T* JXL_RESTRICT to) { + return CopyImageTo(rect_from, from, Rect(*to), to); +} + +// Copies `from` to `to:rect_to`. +template +void CopyImageTo(const T& from, const Rect& rect_to, T* JXL_RESTRICT to) { + return CopyImageTo(Rect(from), from, rect_to, to); +} + +// Copies `from:rect_from` to `to:rect_to`; also copies `padding` pixels of +// border around `from:rect_from`, in all directions, whenever they are inside +// the first image. +template +void CopyImageToWithPadding(const Rect& from_rect, const T& from, + size_t padding, const Rect& to_rect, T* to) { + size_t xextra0 = std::min(padding, from_rect.x0()); + size_t xextra1 = + std::min(padding, from.xsize() - from_rect.x0() - from_rect.xsize()); + size_t yextra0 = std::min(padding, from_rect.y0()); + size_t yextra1 = + std::min(padding, from.ysize() - from_rect.y0() - from_rect.ysize()); + JXL_DASSERT(to_rect.x0() >= xextra0); + JXL_DASSERT(to_rect.y0() >= yextra0); + + return CopyImageTo(Rect(from_rect.x0() - xextra0, from_rect.y0() - yextra0, + from_rect.xsize() + xextra0 + xextra1, + from_rect.ysize() + yextra0 + yextra1), + from, + Rect(to_rect.x0() - xextra0, to_rect.y0() - yextra0, + to_rect.xsize() + xextra0 + xextra1, + to_rect.ysize() + yextra0 + yextra1), + to); +} + +// DEPRECATED - prefer to preallocate result. +template +Image3 CopyImage(const Image3& from) { + Image3 copy(from.xsize(), from.ysize()); + CopyImageTo(from, ©); + return copy; +} + +// DEPRECATED - prefer to preallocate result. +template +Image3 CopyImage(const Rect& rect, const Image3& from) { + Image3 to(rect.xsize(), rect.ysize()); + CopyImageTo(rect, from.Plane(0), to.Plane(0)); + CopyImageTo(rect, from.Plane(1), to.Plane(1)); + CopyImageTo(rect, from.Plane(2), to.Plane(2)); + return to; +} + +// Sets "thickness" pixels on each border to "value". This is faster than +// initializing the entire image and overwriting valid/interior pixels. +template +void SetBorder(const size_t thickness, const T value, Image3* image) { + const size_t xsize = image->xsize(); + const size_t ysize = image->ysize(); + // Top: fill entire row + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < std::min(thickness, ysize); ++y) { + T* JXL_RESTRICT row = image->PlaneRow(c, y); + std::fill(row, row + xsize, value); + } + + // Bottom: fill entire row + for (size_t y = ysize - thickness; y < ysize; ++y) { + T* JXL_RESTRICT row = image->PlaneRow(c, y); + std::fill(row, row + xsize, value); + } + + // Left/right: fill the 'columns' on either side, but only if the image is + // big enough that they don't already belong to the top/bottom rows. + if (ysize >= 2 * thickness) { + for (size_t y = thickness; y < ysize - thickness; ++y) { + T* JXL_RESTRICT row = image->PlaneRow(c, y); + std::fill(row, row + thickness, value); + std::fill(row + xsize - thickness, row + xsize, value); + } + } + } +} + +template +void Subtract(const ImageIn& image1, const ImageIn& image2, ImageOut* out) { + using T = typename ImageIn::T; + const size_t xsize = image1.xsize(); + const size_t ysize = image1.ysize(); + JXL_CHECK(xsize == image2.xsize()); + JXL_CHECK(ysize == image2.ysize()); + + for (size_t y = 0; y < ysize; ++y) { + const T* const JXL_RESTRICT row1 = image1.Row(y); + const T* const JXL_RESTRICT row2 = image2.Row(y); + T* const JXL_RESTRICT row_out = out->Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = row1[x] - row2[x]; + } + } +} + +// In-place. +template +void SubtractFrom(const Plane& what, Plane* to) { + const size_t xsize = what.xsize(); + const size_t ysize = what.ysize(); + for (size_t y = 0; y < ysize; ++y) { + const Tin* JXL_RESTRICT row_what = what.ConstRow(y); + Tout* JXL_RESTRICT row_to = to->Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_to[x] -= row_what[x]; + } + } +} + +// In-place. +template +void AddTo(const Plane& what, Plane* to) { + const size_t xsize = what.xsize(); + const size_t ysize = what.ysize(); + for (size_t y = 0; y < ysize; ++y) { + const Tin* JXL_RESTRICT row_what = what.ConstRow(y); + Tout* JXL_RESTRICT row_to = to->Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_to[x] += row_what[x]; + } + } +} + +template +void AddTo(Rect rectFrom, const Plane& what, Rect rectTo, + Plane* to) { + JXL_ASSERT(SameSize(rectFrom, rectTo)); + const size_t xsize = rectTo.xsize(); + const size_t ysize = rectTo.ysize(); + for (size_t y = 0; y < ysize; ++y) { + const Tin* JXL_RESTRICT row_what = rectFrom.ConstRow(what, y); + Tout* JXL_RESTRICT row_to = rectTo.Row(to, y); + for (size_t x = 0; x < xsize; ++x) { + row_to[x] += row_what[x]; + } + } +} + +// Returns linear combination of two grayscale images. +template +Plane LinComb(const T lambda1, const Plane& image1, const T lambda2, + const Plane& image2) { + const size_t xsize = image1.xsize(); + const size_t ysize = image1.ysize(); + JXL_CHECK(xsize == image2.xsize()); + JXL_CHECK(ysize == image2.ysize()); + Plane out(xsize, ysize); + for (size_t y = 0; y < ysize; ++y) { + const T* const JXL_RESTRICT row1 = image1.Row(y); + const T* const JXL_RESTRICT row2 = image2.Row(y); + T* const JXL_RESTRICT row_out = out.Row(y); + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = lambda1 * row1[x] + lambda2 * row2[x]; + } + } + return out; +} + +// Returns a pixel-by-pixel multiplication of image by lambda. +template +Plane ScaleImage(const T lambda, const Plane& image) { + Plane out(image.xsize(), image.ysize()); + for (size_t y = 0; y < image.ysize(); ++y) { + const T* const JXL_RESTRICT row = image.Row(y); + T* const JXL_RESTRICT row_out = out.Row(y); + for (size_t x = 0; x < image.xsize(); ++x) { + row_out[x] = lambda * row[x]; + } + } + return out; +} + +// Multiplies image by lambda in-place +template +void ScaleImage(const T lambda, Plane* image) { + for (size_t y = 0; y < image->ysize(); ++y) { + T* const JXL_RESTRICT row = image->Row(y); + for (size_t x = 0; x < image->xsize(); ++x) { + row[x] = lambda * row[x]; + } + } +} + +template +Plane Product(const Plane& a, const Plane& b) { + Plane c(a.xsize(), a.ysize()); + for (size_t y = 0; y < a.ysize(); ++y) { + const T* const JXL_RESTRICT row_a = a.Row(y); + const T* const JXL_RESTRICT row_b = b.Row(y); + T* const JXL_RESTRICT row_c = c.Row(y); + for (size_t x = 0; x < a.xsize(); ++x) { + row_c[x] = row_a[x] * row_b[x]; + } + } + return c; +} + +float DotProduct(const ImageF& a, const ImageF& b); + +template +void FillImage(const T value, Plane* image) { + for (size_t y = 0; y < image->ysize(); ++y) { + T* const JXL_RESTRICT row = image->Row(y); + for (size_t x = 0; x < image->xsize(); ++x) { + row[x] = value; + } + } +} + +template +void ZeroFillImage(Plane* image) { + if (image->xsize() == 0) return; + for (size_t y = 0; y < image->ysize(); ++y) { + T* const JXL_RESTRICT row = image->Row(y); + memset(row, 0, image->xsize() * sizeof(T)); + } +} + +// Mirrors out of bounds coordinates and returns valid coordinates unchanged. +// We assume the radius (distance outside the image) is small compared to the +// image size, otherwise this might not terminate. +// The mirror is outside the last column (border pixel is also replicated). +static inline int64_t Mirror(int64_t x, const int64_t xsize) { + JXL_DASSERT(xsize != 0); + + // TODO(janwas): replace with branchless version + while (x < 0 || x >= xsize) { + if (x < 0) { + x = -x - 1; + } else { + x = 2 * xsize - 1 - x; + } + } + return x; +} + +// Wrap modes for ensuring X/Y coordinates are in the valid range [0, size): + +// Mirrors (repeating the edge pixel once). Useful for convolutions. +struct WrapMirror { + JXL_INLINE int64_t operator()(const int64_t coord, const int64_t size) const { + return Mirror(coord, size); + } +}; + +// Returns the same coordinate: required for TFNode with Border(), or useful +// when we know "coord" is already valid (e.g. interior of an image). +struct WrapUnchanged { + JXL_INLINE int64_t operator()(const int64_t coord, int64_t /*size*/) const { + return coord; + } +}; + +// Similar to Wrap* but for row pointers (reduces Row() multiplications). + +class WrapRowMirror { + public: + template + WrapRowMirror(const ImageOrView& image, size_t ysize) + : first_row_(image.ConstRow(0)), last_row_(image.ConstRow(ysize - 1)) {} + + const float* operator()(const float* const JXL_RESTRICT row, + const int64_t stride) const { + if (row < first_row_) { + const int64_t num_before = first_row_ - row; + // Mirrored; one row before => row 0, two before = row 1, ... + return first_row_ + num_before - stride; + } + if (row > last_row_) { + const int64_t num_after = row - last_row_; + // Mirrored; one row after => last row, two after = last - 1, ... + return last_row_ - num_after + stride; + } + return row; + } + + private: + const float* const JXL_RESTRICT first_row_; + const float* const JXL_RESTRICT last_row_; +}; + +struct WrapRowUnchanged { + JXL_INLINE const float* operator()(const float* const JXL_RESTRICT row, + int64_t /*stride*/) const { + return row; + } +}; + +// Sets "thickness" pixels on each border to "value". This is faster than +// initializing the entire image and overwriting valid/interior pixels. +template +void SetBorder(const size_t thickness, const T value, Plane* image) { + const size_t xsize = image->xsize(); + const size_t ysize = image->ysize(); + // Top: fill entire row + for (size_t y = 0; y < std::min(thickness, ysize); ++y) { + T* const JXL_RESTRICT row = image->Row(y); + std::fill(row, row + xsize, value); + } + + // Bottom: fill entire row + for (size_t y = ysize - thickness; y < ysize; ++y) { + T* const JXL_RESTRICT row = image->Row(y); + std::fill(row, row + xsize, value); + } + + // Left/right: fill the 'columns' on either side, but only if the image is + // big enough that they don't already belong to the top/bottom rows. + if (ysize >= 2 * thickness) { + for (size_t y = thickness; y < ysize - thickness; ++y) { + T* const JXL_RESTRICT row = image->Row(y); + std::fill(row, row + thickness, value); + std::fill(row + xsize - thickness, row + xsize, value); + } + } +} + +// Computes the minimum and maximum pixel value. +template +void ImageMinMax(const Plane& image, T* const JXL_RESTRICT min, + T* const JXL_RESTRICT max) { + *min = std::numeric_limits::max(); + *max = std::numeric_limits::lowest(); + for (size_t y = 0; y < image.ysize(); ++y) { + const T* const JXL_RESTRICT row = image.Row(y); + for (size_t x = 0; x < image.xsize(); ++x) { + *min = std::min(*min, row[x]); + *max = std::max(*max, row[x]); + } + } +} + +// Copies pixels, scaling their value relative to the "from" min/max by +// "to_range". Example: U8 [0, 255] := [0.0, 1.0], to_range = 1.0 => +// outputs [0.0, 1.0]. +template +void ImageConvert(const Plane& from, const float to_range, + Plane* const JXL_RESTRICT to) { + JXL_ASSERT(SameSize(from, *to)); + FromType min_from, max_from; + ImageMinMax(from, &min_from, &max_from); + const float scale = to_range / (max_from - min_from); + for (size_t y = 0; y < from.ysize(); ++y) { + const FromType* const JXL_RESTRICT row_from = from.Row(y); + ToType* const JXL_RESTRICT row_to = to->Row(y); + for (size_t x = 0; x < from.xsize(); ++x) { + row_to[x] = static_cast((row_from[x] - min_from) * scale); + } + } +} + +template +Plane ConvertToFloat(const Plane& from) { + float factor = 1.0f / std::numeric_limits::max(); + if (std::is_same::value || std::is_same::value) { + factor = 1.0f; + } + Plane to(from.xsize(), from.ysize()); + for (size_t y = 0; y < from.ysize(); ++y) { + const From* const JXL_RESTRICT row_from = from.Row(y); + float* const JXL_RESTRICT row_to = to.Row(y); + for (size_t x = 0; x < from.xsize(); ++x) { + row_to[x] = row_from[x] * factor; + } + } + return to; +} + +template +Plane ImageFromPacked(const std::vector& packed, const size_t xsize, + const size_t ysize) { + Plane out(xsize, ysize); + for (size_t y = 0; y < ysize; ++y) { + T* const JXL_RESTRICT row = out.Row(y); + const T* const JXL_RESTRICT packed_row = &packed[y * xsize]; + memcpy(row, packed_row, xsize * sizeof(T)); + } + return out; +} + +// Computes independent minimum and maximum values for each plane. +template +void Image3MinMax(const Image3& image, const Rect& rect, + std::array* out_min, std::array* out_max) { + for (size_t c = 0; c < 3; ++c) { + T min = std::numeric_limits::max(); + T max = std::numeric_limits::min(); + for (size_t y = 0; y < rect.ysize(); ++y) { + const T* JXL_RESTRICT row = rect.ConstPlaneRow(image, c, y); + for (size_t x = 0; x < rect.xsize(); ++x) { + min = std::min(min, row[x]); + max = std::max(max, row[x]); + } + } + (*out_min)[c] = min; + (*out_max)[c] = max; + } +} + +// Computes independent minimum and maximum values for each plane. +template +void Image3MinMax(const Image3& image, std::array* out_min, + std::array* out_max) { + Image3MinMax(image, Rect(image), out_min, out_max); +} + +template +void Image3Max(const Image3& image, std::array* out_max) { + for (size_t c = 0; c < 3; ++c) { + T max = std::numeric_limits::min(); + for (size_t y = 0; y < image.ysize(); ++y) { + const T* JXL_RESTRICT row = image.ConstPlaneRow(c, y); + for (size_t x = 0; x < image.xsize(); ++x) { + max = std::max(max, row[x]); + } + } + (*out_max)[c] = max; + } +} + +// Computes the sum of the pixels in `rect`. +template +T ImageSum(const Plane& image, const Rect& rect) { + T result = 0; + for (size_t y = 0; y < rect.ysize(); ++y) { + const T* JXL_RESTRICT row = rect.ConstRow(image, y); + for (size_t x = 0; x < rect.xsize(); ++x) { + result += row[x]; + } + } + return result; +} + +template +T ImageSum(const Plane& image) { + return ImageSum(image, Rect(image)); +} + +template +std::array Image3Sum(const Image3& image, const Rect& rect) { + std::array out_sum = 0; + for (size_t c = 0; c < 3; ++c) { + (out_sum)[c] = ImageSum(image.Plane(c), rect); + } + return out_sum; +} + +template +std::array Image3Sum(const Image3& image) { + return Image3Sum(image, Rect(image)); +} + +template +std::vector PackedFromImage(const Plane& image, const Rect& rect) { + const size_t xsize = rect.xsize(); + const size_t ysize = rect.ysize(); + std::vector packed(xsize * ysize); + for (size_t y = 0; y < rect.ysize(); ++y) { + memcpy(&packed[y * xsize], rect.ConstRow(image, y), xsize * sizeof(T)); + } + return packed; +} + +template +std::vector PackedFromImage(const Plane& image) { + return PackedFromImage(image, Rect(image)); +} + +// Computes the median pixel value. +template +T ImageMedian(const Plane& image, const Rect& rect) { + std::vector pixels = PackedFromImage(image, rect); + return Median(&pixels); +} + +template +T ImageMedian(const Plane& image) { + return ImageMedian(image, Rect(image)); +} + +template +std::array Image3Median(const Image3& image, const Rect& rect) { + std::array out_median; + for (size_t c = 0; c < 3; ++c) { + (out_median)[c] = ImageMedian(image.Plane(c), rect); + } + return out_median; +} + +template +std::array Image3Median(const Image3& image) { + return Image3Median(image, Rect(image)); +} + +template +void Image3Convert(const Image3& from, const float to_range, + Image3* const JXL_RESTRICT to) { + JXL_ASSERT(SameSize(from, *to)); + std::array min_from, max_from; + Image3MinMax(from, &min_from, &max_from); + float scales[3]; + for (size_t c = 0; c < 3; ++c) { + scales[c] = to_range / (max_from[c] - min_from[c]); + } + float scale = std::min(scales[0], std::min(scales[1], scales[2])); + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < from.ysize(); ++y) { + const FromType* JXL_RESTRICT row_from = from.ConstPlaneRow(c, y); + ToType* JXL_RESTRICT row_to = to->PlaneRow(c, y); + for (size_t x = 0; x < from.xsize(); ++x) { + const float to = (row_from[x] - min_from[c]) * scale; + row_to[x] = static_cast(to); + } + } + } +} + +template +Image3F ConvertToFloat(const Image3& from) { + return Image3F(ConvertToFloat(from.Plane(0)), ConvertToFloat(from.Plane(1)), + ConvertToFloat(from.Plane(2))); +} + +template +void Subtract(const Image3& image1, const Image3& image2, + Image3* out) { + const size_t xsize = image1.xsize(); + const size_t ysize = image1.ysize(); + JXL_CHECK(xsize == image2.xsize()); + JXL_CHECK(ysize == image2.ysize()); + + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < ysize; ++y) { + const Tin* const JXL_RESTRICT row1 = image1.ConstPlaneRow(c, y); + const Tin* const JXL_RESTRICT row2 = image2.ConstPlaneRow(c, y); + Tout* const JXL_RESTRICT row_out = out->PlaneRow(c, y); + for (size_t x = 0; x < xsize; ++x) { + row_out[x] = row1[x] - row2[x]; + } + } + } +} + +template +void SubtractFrom(const Image3& what, Image3* to) { + const size_t xsize = what.xsize(); + const size_t ysize = what.ysize(); + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < ysize; ++y) { + const Tin* JXL_RESTRICT row_what = what.ConstPlaneRow(c, y); + Tout* JXL_RESTRICT row_to = to->PlaneRow(c, y); + for (size_t x = 0; x < xsize; ++x) { + row_to[x] -= row_what[x]; + } + } + } +} + +template +void AddTo(const Image3& what, Image3* to) { + const size_t xsize = what.xsize(); + const size_t ysize = what.ysize(); + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < ysize; ++y) { + const Tin* JXL_RESTRICT row_what = what.ConstPlaneRow(c, y); + Tout* JXL_RESTRICT row_to = to->PlaneRow(c, y); + for (size_t x = 0; x < xsize; ++x) { + row_to[x] += row_what[x]; + } + } + } +} + +// Adds `what` of the size of `rect` to `to` in the position of `rect`. +template +void AddTo(const Rect& rect, const Image3& what, Image3* to) { + const size_t xsize = what.xsize(); + const size_t ysize = what.ysize(); + JXL_ASSERT(xsize == rect.xsize()); + JXL_ASSERT(ysize == rect.ysize()); + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < ysize; ++y) { + const Tin* JXL_RESTRICT row_what = what.ConstPlaneRow(c, y); + Tout* JXL_RESTRICT row_to = rect.PlaneRow(to, c, y); + for (size_t x = 0; x < xsize; ++x) { + row_to[x] += row_what[x]; + } + } + } +} + +template +Image3 ScaleImage(const T lambda, const Image3& image) { + Image3 out(image.xsize(), image.ysize()); + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < image.ysize(); ++y) { + const T* JXL_RESTRICT row = image.ConstPlaneRow(c, y); + T* JXL_RESTRICT row_out = out.PlaneRow(c, y); + for (size_t x = 0; x < image.xsize(); ++x) { + row_out[x] = lambda * row[x]; + } + } + } + return out; +} + +// Multiplies image by lambda in-place +template +void ScaleImage(const T lambda, Image3* image) { + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < image->ysize(); ++y) { + T* const JXL_RESTRICT row = image->PlaneRow(c, y); + for (size_t x = 0; x < image->xsize(); ++x) { + row[x] = lambda * row[x]; + } + } + } +} + +// Initializes all planes to the same "value". +template +void FillImage(const T value, Image3* image) { + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < image->ysize(); ++y) { + T* JXL_RESTRICT row = image->PlaneRow(c, y); + for (size_t x = 0; x < image->xsize(); ++x) { + row[x] = value; + } + } + } +} + +template +void FillPlane(const T value, Plane* image) { + for (size_t y = 0; y < image->ysize(); ++y) { + T* JXL_RESTRICT row = image->Row(y); + for (size_t x = 0; x < image->xsize(); ++x) { + row[x] = value; + } + } +} + +template +void FillImage(const T value, Image3* image, Rect rect) { + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < rect.ysize(); ++y) { + T* JXL_RESTRICT row = rect.PlaneRow(image, c, y); + for (size_t x = 0; x < rect.xsize(); ++x) { + row[x] = value; + } + } + } +} + +template +void FillPlane(const T value, Plane* image, Rect rect) { + for (size_t y = 0; y < rect.ysize(); ++y) { + T* JXL_RESTRICT row = rect.Row(image, y); + for (size_t x = 0; x < rect.xsize(); ++x) { + row[x] = value; + } + } +} + +template +void ZeroFillImage(Image3* image) { + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < image->ysize(); ++y) { + T* JXL_RESTRICT row = image->PlaneRow(c, y); + memset(row, 0, image->xsize() * sizeof(T)); + } + } +} + +template +void ZeroFillPlane(Plane* image, Rect rect) { + for (size_t y = 0; y < rect.ysize(); ++y) { + T* JXL_RESTRICT row = rect.Row(image, y); + memset(row, 0, rect.xsize() * sizeof(T)); + } +} + +// First, image is padded horizontally, with the rightmost value. +// Next, image is padded vertically, by repeating the last line. +ImageF PadImage(const ImageF& in, size_t xsize, size_t ysize); + +// Pad an image with xborder columns on each vertical side and yboder rows +// above and below, mirroring the image. +Image3F PadImageMirror(const Image3F& in, size_t xborder, size_t yborder); + +// First, image is padded horizontally, with the rightmost value. +// Next, image is padded vertically, by repeating the last line. +// Prefer PadImageToBlockMultipleInPlace if padding to kBlockDim. +Image3F PadImageToMultiple(const Image3F& in, size_t N); + +// Same as above, but operates in-place. Assumes that the `in` image was +// allocated large enough. +void PadImageToBlockMultipleInPlace(Image3F* JXL_RESTRICT in); + +// Downsamples an image by a given factor. +void DownsampleImage(Image3F* opsin, size_t factor); +void DownsampleImage(ImageF* image, size_t factor); + +} // namespace jxl + +#endif // LIB_JXL_IMAGE_OPS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_ops_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_ops_test.cc new file mode 100644 index 0000000000..84801feb5a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_ops_test.cc @@ -0,0 +1,133 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/image_ops.h" + +#include +#include +#include + +#include +#include + +#include "gtest/gtest.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_test_utils.h" + +namespace jxl { +namespace { + +template +void TestPacked(const size_t xsize, const size_t ysize) { + Plane image1(xsize, ysize); + RandomFillImage(&image1); + const std::vector& packed = PackedFromImage(image1); + const Plane& image2 = ImageFromPacked(packed, xsize, ysize); + EXPECT_TRUE(SamePixels(image1, image2)); +} + +TEST(ImageTest, TestPacked) { + TestPacked(1, 1); + TestPacked(7, 1); + TestPacked(1, 7); + + TestPacked(1, 1); + TestPacked(7, 1); + TestPacked(1, 7); + + TestPacked(1, 1); + TestPacked(7, 1); + TestPacked(1, 7); + + TestPacked(1, 1); + TestPacked(7, 1); + TestPacked(1, 7); +} + +// Ensure entire payload is readable/writable for various size/offset combos. +TEST(ImageTest, TestAllocator) { + std::mt19937 rng(129); + const size_t k32 = 32; + const size_t kAlign = CacheAligned::kAlignment; + for (size_t size : {k32 * 1, k32 * 2, k32 * 3, k32 * 4, k32 * 5, + CacheAligned::kAlias, 2 * CacheAligned::kAlias + 4}) { + for (size_t offset = 0; offset <= CacheAligned::kAlias; offset += kAlign) { + uint8_t* bytes = + static_cast(CacheAligned::Allocate(size, offset)); + JXL_CHECK(reinterpret_cast(bytes) % kAlign == 0); + // Ensure we can write/read the last byte. Use RNG to fool the compiler + // into thinking the write is necessary. + memset(bytes, 0, size); + bytes[size - 1] = 1; // greatest element + std::uniform_int_distribution dist(0, size - 1); + uint32_t pos = dist(rng); // random but != greatest + while (pos == size - 1) { + pos = dist(rng); + } + JXL_CHECK(bytes[pos] < bytes[size - 1]); + + CacheAligned::Free(bytes); + } + } +} + +template +void TestFillImpl(Image3* img, const char* layout) { + FillImage(T(1), img); + for (size_t y = 0; y < img->ysize(); ++y) { + for (size_t c = 0; c < 3; ++c) { + T* JXL_RESTRICT row = img->PlaneRow(c, y); + for (size_t x = 0; x < img->xsize(); ++x) { + if (row[x] != T(1)) { + printf("Not 1 at c=%zu %zu, %zu (%zu x %zu) (%s)\n", c, x, y, + img->xsize(), img->ysize(), layout); + abort(); + } + row[x] = T(2); + } + } + } + + // Same for ZeroFillImage and swapped c/y loop ordering. + ZeroFillImage(img); + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < img->ysize(); ++y) { + T* JXL_RESTRICT row = img->PlaneRow(c, y); + for (size_t x = 0; x < img->xsize(); ++x) { + if (row[x] != T(0)) { + printf("Not 0 at c=%zu %zu, %zu (%zu x %zu) (%s)\n", c, x, y, + img->xsize(), img->ysize(), layout); + abort(); + } + row[x] = T(3); + } + } + } +} + +template +void TestFillT() { + for (uint32_t xsize : {0, 1, 15, 16, 31, 32}) { + for (uint32_t ysize : {0, 1, 15, 16, 31, 32}) { + Image3 image(xsize, ysize); + TestFillImpl(&image, "size ctor"); + + Image3 planar(Plane(xsize, ysize), Plane(xsize, ysize), + Plane(xsize, ysize)); + TestFillImpl(&planar, "planar"); + } + } +} + +// Ensure y/c/x and c/y/x loops visit pixels no more than once. +TEST(ImageTest, TestFill) { + TestFillT(); + TestFillT(); + TestFillT(); + TestFillT(); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_test_utils.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_test_utils.h new file mode 100644 index 0000000000..e484307c14 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/image_test_utils.h @@ -0,0 +1,313 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_IMAGE_TEST_UTILS_H_ +#define LIB_JXL_IMAGE_TEST_UTILS_H_ + +#include + +#include +#include +#include + +#include "gtest/gtest.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/image.h" + +namespace jxl { + +template +void VerifyEqual(const Plane& expected, const Plane& actual) { + JXL_CHECK(SameSize(expected, actual)); + for (size_t y = 0; y < expected.ysize(); ++y) { + const T* const JXL_RESTRICT row_expected = expected.Row(y); + const T* const JXL_RESTRICT row_actual = actual.Row(y); + for (size_t x = 0; x < expected.xsize(); ++x) { + ASSERT_EQ(row_expected[x], row_actual[x]) << x << " " << y; + } + } +} + +template +void VerifyEqual(const Image3& expected, const Image3& actual) { + for (size_t c = 0; c < 3; ++c) { + VerifyEqual(expected.Plane(c), actual.Plane(c)); + } +} + +template +bool SamePixels(const Plane& image1, const Plane& image2, + const Rect rect) { + if (!rect.IsInside(image1) || !rect.IsInside(image2)) { + ADD_FAILURE() << "requested rectangle is not fully inside the image"; + return false; + } + size_t mismatches = 0; + for (size_t y = rect.y0(); y < rect.ysize(); ++y) { + const T* const JXL_RESTRICT row1 = image1.Row(y); + const T* const JXL_RESTRICT row2 = image2.Row(y); + for (size_t x = rect.x0(); x < rect.xsize(); ++x) { + if (row1[x] != row2[x]) { + ADD_FAILURE() << "pixel mismatch" << x << ", " << y << ": " + << double(row1[x]) << " != " << double(row2[x]); + if (++mismatches > 4) { + return false; + } + } + } + } + return mismatches == 0; +} + +template +bool SamePixels(const Plane& image1, const Plane& image2) { + JXL_CHECK(SameSize(image1, image2)); + return SamePixels(image1, image2, Rect(image1)); +} + +template +bool SamePixels(const Image3& image1, const Image3& image2) { + JXL_CHECK(SameSize(image1, image2)); + for (size_t c = 0; c < 3; ++c) { + if (!SamePixels(image1.Plane(c), image2.Plane(c))) { + return false; + } + } + return true; +} + +// Use for floating-point images with fairly large numbers; tolerates small +// absolute errors and/or small relative errors. Returns max_relative. +template +void VerifyRelativeError(const Plane& expected, const Plane& actual, + const double threshold_l1, + const double threshold_relative, + const intptr_t border = 0, const size_t c = 0) { + JXL_CHECK(SameSize(expected, actual)); + const intptr_t xsize = expected.xsize(); + const intptr_t ysize = expected.ysize(); + + // Max over current scanline to give a better idea whether there are + // systematic errors or just one outlier. Invalid if negative. + double max_l1 = -1; + double max_relative = -1; + bool any_bad = false; + for (intptr_t y = border; y < ysize - border; ++y) { + const T* const JXL_RESTRICT row_expected = expected.Row(y); + const T* const JXL_RESTRICT row_actual = actual.Row(y); + for (intptr_t x = border; x < xsize - border; ++x) { + const double l1 = std::abs(row_expected[x] - row_actual[x]); + + // Cannot compute relative, only check/update L1. + if (std::abs(row_expected[x]) < 1E-10) { + if (l1 > threshold_l1) { + any_bad = true; + max_l1 = std::max(max_l1, l1); + } + } else { + const double relative = l1 / std::abs(double(row_expected[x])); + if (l1 > threshold_l1 && relative > threshold_relative) { + // Fails both tolerances => will exit below, update max_*. + any_bad = true; + max_l1 = std::max(max_l1, l1); + max_relative = std::max(max_relative, relative); + } + } + } + } + if (any_bad) { + // Never had a valid relative value, don't print it. + if (max_relative < 0) { + fprintf(stderr, "c=%zu: max +/- %E exceeds +/- %.2E\n", c, max_l1, + threshold_l1); + } else { + fprintf(stderr, "c=%zu: max +/- %E, x %E exceeds +/- %.2E, x %.2E\n", c, + max_l1, max_relative, threshold_l1, threshold_relative); + } + // Dump the expected image and actual image if the region is small enough. + const intptr_t kMaxTestDumpSize = 16; + if (xsize <= kMaxTestDumpSize + 2 * border && + ysize <= kMaxTestDumpSize + 2 * border) { + fprintf(stderr, "Expected image:\n"); + for (intptr_t y = border; y < ysize - border; ++y) { + const T* const JXL_RESTRICT row_expected = expected.Row(y); + for (intptr_t x = border; x < xsize - border; ++x) { + fprintf(stderr, "%10lf ", static_cast(row_expected[x])); + } + fprintf(stderr, "\n"); + } + + fprintf(stderr, "Actual image:\n"); + for (intptr_t y = border; y < ysize - border; ++y) { + const T* const JXL_RESTRICT row_expected = expected.Row(y); + const T* const JXL_RESTRICT row_actual = actual.Row(y); + for (intptr_t x = border; x < xsize - border; ++x) { + const double l1 = std::abs(row_expected[x] - row_actual[x]); + + bool bad = l1 > threshold_l1; + if (row_expected[x] > 1E-10) { + const double relative = l1 / std::abs(double(row_expected[x])); + bad &= relative > threshold_relative; + } + if (bad) { + fprintf(stderr, "%10lf ", static_cast(row_actual[x])); + } else { + fprintf(stderr, "%10s ", "=="); + } + } + fprintf(stderr, "\n"); + } + } + + // Find first failing x for further debugging. + for (intptr_t y = border; y < ysize - border; ++y) { + const T* const JXL_RESTRICT row_expected = expected.Row(y); + const T* const JXL_RESTRICT row_actual = actual.Row(y); + + for (intptr_t x = border; x < xsize - border; ++x) { + const double l1 = std::abs(row_expected[x] - row_actual[x]); + + bool bad = l1 > threshold_l1; + if (row_expected[x] > 1E-10) { + const double relative = l1 / std::abs(double(row_expected[x])); + bad &= relative > threshold_relative; + } + if (bad) { + FAIL() << x << ", " << y << " (" << expected.xsize() << " x " + << expected.ysize() << ") expected " + << static_cast(row_expected[x]) << " actual " + << static_cast(row_actual[x]); + } + } + } + return; // if any_bad, we should have exited. + } +} + +template +void VerifyRelativeError(const Image3& expected, const Image3& actual, + const float threshold_l1, + const float threshold_relative, + const intptr_t border = 0) { + for (size_t c = 0; c < 3; ++c) { + VerifyRelativeError(expected.Plane(c), actual.Plane(c), threshold_l1, + threshold_relative, border, c); + } +} + +// Generator for independent, uniformly distributed integers [0, max]. +template +class GeneratorRandom { + public: + GeneratorRandom(Random* rng, const T max) : rng_(*rng), dist_(0, max) {} + + GeneratorRandom(Random* rng, const T min, const T max) + : rng_(*rng), dist_(min, max) {} + + T operator()(const size_t x, const size_t y, const int c) const { + return dist_(rng_); + } + + private: + Random& rng_; + mutable std::uniform_int_distribution<> dist_; +}; + +template +class GeneratorRandom { + public: + GeneratorRandom(Random* rng, const float max) + : rng_(*rng), dist_(0.0f, max) {} + + GeneratorRandom(Random* rng, const float min, const float max) + : rng_(*rng), dist_(min, max) {} + + float operator()(const size_t x, const size_t y, const int c) const { + return dist_(rng_); + } + + private: + Random& rng_; + mutable std::uniform_real_distribution dist_; +}; + +template +class GeneratorRandom { + public: + GeneratorRandom(Random* rng, const double max) + : rng_(*rng), dist_(0.0, max) {} + + GeneratorRandom(Random* rng, const double min, const double max) + : rng_(*rng), dist_(min, max) {} + + double operator()(const size_t x, const size_t y, const int c) const { + return dist_(rng_); + } + + private: + Random& rng_; + mutable std::uniform_real_distribution<> dist_; +}; + +// Assigns generator(x, y, 0) to each pixel (x, y). +template +void GenerateImage(const Generator& generator, Image* image) { + using T = typename Image::T; + for (size_t y = 0; y < image->ysize(); ++y) { + T* const JXL_RESTRICT row = image->Row(y); + for (size_t x = 0; x < image->xsize(); ++x) { + row[x] = generator(x, y, 0); + } + } +} + +template +void RandomFillImage(Plane* image, + const T max = std::numeric_limits::max()) { + std::mt19937_64 rng(129); + const GeneratorRandom generator(&rng, max); + GenerateImage(generator, image); +} + +template +void RandomFillImage(Plane* image, const T min, const T max, + const int seed) { + std::mt19937_64 rng(seed); + const GeneratorRandom generator(&rng, min, max); + GenerateImage(generator, image); +} + +// Assigns generator(x, y, c) to each pixel (x, y). +template +void GenerateImage(const Generator& generator, Image3* image) { + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < image->ysize(); ++y) { + T* JXL_RESTRICT row = image->PlaneRow(c, y); + for (size_t x = 0; x < image->xsize(); ++x) { + row[x] = generator(x, y, c); + } + } + } +} + +template +void RandomFillImage(Image3* image, + const T max = std::numeric_limits::max()) { + std::mt19937_64 rng(129); + const GeneratorRandom generator(&rng, max); + GenerateImage(generator, image); +} + +template +void RandomFillImage(Image3* image, const T min, const T max, + const int seed) { + std::mt19937_64 rng(seed); + const GeneratorRandom generator(&rng, min, max); + GenerateImage(generator, image); +} + +} // namespace jxl + +#endif // LIB_JXL_IMAGE_TEST_UTILS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data.cc new file mode 100644 index 0000000000..f57f697139 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data.cc @@ -0,0 +1,140 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/jpeg/dec_jpeg_data.h" + +#include + +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_bit_reader.h" + +namespace jxl { +namespace jpeg { +Status DecodeJPEGData(Span encoded, JPEGData* jpeg_data) { + Status ret = true; + const uint8_t* in = encoded.data(); + size_t available_in = encoded.size(); + { + BitReader br(encoded); + BitReaderScopedCloser br_closer(&br, &ret); + JXL_RETURN_IF_ERROR(Bundle::Read(&br, jpeg_data)); + JXL_RETURN_IF_ERROR(br.JumpToByteBoundary()); + in += br.TotalBitsConsumed() / 8; + available_in -= br.TotalBitsConsumed() / 8; + } + JXL_RETURN_IF_ERROR(ret); + + BrotliDecoderState* brotli_dec = + BrotliDecoderCreateInstance(nullptr, nullptr, nullptr); + + struct BrotliDecDeleter { + BrotliDecoderState* brotli_dec; + ~BrotliDecDeleter() { BrotliDecoderDestroyInstance(brotli_dec); } + } brotli_dec_deleter{brotli_dec}; + + BrotliDecoderResult result = + BrotliDecoderResult::BROTLI_DECODER_RESULT_SUCCESS; + + auto br_read = [&](std::vector& data) -> Status { + size_t available_out = data.size(); + uint8_t* out = data.data(); + while (available_out != 0) { + if (BrotliDecoderIsFinished(brotli_dec)) { + return JXL_FAILURE("Not enough decompressed output"); + } + result = BrotliDecoderDecompressStream(brotli_dec, &available_in, &in, + &available_out, &out, nullptr); + if (result != + BrotliDecoderResult::BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT && + result != BrotliDecoderResult::BROTLI_DECODER_RESULT_SUCCESS) { + return JXL_FAILURE( + "Brotli decoding error: %s\n", + BrotliDecoderErrorString(BrotliDecoderGetErrorCode(brotli_dec))); + } + } + return true; + }; + size_t num_icc = 0; + for (size_t i = 0; i < jpeg_data->app_data.size(); i++) { + auto& marker = jpeg_data->app_data[i]; + if (jpeg_data->app_marker_type[i] != AppMarkerType::kUnknown) { + // Set the size of the marker. + size_t size_minus_1 = marker.size() - 1; + marker[1] = size_minus_1 >> 8; + marker[2] = size_minus_1 & 0xFF; + if (jpeg_data->app_marker_type[i] == AppMarkerType::kICC) { + if (marker.size() < 17) { + return JXL_FAILURE("ICC markers must be at least 17 bytes"); + } + marker[0] = 0xE2; + memcpy(&marker[3], kIccProfileTag, sizeof kIccProfileTag); + marker[15] = ++num_icc; + } + } else { + JXL_RETURN_IF_ERROR(br_read(marker)); + if (marker[1] * 256u + marker[2] + 1u != marker.size()) { + return JXL_FAILURE("Incorrect marker size"); + } + } + } + for (size_t i = 0; i < jpeg_data->app_data.size(); i++) { + auto& marker = jpeg_data->app_data[i]; + if (jpeg_data->app_marker_type[i] == AppMarkerType::kICC) { + marker[16] = num_icc; + } + if (jpeg_data->app_marker_type[i] == AppMarkerType::kExif) { + marker[0] = 0xE1; + if (marker.size() < 3 + sizeof kExifTag) { + return JXL_FAILURE("Incorrect Exif marker size"); + } + memcpy(&marker[3], kExifTag, sizeof kExifTag); + } + if (jpeg_data->app_marker_type[i] == AppMarkerType::kXMP) { + marker[0] = 0xE1; + if (marker.size() < 3 + sizeof kXMPTag) { + return JXL_FAILURE("Incorrect XMP marker size"); + } + memcpy(&marker[3], kXMPTag, sizeof kXMPTag); + } + } + // TODO(eustas): actually inject ICC profile and check it fits perfectly. + for (size_t i = 0; i < jpeg_data->com_data.size(); i++) { + auto& marker = jpeg_data->com_data[i]; + JXL_RETURN_IF_ERROR(br_read(marker)); + if (marker[1] * 256u + marker[2] + 1u != marker.size()) { + return JXL_FAILURE("Incorrect marker size"); + } + } + for (size_t i = 0; i < jpeg_data->inter_marker_data.size(); i++) { + JXL_RETURN_IF_ERROR(br_read(jpeg_data->inter_marker_data[i])); + } + JXL_RETURN_IF_ERROR(br_read(jpeg_data->tail_data)); + + // Check if there is more decompressed output. + size_t available_out = 1; + uint64_t dummy; + uint8_t* next_out = reinterpret_cast(&dummy); + result = BrotliDecoderDecompressStream(brotli_dec, &available_in, &in, + &available_out, &next_out, nullptr); + if (available_out == 0 || + result == BrotliDecoderResult::BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT) { + return JXL_FAILURE("Excess data in compressed stream"); + } + if (result == BrotliDecoderResult::BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT) { + return JXL_FAILURE("Incomplete brotli-stream"); + } + if (!BrotliDecoderIsFinished(brotli_dec) || + result != BrotliDecoderResult::BROTLI_DECODER_RESULT_SUCCESS) { + return JXL_FAILURE("Corrupted brotli-stream"); + } + if (available_in != 0) { + return JXL_FAILURE("Unused data after brotli stream"); + } + + return true; +} +} // namespace jpeg +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data.h new file mode 100644 index 0000000000..b9d50bf9f8 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data.h @@ -0,0 +1,19 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_JPEG_DEC_JPEG_DATA_H_ +#define LIB_JXL_JPEG_DEC_JPEG_DATA_H_ + +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/jpeg/jpeg_data.h" + +namespace jxl { +namespace jpeg { +Status DecodeJPEGData(Span encoded, JPEGData* jpeg_data); +} +} // namespace jxl + +#endif // LIB_JXL_JPEG_DEC_JPEG_DATA_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data_writer.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data_writer.cc new file mode 100644 index 0000000000..c321344ebf --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data_writer.cc @@ -0,0 +1,983 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/jpeg/dec_jpeg_data_writer.h" + +#include +#include /* for memset, memcpy */ + +#include +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/common.h" +#include "lib/jxl/jpeg/dec_jpeg_serialization_state.h" +#include "lib/jxl/jpeg/jpeg_data.h" + +namespace jxl { +namespace jpeg { + +namespace { + +enum struct SerializationStatus { + NEEDS_MORE_INPUT, + NEEDS_MORE_OUTPUT, + ERROR, + DONE +}; + +const int kJpegPrecision = 8; + +// JpegBitWriter: buffer size +const size_t kJpegBitWriterChunkSize = 16384; + +// DCTCodingState: maximum number of correction bits to buffer +const int kJPEGMaxCorrectionBits = 1u << 16; + +// Returns non-zero if and only if x has a zero byte, i.e. one of +// x & 0xff, x & 0xff00, ..., x & 0xff00000000000000 is zero. +static JXL_INLINE uint64_t HasZeroByte(uint64_t x) { + return (x - 0x0101010101010101ULL) & ~x & 0x8080808080808080ULL; +} + +void JpegBitWriterInit(JpegBitWriter* bw, + std::deque* output_queue) { + bw->output = output_queue; + bw->chunk = OutputChunk(kJpegBitWriterChunkSize); + bw->pos = 0; + bw->put_buffer = 0; + bw->put_bits = 64; + bw->healthy = true; + bw->data = bw->chunk.buffer->data(); +} + +static JXL_NOINLINE void SwapBuffer(JpegBitWriter* bw) { + bw->chunk.len = bw->pos; + bw->output->emplace_back(std::move(bw->chunk)); + bw->chunk = OutputChunk(kJpegBitWriterChunkSize); + bw->data = bw->chunk.buffer->data(); + bw->pos = 0; +} + +static JXL_INLINE void Reserve(JpegBitWriter* bw, size_t n_bytes) { + if (JXL_UNLIKELY((bw->pos + n_bytes) > kJpegBitWriterChunkSize)) { + SwapBuffer(bw); + } +} + +/** + * Writes the given byte to the output, writes an extra zero if byte is 0xFF. + * + * This method is "careless" - caller must make sure that there is enough + * space in the output buffer. Emits up to 2 bytes to buffer. + */ +static JXL_INLINE void EmitByte(JpegBitWriter* bw, int byte) { + bw->data[bw->pos++] = byte; + if (byte == 0xFF) bw->data[bw->pos++] = 0; +} + +static JXL_INLINE void DischargeBitBuffer(JpegBitWriter* bw) { + // At this point we are ready to emit the most significant 6 bytes of + // put_buffer_ to the output. + // The JPEG format requires that after every 0xff byte in the entropy + // coded section, there is a zero byte, therefore we first check if any of + // the 6 most significant bytes of put_buffer_ is 0xFF. + Reserve(bw, 12); + if (HasZeroByte(~bw->put_buffer | 0xFFFF)) { + // We have a 0xFF byte somewhere, examine each byte and append a zero + // byte if necessary. + EmitByte(bw, (bw->put_buffer >> 56) & 0xFF); + EmitByte(bw, (bw->put_buffer >> 48) & 0xFF); + EmitByte(bw, (bw->put_buffer >> 40) & 0xFF); + EmitByte(bw, (bw->put_buffer >> 32) & 0xFF); + EmitByte(bw, (bw->put_buffer >> 24) & 0xFF); + EmitByte(bw, (bw->put_buffer >> 16) & 0xFF); + } else { + // We don't have any 0xFF bytes, output all 6 bytes without checking. + bw->data[bw->pos] = (bw->put_buffer >> 56) & 0xFF; + bw->data[bw->pos + 1] = (bw->put_buffer >> 48) & 0xFF; + bw->data[bw->pos + 2] = (bw->put_buffer >> 40) & 0xFF; + bw->data[bw->pos + 3] = (bw->put_buffer >> 32) & 0xFF; + bw->data[bw->pos + 4] = (bw->put_buffer >> 24) & 0xFF; + bw->data[bw->pos + 5] = (bw->put_buffer >> 16) & 0xFF; + bw->pos += 6; + } + bw->put_buffer <<= 48; + bw->put_bits += 48; +} + +static JXL_INLINE void WriteBits(JpegBitWriter* bw, int nbits, uint64_t bits) { + // This is an optimization; if everything goes well, + // then |nbits| is positive; if non-existing Huffman symbol is going to be + // encoded, its length should be zero; later encoder could check the + // "health" of JpegBitWriter. + if (nbits == 0) { + bw->healthy = false; + return; + } + bw->put_bits -= nbits; + bw->put_buffer |= (bits << bw->put_bits); + if (bw->put_bits <= 16) DischargeBitBuffer(bw); +} + +void EmitMarker(JpegBitWriter* bw, int marker) { + Reserve(bw, 2); + JXL_DASSERT(marker != 0xFF); + bw->data[bw->pos++] = 0xFF; + bw->data[bw->pos++] = marker; +} + +bool JumpToByteBoundary(JpegBitWriter* bw, const uint8_t** pad_bits, + const uint8_t* pad_bits_end) { + size_t n_bits = bw->put_bits & 7u; + uint8_t pad_pattern; + if (*pad_bits == nullptr) { + pad_pattern = (1u << n_bits) - 1; + } else { + pad_pattern = 0; + const uint8_t* src = *pad_bits; + // TODO(eustas): bitwise reading looks insanely ineffective... + while (n_bits--) { + pad_pattern <<= 1; + if (src >= pad_bits_end) return false; + // TODO(eustas): DCHECK *src == {0, 1} + pad_pattern |= !!*(src++); + } + *pad_bits = src; + } + + Reserve(bw, 16); + + while (bw->put_bits <= 56) { + int c = (bw->put_buffer >> 56) & 0xFF; + EmitByte(bw, c); + bw->put_buffer <<= 8; + bw->put_bits += 8; + } + if (bw->put_bits < 64) { + int pad_mask = 0xFFu >> (64 - bw->put_bits); + int c = ((bw->put_buffer >> 56) & ~pad_mask) | pad_pattern; + EmitByte(bw, c); + } + bw->put_buffer = 0; + bw->put_bits = 64; + + return true; +} + +void JpegBitWriterFinish(JpegBitWriter* bw) { + if (bw->pos == 0) return; + bw->chunk.len = bw->pos; + bw->output->emplace_back(std::move(bw->chunk)); + bw->chunk = OutputChunk(nullptr, 0); + bw->data = nullptr; + bw->pos = 0; +} + +void DCTCodingStateInit(DCTCodingState* s) { + s->eob_run_ = 0; + s->cur_ac_huff_ = nullptr; + s->refinement_bits_.clear(); + s->refinement_bits_.reserve(kJPEGMaxCorrectionBits); +} + +// Emit all buffered data to the bit stream using the given Huffman code and +// bit writer. +static JXL_INLINE void Flush(DCTCodingState* s, JpegBitWriter* bw) { + if (s->eob_run_ > 0) { + int nbits = FloorLog2Nonzero(s->eob_run_); + int symbol = nbits << 4u; + WriteBits(bw, s->cur_ac_huff_->depth[symbol], + s->cur_ac_huff_->code[symbol]); + if (nbits > 0) { + WriteBits(bw, nbits, s->eob_run_ & ((1 << nbits) - 1)); + } + s->eob_run_ = 0; + } + for (size_t i = 0; i < s->refinement_bits_.size(); ++i) { + WriteBits(bw, 1, s->refinement_bits_[i]); + } + s->refinement_bits_.clear(); +} + +// Buffer some more data at the end-of-band (the last non-zero or newly +// non-zero coefficient within the [Ss, Se] spectral band). +static JXL_INLINE void BufferEndOfBand(DCTCodingState* s, + const HuffmanCodeTable* ac_huff, + const std::vector* new_bits, + JpegBitWriter* bw) { + if (s->eob_run_ == 0) { + s->cur_ac_huff_ = ac_huff; + } + ++s->eob_run_; + if (new_bits) { + s->refinement_bits_.insert(s->refinement_bits_.end(), new_bits->begin(), + new_bits->end()); + } + if (s->eob_run_ == 0x7FFF || + s->refinement_bits_.size() > kJPEGMaxCorrectionBits - kDCTBlockSize + 1) { + Flush(s, bw); + } +} + +bool BuildHuffmanCodeTable(const JPEGHuffmanCode& huff, + HuffmanCodeTable* table) { + int huff_code[kJpegHuffmanAlphabetSize]; + // +1 for a sentinel element. + uint32_t huff_size[kJpegHuffmanAlphabetSize + 1]; + int p = 0; + for (size_t l = 1; l <= kJpegHuffmanMaxBitLength; ++l) { + int i = huff.counts[l]; + if (p + i > kJpegHuffmanAlphabetSize + 1) { + return false; + } + while (i--) huff_size[p++] = l; + } + + if (p == 0) { + return true; + } + + // Reuse sentinel element. + int last_p = p - 1; + huff_size[last_p] = 0; + + int code = 0; + uint32_t si = huff_size[0]; + p = 0; + while (huff_size[p]) { + while ((huff_size[p]) == si) { + huff_code[p++] = code; + code++; + } + code <<= 1; + si++; + } + for (p = 0; p < last_p; p++) { + int i = huff.values[p]; + table->depth[i] = huff_size[p]; + table->code[i] = huff_code[p]; + } + return true; +} + +bool EncodeSOI(SerializationState* state) { + state->output_queue.push_back(OutputChunk({0xFF, 0xD8})); + return true; +} + +bool EncodeEOI(const JPEGData& jpg, SerializationState* state) { + state->output_queue.push_back(OutputChunk({0xFF, 0xD9})); + state->output_queue.emplace_back(jpg.tail_data); + return true; +} + +bool EncodeSOF(const JPEGData& jpg, uint8_t marker, SerializationState* state) { + if (marker <= 0xC2) state->is_progressive = (marker == 0xC2); + + const size_t n_comps = jpg.components.size(); + const size_t marker_len = 8 + 3 * n_comps; + state->output_queue.emplace_back(marker_len + 2); + uint8_t* data = state->output_queue.back().buffer->data(); + size_t pos = 0; + data[pos++] = 0xFF; + data[pos++] = marker; + data[pos++] = marker_len >> 8u; + data[pos++] = marker_len & 0xFFu; + data[pos++] = kJpegPrecision; + data[pos++] = jpg.height >> 8u; + data[pos++] = jpg.height & 0xFFu; + data[pos++] = jpg.width >> 8u; + data[pos++] = jpg.width & 0xFFu; + data[pos++] = n_comps; + for (size_t i = 0; i < n_comps; ++i) { + data[pos++] = jpg.components[i].id; + data[pos++] = ((jpg.components[i].h_samp_factor << 4u) | + (jpg.components[i].v_samp_factor)); + const size_t quant_idx = jpg.components[i].quant_idx; + if (quant_idx >= jpg.quant.size()) return false; + data[pos++] = jpg.quant[quant_idx].index; + } + return true; +} + +bool EncodeSOS(const JPEGData& jpg, const JPEGScanInfo& scan_info, + SerializationState* state) { + const size_t n_scans = scan_info.num_components; + const size_t marker_len = 6 + 2 * n_scans; + state->output_queue.emplace_back(marker_len + 2); + uint8_t* data = state->output_queue.back().buffer->data(); + size_t pos = 0; + data[pos++] = 0xFF; + data[pos++] = 0xDA; + data[pos++] = marker_len >> 8u; + data[pos++] = marker_len & 0xFFu; + data[pos++] = n_scans; + for (size_t i = 0; i < n_scans; ++i) { + const JPEGComponentScanInfo& si = scan_info.components[i]; + if (si.comp_idx >= jpg.components.size()) return false; + data[pos++] = jpg.components[si.comp_idx].id; + data[pos++] = (si.dc_tbl_idx << 4u) + si.ac_tbl_idx; + } + data[pos++] = scan_info.Ss; + data[pos++] = scan_info.Se; + data[pos++] = ((scan_info.Ah << 4u) | (scan_info.Al)); + return true; +} + +bool EncodeDHT(const JPEGData& jpg, SerializationState* state) { + const std::vector& huffman_code = jpg.huffman_code; + + size_t marker_len = 2; + for (size_t i = state->dht_index; i < huffman_code.size(); ++i) { + const JPEGHuffmanCode& huff = huffman_code[i]; + marker_len += kJpegHuffmanMaxBitLength; + for (size_t j = 0; j < huff.counts.size(); ++j) { + marker_len += huff.counts[j]; + } + if (huff.is_last) break; + } + state->output_queue.emplace_back(marker_len + 2); + uint8_t* data = state->output_queue.back().buffer->data(); + size_t pos = 0; + data[pos++] = 0xFF; + data[pos++] = 0xC4; + data[pos++] = marker_len >> 8u; + data[pos++] = marker_len & 0xFFu; + while (true) { + const size_t huffman_code_index = state->dht_index++; + if (huffman_code_index >= huffman_code.size()) { + return false; + } + const JPEGHuffmanCode& huff = huffman_code[huffman_code_index]; + size_t index = huff.slot_id; + HuffmanCodeTable* huff_table; + if (index & 0x10) { + index -= 0x10; + huff_table = &state->ac_huff_table[index]; + } else { + huff_table = &state->dc_huff_table[index]; + } + // TODO(eustas): cache + // TODO(eustas): set up non-existing symbols + if (!BuildHuffmanCodeTable(huff, huff_table)) { + return false; + } + size_t total_count = 0; + size_t max_length = 0; + for (size_t i = 0; i < huff.counts.size(); ++i) { + if (huff.counts[i] != 0) { + max_length = i; + } + total_count += huff.counts[i]; + } + --total_count; + data[pos++] = huff.slot_id; + for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) { + data[pos++] = (i == max_length ? huff.counts[i] - 1 : huff.counts[i]); + } + for (size_t i = 0; i < total_count; ++i) { + data[pos++] = huff.values[i]; + } + if (huff.is_last) break; + } + return true; +} + +bool EncodeDQT(const JPEGData& jpg, SerializationState* state) { + int marker_len = 2; + for (size_t i = state->dqt_index; i < jpg.quant.size(); ++i) { + const JPEGQuantTable& table = jpg.quant[i]; + marker_len += 1 + (table.precision ? 2 : 1) * kDCTBlockSize; + if (table.is_last) break; + } + state->output_queue.emplace_back(marker_len + 2); + uint8_t* data = state->output_queue.back().buffer->data(); + size_t pos = 0; + data[pos++] = 0xFF; + data[pos++] = 0xDB; + data[pos++] = marker_len >> 8u; + data[pos++] = marker_len & 0xFFu; + while (true) { + const size_t idx = state->dqt_index++; + if (idx >= jpg.quant.size()) { + return false; // corrupt input + } + const JPEGQuantTable& table = jpg.quant[idx]; + data[pos++] = (table.precision << 4u) + table.index; + for (size_t i = 0; i < kDCTBlockSize; ++i) { + int val_idx = kJPEGNaturalOrder[i]; + int val = table.values[val_idx]; + if (table.precision) { + data[pos++] = val >> 8u; + } + data[pos++] = val & 0xFFu; + } + if (table.is_last) break; + } + return true; +} + +bool EncodeDRI(const JPEGData& jpg, SerializationState* state) { + state->seen_dri_marker = true; + OutputChunk dri_marker = {0xFF, + 0xDD, + 0, + 4, + static_cast(jpg.restart_interval >> 8), + static_cast(jpg.restart_interval & 0xFF)}; + state->output_queue.push_back(std::move(dri_marker)); + return true; +} + +bool EncodeRestart(uint8_t marker, SerializationState* state) { + state->output_queue.push_back(OutputChunk({0xFF, marker})); + return true; +} + +bool EncodeAPP(const JPEGData& jpg, uint8_t marker, SerializationState* state) { + // TODO(eustas): check that marker corresponds to payload? + (void)marker; + + size_t app_index = state->app_index++; + if (app_index >= jpg.app_data.size()) return false; + state->output_queue.push_back(OutputChunk({0xFF})); + state->output_queue.emplace_back(jpg.app_data[app_index]); + return true; +} + +bool EncodeCOM(const JPEGData& jpg, SerializationState* state) { + size_t com_index = state->com_index++; + if (com_index >= jpg.com_data.size()) return false; + state->output_queue.push_back(OutputChunk({0xFF})); + state->output_queue.emplace_back(jpg.com_data[com_index]); + return true; +} + +bool EncodeInterMarkerData(const JPEGData& jpg, SerializationState* state) { + size_t index = state->data_index++; + if (index >= jpg.inter_marker_data.size()) return false; + state->output_queue.emplace_back(jpg.inter_marker_data[index]); + return true; +} + +bool EncodeDCTBlockSequential(const coeff_t* coeffs, + const HuffmanCodeTable& dc_huff, + const HuffmanCodeTable& ac_huff, + int num_zero_runs, coeff_t* last_dc_coeff, + JpegBitWriter* bw) { + coeff_t temp2; + coeff_t temp; + temp2 = coeffs[0]; + temp = temp2 - *last_dc_coeff; + *last_dc_coeff = temp2; + temp2 = temp; + if (temp < 0) { + temp = -temp; + temp2--; + } + int dc_nbits = (temp == 0) ? 0 : (FloorLog2Nonzero(temp) + 1); + WriteBits(bw, dc_huff.depth[dc_nbits], dc_huff.code[dc_nbits]); + if (dc_nbits >= 12) return false; + if (dc_nbits > 0) { + WriteBits(bw, dc_nbits, temp2 & ((1u << dc_nbits) - 1)); + } + int r = 0; + for (int k = 1; k < 64; ++k) { + if ((temp = coeffs[kJPEGNaturalOrder[k]]) == 0) { + r++; + continue; + } + if (temp < 0) { + temp = -temp; + temp2 = ~temp; + } else { + temp2 = temp; + } + while (r > 15) { + WriteBits(bw, ac_huff.depth[0xf0], ac_huff.code[0xf0]); + r -= 16; + } + int ac_nbits = FloorLog2Nonzero(temp) + 1; + if (ac_nbits >= 16) return false; + int symbol = (r << 4u) + ac_nbits; + WriteBits(bw, ac_huff.depth[symbol], ac_huff.code[symbol]); + WriteBits(bw, ac_nbits, temp2 & ((1 << ac_nbits) - 1)); + r = 0; + } + for (int i = 0; i < num_zero_runs; ++i) { + WriteBits(bw, ac_huff.depth[0xf0], ac_huff.code[0xf0]); + r -= 16; + } + if (r > 0) { + WriteBits(bw, ac_huff.depth[0], ac_huff.code[0]); + } + return true; +} + +bool EncodeDCTBlockProgressive(const coeff_t* coeffs, + const HuffmanCodeTable& dc_huff, + const HuffmanCodeTable& ac_huff, int Ss, int Se, + int Al, int num_zero_runs, + DCTCodingState* coding_state, + coeff_t* last_dc_coeff, JpegBitWriter* bw) { + bool eob_run_allowed = Ss > 0; + coeff_t temp2; + coeff_t temp; + if (Ss == 0) { + temp2 = coeffs[0] >> Al; + temp = temp2 - *last_dc_coeff; + *last_dc_coeff = temp2; + temp2 = temp; + if (temp < 0) { + temp = -temp; + temp2--; + } + int nbits = (temp == 0) ? 0 : (FloorLog2Nonzero(temp) + 1); + WriteBits(bw, dc_huff.depth[nbits], dc_huff.code[nbits]); + if (nbits > 0) { + WriteBits(bw, nbits, temp2 & ((1 << nbits) - 1)); + } + ++Ss; + } + if (Ss > Se) { + return true; + } + int r = 0; + for (int k = Ss; k <= Se; ++k) { + if ((temp = coeffs[kJPEGNaturalOrder[k]]) == 0) { + r++; + continue; + } + if (temp < 0) { + temp = -temp; + temp >>= Al; + temp2 = ~temp; + } else { + temp >>= Al; + temp2 = temp; + } + if (temp == 0) { + r++; + continue; + } + Flush(coding_state, bw); + while (r > 15) { + WriteBits(bw, ac_huff.depth[0xf0], ac_huff.code[0xf0]); + r -= 16; + } + int nbits = FloorLog2Nonzero(temp) + 1; + int symbol = (r << 4u) + nbits; + WriteBits(bw, ac_huff.depth[symbol], ac_huff.code[symbol]); + WriteBits(bw, nbits, temp2 & ((1 << nbits) - 1)); + r = 0; + } + if (num_zero_runs > 0) { + Flush(coding_state, bw); + for (int i = 0; i < num_zero_runs; ++i) { + WriteBits(bw, ac_huff.depth[0xf0], ac_huff.code[0xf0]); + r -= 16; + } + } + if (r > 0) { + BufferEndOfBand(coding_state, &ac_huff, nullptr, bw); + if (!eob_run_allowed) { + Flush(coding_state, bw); + } + } + return true; +} + +bool EncodeRefinementBits(const coeff_t* coeffs, + const HuffmanCodeTable& ac_huff, int Ss, int Se, + int Al, DCTCodingState* coding_state, + JpegBitWriter* bw) { + bool eob_run_allowed = Ss > 0; + if (Ss == 0) { + // Emit next bit of DC component. + WriteBits(bw, 1, (coeffs[0] >> Al) & 1); + ++Ss; + } + if (Ss > Se) { + return true; + } + int abs_values[kDCTBlockSize]; + int eob = 0; + for (int k = Ss; k <= Se; k++) { + const coeff_t abs_val = std::abs(coeffs[kJPEGNaturalOrder[k]]); + abs_values[k] = abs_val >> Al; + if (abs_values[k] == 1) { + eob = k; + } + } + int r = 0; + std::vector refinement_bits; + refinement_bits.reserve(kDCTBlockSize); + for (int k = Ss; k <= Se; k++) { + if (abs_values[k] == 0) { + r++; + continue; + } + while (r > 15 && k <= eob) { + Flush(coding_state, bw); + WriteBits(bw, ac_huff.depth[0xf0], ac_huff.code[0xf0]); + r -= 16; + for (int bit : refinement_bits) { + WriteBits(bw, 1, bit); + } + refinement_bits.clear(); + } + if (abs_values[k] > 1) { + refinement_bits.push_back(abs_values[k] & 1u); + continue; + } + Flush(coding_state, bw); + int symbol = (r << 4u) + 1; + int new_non_zero_bit = (coeffs[kJPEGNaturalOrder[k]] < 0) ? 0 : 1; + WriteBits(bw, ac_huff.depth[symbol], ac_huff.code[symbol]); + WriteBits(bw, 1, new_non_zero_bit); + for (int bit : refinement_bits) { + WriteBits(bw, 1, bit); + } + refinement_bits.clear(); + r = 0; + } + if (r > 0 || !refinement_bits.empty()) { + BufferEndOfBand(coding_state, &ac_huff, &refinement_bits, bw); + if (!eob_run_allowed) { + Flush(coding_state, bw); + } + } + return true; +} + +template +SerializationStatus JXL_NOINLINE DoEncodeScan(const JPEGData& jpg, + SerializationState* state) { + const JPEGScanInfo& scan_info = jpg.scan_info[state->scan_index]; + EncodeScanState& ss = state->scan_state; + + const int restart_interval = + state->seen_dri_marker ? jpg.restart_interval : 0; + + const auto get_next_extra_zero_run_index = [&ss, &scan_info]() -> int { + if (ss.extra_zero_runs_pos < scan_info.extra_zero_runs.size()) { + return scan_info.extra_zero_runs[ss.extra_zero_runs_pos].block_idx; + } else { + return -1; + } + }; + + const auto get_next_reset_point = [&ss, &scan_info]() -> int { + if (ss.next_reset_point_pos < scan_info.reset_points.size()) { + return scan_info.reset_points[ss.next_reset_point_pos++]; + } else { + return -1; + } + }; + + if (ss.stage == EncodeScanState::HEAD) { + if (!EncodeSOS(jpg, scan_info, state)) return SerializationStatus::ERROR; + JpegBitWriterInit(&ss.bw, &state->output_queue); + DCTCodingStateInit(&ss.coding_state); + ss.restarts_to_go = restart_interval; + ss.next_restart_marker = 0; + ss.block_scan_index = 0; + ss.extra_zero_runs_pos = 0; + ss.next_extra_zero_run_index = get_next_extra_zero_run_index(); + ss.next_reset_point_pos = 0; + ss.next_reset_point = get_next_reset_point(); + ss.mcu_y = 0; + memset(ss.last_dc_coeff, 0, sizeof(ss.last_dc_coeff)); + ss.stage = EncodeScanState::BODY; + } + JpegBitWriter* bw = &ss.bw; + DCTCodingState* coding_state = &ss.coding_state; + + JXL_DASSERT(ss.stage == EncodeScanState::BODY); + + // "Non-interleaved" means color data comes in separate scans, in other words + // each scan can contain only one color component. + const bool is_interleaved = (scan_info.num_components > 1); + int MCUs_per_row = 0; + int MCU_rows = 0; + jpg.CalculateMcuSize(scan_info, &MCUs_per_row, &MCU_rows); + const bool is_progressive = state->is_progressive; + const int Al = is_progressive ? scan_info.Al : 0; + const int Ss = is_progressive ? scan_info.Ss : 0; + const int Se = is_progressive ? scan_info.Se : 63; + + // DC-only is defined by [0..0] spectral range. + const bool want_ac = ((Ss != 0) || (Se != 0)); + // TODO: support streaming decoding again. + const bool complete_ac = true; + const bool has_ac = true; + if (want_ac && !has_ac) return SerializationStatus::NEEDS_MORE_INPUT; + + // |has_ac| implies |complete_dc| but not vice versa; for the sake of + // simplicity we pretend they are equal, because they are separated by just a + // few bytes of input. + const bool complete_dc = has_ac; + const bool complete = want_ac ? complete_ac : complete_dc; + // When "incomplete" |ac_dc| tracks information about current ("incomplete") + // band parsing progress. + + // FIXME: Is this always complete? + // const int last_mcu_y = + // complete ? MCU_rows : parsing_state.internal->ac_dc.next_mcu_y * + // v_group; + (void)complete; + const int last_mcu_y = complete ? MCU_rows : 0; + + for (; ss.mcu_y < last_mcu_y; ++ss.mcu_y) { + for (int mcu_x = 0; mcu_x < MCUs_per_row; ++mcu_x) { + // Possibly emit a restart marker. + if (restart_interval > 0 && ss.restarts_to_go == 0) { + Flush(coding_state, bw); + if (!JumpToByteBoundary(bw, &state->pad_bits, state->pad_bits_end)) { + return SerializationStatus::ERROR; + } + EmitMarker(bw, 0xD0 + ss.next_restart_marker); + ss.next_restart_marker += 1; + ss.next_restart_marker &= 0x7; + ss.restarts_to_go = restart_interval; + memset(ss.last_dc_coeff, 0, sizeof(ss.last_dc_coeff)); + } + // Encode one MCU + for (size_t i = 0; i < scan_info.num_components; ++i) { + const JPEGComponentScanInfo& si = scan_info.components[i]; + const JPEGComponent& c = jpg.components[si.comp_idx]; + const HuffmanCodeTable& dc_huff = state->dc_huff_table[si.dc_tbl_idx]; + const HuffmanCodeTable& ac_huff = state->ac_huff_table[si.ac_tbl_idx]; + int n_blocks_y = is_interleaved ? c.v_samp_factor : 1; + int n_blocks_x = is_interleaved ? c.h_samp_factor : 1; + for (int iy = 0; iy < n_blocks_y; ++iy) { + for (int ix = 0; ix < n_blocks_x; ++ix) { + int block_y = ss.mcu_y * n_blocks_y + iy; + int block_x = mcu_x * n_blocks_x + ix; + int block_idx = block_y * c.width_in_blocks + block_x; + if (ss.block_scan_index == ss.next_reset_point) { + Flush(coding_state, bw); + ss.next_reset_point = get_next_reset_point(); + } + int num_zero_runs = 0; + if (ss.block_scan_index == ss.next_extra_zero_run_index) { + num_zero_runs = scan_info.extra_zero_runs[ss.extra_zero_runs_pos] + .num_extra_zero_runs; + ++ss.extra_zero_runs_pos; + ss.next_extra_zero_run_index = get_next_extra_zero_run_index(); + } + const coeff_t* coeffs = &c.coeffs[block_idx << 6]; + bool ok; + if (kMode == 0) { + ok = EncodeDCTBlockSequential(coeffs, dc_huff, ac_huff, + num_zero_runs, + ss.last_dc_coeff + si.comp_idx, bw); + } else if (kMode == 1) { + ok = EncodeDCTBlockProgressive( + coeffs, dc_huff, ac_huff, Ss, Se, Al, num_zero_runs, + coding_state, ss.last_dc_coeff + si.comp_idx, bw); + } else { + ok = EncodeRefinementBits(coeffs, ac_huff, Ss, Se, Al, + coding_state, bw); + } + if (!ok) return SerializationStatus::ERROR; + ++ss.block_scan_index; + } + } + } + --ss.restarts_to_go; + } + } + if (ss.mcu_y < MCU_rows) { + if (!bw->healthy) return SerializationStatus::ERROR; + return SerializationStatus::NEEDS_MORE_INPUT; + } + Flush(coding_state, bw); + if (!JumpToByteBoundary(bw, &state->pad_bits, state->pad_bits_end)) { + return SerializationStatus::ERROR; + } + JpegBitWriterFinish(bw); + ss.stage = EncodeScanState::HEAD; + state->scan_index++; + if (!bw->healthy) return SerializationStatus::ERROR; + + return SerializationStatus::DONE; +} + +static SerializationStatus JXL_INLINE EncodeScan(const JPEGData& jpg, + SerializationState* state) { + const JPEGScanInfo& scan_info = jpg.scan_info[state->scan_index]; + const bool is_progressive = state->is_progressive; + const int Al = is_progressive ? scan_info.Al : 0; + const int Ah = is_progressive ? scan_info.Ah : 0; + const int Ss = is_progressive ? scan_info.Ss : 0; + const int Se = is_progressive ? scan_info.Se : 63; + const bool need_sequential = + !is_progressive || (Ah == 0 && Al == 0 && Ss == 0 && Se == 63); + if (need_sequential) { + return DoEncodeScan<0>(jpg, state); + } else if (Ah == 0) { + return DoEncodeScan<1>(jpg, state); + } else { + return DoEncodeScan<2>(jpg, state); + } +} + +SerializationStatus SerializeSection(uint8_t marker, SerializationState* state, + const JPEGData& jpg) { + const auto to_status = [](bool result) { + return result ? SerializationStatus::DONE : SerializationStatus::ERROR; + }; + // TODO(eustas): add and use marker enum + switch (marker) { + case 0xC0: + case 0xC1: + case 0xC2: + case 0xC9: + case 0xCA: + return to_status(EncodeSOF(jpg, marker, state)); + + case 0xC4: + return to_status(EncodeDHT(jpg, state)); + + case 0xD0: + case 0xD1: + case 0xD2: + case 0xD3: + case 0xD4: + case 0xD5: + case 0xD6: + case 0xD7: + return to_status(EncodeRestart(marker, state)); + + case 0xD9: + return to_status(EncodeEOI(jpg, state)); + + case 0xDA: + return EncodeScan(jpg, state); + + case 0xDB: + return to_status(EncodeDQT(jpg, state)); + + case 0xDD: + return to_status(EncodeDRI(jpg, state)); + + case 0xE0: + case 0xE1: + case 0xE2: + case 0xE3: + case 0xE4: + case 0xE5: + case 0xE6: + case 0xE7: + case 0xE8: + case 0xE9: + case 0xEA: + case 0xEB: + case 0xEC: + case 0xED: + case 0xEE: + case 0xEF: + return to_status(EncodeAPP(jpg, marker, state)); + + case 0xFE: + return to_status(EncodeCOM(jpg, state)); + + case 0xFF: + return to_status(EncodeInterMarkerData(jpg, state)); + + default: + return SerializationStatus::ERROR; + } +} + +} // namespace + +// TODO(veluca): add streaming support again. +Status WriteJpeg(const JPEGData& jpg, const JPEGOutput& out) { + SerializationState ss; + + size_t written = 0; + const auto maybe_push_output = [&]() -> Status { + if (ss.stage != SerializationState::ERROR) { + while (!ss.output_queue.empty()) { + auto& chunk = ss.output_queue.front(); + size_t num_written = out(chunk.next, chunk.len); + if (num_written == 0 && chunk.len > 0) { + return StatusMessage(Status(StatusCode::kNotEnoughBytes), + "Failed to write output"); + } + chunk.len -= num_written; + written += num_written; + if (chunk.len == 0) { + ss.output_queue.pop_front(); + } + } + } + return true; + }; + + while (true) { + switch (ss.stage) { + case SerializationState::INIT: { + // Valid Brunsli requires, at least, 0xD9 marker. + // This might happen on corrupted stream, or on unconditioned JPEGData. + // TODO(eustas): check D9 in the only one and is the last one. + if (jpg.marker_order.empty()) { + ss.stage = SerializationState::ERROR; + break; + } + + ss.dc_huff_table.resize(kMaxHuffmanTables); + ss.ac_huff_table.resize(kMaxHuffmanTables); + if (jpg.has_zero_padding_bit) { + ss.pad_bits = jpg.padding_bits.data(); + ss.pad_bits_end = ss.pad_bits + jpg.padding_bits.size(); + } + + EncodeSOI(&ss); + JXL_QUIET_RETURN_IF_ERROR(maybe_push_output()); + ss.stage = SerializationState::SERIALIZE_SECTION; + break; + } + + case SerializationState::SERIALIZE_SECTION: { + if (ss.section_index >= jpg.marker_order.size()) { + ss.stage = SerializationState::DONE; + break; + } + uint8_t marker = jpg.marker_order[ss.section_index]; + SerializationStatus status = SerializeSection(marker, &ss, jpg); + if (status == SerializationStatus::ERROR) { + JXL_WARNING("Failed to encode marker 0x%.2x", marker); + ss.stage = SerializationState::ERROR; + break; + } + JXL_QUIET_RETURN_IF_ERROR(maybe_push_output()); + if (status == SerializationStatus::NEEDS_MORE_INPUT) { + return JXL_FAILURE("Incomplete serialization data"); + } else if (status != SerializationStatus::DONE) { + JXL_DASSERT(false); + ss.stage = SerializationState::ERROR; + break; + } + ++ss.section_index; + break; + } + + case SerializationState::DONE: + JXL_ASSERT(ss.output_queue.empty()); + return true; + + case SerializationState::ERROR: + return JXL_FAILURE("JPEG serialization error"); + } + } +} + +} // namespace jpeg +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data_writer.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data_writer.h new file mode 100644 index 0000000000..28f5141f41 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_data_writer.h @@ -0,0 +1,30 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Functions for writing a JPEGData object into a jpeg byte stream. + +#ifndef LIB_JXL_JPEG_DEC_JPEG_DATA_WRITER_H_ +#define LIB_JXL_JPEG_DEC_JPEG_DATA_WRITER_H_ + +#include +#include + +#include + +#include "lib/jxl/jpeg/jpeg_data.h" + +namespace jxl { +namespace jpeg { + +// Function type used to write len bytes into buf. Returns the number of bytes +// written. +using JPEGOutput = std::function; + +Status WriteJpeg(const JPEGData& jpg, const JPEGOutput& out); + +} // namespace jpeg +} // namespace jxl + +#endif // LIB_JXL_JPEG_DEC_JPEG_DATA_WRITER_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_output_chunk.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_output_chunk.h new file mode 100644 index 0000000000..e003c04952 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_output_chunk.h @@ -0,0 +1,72 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_JPEG_DEC_JPEG_OUTPUT_CHUNK_H_ +#define LIB_JXL_JPEG_DEC_JPEG_OUTPUT_CHUNK_H_ + +#include +#include + +#include +#include +#include + +namespace jxl { +namespace jpeg { + +/** + * A chunk of output data. + * + * Data producer creates OutputChunks and adds them to the end output queue. + * Once control flow leaves the producer code, it is considered that chunk of + * data is final and can not be changed; to underline this fact |next| is a + * const-pointer. + * + * Data consumer removes OutputChunks from the beginning of the output queue. + * It is possible to consume OutputChunks partially, by updating |next| and + * |len|. + * + * There are 2 types of output chunks: + * - owning: actual data is stored in |buffer| field; producer fills data after + * the instance it created; it is legal to reduce |len| to show that not all + * the capacity of |buffer| is used + * - non-owning: represents the data stored (owned) somewhere else + */ +struct OutputChunk { + // Non-owning + template + explicit OutputChunk(Bytes& bytes) : len(bytes.size()) { + // Deal both with const qualifier and data type. + const void* src = bytes.data(); + next = reinterpret_cast(src); + } + + // Non-owning + OutputChunk(const uint8_t* data, size_t size) : next(data), len(size) {} + + // Owning + explicit OutputChunk(size_t size = 0) { + buffer.reset(new std::vector(size)); + next = buffer->data(); + len = size; + } + + // Owning + OutputChunk(std::initializer_list bytes) { + buffer.reset(new std::vector(bytes)); + next = buffer->data(); + len = bytes.size(); + } + + const uint8_t* next; + size_t len; + // TODO(veluca): consider removing the unique_ptr. + std::unique_ptr> buffer; +}; + +} // namespace jpeg +} // namespace jxl + +#endif // LIB_JXL_JPEG_DEC_JPEG_OUTPUT_CHUNK_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_serialization_state.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_serialization_state.h new file mode 100644 index 0000000000..a25c335b59 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/dec_jpeg_serialization_state.h @@ -0,0 +1,95 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_JPEG_DEC_JPEG_SERIALIZATION_STATE_H_ +#define LIB_JXL_JPEG_DEC_JPEG_SERIALIZATION_STATE_H_ + +#include +#include + +#include "lib/jxl/jpeg/dec_jpeg_output_chunk.h" +#include "lib/jxl/jpeg/jpeg_data.h" + +namespace jxl { +namespace jpeg { + +struct HuffmanCodeTable { + int depth[256]; + int code[256]; +}; + +// Handles the packing of bits into output bytes. +struct JpegBitWriter { + bool healthy; + std::deque* output; + OutputChunk chunk; + uint8_t* data; + size_t pos; + uint64_t put_buffer; + int put_bits; +}; + +// Holds data that is buffered between 8x8 blocks in progressive mode. +struct DCTCodingState { + // The run length of end-of-band symbols in a progressive scan. + int eob_run_; + // The huffman table to be used when flushing the state. + const HuffmanCodeTable* cur_ac_huff_; + // The sequence of currently buffered refinement bits for a successive + // approximation scan (one where Ah > 0). + std::vector refinement_bits_; +}; + +struct EncodeScanState { + enum Stage { HEAD, BODY }; + + Stage stage = HEAD; + + int mcu_y; + JpegBitWriter bw; + coeff_t last_dc_coeff[kMaxComponents] = {0}; + int restarts_to_go; + int next_restart_marker; + int block_scan_index; + DCTCodingState coding_state; + size_t extra_zero_runs_pos; + int next_extra_zero_run_index; + size_t next_reset_point_pos; + int next_reset_point; +}; + +struct SerializationState { + enum Stage { + INIT, + SERIALIZE_SECTION, + DONE, + ERROR, + }; + + Stage stage = INIT; + + std::deque output_queue; + + size_t section_index = 0; + int dht_index = 0; + int dqt_index = 0; + int app_index = 0; + int com_index = 0; + int data_index = 0; + int scan_index = 0; + std::vector dc_huff_table; + std::vector ac_huff_table; + const uint8_t* pad_bits = nullptr; + const uint8_t* pad_bits_end = nullptr; + bool seen_dri_marker = false; + bool is_progressive = false; + + EncodeScanState scan_state; +}; + +} // namespace jpeg +} // namespace jxl + +#endif // LIB_JXL_JPEG_DEC_JPEG_SERIALIZATION_STATE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data.cc new file mode 100644 index 0000000000..079c6efcea --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data.cc @@ -0,0 +1,370 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/jpeg/enc_jpeg_data.h" + +#include +#include + +#include "lib/jxl/jpeg/enc_jpeg_data_reader.h" + +namespace jxl { +namespace jpeg { + +namespace { + +constexpr int BITS_IN_JSAMPLE = 8; +using ByteSpan = Span; + +// TODO(eustas): move to jpeg_data, to use from codec_jpg as well. +// See if there is a canonically chunked ICC profile and mark corresponding +// app-tags with AppMarkerType::kICC. +Status DetectIccProfile(JPEGData& jpeg_data) { + JXL_DASSERT(jpeg_data.app_data.size() == jpeg_data.app_marker_type.size()); + size_t num_icc = 0; + size_t num_icc_jpeg = 0; + for (size_t i = 0; i < jpeg_data.app_data.size(); i++) { + const auto& app = jpeg_data.app_data[i]; + size_t pos = 0; + if (app[pos++] != 0xE2) continue; + // At least APPn + size; otherwise it should be intermarker-data. + JXL_DASSERT(app.size() >= 3); + size_t tag_length = (app[pos] << 8) + app[pos + 1]; + pos += 2; + JXL_DASSERT(app.size() == tag_length + 1); + // Empty payload is 2 bytes for tag length itself + signature + if (tag_length < 2 + sizeof kIccProfileTag) continue; + + if (memcmp(&app[pos], kIccProfileTag, sizeof kIccProfileTag) != 0) continue; + pos += sizeof kIccProfileTag; + uint8_t chunk_id = app[pos++]; + uint8_t num_chunks = app[pos++]; + if (chunk_id != num_icc + 1) continue; + if (num_icc_jpeg == 0) num_icc_jpeg = num_chunks; + if (num_icc_jpeg != num_chunks) continue; + num_icc++; + jpeg_data.app_marker_type[i] = AppMarkerType::kICC; + } + if (num_icc != num_icc_jpeg) { + return JXL_FAILURE("Invalid ICC chunks"); + } + return true; +} + +bool GetMarkerPayload(const uint8_t* data, size_t size, ByteSpan* payload) { + if (size < 3) { + return false; + } + size_t hi = data[1]; + size_t lo = data[2]; + size_t internal_size = (hi << 8u) | lo; + // Second byte of marker is not counted towards size. + if (internal_size != size - 1) { + return false; + } + // cut second marker byte and "length" from payload. + *payload = ByteSpan(data, size); + payload->remove_prefix(3); + return true; +} + +Status DetectBlobs(jpeg::JPEGData& jpeg_data) { + JXL_DASSERT(jpeg_data.app_data.size() == jpeg_data.app_marker_type.size()); + bool have_exif = false, have_xmp = false; + for (size_t i = 0; i < jpeg_data.app_data.size(); i++) { + auto& marker = jpeg_data.app_data[i]; + if (marker.empty() || marker[0] != kApp1) { + continue; + } + ByteSpan payload; + if (!GetMarkerPayload(marker.data(), marker.size(), &payload)) { + // Something is wrong with this marker; does not care. + continue; + } + if (!have_exif && payload.size() >= sizeof kExifTag && + !memcmp(payload.data(), kExifTag, sizeof kExifTag)) { + jpeg_data.app_marker_type[i] = AppMarkerType::kExif; + have_exif = true; + } + if (!have_xmp && payload.size() >= sizeof kXMPTag && + !memcmp(payload.data(), kXMPTag, sizeof kXMPTag)) { + jpeg_data.app_marker_type[i] = AppMarkerType::kXMP; + have_xmp = true; + } + } + return true; +} + +Status ParseChunkedMarker(const jpeg::JPEGData& src, uint8_t marker_type, + const ByteSpan& tag, PaddedBytes* output, + bool allow_permutations = false) { + output->clear(); + + std::vector chunks; + std::vector presence; + size_t expected_number_of_parts = 0; + bool is_first_chunk = true; + size_t ordinal = 0; + for (const auto& marker : src.app_data) { + if (marker.empty() || marker[0] != marker_type) { + continue; + } + ByteSpan payload; + if (!GetMarkerPayload(marker.data(), marker.size(), &payload)) { + // Something is wrong with this marker; does not care. + continue; + } + if ((payload.size() < tag.size()) || + memcmp(payload.data(), tag.data(), tag.size()) != 0) { + continue; + } + payload.remove_prefix(tag.size()); + if (payload.size() < 2) { + return JXL_FAILURE("Chunk is too small."); + } + uint8_t index = payload[0]; + uint8_t total = payload[1]; + ordinal++; + if (!allow_permutations) { + if (index != ordinal) return JXL_FAILURE("Invalid chunk order."); + } + + payload.remove_prefix(2); + + JXL_RETURN_IF_ERROR(total != 0); + if (is_first_chunk) { + is_first_chunk = false; + expected_number_of_parts = total; + // 1-based indices; 0-th element is added for convenience. + chunks.resize(total + 1); + presence.resize(total + 1); + } else { + JXL_RETURN_IF_ERROR(expected_number_of_parts == total); + } + + if (index == 0 || index > total) { + return JXL_FAILURE("Invalid chunk index."); + } + + if (presence[index]) { + return JXL_FAILURE("Duplicate chunk."); + } + presence[index] = true; + chunks[index] = payload; + } + + for (size_t i = 0; i < expected_number_of_parts; ++i) { + // 0-th element is not used. + size_t index = i + 1; + if (!presence[index]) { + return JXL_FAILURE("Missing chunk."); + } + output->append(chunks[index]); + } + + return true; +} + +Status SetBlobsFromJpegData(const jpeg::JPEGData& jpeg_data, Blobs* blobs) { + for (size_t i = 0; i < jpeg_data.app_data.size(); i++) { + auto& marker = jpeg_data.app_data[i]; + if (marker.empty() || marker[0] != kApp1) { + continue; + } + ByteSpan payload; + if (!GetMarkerPayload(marker.data(), marker.size(), &payload)) { + // Something is wrong with this marker; does not care. + continue; + } + if (payload.size() >= sizeof kExifTag && + !memcmp(payload.data(), kExifTag, sizeof kExifTag)) { + if (blobs->exif.empty()) { + blobs->exif.resize(payload.size() - sizeof kExifTag); + memcpy(blobs->exif.data(), payload.data() + sizeof kExifTag, + payload.size() - sizeof kExifTag); + } else { + JXL_WARNING( + "ReJPEG: multiple Exif blobs, storing only first one in the JPEG " + "XL container\n"); + } + } + if (payload.size() >= sizeof kXMPTag && + !memcmp(payload.data(), kXMPTag, sizeof kXMPTag)) { + if (blobs->xmp.empty()) { + blobs->xmp.resize(payload.size() - sizeof kXMPTag); + memcpy(blobs->xmp.data(), payload.data() + sizeof kXMPTag, + payload.size() - sizeof kXMPTag); + } else { + JXL_WARNING( + "ReJPEG: multiple XMP blobs, storing only first one in the JPEG " + "XL container\n"); + } + } + } + return true; +} + +} // namespace + +Status SetColorEncodingFromJpegData(const jpeg::JPEGData& jpg, + ColorEncoding* color_encoding) { + PaddedBytes icc_profile; + if (!ParseChunkedMarker(jpg, kApp2, ByteSpan(kIccProfileTag), &icc_profile)) { + JXL_WARNING("ReJPEG: corrupted ICC profile\n"); + icc_profile.clear(); + } + + if (icc_profile.empty()) { + bool is_gray = (jpg.components.size() == 1); + *color_encoding = ColorEncoding::SRGB(is_gray); + return true; + } + + return color_encoding->SetICC(std::move(icc_profile)); +} + +Status EncodeJPEGData(JPEGData& jpeg_data, PaddedBytes* bytes) { + jpeg_data.app_marker_type.resize(jpeg_data.app_data.size(), + AppMarkerType::kUnknown); + JXL_RETURN_IF_ERROR(DetectIccProfile(jpeg_data)); + JXL_RETURN_IF_ERROR(DetectBlobs(jpeg_data)); + BitWriter writer; + JXL_RETURN_IF_ERROR(Bundle::Write(jpeg_data, &writer, 0, nullptr)); + writer.ZeroPadToByte(); + *bytes = std::move(writer).TakeBytes(); + BrotliEncoderState* brotli_enc = + BrotliEncoderCreateInstance(nullptr, nullptr, nullptr); + BrotliEncoderSetParameter(brotli_enc, BROTLI_PARAM_QUALITY, 11); + size_t total_data = 0; + for (size_t i = 0; i < jpeg_data.app_data.size(); i++) { + if (jpeg_data.app_marker_type[i] != AppMarkerType::kUnknown) { + continue; + } + total_data += jpeg_data.app_data[i].size(); + } + for (size_t i = 0; i < jpeg_data.com_data.size(); i++) { + total_data += jpeg_data.com_data[i].size(); + } + for (size_t i = 0; i < jpeg_data.inter_marker_data.size(); i++) { + total_data += jpeg_data.inter_marker_data[i].size(); + } + total_data += jpeg_data.tail_data.size(); + size_t initial_size = bytes->size(); + size_t brotli_capacity = BrotliEncoderMaxCompressedSize(total_data); + BrotliEncoderSetParameter(brotli_enc, BROTLI_PARAM_SIZE_HINT, total_data); + bytes->resize(bytes->size() + brotli_capacity); + size_t enc_size = 0; + auto br_append = [&](const std::vector& data, bool last) { + size_t available_in = data.size(); + const uint8_t* in = data.data(); + uint8_t* out = &(*bytes)[initial_size + enc_size]; + do { + JXL_CHECK(BrotliEncoderCompressStream( + brotli_enc, last ? BROTLI_OPERATION_FINISH : BROTLI_OPERATION_PROCESS, + &available_in, &in, &brotli_capacity, &out, &enc_size)); + } while (BrotliEncoderHasMoreOutput(brotli_enc) || available_in > 0); + }; + + for (size_t i = 0; i < jpeg_data.app_data.size(); i++) { + if (jpeg_data.app_marker_type[i] != AppMarkerType::kUnknown) { + continue; + } + br_append(jpeg_data.app_data[i], /*last=*/false); + } + for (size_t i = 0; i < jpeg_data.com_data.size(); i++) { + br_append(jpeg_data.com_data[i], /*last=*/false); + } + for (size_t i = 0; i < jpeg_data.inter_marker_data.size(); i++) { + br_append(jpeg_data.inter_marker_data[i], /*last=*/false); + } + br_append(jpeg_data.tail_data, /*last=*/true); + BrotliEncoderDestroyInstance(brotli_enc); + bytes->resize(initial_size + enc_size); + return true; +} + +Status DecodeImageJPG(const Span bytes, CodecInOut* io) { + io->frames.clear(); + io->frames.reserve(1); + io->frames.emplace_back(&io->metadata.m); + io->Main().jpeg_data = make_unique(); + jpeg::JPEGData* jpeg_data = io->Main().jpeg_data.get(); + if (!jpeg::ReadJpeg(bytes.data(), bytes.size(), jpeg::JpegReadMode::kReadAll, + jpeg_data)) { + return JXL_FAILURE("Error reading JPEG"); + } + JXL_RETURN_IF_ERROR( + SetColorEncodingFromJpegData(*jpeg_data, &io->metadata.m.color_encoding)); + JXL_RETURN_IF_ERROR(SetBlobsFromJpegData(*jpeg_data, &io->blobs)); + size_t nbcomp = jpeg_data->components.size(); + if (nbcomp != 1 && nbcomp != 3) { + return JXL_FAILURE("Cannot recompress JPEGs with neither 1 nor 3 channels"); + } + YCbCrChromaSubsampling cs; + if (nbcomp == 3) { + uint8_t hsample[3], vsample[3]; + for (size_t i = 0; i < nbcomp; i++) { + hsample[i] = jpeg_data->components[i].h_samp_factor; + vsample[i] = jpeg_data->components[i].v_samp_factor; + } + JXL_RETURN_IF_ERROR(cs.Set(hsample, vsample)); + } else if (nbcomp == 1) { + uint8_t hsample[3], vsample[3]; + for (size_t i = 0; i < 3; i++) { + hsample[i] = jpeg_data->components[0].h_samp_factor; + vsample[i] = jpeg_data->components[0].v_samp_factor; + } + JXL_RETURN_IF_ERROR(cs.Set(hsample, vsample)); + } + bool is_rgb = false; + { + const auto& markers = jpeg_data->marker_order; + // If there is a JFIF marker, this is YCbCr. Otherwise... + if (std::find(markers.begin(), markers.end(), 0xE0) == markers.end()) { + // Try to find an 'Adobe' marker. + size_t app_markers = 0; + size_t i = 0; + for (; i < markers.size(); i++) { + // This is an APP marker. + if ((markers[i] & 0xF0) == 0xE0) { + JXL_CHECK(app_markers < jpeg_data->app_data.size()); + // APP14 marker + if (markers[i] == 0xEE) { + const auto& data = jpeg_data->app_data[app_markers]; + if (data.size() == 15 && data[3] == 'A' && data[4] == 'd' && + data[5] == 'o' && data[6] == 'b' && data[7] == 'e') { + // 'Adobe' marker. + is_rgb = data[14] == 0; + break; + } + } + app_markers++; + } + } + + if (i == markers.size()) { + // No 'Adobe' marker, guess from component IDs. + is_rgb = nbcomp == 3 && jpeg_data->components[0].id == 'R' && + jpeg_data->components[1].id == 'G' && + jpeg_data->components[2].id == 'B'; + } + } + } + + io->Main().chroma_subsampling = cs; + io->Main().color_transform = + (!is_rgb || nbcomp == 1) ? ColorTransform::kYCbCr : ColorTransform::kNone; + + io->metadata.m.SetIntensityTarget( + io->target_nits != 0 ? io->target_nits : kDefaultIntensityTarget); + io->metadata.m.SetUintSamples(BITS_IN_JSAMPLE); + io->SetFromImage(Image3F(jpeg_data->width, jpeg_data->height), + io->metadata.m.color_encoding); + SetIntensityTarget(io); + return true; +} + +} // namespace jpeg +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data.h new file mode 100644 index 0000000000..b80ade776f --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data.h @@ -0,0 +1,28 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_JPEG_ENC_JPEG_DATA_H_ +#define LIB_JXL_JPEG_ENC_JPEG_DATA_H_ + +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/jpeg/jpeg_data.h" + +namespace jxl { +namespace jpeg { +Status EncodeJPEGData(JPEGData& jpeg_data, PaddedBytes* bytes); + +Status SetColorEncodingFromJpegData(const jpeg::JPEGData& jpg, + ColorEncoding* color_encoding); + +/** + * Decodes bytes containing JPEG codestream into a CodecInOut as coefficients + * only, for lossless JPEG transcoding. + */ +Status DecodeImageJPG(const Span bytes, CodecInOut* io); +} // namespace jpeg +} // namespace jxl + +#endif // LIB_JXL_JPEG_ENC_JPEG_DATA_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data_reader.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data_reader.cc new file mode 100644 index 0000000000..6e24557a27 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data_reader.cc @@ -0,0 +1,1142 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/jpeg/enc_jpeg_data_reader.h" + +#include +#include + +#include +#include +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/jpeg/enc_jpeg_huffman_decode.h" +#include "lib/jxl/jpeg/jpeg_data.h" + +// By default only print debug messages when JXL_DEBUG_ON_ERROR is enabled. +#ifndef JXL_DEBUG_JPEG_DATA_READER +#define JXL_DEBUG_JPEG_DATA_READER JXL_DEBUG_ON_ERROR +#endif // JXL_DEBUG_JPEG_DATA_READER + +#define JXL_JPEG_DEBUG(format, ...) \ + JXL_DEBUG(JXL_DEBUG_JPEG_DATA_READER, format, ##__VA_ARGS__) + +namespace jxl { +namespace jpeg { + +namespace { +static const int kBrunsliMaxSampling = 15; +static const size_t kBrunsliMaxNumBlocks = 1ull << 24; + +// Macros for commonly used error conditions. + +#define JXL_JPEG_VERIFY_LEN(n) \ + if (*pos + (n) > len) { \ + JXL_JPEG_DEBUG("Unexpected end of input: pos=%zu need=%d len=%zu", *pos, \ + static_cast(n), len); \ + jpg->error = JPEGReadError::UNEXPECTED_EOF; \ + return false; \ + } + +#define JXL_JPEG_VERIFY_INPUT(var, low, high, code) \ + if ((var) < (low) || (var) > (high)) { \ + JXL_JPEG_DEBUG("Invalid " #var ": %d", static_cast(var)); \ + jpg->error = JPEGReadError::INVALID_##code; \ + return false; \ + } + +#define JXL_JPEG_VERIFY_MARKER_END() \ + if (start_pos + marker_len != *pos) { \ + JXL_JPEG_DEBUG("Invalid marker length: declared=%zu actual=%zu", \ + marker_len, (*pos - start_pos)); \ + jpg->error = JPEGReadError::WRONG_MARKER_SIZE; \ + return false; \ + } + +#define JXL_JPEG_EXPECT_MARKER() \ + if (pos + 2 > len || data[pos] != 0xff) { \ + JXL_JPEG_DEBUG( \ + "Marker byte (0xff) expected, found: 0x%.2x pos=%zu len=%zu", \ + (pos < len ? data[pos] : 0), pos, len); \ + jpg->error = JPEGReadError::MARKER_BYTE_NOT_FOUND; \ + return false; \ + } + +inline int ReadUint8(const uint8_t* data, size_t* pos) { + return data[(*pos)++]; +} + +inline int ReadUint16(const uint8_t* data, size_t* pos) { + int v = (data[*pos] << 8) + data[*pos + 1]; + *pos += 2; + return v; +} + +// Reads the Start of Frame (SOF) marker segment and fills in *jpg with the +// parsed data. +bool ProcessSOF(const uint8_t* data, const size_t len, JpegReadMode mode, + size_t* pos, JPEGData* jpg) { + if (jpg->width != 0) { + JXL_JPEG_DEBUG("Duplicate SOF marker."); + jpg->error = JPEGReadError::DUPLICATE_SOF; + return false; + } + const size_t start_pos = *pos; + JXL_JPEG_VERIFY_LEN(8); + size_t marker_len = ReadUint16(data, pos); + int precision = ReadUint8(data, pos); + int height = ReadUint16(data, pos); + int width = ReadUint16(data, pos); + int num_components = ReadUint8(data, pos); + JXL_JPEG_VERIFY_INPUT(precision, 8, 8, PRECISION); + JXL_JPEG_VERIFY_INPUT(height, 1, kMaxDimPixels, HEIGHT); + JXL_JPEG_VERIFY_INPUT(width, 1, kMaxDimPixels, WIDTH); + JXL_JPEG_VERIFY_INPUT(num_components, 1, kMaxComponents, NUMCOMP); + JXL_JPEG_VERIFY_LEN(3 * num_components); + jpg->height = height; + jpg->width = width; + jpg->components.resize(num_components); + + // Read sampling factors and quant table index for each component. + std::vector ids_seen(256, false); + int max_h_samp_factor = 1; + int max_v_samp_factor = 1; + for (size_t i = 0; i < jpg->components.size(); ++i) { + const int id = ReadUint8(data, pos); + if (ids_seen[id]) { // (cf. section B.2.2, syntax of Ci) + JXL_JPEG_DEBUG("Duplicate ID %d in SOF.", id); + jpg->error = JPEGReadError::DUPLICATE_COMPONENT_ID; + return false; + } + ids_seen[id] = true; + jpg->components[i].id = id; + int factor = ReadUint8(data, pos); + int h_samp_factor = factor >> 4; + int v_samp_factor = factor & 0xf; + JXL_JPEG_VERIFY_INPUT(h_samp_factor, 1, kBrunsliMaxSampling, SAMP_FACTOR); + JXL_JPEG_VERIFY_INPUT(v_samp_factor, 1, kBrunsliMaxSampling, SAMP_FACTOR); + jpg->components[i].h_samp_factor = h_samp_factor; + jpg->components[i].v_samp_factor = v_samp_factor; + jpg->components[i].quant_idx = ReadUint8(data, pos); + max_h_samp_factor = std::max(max_h_samp_factor, h_samp_factor); + max_v_samp_factor = std::max(max_v_samp_factor, v_samp_factor); + } + + // We have checked above that none of the sampling factors are 0, so the max + // sampling factors can not be 0. + int MCU_rows = DivCeil(jpg->height, max_v_samp_factor * 8); + int MCU_cols = DivCeil(jpg->width, max_h_samp_factor * 8); + // Compute the block dimensions for each component. + for (size_t i = 0; i < jpg->components.size(); ++i) { + JPEGComponent* c = &jpg->components[i]; + if (max_h_samp_factor % c->h_samp_factor != 0 || + max_v_samp_factor % c->v_samp_factor != 0) { + JXL_JPEG_DEBUG("Non-integral subsampling ratios."); + jpg->error = JPEGReadError::INVALID_SAMPLING_FACTORS; + return false; + } + c->width_in_blocks = MCU_cols * c->h_samp_factor; + c->height_in_blocks = MCU_rows * c->v_samp_factor; + const uint64_t num_blocks = + static_cast(c->width_in_blocks) * c->height_in_blocks; + if (num_blocks > kBrunsliMaxNumBlocks) { + JXL_JPEG_DEBUG("Image too large."); + jpg->error = JPEGReadError::IMAGE_TOO_LARGE; + return false; + } + if (mode == JpegReadMode::kReadAll) { + c->coeffs.resize(num_blocks * kDCTBlockSize); + } + } + JXL_JPEG_VERIFY_MARKER_END(); + return true; +} + +// Reads the Start of Scan (SOS) marker segment and fills in *scan_info with the +// parsed data. +bool ProcessSOS(const uint8_t* data, const size_t len, size_t* pos, + JPEGData* jpg) { + const size_t start_pos = *pos; + JXL_JPEG_VERIFY_LEN(3); + size_t marker_len = ReadUint16(data, pos); + size_t comps_in_scan = ReadUint8(data, pos); + JXL_JPEG_VERIFY_INPUT(comps_in_scan, 1, jpg->components.size(), + COMPS_IN_SCAN); + + JPEGScanInfo scan_info; + scan_info.num_components = comps_in_scan; + JXL_JPEG_VERIFY_LEN(2 * comps_in_scan); + std::vector ids_seen(256, false); + for (size_t i = 0; i < comps_in_scan; ++i) { + uint32_t id = ReadUint8(data, pos); + if (ids_seen[id]) { // (cf. section B.2.3, regarding CSj) + JXL_JPEG_DEBUG("Duplicate ID %d in SOS.", id); + jpg->error = JPEGReadError::DUPLICATE_COMPONENT_ID; + return false; + } + ids_seen[id] = true; + bool found_index = false; + for (size_t j = 0; j < jpg->components.size(); ++j) { + if (jpg->components[j].id == id) { + scan_info.components[i].comp_idx = j; + found_index = true; + } + } + if (!found_index) { + JXL_JPEG_DEBUG("SOS marker: Could not find component with id %d", id); + jpg->error = JPEGReadError::COMPONENT_NOT_FOUND; + return false; + } + int c = ReadUint8(data, pos); + int dc_tbl_idx = c >> 4; + int ac_tbl_idx = c & 0xf; + JXL_JPEG_VERIFY_INPUT(dc_tbl_idx, 0, 3, HUFFMAN_INDEX); + JXL_JPEG_VERIFY_INPUT(ac_tbl_idx, 0, 3, HUFFMAN_INDEX); + scan_info.components[i].dc_tbl_idx = dc_tbl_idx; + scan_info.components[i].ac_tbl_idx = ac_tbl_idx; + } + JXL_JPEG_VERIFY_LEN(3); + scan_info.Ss = ReadUint8(data, pos); + scan_info.Se = ReadUint8(data, pos); + JXL_JPEG_VERIFY_INPUT(static_cast(scan_info.Ss), 0, 63, START_OF_SCAN); + JXL_JPEG_VERIFY_INPUT(scan_info.Se, scan_info.Ss, 63, END_OF_SCAN); + int c = ReadUint8(data, pos); + scan_info.Ah = c >> 4; + scan_info.Al = c & 0xf; + if (scan_info.Ah != 0 && scan_info.Al != scan_info.Ah - 1) { + // section G.1.1.1.2 : Successive approximation control only improves + // by one bit at a time. But it's not always respected, so we just issue + // a warning. + JXL_WARNING("Invalid progressive parameters: Al=%d Ah=%d", scan_info.Al, + scan_info.Ah); + } + // Check that all the Huffman tables needed for this scan are defined. + for (size_t i = 0; i < comps_in_scan; ++i) { + bool found_dc_table = false; + bool found_ac_table = false; + for (size_t j = 0; j < jpg->huffman_code.size(); ++j) { + uint32_t slot_id = jpg->huffman_code[j].slot_id; + if (slot_id == scan_info.components[i].dc_tbl_idx) { + found_dc_table = true; + } else if (slot_id == scan_info.components[i].ac_tbl_idx + 16) { + found_ac_table = true; + } + } + if (scan_info.Ss == 0 && !found_dc_table) { + JXL_JPEG_DEBUG( + "SOS marker: Could not find DC Huffman table with index %d", + scan_info.components[i].dc_tbl_idx); + jpg->error = JPEGReadError::HUFFMAN_TABLE_NOT_FOUND; + return false; + } + if (scan_info.Se > 0 && !found_ac_table) { + JXL_JPEG_DEBUG( + "SOS marker: Could not find AC Huffman table with index %d", + scan_info.components[i].ac_tbl_idx); + jpg->error = JPEGReadError::HUFFMAN_TABLE_NOT_FOUND; + return false; + } + } + jpg->scan_info.push_back(scan_info); + JXL_JPEG_VERIFY_MARKER_END(); + return true; +} + +// Reads the Define Huffman Table (DHT) marker segment and fills in *jpg with +// the parsed data. Builds the Huffman decoding table in either dc_huff_lut or +// ac_huff_lut, depending on the type and solt_id of Huffman code being read. +bool ProcessDHT(const uint8_t* data, const size_t len, JpegReadMode mode, + std::vector* dc_huff_lut, + std::vector* ac_huff_lut, size_t* pos, + JPEGData* jpg) { + const size_t start_pos = *pos; + JXL_JPEG_VERIFY_LEN(2); + size_t marker_len = ReadUint16(data, pos); + if (marker_len == 2) { + JXL_JPEG_DEBUG("DHT marker: no Huffman table found"); + jpg->error = JPEGReadError::EMPTY_DHT; + return false; + } + while (*pos < start_pos + marker_len) { + JXL_JPEG_VERIFY_LEN(1 + kJpegHuffmanMaxBitLength); + JPEGHuffmanCode huff; + huff.slot_id = ReadUint8(data, pos); + int huffman_index = huff.slot_id; + int is_ac_table = (huff.slot_id & 0x10) != 0; + HuffmanTableEntry* huff_lut; + if (is_ac_table) { + huffman_index -= 0x10; + JXL_JPEG_VERIFY_INPUT(huffman_index, 0, 3, HUFFMAN_INDEX); + huff_lut = &(*ac_huff_lut)[huffman_index * kJpegHuffmanLutSize]; + } else { + JXL_JPEG_VERIFY_INPUT(huffman_index, 0, 3, HUFFMAN_INDEX); + huff_lut = &(*dc_huff_lut)[huffman_index * kJpegHuffmanLutSize]; + } + huff.counts[0] = 0; + int total_count = 0; + int space = 1 << kJpegHuffmanMaxBitLength; + int max_depth = 1; + for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) { + int count = ReadUint8(data, pos); + if (count != 0) { + max_depth = i; + } + huff.counts[i] = count; + total_count += count; + space -= count * (1 << (kJpegHuffmanMaxBitLength - i)); + } + if (is_ac_table) { + JXL_JPEG_VERIFY_INPUT(total_count, 0, kJpegHuffmanAlphabetSize, + HUFFMAN_CODE); + } else { + JXL_JPEG_VERIFY_INPUT(total_count, 0, kJpegDCAlphabetSize, HUFFMAN_CODE); + } + JXL_JPEG_VERIFY_LEN(total_count); + std::vector values_seen(256, false); + for (int i = 0; i < total_count; ++i) { + int value = ReadUint8(data, pos); + if (!is_ac_table) { + JXL_JPEG_VERIFY_INPUT(value, 0, kJpegDCAlphabetSize - 1, HUFFMAN_CODE); + } + if (values_seen[value]) { + JXL_JPEG_DEBUG("Duplicate Huffman code value %d", value); + jpg->error = JPEGReadError::INVALID_HUFFMAN_CODE; + return false; + } + values_seen[value] = true; + huff.values[i] = value; + } + // Add an invalid symbol that will have the all 1 code. + ++huff.counts[max_depth]; + huff.values[total_count] = kJpegHuffmanAlphabetSize; + space -= (1 << (kJpegHuffmanMaxBitLength - max_depth)); + if (space < 0) { + JXL_JPEG_DEBUG("Invalid Huffman code lengths."); + jpg->error = JPEGReadError::INVALID_HUFFMAN_CODE; + return false; + } else if (space > 0 && huff_lut[0].value != 0xffff) { + // Re-initialize the values to an invalid symbol so that we can recognize + // it when reading the bit stream using a Huffman code with space > 0. + for (int i = 0; i < kJpegHuffmanLutSize; ++i) { + huff_lut[i].bits = 0; + huff_lut[i].value = 0xffff; + } + } + huff.is_last = (*pos == start_pos + marker_len); + if (mode == JpegReadMode::kReadAll) { + BuildJpegHuffmanTable(&huff.counts[0], &huff.values[0], huff_lut); + } + jpg->huffman_code.push_back(huff); + } + JXL_JPEG_VERIFY_MARKER_END(); + return true; +} + +// Reads the Define Quantization Table (DQT) marker segment and fills in *jpg +// with the parsed data. +bool ProcessDQT(const uint8_t* data, const size_t len, size_t* pos, + JPEGData* jpg) { + const size_t start_pos = *pos; + JXL_JPEG_VERIFY_LEN(2); + size_t marker_len = ReadUint16(data, pos); + if (marker_len == 2) { + JXL_JPEG_DEBUG("DQT marker: no quantization table found"); + jpg->error = JPEGReadError::EMPTY_DQT; + return false; + } + while (*pos < start_pos + marker_len && jpg->quant.size() < kMaxQuantTables) { + JXL_JPEG_VERIFY_LEN(1); + int quant_table_index = ReadUint8(data, pos); + int quant_table_precision = quant_table_index >> 4; + JXL_JPEG_VERIFY_INPUT(quant_table_precision, 0, 1, QUANT_TBL_PRECISION); + quant_table_index &= 0xf; + JXL_JPEG_VERIFY_INPUT(quant_table_index, 0, 3, QUANT_TBL_INDEX); + JXL_JPEG_VERIFY_LEN((quant_table_precision + 1) * kDCTBlockSize); + JPEGQuantTable table; + table.index = quant_table_index; + table.precision = quant_table_precision; + for (size_t i = 0; i < kDCTBlockSize; ++i) { + int quant_val = + quant_table_precision ? ReadUint16(data, pos) : ReadUint8(data, pos); + JXL_JPEG_VERIFY_INPUT(quant_val, 1, 65535, QUANT_VAL); + table.values[kJPEGNaturalOrder[i]] = quant_val; + } + table.is_last = (*pos == start_pos + marker_len); + jpg->quant.push_back(table); + } + JXL_JPEG_VERIFY_MARKER_END(); + return true; +} + +// Reads the DRI marker and saves the restart interval into *jpg. +bool ProcessDRI(const uint8_t* data, const size_t len, size_t* pos, + bool* found_dri, JPEGData* jpg) { + if (*found_dri) { + JXL_JPEG_DEBUG("Duplicate DRI marker."); + jpg->error = JPEGReadError::DUPLICATE_DRI; + return false; + } + *found_dri = true; + const size_t start_pos = *pos; + JXL_JPEG_VERIFY_LEN(4); + size_t marker_len = ReadUint16(data, pos); + int restart_interval = ReadUint16(data, pos); + jpg->restart_interval = restart_interval; + JXL_JPEG_VERIFY_MARKER_END(); + return true; +} + +// Saves the APP marker segment as a string to *jpg. +bool ProcessAPP(const uint8_t* data, const size_t len, size_t* pos, + JPEGData* jpg) { + JXL_JPEG_VERIFY_LEN(2); + size_t marker_len = ReadUint16(data, pos); + JXL_JPEG_VERIFY_INPUT(marker_len, 2, 65535, MARKER_LEN); + JXL_JPEG_VERIFY_LEN(marker_len - 2); + JXL_DASSERT(*pos >= 3); + // Save the marker type together with the app data. + const uint8_t* app_str_start = data + *pos - 3; + std::vector app_str(app_str_start, app_str_start + marker_len + 1); + *pos += marker_len - 2; + jpg->app_data.push_back(app_str); + return true; +} + +// Saves the COM marker segment as a string to *jpg. +bool ProcessCOM(const uint8_t* data, const size_t len, size_t* pos, + JPEGData* jpg) { + JXL_JPEG_VERIFY_LEN(2); + size_t marker_len = ReadUint16(data, pos); + JXL_JPEG_VERIFY_INPUT(marker_len, 2, 65535, MARKER_LEN); + JXL_JPEG_VERIFY_LEN(marker_len - 2); + const uint8_t* com_str_start = data + *pos - 3; + std::vector com_str(com_str_start, com_str_start + marker_len + 1); + *pos += marker_len - 2; + jpg->com_data.push_back(com_str); + return true; +} + +// Helper structure to read bits from the entropy coded data segment. +struct BitReaderState { + BitReaderState(const uint8_t* data, const size_t len, size_t pos) + : data_(data), len_(len) { + Reset(pos); + } + + void Reset(size_t pos) { + pos_ = pos; + val_ = 0; + bits_left_ = 0; + next_marker_pos_ = len_ - 2; + FillBitWindow(); + } + + // Returns the next byte and skips the 0xff/0x00 escape sequences. + uint8_t GetNextByte() { + if (pos_ >= next_marker_pos_) { + ++pos_; + return 0; + } + uint8_t c = data_[pos_++]; + if (c == 0xff) { + uint8_t escape = data_[pos_]; + if (escape == 0) { + ++pos_; + } else { + // 0xff was followed by a non-zero byte, which means that we found the + // start of the next marker segment. + next_marker_pos_ = pos_ - 1; + } + } + return c; + } + + void FillBitWindow() { + if (bits_left_ <= 16) { + while (bits_left_ <= 56) { + val_ <<= 8; + val_ |= (uint64_t)GetNextByte(); + bits_left_ += 8; + } + } + } + + int ReadBits(int nbits) { + FillBitWindow(); + uint64_t val = (val_ >> (bits_left_ - nbits)) & ((1ULL << nbits) - 1); + bits_left_ -= nbits; + return val; + } + + // Sets *pos to the next stream position where parsing should continue. + // Enqueue the padding bits seen (0 or 1). + // Returns false if there is inconsistent or invalid padding or the stream + // ended too early. + bool FinishStream(JPEGData* jpg, size_t* pos) { + int npadbits = bits_left_ & 7; + if (npadbits > 0) { + uint64_t padmask = (1ULL << npadbits) - 1; + uint64_t padbits = (val_ >> (bits_left_ - npadbits)) & padmask; + if (padbits != padmask) { + jpg->has_zero_padding_bit = true; + } + for (int i = npadbits - 1; i >= 0; --i) { + jpg->padding_bits.push_back((padbits >> i) & 1); + } + } + // Give back some bytes that we did not use. + int unused_bytes_left = bits_left_ >> 3; + while (unused_bytes_left-- > 0) { + --pos_; + // If we give back a 0 byte, we need to check if it was a 0xff/0x00 escape + // sequence, and if yes, we need to give back one more byte. + if (pos_ < next_marker_pos_ && data_[pos_] == 0 && + data_[pos_ - 1] == 0xff) { + --pos_; + } + } + if (pos_ > next_marker_pos_) { + // Data ran out before the scan was complete. + JXL_JPEG_DEBUG("Unexpected end of scan."); + return false; + } + *pos = pos_; + return true; + } + + const uint8_t* data_; + const size_t len_; + size_t pos_; + uint64_t val_; + int bits_left_; + size_t next_marker_pos_; +}; + +// Returns the next Huffman-coded symbol. +int ReadSymbol(const HuffmanTableEntry* table, BitReaderState* br) { + int nbits; + br->FillBitWindow(); + int val = (br->val_ >> (br->bits_left_ - 8)) & 0xff; + table += val; + nbits = table->bits - 8; + if (nbits > 0) { + br->bits_left_ -= 8; + table += table->value; + val = (br->val_ >> (br->bits_left_ - nbits)) & ((1 << nbits) - 1); + table += val; + } + br->bits_left_ -= table->bits; + return table->value; +} + +/** + * Returns the DC diff or AC value for extra bits value x and prefix code s. + * + * CCITT Rec. T.81 (1992 E) + * Table F.1 – Difference magnitude categories for DC coding + * SSSS | DIFF values + * ------+-------------------------- + * 0 | 0 + * 1 | –1, 1 + * 2 | –3, –2, 2, 3 + * 3 | –7..–4, 4..7 + * ......|.......................... + * 11 | –2047..–1024, 1024..2047 + * + * CCITT Rec. T.81 (1992 E) + * Table F.2 – Categories assigned to coefficient values + * [ Same as Table F.1, but does not include SSSS equal to 0 and 11] + * + * + * CCITT Rec. T.81 (1992 E) + * F.1.2.1.1 Structure of DC code table + * For each category,... additional bits... appended... to uniquely identify + * which difference... occurred... When DIFF is positive... SSSS... bits of DIFF + * are appended. When DIFF is negative... SSSS... bits of (DIFF – 1) are + * appended... Most significant bit... is 0 for negative differences and 1 for + * positive differences. + * + * In other words the upper half of extra bits range represents DIFF as is. + * The lower half represents the negative DIFFs with an offset. + */ +int HuffExtend(int x, int s) { + JXL_DASSERT(s >= 1); + int half = 1 << (s - 1); + if (x >= half) { + JXL_DASSERT(x < (1 << s)); + return x; + } else { + return x - (1 << s) + 1; + } +} + +// Decodes one 8x8 block of DCT coefficients from the bit stream. +bool DecodeDCTBlock(const HuffmanTableEntry* dc_huff, + const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al, + int* eobrun, bool* reset_state, int* num_zero_runs, + BitReaderState* br, JPEGData* jpg, coeff_t* last_dc_coeff, + coeff_t* coeffs) { + // Nowadays multiplication is even faster than variable shift. + int Am = 1 << Al; + bool eobrun_allowed = Ss > 0; + if (Ss == 0) { + int s = ReadSymbol(dc_huff, br); + if (s >= kJpegDCAlphabetSize) { + JXL_JPEG_DEBUG("Invalid Huffman symbol %d for DC coefficient.", s); + jpg->error = JPEGReadError::INVALID_SYMBOL; + return false; + } + int diff = 0; + if (s > 0) { + int bits = br->ReadBits(s); + diff = HuffExtend(bits, s); + } + int coeff = diff + *last_dc_coeff; + const int dc_coeff = coeff * Am; + coeffs[0] = dc_coeff; + // TODO(eustas): is there a more elegant / explicit way to check this? + if (dc_coeff != coeffs[0]) { + JXL_JPEG_DEBUG("Invalid DC coefficient %d", dc_coeff); + jpg->error = JPEGReadError::NON_REPRESENTABLE_DC_COEFF; + return false; + } + *last_dc_coeff = coeff; + ++Ss; + } + if (Ss > Se) { + return true; + } + if (*eobrun > 0) { + --(*eobrun); + return true; + } + *num_zero_runs = 0; + for (int k = Ss; k <= Se; k++) { + int sr = ReadSymbol(ac_huff, br); + if (sr >= kJpegHuffmanAlphabetSize) { + JXL_JPEG_DEBUG("Invalid Huffman symbol %d for AC coefficient %d", sr, k); + jpg->error = JPEGReadError::INVALID_SYMBOL; + return false; + } + int r = sr >> 4; + int s = sr & 15; + if (s > 0) { + k += r; + if (k > Se) { + JXL_JPEG_DEBUG("Out-of-band coefficient %d band was %d-%d", k, Ss, Se); + jpg->error = JPEGReadError::OUT_OF_BAND_COEFF; + return false; + } + if (s + Al >= kJpegDCAlphabetSize) { + JXL_JPEG_DEBUG( + "Out of range AC coefficient value: s = %d Al = %d k = %d", s, Al, + k); + jpg->error = JPEGReadError::NON_REPRESENTABLE_AC_COEFF; + return false; + } + int bits = br->ReadBits(s); + int coeff = HuffExtend(bits, s); + coeffs[kJPEGNaturalOrder[k]] = coeff * Am; + *num_zero_runs = 0; + } else if (r == 15) { + k += 15; + ++(*num_zero_runs); + } else { + if (eobrun_allowed && k == Ss && *eobrun == 0) { + // We have two end-of-block runs right after each other, so we signal + // the jpeg encoder to force a state reset at this point. + *reset_state = true; + } + *eobrun = 1 << r; + if (r > 0) { + if (!eobrun_allowed) { + JXL_JPEG_DEBUG("End-of-block run crossing DC coeff."); + jpg->error = JPEGReadError::EOB_RUN_TOO_LONG; + return false; + } + *eobrun += br->ReadBits(r); + } + break; + } + } + --(*eobrun); + return true; +} + +bool RefineDCTBlock(const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al, + int* eobrun, bool* reset_state, BitReaderState* br, + JPEGData* jpg, coeff_t* coeffs) { + // Nowadays multiplication is even faster than variable shift. + int Am = 1 << Al; + bool eobrun_allowed = Ss > 0; + if (Ss == 0) { + int s = br->ReadBits(1); + coeff_t dc_coeff = coeffs[0]; + dc_coeff |= s * Am; + coeffs[0] = dc_coeff; + ++Ss; + } + if (Ss > Se) { + return true; + } + int p1 = Am; + int m1 = -Am; + int k = Ss; + int r; + int s; + bool in_zero_run = false; + if (*eobrun <= 0) { + for (; k <= Se; k++) { + s = ReadSymbol(ac_huff, br); + if (s >= kJpegHuffmanAlphabetSize) { + JXL_JPEG_DEBUG("Invalid Huffman symbol %d for AC coefficient %d", s, k); + jpg->error = JPEGReadError::INVALID_SYMBOL; + return false; + } + r = s >> 4; + s &= 15; + if (s) { + if (s != 1) { + JXL_JPEG_DEBUG("Invalid Huffman symbol %d for AC coefficient %d", s, + k); + jpg->error = JPEGReadError::INVALID_SYMBOL; + return false; + } + s = br->ReadBits(1) ? p1 : m1; + in_zero_run = false; + } else { + if (r != 15) { + if (eobrun_allowed && k == Ss && *eobrun == 0) { + // We have two end-of-block runs right after each other, so we + // signal the jpeg encoder to force a state reset at this point. + *reset_state = true; + } + *eobrun = 1 << r; + if (r > 0) { + if (!eobrun_allowed) { + JXL_JPEG_DEBUG("End-of-block run crossing DC coeff."); + jpg->error = JPEGReadError::EOB_RUN_TOO_LONG; + return false; + } + *eobrun += br->ReadBits(r); + } + break; + } + in_zero_run = true; + } + do { + coeff_t thiscoef = coeffs[kJPEGNaturalOrder[k]]; + if (thiscoef != 0) { + if (br->ReadBits(1)) { + if ((thiscoef & p1) == 0) { + if (thiscoef >= 0) { + thiscoef += p1; + } else { + thiscoef += m1; + } + } + } + coeffs[kJPEGNaturalOrder[k]] = thiscoef; + } else { + if (--r < 0) { + break; + } + } + k++; + } while (k <= Se); + if (s) { + if (k > Se) { + JXL_JPEG_DEBUG("Out-of-band coefficient %d band was %d-%d", k, Ss, + Se); + jpg->error = JPEGReadError::OUT_OF_BAND_COEFF; + return false; + } + coeffs[kJPEGNaturalOrder[k]] = s; + } + } + } + if (in_zero_run) { + JXL_JPEG_DEBUG("Extra zero run before end-of-block."); + jpg->error = JPEGReadError::EXTRA_ZERO_RUN; + return false; + } + if (*eobrun > 0) { + for (; k <= Se; k++) { + coeff_t thiscoef = coeffs[kJPEGNaturalOrder[k]]; + if (thiscoef != 0) { + if (br->ReadBits(1)) { + if ((thiscoef & p1) == 0) { + if (thiscoef >= 0) { + thiscoef += p1; + } else { + thiscoef += m1; + } + } + } + coeffs[kJPEGNaturalOrder[k]] = thiscoef; + } + } + } + --(*eobrun); + return true; +} + +bool ProcessRestart(const uint8_t* data, const size_t len, + int* next_restart_marker, BitReaderState* br, + JPEGData* jpg) { + size_t pos = 0; + if (!br->FinishStream(jpg, &pos)) { + jpg->error = JPEGReadError::INVALID_SCAN; + return false; + } + int expected_marker = 0xd0 + *next_restart_marker; + JXL_JPEG_EXPECT_MARKER(); + int marker = data[pos + 1]; + if (marker != expected_marker) { + JXL_JPEG_DEBUG("Did not find expected restart marker %d actual %d", + expected_marker, marker); + jpg->error = JPEGReadError::WRONG_RESTART_MARKER; + return false; + } + br->Reset(pos + 2); + *next_restart_marker += 1; + *next_restart_marker &= 0x7; + return true; +} + +bool ProcessScan(const uint8_t* data, const size_t len, + const std::vector& dc_huff_lut, + const std::vector& ac_huff_lut, + uint16_t scan_progression[kMaxComponents][kDCTBlockSize], + bool is_progressive, size_t* pos, JPEGData* jpg) { + if (!ProcessSOS(data, len, pos, jpg)) { + return false; + } + JPEGScanInfo* scan_info = &jpg->scan_info.back(); + bool is_interleaved = (scan_info->num_components > 1); + int max_h_samp_factor = 1; + int max_v_samp_factor = 1; + for (size_t i = 0; i < jpg->components.size(); ++i) { + max_h_samp_factor = + std::max(max_h_samp_factor, jpg->components[i].h_samp_factor); + max_v_samp_factor = + std::max(max_v_samp_factor, jpg->components[i].v_samp_factor); + } + + int MCU_rows = DivCeil(jpg->height, max_v_samp_factor * 8); + int MCUs_per_row = DivCeil(jpg->width, max_h_samp_factor * 8); + if (!is_interleaved) { + const JPEGComponent& c = jpg->components[scan_info->components[0].comp_idx]; + MCUs_per_row = DivCeil(jpg->width * c.h_samp_factor, 8 * max_h_samp_factor); + MCU_rows = DivCeil(jpg->height * c.v_samp_factor, 8 * max_v_samp_factor); + } + coeff_t last_dc_coeff[kMaxComponents] = {0}; + BitReaderState br(data, len, *pos); + int restarts_to_go = jpg->restart_interval; + int next_restart_marker = 0; + int eobrun = -1; + int block_scan_index = 0; + const int Al = is_progressive ? scan_info->Al : 0; + const int Ah = is_progressive ? scan_info->Ah : 0; + const int Ss = is_progressive ? scan_info->Ss : 0; + const int Se = is_progressive ? scan_info->Se : 63; + const uint16_t scan_bitmask = Ah == 0 ? (0xffff << Al) : (1u << Al); + const uint16_t refinement_bitmask = (1 << Al) - 1; + for (size_t i = 0; i < scan_info->num_components; ++i) { + int comp_idx = scan_info->components[i].comp_idx; + for (int k = Ss; k <= Se; ++k) { + if (scan_progression[comp_idx][k] & scan_bitmask) { + JXL_JPEG_DEBUG( + "Overlapping scans: component=%d k=%d prev_mask: %u cur_mask %u", + comp_idx, k, scan_progression[i][k], scan_bitmask); + jpg->error = JPEGReadError::OVERLAPPING_SCANS; + return false; + } + if (scan_progression[comp_idx][k] & refinement_bitmask) { + JXL_JPEG_DEBUG( + "Invalid scan order, a more refined scan was already done: " + "component=%d k=%d prev_mask=%u cur_mask=%u", + comp_idx, k, scan_progression[i][k], scan_bitmask); + jpg->error = JPEGReadError::INVALID_SCAN_ORDER; + return false; + } + scan_progression[comp_idx][k] |= scan_bitmask; + } + } + if (Al > 10) { + JXL_JPEG_DEBUG("Scan parameter Al=%d is not supported.", Al); + jpg->error = JPEGReadError::NON_REPRESENTABLE_AC_COEFF; + return false; + } + for (int mcu_y = 0; mcu_y < MCU_rows; ++mcu_y) { + for (int mcu_x = 0; mcu_x < MCUs_per_row; ++mcu_x) { + // Handle the restart intervals. + if (jpg->restart_interval > 0) { + if (restarts_to_go == 0) { + if (ProcessRestart(data, len, &next_restart_marker, &br, jpg)) { + restarts_to_go = jpg->restart_interval; + memset(static_cast(last_dc_coeff), 0, sizeof(last_dc_coeff)); + if (eobrun > 0) { + JXL_JPEG_DEBUG("End-of-block run too long."); + jpg->error = JPEGReadError::EOB_RUN_TOO_LONG; + return false; + } + eobrun = -1; // fresh start + } else { + return false; + } + } + --restarts_to_go; + } + // Decode one MCU. + for (size_t i = 0; i < scan_info->num_components; ++i) { + JPEGComponentScanInfo* si = &scan_info->components[i]; + JPEGComponent* c = &jpg->components[si->comp_idx]; + const HuffmanTableEntry* dc_lut = + &dc_huff_lut[si->dc_tbl_idx * kJpegHuffmanLutSize]; + const HuffmanTableEntry* ac_lut = + &ac_huff_lut[si->ac_tbl_idx * kJpegHuffmanLutSize]; + int nblocks_y = is_interleaved ? c->v_samp_factor : 1; + int nblocks_x = is_interleaved ? c->h_samp_factor : 1; + for (int iy = 0; iy < nblocks_y; ++iy) { + for (int ix = 0; ix < nblocks_x; ++ix) { + int block_y = mcu_y * nblocks_y + iy; + int block_x = mcu_x * nblocks_x + ix; + int block_idx = block_y * c->width_in_blocks + block_x; + bool reset_state = false; + int num_zero_runs = 0; + coeff_t* coeffs = &c->coeffs[block_idx * kDCTBlockSize]; + if (Ah == 0) { + if (!DecodeDCTBlock(dc_lut, ac_lut, Ss, Se, Al, &eobrun, + &reset_state, &num_zero_runs, &br, jpg, + &last_dc_coeff[si->comp_idx], coeffs)) { + return false; + } + } else { + if (!RefineDCTBlock(ac_lut, Ss, Se, Al, &eobrun, &reset_state, + &br, jpg, coeffs)) { + return false; + } + } + if (reset_state) { + scan_info->reset_points.emplace_back(block_scan_index); + } + if (num_zero_runs > 0) { + JPEGScanInfo::ExtraZeroRunInfo info; + info.block_idx = block_scan_index; + info.num_extra_zero_runs = num_zero_runs; + scan_info->extra_zero_runs.push_back(info); + } + ++block_scan_index; + } + } + } + } + } + if (eobrun > 0) { + JXL_JPEG_DEBUG("End-of-block run too long."); + jpg->error = JPEGReadError::EOB_RUN_TOO_LONG; + return false; + } + if (!br.FinishStream(jpg, pos)) { + jpg->error = JPEGReadError::INVALID_SCAN; + return false; + } + if (*pos > len) { + JXL_JPEG_DEBUG("Unexpected end of file during scan. pos=%zu len=%zu", *pos, + len); + jpg->error = JPEGReadError::UNEXPECTED_EOF; + return false; + } + return true; +} + +// Changes the quant_idx field of the components to refer to the index of the +// quant table in the jpg->quant array. +bool FixupIndexes(JPEGData* jpg) { + for (size_t i = 0; i < jpg->components.size(); ++i) { + JPEGComponent* c = &jpg->components[i]; + bool found_index = false; + for (size_t j = 0; j < jpg->quant.size(); ++j) { + if (jpg->quant[j].index == c->quant_idx) { + c->quant_idx = j; + found_index = true; + break; + } + } + if (!found_index) { + JXL_JPEG_DEBUG("Quantization table with index %u not found", + c->quant_idx); + jpg->error = JPEGReadError::QUANT_TABLE_NOT_FOUND; + return false; + } + } + return true; +} + +size_t FindNextMarker(const uint8_t* data, const size_t len, size_t pos) { + // kIsValidMarker[i] == 1 means (0xc0 + i) is a valid marker. + static const uint8_t kIsValidMarker[] = { + 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + }; + size_t num_skipped = 0; + while (pos + 1 < len && (data[pos] != 0xff || data[pos + 1] < 0xc0 || + !kIsValidMarker[data[pos + 1] - 0xc0])) { + ++pos; + ++num_skipped; + } + return num_skipped; +} + +} // namespace + +bool ReadJpeg(const uint8_t* data, const size_t len, JpegReadMode mode, + JPEGData* jpg) { + size_t pos = 0; + // Check SOI marker. + JXL_JPEG_EXPECT_MARKER(); + int marker = data[pos + 1]; + pos += 2; + if (marker != 0xd8) { + JXL_JPEG_DEBUG("Did not find expected SOI marker, actual=%d", marker); + jpg->error = JPEGReadError::SOI_NOT_FOUND; + return false; + } + int lut_size = kMaxHuffmanTables * kJpegHuffmanLutSize; + std::vector dc_huff_lut(lut_size); + std::vector ac_huff_lut(lut_size); + bool found_sof = false; + bool found_dri = false; + uint16_t scan_progression[kMaxComponents][kDCTBlockSize] = {{0}}; + + jpg->padding_bits.resize(0); + bool is_progressive = false; // default + do { + // Read next marker. + size_t num_skipped = FindNextMarker(data, len, pos); + if (num_skipped > 0) { + // Add a fake marker to indicate arbitrary in-between-markers data. + jpg->marker_order.push_back(0xff); + jpg->inter_marker_data.emplace_back(data + pos, data + pos + num_skipped); + pos += num_skipped; + } + JXL_JPEG_EXPECT_MARKER(); + marker = data[pos + 1]; + pos += 2; + bool ok = true; + switch (marker) { + case 0xc0: + case 0xc1: + case 0xc2: + is_progressive = (marker == 0xc2); + ok = ProcessSOF(data, len, mode, &pos, jpg); + found_sof = true; + break; + case 0xc4: + ok = ProcessDHT(data, len, mode, &dc_huff_lut, &ac_huff_lut, &pos, jpg); + break; + case 0xd0: + case 0xd1: + case 0xd2: + case 0xd3: + case 0xd4: + case 0xd5: + case 0xd6: + case 0xd7: + // RST markers do not have any data. + break; + case 0xd9: + // Found end marker. + break; + case 0xda: + if (mode == JpegReadMode::kReadAll) { + ok = ProcessScan(data, len, dc_huff_lut, ac_huff_lut, + scan_progression, is_progressive, &pos, jpg); + } + break; + case 0xdb: + ok = ProcessDQT(data, len, &pos, jpg); + break; + case 0xdd: + ok = ProcessDRI(data, len, &pos, &found_dri, jpg); + break; + case 0xe0: + case 0xe1: + case 0xe2: + case 0xe3: + case 0xe4: + case 0xe5: + case 0xe6: + case 0xe7: + case 0xe8: + case 0xe9: + case 0xea: + case 0xeb: + case 0xec: + case 0xed: + case 0xee: + case 0xef: + if (mode != JpegReadMode::kReadTables) { + ok = ProcessAPP(data, len, &pos, jpg); + } + break; + case 0xfe: + if (mode != JpegReadMode::kReadTables) { + ok = ProcessCOM(data, len, &pos, jpg); + } + break; + default: + JXL_JPEG_DEBUG("Unsupported marker: %d pos=%zu len=%zu", marker, pos, + len); + jpg->error = JPEGReadError::UNSUPPORTED_MARKER; + ok = false; + break; + } + if (!ok) { + return false; + } + jpg->marker_order.push_back(marker); + if (mode == JpegReadMode::kReadHeader && found_sof) { + break; + } + } while (marker != 0xd9); + + if (!found_sof) { + JXL_JPEG_DEBUG("Missing SOF marker."); + jpg->error = JPEGReadError::SOF_NOT_FOUND; + return false; + } + + // Supplemental checks. + if (mode == JpegReadMode::kReadAll) { + if (pos < len) { + jpg->tail_data = std::vector(data + pos, data + len); + } + if (!FixupIndexes(jpg)) { + return false; + } + if (jpg->huffman_code.empty()) { + // Section B.2.4.2: "If a table has never been defined for a particular + // destination, then when this destination is specified in a scan header, + // the results are unpredictable." + JXL_JPEG_DEBUG("Need at least one Huffman code table."); + jpg->error = JPEGReadError::HUFFMAN_TABLE_ERROR; + return false; + } + if (jpg->huffman_code.size() >= kMaxDHTMarkers) { + JXL_JPEG_DEBUG("Too many Huffman tables."); + jpg->error = JPEGReadError::HUFFMAN_TABLE_ERROR; + return false; + } + } + return true; +} + +} // namespace jpeg +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data_reader.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data_reader.h new file mode 100644 index 0000000000..3fad820e9d --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_data_reader.h @@ -0,0 +1,36 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Functions for reading a jpeg byte stream into a JPEGData object. + +#ifndef LIB_JXL_JPEG_ENC_JPEG_DATA_READER_H_ +#define LIB_JXL_JPEG_ENC_JPEG_DATA_READER_H_ + +#include +#include + +#include "lib/jxl/jpeg/jpeg_data.h" + +namespace jxl { +namespace jpeg { + +enum class JpegReadMode { + kReadHeader, // only basic headers + kReadTables, // headers and tables (quant, Huffman, ...) + kReadAll, // everything +}; + +// Parses the JPEG stream contained in data[*pos ... len) and fills in *jpg with +// the parsed information. +// If mode is kReadHeader, it fills in only the image dimensions in *jpg. +// Returns false if the data is not valid JPEG, or if it contains an unsupported +// JPEG feature. +bool ReadJpeg(const uint8_t* data, const size_t len, JpegReadMode mode, + JPEGData* jpg); + +} // namespace jpeg +} // namespace jxl + +#endif // LIB_JXL_JPEG_ENC_JPEG_DATA_READER_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc new file mode 100644 index 0000000000..38282e640a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc @@ -0,0 +1,103 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/jpeg/enc_jpeg_huffman_decode.h" + +#include "lib/jxl/jpeg/jpeg_data.h" + +namespace jxl { +namespace jpeg { + +// Returns the table width of the next 2nd level table, count is the histogram +// of bit lengths for the remaining symbols, len is the code length of the next +// processed symbol. +static inline int NextTableBitSize(const int* count, int len) { + int left = 1 << (len - kJpegHuffmanRootTableBits); + while (len < static_cast(kJpegHuffmanMaxBitLength)) { + left -= count[len]; + if (left <= 0) break; + ++len; + left <<= 1; + } + return len - kJpegHuffmanRootTableBits; +} + +void BuildJpegHuffmanTable(const uint32_t* count, const uint32_t* symbols, + HuffmanTableEntry* lut) { + HuffmanTableEntry code; // current table entry + HuffmanTableEntry* table; // next available space in table + int len; // current code length + int idx; // symbol index + int key; // prefix code + int reps; // number of replicate key values in current table + int low; // low bits for current root entry + int table_bits; // key length of current table + int table_size; // size of current table + + // Make a local copy of the input bit length histogram. + int tmp_count[kJpegHuffmanMaxBitLength + 1] = {0}; + int total_count = 0; + for (len = 1; len <= static_cast(kJpegHuffmanMaxBitLength); ++len) { + tmp_count[len] = count[len]; + total_count += tmp_count[len]; + } + + table = lut; + table_bits = kJpegHuffmanRootTableBits; + table_size = 1 << table_bits; + + // Special case code with only one value. + if (total_count == 1) { + code.bits = 0; + code.value = symbols[0]; + for (key = 0; key < table_size; ++key) { + table[key] = code; + } + return; + } + + // Fill in root table. + key = 0; + idx = 0; + for (len = 1; len <= kJpegHuffmanRootTableBits; ++len) { + for (; tmp_count[len] > 0; --tmp_count[len]) { + code.bits = len; + code.value = symbols[idx++]; + reps = 1 << (kJpegHuffmanRootTableBits - len); + while (reps--) { + table[key++] = code; + } + } + } + + // Fill in 2nd level tables and add pointers to root table. + table += table_size; + table_size = 0; + low = 0; + for (len = kJpegHuffmanRootTableBits + 1; + len <= static_cast(kJpegHuffmanMaxBitLength); ++len) { + for (; tmp_count[len] > 0; --tmp_count[len]) { + // Start a new sub-table if the previous one is full. + if (low >= table_size) { + table += table_size; + table_bits = NextTableBitSize(tmp_count, len); + table_size = 1 << table_bits; + low = 0; + lut[key].bits = table_bits + kJpegHuffmanRootTableBits; + lut[key].value = (table - lut) - key; + ++key; + } + code.bits = len - kJpegHuffmanRootTableBits; + code.value = symbols[idx++]; + reps = 1 << (table_bits - code.bits); + while (reps--) { + table[low++] = code; + } + } + } +} + +} // namespace jpeg +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_huffman_decode.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_huffman_decode.h new file mode 100644 index 0000000000..b8a60e4107 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/enc_jpeg_huffman_decode.h @@ -0,0 +1,41 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Utility function for building a Huffman lookup table for the jpeg decoder. + +#ifndef LIB_JXL_JPEG_ENC_JPEG_HUFFMAN_DECODE_H_ +#define LIB_JXL_JPEG_ENC_JPEG_HUFFMAN_DECODE_H_ + +#include + +namespace jxl { +namespace jpeg { + +constexpr int kJpegHuffmanRootTableBits = 8; +// Maximum huffman lookup table size. +// According to zlib/examples/enough.c, 758 entries are always enough for +// an alphabet of 257 symbols (256 + 1 special symbol for the all 1s code) and +// max bit length 16 if the root table has 8 bits. +constexpr int kJpegHuffmanLutSize = 758; + +struct HuffmanTableEntry { + // Initialize the value to an invalid symbol so that we can recognize it + // when reading the bit stream using a Huffman code with space > 0. + HuffmanTableEntry() : bits(0), value(0xffff) {} + + uint8_t bits; // number of bits used for this symbol + uint16_t value; // symbol value or table offset +}; + +// Builds jpeg-style Huffman lookup table from the given symbols. +// The symbols are in order of increasing bit lengths. The number of symbols +// with bit length n is given in counts[n] for each n >= 1. +void BuildJpegHuffmanTable(const uint32_t* counts, const uint32_t* symbols, + HuffmanTableEntry* lut); + +} // namespace jpeg +} // namespace jxl + +#endif // LIB_JXL_JPEG_ENC_JPEG_HUFFMAN_DECODE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/jpeg_data.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/jpeg_data.cc new file mode 100644 index 0000000000..42e5a4921c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/jpeg_data.cc @@ -0,0 +1,448 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/jpeg/jpeg_data.h" + +#include "lib/jxl/base/status.h" + +namespace jxl { +namespace jpeg { + +#if JPEGXL_ENABLE_TRANSCODE_JPEG + +namespace { +enum JPEGComponentType : uint32_t { + kGray = 0, + kYCbCr = 1, + kRGB = 2, + kCustom = 3, +}; + +struct JPEGInfo { + size_t num_app_markers = 0; + size_t num_com_markers = 0; + size_t num_scans = 0; + size_t num_intermarker = 0; + bool has_dri = false; +}; + +Status VisitMarker(uint8_t* marker, Visitor* visitor, JPEGInfo* info) { + uint32_t marker32 = *marker - 0xc0; + JXL_RETURN_IF_ERROR(visitor->Bits(6, 0x00, &marker32)); + *marker = marker32 + 0xc0; + if ((*marker & 0xf0) == 0xe0) { + info->num_app_markers++; + } + if (*marker == 0xfe) { + info->num_com_markers++; + } + if (*marker == 0xda) { + info->num_scans++; + } + // We use a fake 0xff marker to signal intermarker data. + if (*marker == 0xff) { + info->num_intermarker++; + } + if (*marker == 0xdd) { + info->has_dri = true; + } + return true; +} + +} // namespace + +Status JPEGData::VisitFields(Visitor* visitor) { + bool is_gray = components.size() == 1; + JXL_RETURN_IF_ERROR(visitor->Bool(false, &is_gray)); + if (visitor->IsReading()) { + components.resize(is_gray ? 1 : 3); + } + JPEGInfo info; + if (visitor->IsReading()) { + uint8_t marker = 0xc0; + do { + JXL_RETURN_IF_ERROR(VisitMarker(&marker, visitor, &info)); + marker_order.push_back(marker); + if (marker_order.size() > 16384) { + return JXL_FAILURE("Too many markers: %zu\n", marker_order.size()); + } + } while (marker != 0xd9); + } else { + if (marker_order.size() > 16384) { + return JXL_FAILURE("Too many markers: %zu\n", marker_order.size()); + } + for (size_t i = 0; i < marker_order.size(); i++) { + JXL_RETURN_IF_ERROR(VisitMarker(&marker_order[i], visitor, &info)); + } + if (!marker_order.empty()) { + // Last marker should always be EOI marker. + JXL_CHECK(marker_order.back() == 0xd9); + } + } + + // Size of the APP and COM markers. + if (visitor->IsReading()) { + app_data.resize(info.num_app_markers); + app_marker_type.resize(info.num_app_markers); + com_data.resize(info.num_com_markers); + scan_info.resize(info.num_scans); + } + JXL_ASSERT(app_data.size() == info.num_app_markers); + JXL_ASSERT(app_marker_type.size() == info.num_app_markers); + JXL_ASSERT(com_data.size() == info.num_com_markers); + JXL_ASSERT(scan_info.size() == info.num_scans); + for (size_t i = 0; i < app_data.size(); i++) { + auto& app = app_data[i]; + // Encodes up to 8 different values. + JXL_RETURN_IF_ERROR( + visitor->U32(Val(0), Val(1), BitsOffset(1, 2), BitsOffset(2, 4), 0, + reinterpret_cast(&app_marker_type[i]))); + if (app_marker_type[i] != AppMarkerType::kUnknown && + app_marker_type[i] != AppMarkerType::kICC && + app_marker_type[i] != AppMarkerType::kExif && + app_marker_type[i] != AppMarkerType::kXMP) { + return JXL_FAILURE("Unknown app marker type %u", + static_cast(app_marker_type[i])); + } + uint32_t len = app.size() - 1; + JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &len)); + if (visitor->IsReading()) app.resize(len + 1); + if (app.size() < 3) { + return JXL_FAILURE("Invalid marker size: %zu\n", app.size()); + } + } + for (auto& com : com_data) { + uint32_t len = com.size() - 1; + JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &len)); + if (visitor->IsReading()) com.resize(len + 1); + if (com.size() < 3) { + return JXL_FAILURE("Invalid marker size: %zu\n", com.size()); + } + } + + uint32_t num_quant_tables = quant.size(); + JXL_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(3), Val(4), 2, &num_quant_tables)); + if (num_quant_tables == 4) { + return JXL_FAILURE("Invalid number of quant tables"); + } + if (visitor->IsReading()) { + quant.resize(num_quant_tables); + } + for (size_t i = 0; i < num_quant_tables; i++) { + if (quant[i].precision > 1) { + return JXL_FAILURE( + "Quant tables with more than 16 bits are not supported"); + } + JXL_RETURN_IF_ERROR(visitor->Bits(1, 0, &quant[i].precision)); + JXL_RETURN_IF_ERROR(visitor->Bits(2, i, &quant[i].index)); + JXL_RETURN_IF_ERROR(visitor->Bool(true, &quant[i].is_last)); + } + + JPEGComponentType component_type = + components.size() == 1 && components[0].id == 1 + ? JPEGComponentType::kGray + : components.size() == 3 && components[0].id == 1 && + components[1].id == 2 && components[2].id == 3 + ? JPEGComponentType::kYCbCr + : components.size() == 3 && components[0].id == 'R' && + components[1].id == 'G' && components[2].id == 'B' + ? JPEGComponentType::kRGB + : JPEGComponentType::kCustom; + JXL_RETURN_IF_ERROR( + visitor->Bits(2, JPEGComponentType::kYCbCr, + reinterpret_cast(&component_type))); + uint32_t num_components; + if (component_type == JPEGComponentType::kGray) { + num_components = 1; + } else if (component_type != JPEGComponentType::kCustom) { + num_components = 3; + } else { + num_components = components.size(); + JXL_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(3), Val(4), 3, &num_components)); + if (num_components != 1 && num_components != 3) { + return JXL_FAILURE("Invalid number of components: %u", num_components); + } + } + if (visitor->IsReading()) { + components.resize(num_components); + } + if (component_type == JPEGComponentType::kCustom) { + for (size_t i = 0; i < components.size(); i++) { + JXL_RETURN_IF_ERROR(visitor->Bits(8, 0, &components[i].id)); + } + } else if (component_type == JPEGComponentType::kGray) { + components[0].id = 1; + } else if (component_type == JPEGComponentType::kRGB) { + components[0].id = 'R'; + components[1].id = 'G'; + components[2].id = 'B'; + } else { + components[0].id = 1; + components[1].id = 2; + components[2].id = 3; + } + size_t used_tables = 0; + for (size_t i = 0; i < components.size(); i++) { + JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &components[i].quant_idx)); + if (components[i].quant_idx >= quant.size()) { + return JXL_FAILURE("Invalid quant table for component %zu: %u\n", i, + components[i].quant_idx); + } + used_tables |= 1U << components[i].quant_idx; + } + if (used_tables + 1 != 1U << quant.size()) { + return JXL_FAILURE( + "Not all quant tables are used (%zu tables, %zx used table mask)", + quant.size(), used_tables); + } + + uint32_t num_huff = huffman_code.size(); + JXL_RETURN_IF_ERROR(visitor->U32(Val(4), BitsOffset(3, 2), BitsOffset(4, 10), + BitsOffset(6, 26), 4, &num_huff)); + if (visitor->IsReading()) { + huffman_code.resize(num_huff); + } + for (JPEGHuffmanCode& hc : huffman_code) { + bool is_ac = hc.slot_id >> 4; + uint32_t id = hc.slot_id & 0xF; + JXL_RETURN_IF_ERROR(visitor->Bool(false, &is_ac)); + JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &id)); + hc.slot_id = (static_cast(is_ac) << 4) | id; + JXL_RETURN_IF_ERROR(visitor->Bool(true, &hc.is_last)); + size_t num_symbols = 0; + for (size_t i = 0; i <= 16; i++) { + JXL_RETURN_IF_ERROR(visitor->U32(Val(0), Val(1), BitsOffset(3, 2), + Bits(8), 0, &hc.counts[i])); + num_symbols += hc.counts[i]; + } + if (num_symbols < 1) { + // Actually, at least 2 symbols are required, since one of them is EOI. + return JXL_FAILURE("Empty Huffman table"); + } + if (num_symbols > hc.values.size()) { + return JXL_FAILURE("Huffman code too large (%zu)", num_symbols); + } + // Presence flags for 4 * 64 + 1 values. + uint64_t value_slots[5] = {}; + for (size_t i = 0; i < num_symbols; i++) { + // Goes up to 256, included. Might have the same symbol appear twice... + JXL_RETURN_IF_ERROR(visitor->U32(Bits(2), BitsOffset(2, 4), + BitsOffset(4, 8), BitsOffset(8, 1), 0, + &hc.values[i])); + value_slots[hc.values[i] >> 6] |= (uint64_t)1 << (hc.values[i] & 0x3F); + } + if (hc.values[num_symbols - 1] != kJpegHuffmanAlphabetSize) { + return JXL_FAILURE("Missing EOI symbol"); + } + // Last element, denoting EOI, have to be 1 after the loop. + JXL_ASSERT(value_slots[4] == 1); + size_t num_values = 1; + for (size_t i = 0; i < 4; ++i) num_values += hwy::PopCount(value_slots[i]); + if (num_values != num_symbols) { + return JXL_FAILURE("Duplicate Huffman symbols"); + } + if (!is_ac) { + bool only_dc = ((value_slots[0] >> kJpegDCAlphabetSize) | value_slots[1] | + value_slots[2] | value_slots[3]) == 0; + if (!only_dc) return JXL_FAILURE("Huffman symbols out of DC range"); + } + } + + for (auto& scan : scan_info) { + JXL_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(3), Val(4), 1, &scan.num_components)); + if (scan.num_components >= 4) { + return JXL_FAILURE("Invalid number of components in SOS marker"); + } + JXL_RETURN_IF_ERROR(visitor->Bits(6, 0, &scan.Ss)); + JXL_RETURN_IF_ERROR(visitor->Bits(6, 63, &scan.Se)); + JXL_RETURN_IF_ERROR(visitor->Bits(4, 0, &scan.Al)); + JXL_RETURN_IF_ERROR(visitor->Bits(4, 0, &scan.Ah)); + for (size_t i = 0; i < scan.num_components; i++) { + JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &scan.components[i].comp_idx)); + if (scan.components[i].comp_idx >= components.size()) { + return JXL_FAILURE("Invalid component idx in SOS marker"); + } + JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &scan.components[i].ac_tbl_idx)); + JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &scan.components[i].dc_tbl_idx)); + } + // TODO(veluca): actually set and use this value. + JXL_RETURN_IF_ERROR(visitor->U32(Val(0), Val(1), Val(2), BitsOffset(3, 3), + kMaxNumPasses - 1, + &scan.last_needed_pass)); + } + + // From here on, this is data that is not strictly necessary to get a valid + // JPEG, but necessary for bit-exact JPEG reconstruction. + if (info.has_dri) { + JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &restart_interval)); + } + + uint64_t padding_spot_limit = scan_info.size(); + + for (auto& scan : scan_info) { + uint32_t num_reset_points = scan.reset_points.size(); + JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(2, 1), BitsOffset(4, 4), + BitsOffset(16, 20), 0, &num_reset_points)); + if (visitor->IsReading()) { + scan.reset_points.resize(num_reset_points); + } + int last_block_idx = -1; + for (auto& block_idx : scan.reset_points) { + block_idx -= last_block_idx + 1; + JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(3, 1), + BitsOffset(5, 9), BitsOffset(28, 41), 0, + &block_idx)); + block_idx += last_block_idx + 1; + if (static_cast(block_idx) < last_block_idx + 1) { + return JXL_FAILURE("Invalid block ID: %u, last block was %d", block_idx, + last_block_idx); + } + // TODO(eustas): better upper boundary could be given at this point; also + // it could be applied during reset_points reading. + if (block_idx > (1u << 30)) { + // At most 8K x 8K x num_channels blocks are expected. That is, + // typically, 1.5 * 2^27. 2^30 should be sufficient for any sane + // image. + return JXL_FAILURE("Invalid block ID: %u", block_idx); + } + last_block_idx = block_idx; + } + + uint32_t num_extra_zero_runs = scan.extra_zero_runs.size(); + JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(2, 1), BitsOffset(4, 4), + BitsOffset(16, 20), 0, + &num_extra_zero_runs)); + if (visitor->IsReading()) { + scan.extra_zero_runs.resize(num_extra_zero_runs); + } + last_block_idx = -1; + for (size_t i = 0; i < scan.extra_zero_runs.size(); ++i) { + uint32_t& block_idx = scan.extra_zero_runs[i].block_idx; + JXL_RETURN_IF_ERROR(visitor->U32( + Val(1), BitsOffset(2, 2), BitsOffset(4, 5), BitsOffset(8, 20), 1, + &scan.extra_zero_runs[i].num_extra_zero_runs)); + block_idx -= last_block_idx + 1; + JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(3, 1), + BitsOffset(5, 9), BitsOffset(28, 41), 0, + &block_idx)); + block_idx += last_block_idx + 1; + if (static_cast(block_idx) < last_block_idx + 1) { + return JXL_FAILURE("Invalid block ID: %u, last block was %d", block_idx, + last_block_idx); + } + if (block_idx > (1u << 30)) { + // At most 8K x 8K x num_channels blocks are expected. That is, + // typically, 1.5 * 2^27. 2^30 should be sufficient for any sane + // image. + return JXL_FAILURE("Invalid block ID: %u", block_idx); + } + last_block_idx = block_idx; + } + + if (restart_interval > 0) { + int MCUs_per_row = 0; + int MCU_rows = 0; + CalculateMcuSize(scan, &MCUs_per_row, &MCU_rows); + padding_spot_limit += DivCeil(MCU_rows * MCUs_per_row, restart_interval); + } + } + std::vector inter_marker_data_sizes; + inter_marker_data_sizes.reserve(info.num_intermarker); + for (size_t i = 0; i < info.num_intermarker; ++i) { + uint32_t len = visitor->IsReading() ? 0 : inter_marker_data[i].size(); + JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &len)); + if (visitor->IsReading()) inter_marker_data_sizes.emplace_back(len); + } + uint32_t tail_data_len = tail_data.size(); + JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(8, 1), + BitsOffset(16, 257), BitsOffset(22, 65793), + 0, &tail_data_len)); + + JXL_RETURN_IF_ERROR(visitor->Bool(false, &has_zero_padding_bit)); + if (has_zero_padding_bit) { + uint32_t nbit = padding_bits.size(); + JXL_RETURN_IF_ERROR(visitor->Bits(24, 0, &nbit)); + if (nbit > 7 * padding_spot_limit) { + return JXL_FAILURE("Number of padding bits does not correspond to image"); + } + // TODO(eustas): check that that much bits of input are available. + if (visitor->IsReading()) { + padding_bits.resize(nbit); + } + // TODO(eustas): read in (8-64?) bit groups to reduce overhead. + for (uint8_t& bit : padding_bits) { + bool bbit = bit; + JXL_RETURN_IF_ERROR(visitor->Bool(false, &bbit)); + bit = bbit; + } + } + + // Apply postponed actions. + if (visitor->IsReading()) { + tail_data.resize(tail_data_len); + JXL_ASSERT(inter_marker_data_sizes.size() == info.num_intermarker); + inter_marker_data.reserve(info.num_intermarker); + for (size_t i = 0; i < info.num_intermarker; ++i) { + inter_marker_data.emplace_back(inter_marker_data_sizes[i]); + } + } + + return true; +} + +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + +void JPEGData::CalculateMcuSize(const JPEGScanInfo& scan, int* MCUs_per_row, + int* MCU_rows) const { + const bool is_interleaved = (scan.num_components > 1); + const JPEGComponent& base_component = components[scan.components[0].comp_idx]; + // h_group / v_group act as numerators for converting number of blocks to + // number of MCU. In interleaved mode it is 1, so MCU is represented with + // max_*_samp_factor blocks. In non-interleaved mode we choose numerator to + // be the samping factor, consequently MCU is always represented with single + // block. + const int h_group = is_interleaved ? 1 : base_component.h_samp_factor; + const int v_group = is_interleaved ? 1 : base_component.v_samp_factor; + int max_h_samp_factor = 1; + int max_v_samp_factor = 1; + for (const auto& c : components) { + max_h_samp_factor = std::max(c.h_samp_factor, max_h_samp_factor); + max_v_samp_factor = std::max(c.v_samp_factor, max_v_samp_factor); + } + *MCUs_per_row = DivCeil(width * h_group, 8 * max_h_samp_factor); + *MCU_rows = DivCeil(height * v_group, 8 * max_v_samp_factor); +} + +#if JPEGXL_ENABLE_TRANSCODE_JPEG + +Status SetJPEGDataFromICC(const PaddedBytes& icc, jpeg::JPEGData* jpeg_data) { + size_t icc_pos = 0; + for (size_t i = 0; i < jpeg_data->app_data.size(); i++) { + if (jpeg_data->app_marker_type[i] != jpeg::AppMarkerType::kICC) { + continue; + } + size_t len = jpeg_data->app_data[i].size() - 17; + if (icc_pos + len > icc.size()) { + return JXL_FAILURE( + "ICC length is less than APP markers: requested %zu more bytes, " + "%zu available", + len, icc.size() - icc_pos); + } + memcpy(&jpeg_data->app_data[i][17], icc.data() + icc_pos, len); + icc_pos += len; + } + if (icc_pos != icc.size() && icc_pos != 0) { + return JXL_FAILURE("ICC length is more than APP markers"); + } + return true; +} + +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + +} // namespace jpeg +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/jpeg_data.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/jpeg_data.h new file mode 100644 index 0000000000..6b7cb02aad --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jpeg/jpeg_data.h @@ -0,0 +1,267 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Data structures that represent the non-pixel contents of a jpeg file. + +#ifndef LIB_JXL_JPEG_JPEG_DATA_H_ +#define LIB_JXL_JPEG_JPEG_DATA_H_ + +#include +#include + +#include +#include + +#include "lib/jxl/common.h" // JPEGXL_ENABLE_TRANSCODE_JPEG +#include "lib/jxl/fields.h" + +namespace jxl { +namespace jpeg { + +constexpr int kMaxComponents = 4; +constexpr int kMaxQuantTables = 4; +constexpr int kMaxHuffmanTables = 4; +constexpr size_t kJpegHuffmanMaxBitLength = 16; +constexpr int kJpegHuffmanAlphabetSize = 256; +constexpr int kJpegDCAlphabetSize = 12; +constexpr int kMaxDHTMarkers = 512; +constexpr int kMaxDimPixels = 65535; +constexpr uint8_t kApp1 = 0xE1; +constexpr uint8_t kApp2 = 0xE2; +const uint8_t kIccProfileTag[12] = "ICC_PROFILE"; +const uint8_t kExifTag[6] = "Exif\0"; +const uint8_t kXMPTag[29] = "http://ns.adobe.com/xap/1.0/"; + +/* clang-format off */ +constexpr uint32_t kJPEGNaturalOrder[80] = { + 0, 1, 8, 16, 9, 2, 3, 10, + 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, + 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, + 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, + 53, 60, 61, 54, 47, 55, 62, 63, + // extra entries for safety in decoder + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63 +}; + +constexpr uint32_t kJPEGZigZagOrder[64] = { + 0, 1, 5, 6, 14, 15, 27, 28, + 2, 4, 7, 13, 16, 26, 29, 42, + 3, 8, 12, 17, 25, 30, 41, 43, + 9, 11, 18, 24, 31, 40, 44, 53, + 10, 19, 23, 32, 39, 45, 52, 54, + 20, 22, 33, 38, 46, 51, 55, 60, + 21, 34, 37, 47, 50, 56, 59, 61, + 35, 36, 48, 49, 57, 58, 62, 63 +}; +/* clang-format on */ + +enum struct JPEGReadError { + OK = 0, + SOI_NOT_FOUND, + SOF_NOT_FOUND, + UNEXPECTED_EOF, + MARKER_BYTE_NOT_FOUND, + UNSUPPORTED_MARKER, + WRONG_MARKER_SIZE, + INVALID_PRECISION, + INVALID_WIDTH, + INVALID_HEIGHT, + INVALID_NUMCOMP, + INVALID_SAMP_FACTOR, + INVALID_START_OF_SCAN, + INVALID_END_OF_SCAN, + INVALID_SCAN_BIT_POSITION, + INVALID_COMPS_IN_SCAN, + INVALID_HUFFMAN_INDEX, + INVALID_QUANT_TBL_INDEX, + INVALID_QUANT_VAL, + INVALID_MARKER_LEN, + INVALID_SAMPLING_FACTORS, + INVALID_HUFFMAN_CODE, + INVALID_SYMBOL, + NON_REPRESENTABLE_DC_COEFF, + NON_REPRESENTABLE_AC_COEFF, + INVALID_SCAN, + OVERLAPPING_SCANS, + INVALID_SCAN_ORDER, + EXTRA_ZERO_RUN, + DUPLICATE_DRI, + DUPLICATE_SOF, + WRONG_RESTART_MARKER, + DUPLICATE_COMPONENT_ID, + COMPONENT_NOT_FOUND, + HUFFMAN_TABLE_NOT_FOUND, + HUFFMAN_TABLE_ERROR, + QUANT_TABLE_NOT_FOUND, + EMPTY_DHT, + EMPTY_DQT, + OUT_OF_BAND_COEFF, + EOB_RUN_TOO_LONG, + IMAGE_TOO_LARGE, + INVALID_QUANT_TBL_PRECISION, +}; + +// Quantization values for an 8x8 pixel block. +struct JPEGQuantTable { + std::array values; + uint32_t precision = 0; + // The index of this quantization table as it was parsed from the input JPEG. + // Each DQT marker segment contains an 'index' field, and we save this index + // here. Valid values are 0 to 3. + uint32_t index = 0; + // Set to true if this table is the last one within its marker segment. + bool is_last = true; +}; + +// Huffman code and decoding lookup table used for DC and AC coefficients. +struct JPEGHuffmanCode { + // Bit length histogram. + std::array counts = {}; + // Symbol values sorted by increasing bit lengths. + std::array values = {}; + // The index of the Huffman code in the current set of Huffman codes. For AC + // component Huffman codes, 0x10 is added to the index. + int slot_id = 0; + // Set to true if this Huffman code is the last one within its marker segment. + bool is_last = true; +}; + +// Huffman table indexes used for one component of one scan. +struct JPEGComponentScanInfo { + uint32_t comp_idx; + uint32_t dc_tbl_idx; + uint32_t ac_tbl_idx; +}; + +// Contains information that is used in one scan. +struct JPEGScanInfo { + // Parameters used for progressive scans (named the same way as in the spec): + // Ss : Start of spectral band in zig-zag sequence. + // Se : End of spectral band in zig-zag sequence. + // Ah : Successive approximation bit position, high. + // Al : Successive approximation bit position, low. + uint32_t Ss; + uint32_t Se; + uint32_t Ah; + uint32_t Al; + uint32_t num_components = 0; + std::array components; + // Last codestream pass that is needed to write this scan. + uint32_t last_needed_pass = 0; + + // Extra information required for bit-precise JPEG file reconstruction. + + // Set of block indexes where the JPEG encoder has to flush the end-of-block + // runs and refinement bits. + std::vector reset_points; + // The number of extra zero runs (Huffman symbol 0xf0) before the end of + // block (if nonzero), indexed by block index. + // All of these symbols can be omitted without changing the pixel values, but + // some jpeg encoders put these at the end of blocks. + typedef struct { + uint32_t block_idx; + uint32_t num_extra_zero_runs; + } ExtraZeroRunInfo; + std::vector extra_zero_runs; +}; + +typedef int16_t coeff_t; + +// Represents one component of a jpeg file. +struct JPEGComponent { + JPEGComponent() + : id(0), + h_samp_factor(1), + v_samp_factor(1), + quant_idx(0), + width_in_blocks(0), + height_in_blocks(0) {} + + // One-byte id of the component. + uint32_t id; + // Horizontal and vertical sampling factors. + // In interleaved mode, each minimal coded unit (MCU) has + // h_samp_factor x v_samp_factor DCT blocks from this component. + int h_samp_factor; + int v_samp_factor; + // The index of the quantization table used for this component. + uint32_t quant_idx; + // The dimensions of the component measured in 8x8 blocks. + uint32_t width_in_blocks; + uint32_t height_in_blocks; + // The DCT coefficients of this component, laid out block-by-block, divided + // through the quantization matrix values. + std::vector coeffs; +}; + +enum class AppMarkerType : uint32_t { + kUnknown = 0, + kICC = 1, + kExif = 2, + kXMP = 3, +}; + +// Represents a parsed jpeg file. +struct JPEGData : public Fields { + JPEGData() + : width(0), + height(0), + restart_interval(0), + error(JPEGReadError::OK), + has_zero_padding_bit(false) {} + + const char* Name() const override { return "JPEGData"; } +#if JPEGXL_ENABLE_TRANSCODE_JPEG + // Doesn't serialize everything - skips brotli-encoded data and what is + // already encoded in the codestream. + Status VisitFields(Visitor* visitor) override; +#else + Status VisitFields(Visitor* /* visitor */) override { + JXL_ABORT("JPEG transcoding support not enabled"); + } +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + + void CalculateMcuSize(const JPEGScanInfo& scan, int* MCUs_per_row, + int* MCU_rows) const; + + int width; + int height; + uint32_t restart_interval; + std::vector> app_data; + std::vector app_marker_type; + std::vector> com_data; + std::vector quant; + std::vector huffman_code; + std::vector components; + std::vector scan_info; + std::vector marker_order; + std::vector> inter_marker_data; + std::vector tail_data; + JPEGReadError error; + + // Extra information required for bit-precise JPEG file reconstruction. + + bool has_zero_padding_bit; + std::vector padding_bits; +}; + +#if JPEGXL_ENABLE_TRANSCODE_JPEG +// Set ICC profile in jpeg_data. +Status SetJPEGDataFromICC(const PaddedBytes& icc, jpeg::JPEGData* jpeg_data); +#else +static JXL_INLINE Status SetJPEGDataFromICC(const PaddedBytes& /* icc */, + jpeg::JPEGData* /* jpeg_data */) { + JXL_ABORT("JPEG transcoding support not enabled"); +} +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + +} // namespace jpeg +} // namespace jxl + +#endif // LIB_JXL_JPEG_JPEG_DATA_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl.syms b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl.syms new file mode 100644 index 0000000000..0f398d7151 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl.syms @@ -0,0 +1,5 @@ +{ + extern "C" { + jpegxl_*; + }; +}; diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl.version b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl.version new file mode 100644 index 0000000000..e0ed12be25 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl.version @@ -0,0 +1,7 @@ +JXL_0 { + global: + Jxl*; + + local: + *; +}; diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_decode.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_decode.cc new file mode 100644 index 0000000000..78c7d8d8e8 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_decode.cc @@ -0,0 +1,2217 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "jxl/decode.h" + +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_external_image.h" +#include "lib/jxl/dec_frame.h" +#include "lib/jxl/dec_modular.h" +#include "lib/jxl/dec_reconstruct.h" +#include "lib/jxl/decode_to_jpeg.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/icc_codec.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/memory_manager_internal.h" +#include "lib/jxl/toc.h" + +#ifndef JPEGXL_MAJOR_VERSION +#define JPEGXL_MAJOR_VERSION 0 +#define JPEGXL_MINOR_VERSION 5 +#define JPEGXL_PATCH_VERSION 0 +#endif + +namespace { + +// If set (by fuzzer) then some operations will fail, if those would require +// allocating large objects. Actual memory usage might be two orders of +// magnitude bigger. +// TODO(eustas): this is a poor-mans replacement for memory-manager approach; +// remove, once memory-manager actually works. +size_t memory_limit_base_ = 0; +size_t cpu_limit_base_ = 0; +size_t used_cpu_base_ = 0; + +bool CheckSizeLimit(size_t xsize, size_t ysize) { + if (!memory_limit_base_) return true; + if (xsize == 0 || ysize == 0) return true; + size_t num_pixels = xsize * ysize; + if (num_pixels / xsize != ysize) return false; // overflow + if (num_pixels > memory_limit_base_) return false; + return true; +} + +// Checks if a + b > size, taking possible integer overflow into account. +bool OutOfBounds(size_t a, size_t b, size_t size) { + size_t pos = a + b; + if (pos > size) return true; + if (pos < a) return true; // overflow happened + return false; +} + +// Checks if a + b + c > size, taking possible integer overflow into account. +bool OutOfBounds(size_t a, size_t b, size_t c, size_t size) { + size_t pos = a + b; + if (pos < b) return true; // overflow happened + pos += c; + if (pos < c) return true; // overflow happened + if (pos > size) return true; + return false; +} + +bool SumOverflows(size_t a, size_t b, size_t c) { + size_t sum = a + b; + if (sum < b) return true; + sum += c; + if (sum < c) return true; + return false; +} + +JXL_INLINE size_t InitialBasicInfoSizeHint() { + // Amount of bytes before the start of the codestream in the container format, + // assuming that the codestream is the first box after the signature and + // filetype boxes. 12 bytes signature box + 20 bytes filetype box + 16 bytes + // codestream box length + name + optional XLBox length. + const size_t container_header_size = 48; + + // Worst-case amount of bytes for basic info of the JPEG XL codestream header, + // that is all information up to and including extra_channel_bits. Up to + // around 2 bytes signature + 8 bytes SizeHeader + 31 bytes ColorEncoding + 4 + // bytes rest of ImageMetadata + 5 bytes part of ImageMetadata2. + // TODO(lode): recompute and update this value when alpha_bits is moved to + // extra channels info. + const size_t max_codestream_basic_info_size = 50; + + return container_header_size + max_codestream_basic_info_size; +} + +// Debug-printing failure macro similar to JXL_FAILURE, but for the status code +// JXL_DEC_ERROR +#ifdef JXL_CRASH_ON_ERROR +#define JXL_API_ERROR(format, ...) \ + (::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \ + ::jxl::Abort(), JXL_DEC_ERROR) +#else // JXL_CRASH_ON_ERROR +#define JXL_API_ERROR(format, ...) \ + (((JXL_DEBUG_ON_ERROR) && \ + ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__)), \ + JXL_DEC_ERROR) +#endif // JXL_CRASH_ON_ERROR + +JxlDecoderStatus ConvertStatus(JxlDecoderStatus status) { return status; } + +JxlDecoderStatus ConvertStatus(jxl::Status status) { + return status ? JXL_DEC_SUCCESS : JXL_DEC_ERROR; +} + +JxlSignature ReadSignature(const uint8_t* buf, size_t len, size_t* pos) { + if (*pos >= len) return JXL_SIG_NOT_ENOUGH_BYTES; + + buf += *pos; + len -= *pos; + + // JPEG XL codestream: 0xff 0x0a + if (len >= 1 && buf[0] == 0xff) { + if (len < 2) { + return JXL_SIG_NOT_ENOUGH_BYTES; + } else if (buf[1] == jxl::kCodestreamMarker) { + *pos += 2; + return JXL_SIG_CODESTREAM; + } else { + return JXL_SIG_INVALID; + } + } + + // JPEG XL container + if (len >= 1 && buf[0] == 0) { + if (len < 12) { + return JXL_SIG_NOT_ENOUGH_BYTES; + } else if (buf[1] == 0 && buf[2] == 0 && buf[3] == 0xC && buf[4] == 'J' && + buf[5] == 'X' && buf[6] == 'L' && buf[7] == ' ' && + buf[8] == 0xD && buf[9] == 0xA && buf[10] == 0x87 && + buf[11] == 0xA) { + *pos += 12; + return JXL_SIG_CONTAINER; + } else { + return JXL_SIG_INVALID; + } + } + + return JXL_SIG_INVALID; +} + +} // namespace + +uint32_t JxlDecoderVersion(void) { + return JPEGXL_MAJOR_VERSION * 1000000 + JPEGXL_MINOR_VERSION * 1000 + + JPEGXL_PATCH_VERSION; +} + +JxlSignature JxlSignatureCheck(const uint8_t* buf, size_t len) { + size_t pos = 0; + return ReadSignature(buf, len, &pos); +} + +namespace { + +size_t BitsPerChannel(JxlDataType data_type) { + switch (data_type) { + case JXL_TYPE_BOOLEAN: + return 1; + case JXL_TYPE_UINT8: + return 8; + case JXL_TYPE_UINT16: + return 16; + case JXL_TYPE_UINT32: + return 32; + case JXL_TYPE_FLOAT: + return 32; + case JXL_TYPE_FLOAT16: + return 16; + // No default, give compiler error if new type not handled. + } + return 0; // Indicate invalid data type. +} + +enum class DecoderStage : uint32_t { + kInited, // Decoder created, no JxlDecoderProcessInput called yet + kStarted, // Running JxlDecoderProcessInput calls + kFinished, // Everything done, nothing left to process + kError, // Error occurred, decoder object no longer usable +}; + +enum class FrameStage : uint32_t { + kHeader, // Must parse frame header. dec->frame_start must be set up + // correctly already. + kTOC, // Must parse TOC + kFull, // Must parse full pixels + kFullOutput, // Must output full pixels +}; + +// Manages the sections for the FrameDecoder based on input bytes received. +struct Sections { + // sections_begin = position in the frame where the sections begin, after + // the frame header and TOC, so sections_begin = sum of frame header size and + // TOC size. + Sections(jxl::FrameDecoder* frame_dec, size_t frame_size, + size_t sections_begin) + : frame_dec_(frame_dec), + frame_size_(frame_size), + sections_begin_(sections_begin) {} + + Sections(const Sections&) = delete; + Sections& operator=(const Sections&) = delete; + Sections(Sections&&) = delete; + Sections& operator=(Sections&&) = delete; + + ~Sections() { + // Avoid memory leaks if the JXL decoder quits early and doesn't end up + // calling CloseInput(). + CloseInput(); + } + + // frame_dec_ must have been Inited already, but not yet done ProcessSections. + JxlDecoderStatus Init() { + section_received.resize(frame_dec_->NumSections(), 0); + + const auto& offsets = frame_dec_->SectionOffsets(); + const auto& sizes = frame_dec_->SectionSizes(); + + // Ensure none of the sums of section offset and size overflow. + for (size_t i = 0; i < frame_dec_->NumSections(); i++) { + if (OutOfBounds(sections_begin_, offsets[i], sizes[i], frame_size_)) { + return JXL_API_ERROR("section out of bounds"); + } + } + + return JXL_DEC_SUCCESS; + } + + // Sets the input data for the frame. The frame pointer must point to the + // beginning of the frame, size is the amount of bytes gotten so far and + // should increase with next calls until the full frame is loaded. + // TODO(lode): allow caller to provide only later chunks of memory when + // earlier sections are fully processed already. + void SetInput(const uint8_t* frame, size_t size) { + const auto& offsets = frame_dec_->SectionOffsets(); + const auto& sizes = frame_dec_->SectionSizes(); + + for (size_t i = 0; i < frame_dec_->NumSections(); i++) { + if (section_received[i]) continue; + if (!OutOfBounds(sections_begin_, offsets[i], sizes[i], size)) { + section_received[i] = 1; + section_info.emplace_back(jxl::FrameDecoder::SectionInfo{nullptr, i}); + section_status.emplace_back(); + } + } + // Reset all the bitreaders, because the address of the frame pointer may + // change, even if it always represents the same frame start. + for (size_t i = 0; i < section_info.size(); i++) { + size_t id = section_info[i].id; + JXL_ASSERT(section_info[i].br == nullptr); + section_info[i].br = new jxl::BitReader(jxl::Span( + frame + sections_begin_ + offsets[id], sizes[id])); + } + } + + JxlDecoderStatus CloseInput() { + bool out_of_bounds = false; + for (size_t i = 0; i < section_info.size(); i++) { + if (!section_info[i].br) continue; + if (!section_info[i].br->AllReadsWithinBounds()) { + // Mark out of bounds section, but keep closing and deleting the next + // ones as well. + out_of_bounds = true; + } + JXL_ASSERT(section_info[i].br->Close()); + delete section_info[i].br; + section_info[i].br = nullptr; + } + if (out_of_bounds) { + // If any bit reader indicates out of bounds, it's an error, not just + // needing more input, since we ensure only bit readers containing + // a complete section are provided to the FrameDecoder. + return JXL_API_ERROR("frame out of bounds"); + } + return JXL_DEC_SUCCESS; + } + + // Not managed by us. + jxl::FrameDecoder* frame_dec_; + + size_t frame_size_; + size_t sections_begin_; + + std::vector section_info; + std::vector section_status; + std::vector section_received; +}; + +/* +Given list of frame references to storage slots, and storage slots in which this +frame is saved, computes which frames are required to decode the frame at the +given index and any frames after it. The frames on which this depends are +returned as a vector of their indices, in no particular order. The given index +must be smaller than saved_as.size(), and references.size() must equal +saved_as.size(). Any frames beyond saved_as and references are considered +unknown future frames and must be treated as if something depends on them. +*/ +std::vector GetFrameDependencies(size_t index, + const std::vector& saved_as, + const std::vector& references) { + JXL_ASSERT(references.size() == saved_as.size()); + JXL_ASSERT(index < references.size()); + + std::vector result; + + constexpr size_t kNumStorage = 8; + + // value which indicates nothing is stored in this storage slot + const size_t invalid = references.size(); + // for each of the 8 storage slots, a vector that translates frame index to + // frame stored in this storage slot at this point, that is, the last + // frame that was stored in this slot before or at this index. + std::array, kNumStorage> storage; + for (size_t s = 0; s < kNumStorage; ++s) { + storage[s].resize(saved_as.size()); + int mask = 1 << s; + size_t id = invalid; + for (size_t i = 0; i < saved_as.size(); ++i) { + if (saved_as[i] & mask) { + id = i; + } + storage[s][i] = id; + } + } + + std::vector seen(index + 1, 0); + std::vector stack; + stack.push_back(index); + seen[index] = 1; + + // For frames after index, assume they can depend on any of the 8 storage + // slots, so push the frame for each stored reference to the stack and result. + // All frames after index are treated as having unknown references and with + // the possibility that there are more frames after the last known. + // TODO(lode): take values of saved_as and references after index, and a + // input flag indicating if they are all frames of the image, to further + // optimize this. + for (size_t s = 0; s < kNumStorage; ++s) { + size_t frame_ref = storage[s][index]; + if (frame_ref == invalid) continue; + if (seen[frame_ref]) continue; + stack.push_back(frame_ref); + seen[frame_ref] = 1; + result.push_back(frame_ref); + } + + while (!stack.empty()) { + size_t frame_index = stack.back(); + stack.pop_back(); + if (frame_index == 0) continue; // first frame cannot have references + for (size_t s = 0; s < kNumStorage; ++s) { + int mask = 1 << s; + if (!(references[frame_index] & mask)) continue; + size_t frame_ref = storage[s][frame_index - 1]; + if (frame_ref == invalid) continue; + if (seen[frame_ref]) continue; + stack.push_back(frame_ref); + seen[frame_ref] = 1; + result.push_back(frame_ref); + } + } + + return result; +} + +} // namespace + +// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) +struct JxlDecoderStruct { + JxlDecoderStruct() = default; + + JxlMemoryManager memory_manager; + std::unique_ptr thread_pool; + + DecoderStage stage; + + // Status of progression, internal. + bool got_signature; + bool first_codestream_seen; + // Indicates we know that we've seen the last codestream, however this is not + // guaranteed to be true for the last box because a jxl file may have multiple + // "jxlp" boxes and it is possible (and permitted) that the last one is not a + // final box that uses size 0 to indicate the end. + bool last_codestream_seen; + bool got_basic_info; + size_t header_except_icc_bits = 0; // To skip everything before ICC. + bool got_all_headers; // Codestream metadata headers. + bool post_headers; // Already decoding pixels. + jxl::ICCReader icc_reader; + + // This means either we actually got the preview image, or determined we + // cannot get it or there is none. + bool got_preview_image; + + // Position of next_in in the original file including box format if present + // (as opposed to position in the codestream) + size_t file_pos; + size_t box_begin; + size_t box_end; + bool skip_box; + // Begin and end of the content of the current codestream box. This could be + // a partial codestream box. + // codestream_begin 0 is used to indicate the begin is not yet known. + // codestream_end 0 is used to indicate uncapped (until end of file, for the + // last box if this box doesn't indicate its actual size). + // Not used if the file is a direct codestream. + size_t codestream_begin; + size_t codestream_end; + + // Settings + bool keep_orientation; + + // Bitfield, for which informative events (JXL_DEC_BASIC_INFO, etc...) the + // decoder returns a status. By default, do not return for any of the events, + // only return when the decoder cannot continue because it needs more input or + // output data. + int events_wanted; + int orig_events_wanted; + + // Fields for reading the basic info from the header. + size_t basic_info_size_hint; + bool have_container; + + // Whether the preview out buffer was set. It is possible for the buffer to + // be nullptr and buffer_set to be true, indicating it was deliberately + // set to nullptr. + bool preview_out_buffer_set; + // Idem for the image buffer. + bool image_out_buffer_set; + + // Owned by the caller, buffers for DC image and full resolution images + void* preview_out_buffer; + void* image_out_buffer; + JxlImageOutCallback image_out_callback; + void* image_out_opaque; + + size_t preview_out_size; + size_t image_out_size; + + // TODO(lode): merge these? + JxlPixelFormat preview_out_format; + JxlPixelFormat image_out_format; + + jxl::CodecMetadata metadata; + std::unique_ptr ib; + // ColorEncoding to use for xyb encoded image with ICC profile. + jxl::ColorEncoding default_enc; + + std::unique_ptr passes_state; + std::unique_ptr frame_dec; + std::unique_ptr sections; + // The FrameDecoder is initialized, and not yet finalized + bool frame_dec_in_progress; + + // headers and TOC for the current frame. When got_toc is true, this is + // always the frame header of the last frame of the current still series, + // that is, the displayed frame. + std::unique_ptr frame_header; + + // Start of the current frame being processed, as offset from the beginning of + // the codestream. + size_t frame_start; + size_t frame_size; + FrameStage frame_stage; + // The currently processed frame is the last of the current composite still, + // and so must be returned as pixels + bool is_last_of_still; + // The currently processed frame is the last of the codestream + bool is_last_total; + // How many frames to skip. + size_t skip_frames; + // Skipping the current frame. May be false if skip_frames was just set to + // a positive value while already processing a current frame, then + // skipping_frame will be enabled only for the next frame. + bool skipping_frame; + + // Amount of internal frames and external frames started. External frames are + // user-visible frames, internal frames includes all external frames and + // also invisible frames such as patches, blending-only and dc_level frames. + size_t internal_frames; + size_t external_frames; + + // For each internal frame, which storage locations it references, and which + // storage locations it is stored in, using the bit mask as defined in + // FrameDecoder::References and FrameDecoder::SaveAs. + std::vector frame_references; + std::vector frame_saved_as; + + // Translates external frame index to internal frame index. The external + // index is the index of user-visible frames. The internal index can be larger + // since non-visible frames (such as frames with patches, ...) are included. + std::vector frame_external_to_internal; + + // Whether the frame with internal index is required to decode the frame + // being skipped to or any frames after that. If no skipping is active, + // this vector is ignored. If the current internal frame index is beyond this + // vector, it must be treated as a required frame. + std::vector frame_required; + + // Codestream input data is stored here, when the decoder takes in and stores + // the user input bytes. If the decoder does not do that (e.g. in one-shot + // case), this field is unused. + // TODO(lode): avoid needing this field once the C++ decoder doesn't need + // all bytes at once, to save memory. Find alternative to std::vector doubling + // strategy to prevent some memory usage. + std::vector codestream; + + jxl::JxlToJpegDecoder jpeg_decoder; + + // Position in the actual codestream, which codestream.begin() points to. + // Non-zero once earlier parts of the codestream vector have been erased. + size_t codestream_pos; + + // Statistics which CodecInOut can keep + uint64_t dec_pixels; + + const uint8_t* next_in; + size_t avail_in; +}; + +// TODO(zond): Make this depend on the data loaded into the decoder. +JxlDecoderStatus JxlDecoderDefaultPixelFormat(const JxlDecoder* dec, + JxlPixelFormat* format) { + if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT; + *format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0}; + return JXL_DEC_SUCCESS; +} + +void JxlDecoderReset(JxlDecoder* dec) { + dec->thread_pool.reset(); + dec->stage = DecoderStage::kInited; + dec->got_signature = false; + dec->first_codestream_seen = false; + dec->last_codestream_seen = false; + dec->got_basic_info = false; + dec->header_except_icc_bits = 0; + dec->got_all_headers = false; + dec->post_headers = false; + dec->icc_reader.Reset(); + dec->got_preview_image = false; + dec->file_pos = 0; + dec->box_begin = 0; + dec->box_end = 0; + dec->skip_box = false; + dec->codestream_pos = 0; + dec->codestream_begin = 0; + dec->codestream_end = 0; + dec->keep_orientation = false; + dec->events_wanted = 0; + dec->orig_events_wanted = 0; + dec->basic_info_size_hint = InitialBasicInfoSizeHint(); + dec->have_container = 0; + dec->preview_out_buffer_set = false; + dec->image_out_buffer_set = false; + dec->preview_out_buffer = nullptr; + dec->image_out_buffer = nullptr; + dec->image_out_callback = nullptr; + dec->image_out_opaque = nullptr; + dec->preview_out_size = 0; + dec->image_out_size = 0; + dec->dec_pixels = 0; + dec->next_in = 0; + dec->avail_in = 0; + + dec->passes_state.reset(nullptr); + dec->frame_dec.reset(nullptr); + dec->sections.reset(nullptr); + dec->frame_dec_in_progress = false; + + dec->ib.reset(); + dec->metadata = jxl::CodecMetadata(); + dec->frame_header.reset(new jxl::FrameHeader(&dec->metadata)); + dec->codestream.clear(); + + dec->frame_stage = FrameStage::kHeader; + dec->frame_start = 0; + dec->frame_size = 0; + dec->is_last_of_still = false; + dec->is_last_total = false; + dec->skip_frames = 0; + dec->skipping_frame = false; + dec->internal_frames = 0; + dec->external_frames = 0; + dec->frame_references.clear(); + dec->frame_saved_as.clear(); + dec->frame_external_to_internal.clear(); + dec->frame_required.clear(); +} + +JxlDecoder* JxlDecoderCreate(const JxlMemoryManager* memory_manager) { + JxlMemoryManager local_memory_manager; + if (!jxl::MemoryManagerInit(&local_memory_manager, memory_manager)) + return nullptr; + + void* alloc = + jxl::MemoryManagerAlloc(&local_memory_manager, sizeof(JxlDecoder)); + if (!alloc) return nullptr; + // Placement new constructor on allocated memory + JxlDecoder* dec = new (alloc) JxlDecoder(); + dec->memory_manager = local_memory_manager; + + JxlDecoderReset(dec); + + return dec; +} + +void JxlDecoderDestroy(JxlDecoder* dec) { + if (dec) { + // Call destructor directly since custom free function is used. + dec->~JxlDecoder(); + jxl::MemoryManagerFree(&dec->memory_manager, dec); + } +} + +void JxlDecoderRewind(JxlDecoder* dec) { + int keep_orientation = dec->keep_orientation; + int events_wanted = dec->orig_events_wanted; + std::vector frame_references; + std::vector frame_saved_as; + std::vector frame_external_to_internal; + std::vector frame_required; + frame_references.swap(dec->frame_references); + frame_saved_as.swap(dec->frame_saved_as); + frame_external_to_internal.swap(dec->frame_external_to_internal); + frame_required.swap(dec->frame_required); + + JxlDecoderReset(dec); + dec->keep_orientation = keep_orientation; + dec->events_wanted = events_wanted; + dec->orig_events_wanted = events_wanted; + frame_references.swap(dec->frame_references); + frame_saved_as.swap(dec->frame_saved_as); + frame_external_to_internal.swap(dec->frame_external_to_internal); + frame_required.swap(dec->frame_required); +} + +void JxlDecoderSkipFrames(JxlDecoder* dec, size_t amount) { + // Increment amount, rather than set it: making the amount smaller is + // impossible because the decoder may already have skipped frames required to + // decode earlier frames, and making the amount larger compared to an existing + // amount is impossible because if JxlDecoderSkipFrames is called in the + // middle of already skipping frames, the user cannot know how many frames + // have already been skipped internally so far so an absolute value cannot + // be defined. + dec->skip_frames += amount; + + dec->frame_required.clear(); + size_t next_frame = dec->external_frames + dec->skip_frames; + + // A frame that has been seen before a rewind + if (next_frame < dec->frame_external_to_internal.size()) { + size_t internal_index = dec->frame_external_to_internal[next_frame]; + if (internal_index < dec->frame_saved_as.size()) { + std::vector deps = GetFrameDependencies( + internal_index, dec->frame_saved_as, dec->frame_references); + + dec->frame_required.resize(internal_index + 1, 0); + for (size_t i = 0; i < deps.size(); i++) { + JXL_ASSERT(deps[i] < dec->frame_required.size()); + dec->frame_required[deps[i]] = 1; + } + } + } +} + +JXL_EXPORT JxlDecoderStatus +JxlDecoderSetParallelRunner(JxlDecoder* dec, JxlParallelRunner parallel_runner, + void* parallel_runner_opaque) { + if (dec->thread_pool) return JXL_API_ERROR("parallel runner already set"); + dec->thread_pool.reset( + new jxl::ThreadPool(parallel_runner, parallel_runner_opaque)); + return JXL_DEC_SUCCESS; +} + +size_t JxlDecoderSizeHintBasicInfo(const JxlDecoder* dec) { + if (dec->got_basic_info) return 0; + return dec->basic_info_size_hint; +} + +JxlDecoderStatus JxlDecoderSubscribeEvents(JxlDecoder* dec, int events_wanted) { + if (dec->stage != DecoderStage::kInited) { + return JXL_DEC_ERROR; // Cannot subscribe to events after having started. + } + if (events_wanted & 63) { + return JXL_DEC_ERROR; // Can only subscribe to informative events. + } + dec->events_wanted = events_wanted; + dec->orig_events_wanted = events_wanted; + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetKeepOrientation(JxlDecoder* dec, + JXL_BOOL keep_orientation) { + if (dec->stage != DecoderStage::kInited) { + return JXL_API_ERROR("Must set keep_orientation option before starting"); + } + dec->keep_orientation = !!keep_orientation; + return JXL_DEC_SUCCESS; +} + +namespace jxl { +namespace { + +template +bool CanRead(Span data, BitReader* reader, T* JXL_RESTRICT t) { + // Use a copy of the bit reader because CanRead advances bits. + BitReader reader2(data); + reader2.SkipBits(reader->TotalBitsConsumed()); + bool result = Bundle::CanRead(&reader2, t); + JXL_ASSERT(reader2.Close()); + return result; +} + +// Returns JXL_DEC_SUCCESS if the full bundle was successfully read, status +// indicating either error or need more input otherwise. +template +JxlDecoderStatus ReadBundle(Span data, BitReader* reader, + T* JXL_RESTRICT t) { + if (!CanRead(data, reader, t)) { + return JXL_DEC_NEED_MORE_INPUT; + } + if (!Bundle::Read(reader, t)) { + return JXL_DEC_ERROR; + } + return JXL_DEC_SUCCESS; +} + +#define JXL_API_RETURN_IF_ERROR(expr) \ + { \ + JxlDecoderStatus status_ = ConvertStatus(expr); \ + if (status_ != JXL_DEC_SUCCESS) return status_; \ + } + +std::unique_ptr> GetBitReader( + Span span) { + BitReader* reader = new BitReader(span); + return std::unique_ptr>( + reader, [](BitReader* reader) { + // We can't allow Close to abort the program if the reader is out of + // bounds, or all return paths in the code, even those that already + // return failure, would have to manually call AllReadsWithinBounds(). + // Invalid JXL codestream should not cause program to quit. + (void)reader->AllReadsWithinBounds(); + (void)reader->Close(); + delete reader; + }); +} + +JxlDecoderStatus JxlDecoderReadBasicInfo(JxlDecoder* dec, const uint8_t* in, + size_t size) { + size_t pos = 0; + + // Check and skip the codestream signature + JxlSignature signature = ReadSignature(in, size, &pos); + if (signature == JXL_SIG_NOT_ENOUGH_BYTES) { + return JXL_DEC_NEED_MORE_INPUT; + } + if (signature == JXL_SIG_CONTAINER) { + // There is a container signature where we expect a codestream, container + // is handled at a higher level already. + return JXL_API_ERROR("invalid: nested container"); + } + if (signature != JXL_SIG_CODESTREAM) { + return JXL_API_ERROR("invalid signature"); + } + + Span span(in + pos, size - pos); + auto reader = GetBitReader(span); + JXL_API_RETURN_IF_ERROR(ReadBundle(span, reader.get(), &dec->metadata.size)); + + dec->metadata.m.nonserialized_only_parse_basic_info = true; + JXL_API_RETURN_IF_ERROR(ReadBundle(span, reader.get(), &dec->metadata.m)); + dec->metadata.m.nonserialized_only_parse_basic_info = false; + dec->got_basic_info = true; + dec->basic_info_size_hint = 0; + + if (!CheckSizeLimit(dec->metadata.size.xsize(), dec->metadata.size.ysize())) { + return JXL_API_ERROR("image is too large"); + } + + return JXL_DEC_SUCCESS; +} + +// Reads all codestream headers (but not frame headers) +JxlDecoderStatus JxlDecoderReadAllHeaders(JxlDecoder* dec, const uint8_t* in, + size_t size) { + size_t pos = 0; + + // Check and skip the codestream signature + JxlSignature signature = ReadSignature(in, size, &pos); + if (signature == JXL_SIG_CONTAINER) { + return JXL_API_ERROR("invalid: nested container"); + } + if (signature != JXL_SIG_CODESTREAM) { + return JXL_API_ERROR("invalid signature"); + } + + Span span(in + pos, size - pos); + auto reader = GetBitReader(span); + + if (dec->header_except_icc_bits != 0) { + // Headers were decoded already. + reader->SkipBits(dec->header_except_icc_bits); + } else { + SizeHeader dummy_size_header; + JXL_API_RETURN_IF_ERROR(ReadBundle(span, reader.get(), &dummy_size_header)); + + // We already decoded the metadata to dec->metadata.m, no reason to + // overwrite it, use a dummy metadata instead. + ImageMetadata dummy_metadata; + JXL_API_RETURN_IF_ERROR(ReadBundle(span, reader.get(), &dummy_metadata)); + + JXL_API_RETURN_IF_ERROR( + ReadBundle(span, reader.get(), &dec->metadata.transform_data)); + } + + dec->header_except_icc_bits = reader->TotalBitsConsumed(); + + if (dec->metadata.m.color_encoding.WantICC()) { + jxl::Status status = dec->icc_reader.Init(reader.get(), memory_limit_base_); + // Always check AllReadsWithinBounds, not all the C++ decoder implementation + // handles reader out of bounds correctly yet (e.g. context map). Not + // checking AllReadsWithinBounds can cause reader->Close() to trigger an + // assert, but we don't want library to quit program for invalid codestream. + if (!reader->AllReadsWithinBounds()) { + return JXL_DEC_NEED_MORE_INPUT; + } + if (!status) { + if (status.code() == StatusCode::kNotEnoughBytes) { + return JXL_DEC_NEED_MORE_INPUT; + } + // Other non-successful status is an error + return JXL_DEC_ERROR; + } + PaddedBytes icc; + status = dec->icc_reader.Process(reader.get(), &icc); + if (!status) { + if (status.code() == StatusCode::kNotEnoughBytes) { + return JXL_DEC_NEED_MORE_INPUT; + } + // Other non-successful status is an error + return JXL_DEC_ERROR; + } + if (!dec->metadata.m.color_encoding.SetICCRaw(std::move(icc))) { + return JXL_DEC_ERROR; + } + } + + dec->got_all_headers = true; + JXL_API_RETURN_IF_ERROR(reader->JumpToByteBoundary()); + + dec->frame_start = pos + reader->TotalBitsConsumed() / jxl::kBitsPerByte; + + if (!dec->passes_state) { + dec->passes_state.reset(new jxl::PassesDecoderState()); + } + + dec->default_enc = + ColorEncoding::LinearSRGB(dec->metadata.m.color_encoding.IsGray()); + + JXL_API_RETURN_IF_ERROR(dec->passes_state->output_encoding_info.Set( + dec->metadata, dec->default_enc)); + + return JXL_DEC_SUCCESS; +} + +static size_t GetStride(const JxlDecoder* dec, const JxlPixelFormat& format, + const jxl::ImageBundle* frame = nullptr) { + size_t xsize = dec->metadata.xsize(); + if (!dec->keep_orientation && dec->metadata.m.orientation > 4) { + xsize = dec->metadata.ysize(); + } + if (frame) { + xsize = dec->keep_orientation ? frame->xsize() : frame->oriented_xsize(); + } + size_t stride = xsize * (BitsPerChannel(format.data_type) * + format.num_channels / jxl::kBitsPerByte); + if (format.align > 1) { + stride = jxl::DivCeil(stride, format.align) * format.align; + } + return stride; +} + +static JxlDecoderStatus ConvertImageInternal(const JxlDecoder* dec, + const jxl::ImageBundle& frame, + const JxlPixelFormat& format, + void* out_image, size_t out_size, + JxlImageOutCallback out_callback, + void* out_opaque) { + // TODO(lode): handle mismatch of RGB/grayscale color profiles and pixel data + // color/grayscale format + const auto& metadata = dec->metadata.m; + + const size_t stride = GetStride(dec, format, &frame); + + bool float_format = format.data_type == JXL_TYPE_FLOAT || + format.data_type == JXL_TYPE_FLOAT16; + + jxl::Orientation undo_orientation = dec->keep_orientation + ? jxl::Orientation::kIdentity + : metadata.GetOrientation(); + JXL_DASSERT(!dec->frame_dec || !dec->frame_dec->HasRGBBuffer()); + jxl::Status status = jxl::ConvertToExternal( + frame, BitsPerChannel(format.data_type), float_format, + format.num_channels, format.endianness, stride, dec->thread_pool.get(), + out_image, out_size, /*out_callback=*/out_callback, + /*out_opaque=*/out_opaque, undo_orientation); + + return status ? JXL_DEC_SUCCESS : JXL_DEC_ERROR; +} + +// Parses the FrameHeader and the total frame_size, given the initial bytes +// of the frame up to and including the TOC. +// TODO(lode): merge this with FrameDecoder +JxlDecoderStatus ParseFrameHeader(jxl::FrameHeader* frame_header, + const uint8_t* in, size_t size, size_t pos, + bool is_preview, size_t* frame_size, + int* saved_as) { + if (pos >= size) { + return JXL_DEC_NEED_MORE_INPUT; + } + Span span(in + pos, size - pos); + auto reader = GetBitReader(span); + + frame_header->nonserialized_is_preview = is_preview; + jxl::Status status = DecodeFrameHeader(reader.get(), frame_header); + jxl::FrameDimensions frame_dim = frame_header->ToFrameDimensions(); + if (!CheckSizeLimit(frame_dim.xsize_upsampled_padded, + frame_dim.ysize_upsampled_padded)) { + return JXL_API_ERROR("frame is too large"); + } + + if (status.code() == StatusCode::kNotEnoughBytes) { + // TODO(lode): prevent asking for way too much input bytes in case of + // invalid header that the decoder thinks is a very long user extension + // instead. Example: fields can currently print something like this: + // "../lib/jxl/fields.cc:416: Skipping 71467322-bit extension(s)" + // Maybe fields.cc should return error in the above case rather than + // print a message. + return JXL_DEC_NEED_MORE_INPUT; + } else if (!status) { + return JXL_API_ERROR("invalid frame header"); + } + + // Read TOC. + uint64_t groups_total_size; + const bool has_ac_global = true; + const size_t toc_entries = + NumTocEntries(frame_dim.num_groups, frame_dim.num_dc_groups, + frame_header->passes.num_passes, has_ac_global); + + std::vector group_offsets; + std::vector group_sizes; + status = ReadGroupOffsets(toc_entries, reader.get(), &group_offsets, + &group_sizes, &groups_total_size); + + // TODO(lode): we're actually relying on AllReadsWithinBounds() here + // instead of on status.code(), change the internal TOC C++ code to + // correctly set the status.code() instead so we can rely on that one. + if (!reader->AllReadsWithinBounds() || + status.code() == StatusCode::kNotEnoughBytes) { + return JXL_DEC_NEED_MORE_INPUT; + } else if (!status) { + return JXL_API_ERROR("invalid toc entries"); + } + + JXL_DASSERT((reader->TotalBitsConsumed() % kBitsPerByte) == 0); + JXL_API_RETURN_IF_ERROR(reader->JumpToByteBoundary()); + size_t header_size = (reader->TotalBitsConsumed() >> 3); + *frame_size = header_size + groups_total_size; + + if (saved_as != nullptr) { + *saved_as = FrameDecoder::SavedAs(*frame_header); + } + + return JXL_DEC_SUCCESS; +} + +// TODO(eustas): no CodecInOut -> no image size reinforcement -> possible OOM. +JxlDecoderStatus JxlDecoderProcessInternal(JxlDecoder* dec, const uint8_t* in, + size_t size) { + // If no parallel runner is set, use the default + // TODO(lode): move this initialization to an appropriate location once the + // runner is used to decode pixels. + if (!dec->thread_pool) { + dec->thread_pool.reset(new jxl::ThreadPool(nullptr, nullptr)); + } + + // No matter what events are wanted, the basic info is always required. + if (!dec->got_basic_info) { + JxlDecoderStatus status = JxlDecoderReadBasicInfo(dec, in, size); + if (status != JXL_DEC_SUCCESS) return status; + } + + if (dec->events_wanted & JXL_DEC_BASIC_INFO) { + dec->events_wanted &= ~JXL_DEC_BASIC_INFO; + return JXL_DEC_BASIC_INFO; + } + + if (!dec->got_all_headers) { + JxlDecoderStatus status = JxlDecoderReadAllHeaders(dec, in, size); + if (status != JXL_DEC_SUCCESS) return status; + } + + if (dec->events_wanted & JXL_DEC_EXTENSIONS) { + dec->events_wanted &= ~JXL_DEC_EXTENSIONS; + if (dec->metadata.m.extensions != 0) { + return JXL_DEC_EXTENSIONS; + } + } + + if (dec->events_wanted & JXL_DEC_COLOR_ENCODING) { + dec->events_wanted &= ~JXL_DEC_COLOR_ENCODING; + return JXL_DEC_COLOR_ENCODING; + } + + dec->post_headers = true; + + // Decode to pixels, only if required for the events the user wants. + if (!dec->got_preview_image) { + // Parse the preview, or at least its TOC to be able to skip the frame, if + // any frame or image decoding is desired. + bool parse_preview = + (dec->events_wanted & + (JXL_DEC_PREVIEW_IMAGE | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE)); + + if (!dec->metadata.m.have_preview) { + // There is no preview, mark this as done and go to next step + dec->got_preview_image = true; + } else if (!parse_preview) { + // No preview parsing needed, mark this step as done + dec->got_preview_image = true; + } else { + // Want to decode the preview, not just skip the frame + bool want_preview = (dec->events_wanted & JXL_DEC_PREVIEW_IMAGE); + size_t frame_size; + size_t pos = dec->frame_start; + dec->frame_header.reset(new FrameHeader(&dec->metadata)); + JxlDecoderStatus status = ParseFrameHeader(dec->frame_header.get(), in, + size, pos, true, &frame_size, + /*saved_as=*/nullptr); + if (status != JXL_DEC_SUCCESS) return status; + if (OutOfBounds(pos, frame_size, size)) { + return JXL_DEC_NEED_MORE_INPUT; + } + + if (want_preview && !dec->preview_out_buffer_set) { + return JXL_DEC_NEED_PREVIEW_OUT_BUFFER; + } + + jxl::Span compressed(in + dec->frame_start, + size - dec->frame_start); + auto reader = GetBitReader(compressed); + jxl::DecompressParams dparams; + dparams.preview = want_preview ? jxl::Override::kOn : jxl::Override::kOff; + jxl::ImageBundle ib(&dec->metadata.m); + PassesDecoderState preview_dec_state; + JXL_API_RETURN_IF_ERROR(preview_dec_state.output_encoding_info.Set( + dec->metadata, + ColorEncoding::LinearSRGB(dec->metadata.m.color_encoding.IsGray()))); + if (!DecodeFrame(dparams, &preview_dec_state, dec->thread_pool.get(), + reader.get(), &ib, dec->metadata, + /*constraints=*/nullptr, + /*is_preview=*/true)) { + return JXL_API_ERROR("decoding preview failed"); + } + + // Set frame_start to the first non-preview frame. + dec->frame_start += DivCeil(reader->TotalBitsConsumed(), kBitsPerByte); + dec->got_preview_image = true; + + if (want_preview) { + if (dec->preview_out_buffer) { + JxlDecoderStatus status = ConvertImageInternal( + dec, ib, dec->preview_out_format, dec->preview_out_buffer, + dec->preview_out_size, /*out_callback=*/nullptr, + /*out_opaque=*/nullptr); + if (status != JXL_DEC_SUCCESS) return status; + } + return JXL_DEC_PREVIEW_IMAGE; + } + } + } + + // Handle frames + for (;;) { + if (!(dec->events_wanted & (JXL_DEC_FULL_IMAGE | JXL_DEC_FRAME))) { + break; + } + if (dec->frame_stage == FrameStage::kHeader && dec->is_last_total) { + break; + } + + if (dec->frame_stage == FrameStage::kHeader) { + size_t pos = dec->frame_start - dec->codestream_pos; + if (pos >= size) { + return JXL_DEC_NEED_MORE_INPUT; + } + dec->frame_header.reset(new FrameHeader(&dec->metadata)); + int saved_as = 0; + JxlDecoderStatus status = + ParseFrameHeader(dec->frame_header.get(), in, size, pos, + /*is_preview=*/false, &dec->frame_size, &saved_as); + if (status != JXL_DEC_SUCCESS) return status; + + // is last in entire codestream + dec->is_last_total = dec->frame_header->is_last; + // is last of current still + dec->is_last_of_still = + dec->is_last_total || dec->frame_header->animation_frame.duration > 0; + + const size_t internal_frame_index = dec->internal_frames; + const size_t external_frame_index = dec->external_frames; + if (dec->is_last_of_still) dec->external_frames++; + dec->internal_frames++; + + dec->frame_stage = FrameStage::kTOC; + + if (dec->skip_frames > 0) { + dec->skipping_frame = true; + if (dec->is_last_of_still) { + dec->skip_frames--; + } + } else { + dec->skipping_frame = false; + } + + if (external_frame_index >= dec->frame_external_to_internal.size()) { + dec->frame_external_to_internal.push_back(internal_frame_index); + JXL_ASSERT(dec->frame_external_to_internal.size() == + external_frame_index + 1); + } + + if (internal_frame_index >= dec->frame_saved_as.size()) { + dec->frame_saved_as.push_back(saved_as); + JXL_ASSERT(dec->frame_saved_as.size() == internal_frame_index + 1); + + // add the value 0xff (which means all references) to new slots: we only + // know the references of the frame at FinalizeFrame, and fill in the + // correct values there. As long as this information is not known, the + // worst case where the frame depends on all storage slots is assumed. + dec->frame_references.push_back(0xff); + JXL_ASSERT(dec->frame_references.size() == internal_frame_index + 1); + } + + if (dec->skipping_frame) { + // Whether this frame could be referenced by any future frame: either + // because it's a frame saved for blending or patches, or because it's + // a DC frame. + bool referenceable = + dec->frame_header->CanBeReferenced() || + dec->frame_header->frame_type == FrameType::kDCFrame; + if (internal_frame_index < dec->frame_required.size() && + !dec->frame_required[internal_frame_index]) { + referenceable = false; + } + if (!referenceable) { + // Skip all decoding for this frame, since the user is skipping this + // frame and no future frames can reference it. + dec->frame_stage = FrameStage::kHeader; + dec->frame_start += dec->frame_size; + continue; + } + } + + if ((dec->events_wanted & JXL_DEC_FRAME) && dec->is_last_of_still) { + // Only return this for the last of a series of stills: patches frames + // etc... before this one do not contain the correct information such + // as animation timing, ... + if (!dec->skipping_frame) { + return JXL_DEC_FRAME; + } + } + } + + if (dec->frame_stage == FrameStage::kTOC) { + size_t pos = dec->frame_start - dec->codestream_pos; + if (pos >= size) { + return JXL_DEC_NEED_MORE_INPUT; + } + Span span(in + pos, size - pos); + auto reader = GetBitReader(span); + + if (!dec->passes_state) { + dec->passes_state.reset(new jxl::PassesDecoderState()); + } + if (!dec->ib) { + dec->ib.reset(new jxl::ImageBundle(&dec->metadata.m)); + } + + dec->frame_dec.reset(new FrameDecoder( + dec->passes_state.get(), dec->metadata, dec->thread_pool.get())); + + // If JPEG reconstruction is wanted and possible, set the jpeg_data of + // the ImageBundle. + if (!dec->jpeg_decoder.SetImageBundleJpegData(dec->ib.get())) + return JXL_DEC_ERROR; + + jxl::Status status = dec->frame_dec->InitFrame( + reader.get(), dec->ib.get(), /*is_preview=*/false, + /*allow_partial_frames=*/false, /*allow_partial_dc_global=*/false); + if (!status) JXL_API_RETURN_IF_ERROR(status); + + size_t sections_begin = + DivCeil(reader->TotalBitsConsumed(), kBitsPerByte); + + dec->sections.reset( + new Sections(dec->frame_dec.get(), dec->frame_size, sections_begin)); + JXL_API_RETURN_IF_ERROR(dec->sections->Init()); + + // If we don't need pixels, we can skip actually decoding the frames + // (kFull / kFullOut). By not updating frame_stage, none of + // these stages will execute, and the loop will continue from the next + // frame. + if (dec->events_wanted & JXL_DEC_FULL_IMAGE) { + dec->frame_dec_in_progress = true; + dec->frame_stage = FrameStage::kFull; + } + } + + bool return_full_image = false; + + if (dec->frame_stage == FrameStage::kFull) { + if (dec->events_wanted & JXL_DEC_FULL_IMAGE) { + if (!dec->image_out_buffer_set && (!dec->jpeg_decoder.IsOutputSet() || + dec->ib->jpeg_data == nullptr) && + dec->is_last_of_still) { + // TODO(lode): remove the dec->is_last_of_still condition if the + // frame decoder needs the image buffer as working space for decoding + // non-visible or blending frames too + if (!dec->skipping_frame) { + return JXL_DEC_NEED_IMAGE_OUT_BUFFER; + } + } + } + + if (dec->image_out_buffer_set && !!dec->image_out_buffer && + dec->image_out_format.data_type == JXL_TYPE_UINT8 && + dec->image_out_format.num_channels >= 3) { + bool is_rgba = dec->image_out_format.num_channels == 4; + dec->frame_dec->MaybeSetRGB8OutputBuffer( + reinterpret_cast(dec->image_out_buffer), + GetStride(dec, dec->image_out_format), is_rgba, + !dec->keep_orientation); + } + + const bool little_endian = + dec->image_out_format.endianness == JXL_LITTLE_ENDIAN || + (dec->image_out_format.endianness == JXL_NATIVE_ENDIAN && + IsLittleEndian()); + bool swap_endianness = little_endian != IsLittleEndian(); + + // TODO(lode): Support more formats than just native endian float32 for + // the low-memory callback path + if (dec->image_out_buffer_set && !!dec->image_out_callback && + dec->image_out_format.data_type == JXL_TYPE_FLOAT && + dec->image_out_format.num_channels >= 3 && !swap_endianness && + dec->frame_dec_in_progress) { + bool is_rgba = dec->image_out_format.num_channels == 4; + dec->frame_dec->MaybeSetFloatCallback( + [dec](const float* pixels, size_t x, size_t y, size_t num_pixels) { + dec->image_out_callback(dec->image_out_opaque, x, y, num_pixels, + pixels); + }, + is_rgba, !dec->keep_orientation); + } + + size_t pos = dec->frame_start - dec->codestream_pos; + if (pos >= size) { + return JXL_DEC_NEED_MORE_INPUT; + } + dec->sections->SetInput(in + pos, size - pos); + + if (cpu_limit_base_ != 0) { + FrameDimensions frame_dim = dec->frame_header->ToFrameDimensions(); + // No overflow, checked in ParseHeader. + size_t num_pixels = frame_dim.xsize * frame_dim.ysize; + if (used_cpu_base_ + num_pixels < used_cpu_base_) { + return JXL_API_ERROR("used too much CPU"); + } + used_cpu_base_ += num_pixels; + if (used_cpu_base_ > cpu_limit_base_) { + return JXL_API_ERROR("used too much CPU"); + } + } + + jxl::Status status = + dec->frame_dec->ProcessSections(dec->sections->section_info.data(), + dec->sections->section_info.size(), + dec->sections->section_status.data()); + JXL_API_RETURN_IF_ERROR(dec->sections->CloseInput()); + if (status.IsFatalError()) { + return JXL_API_ERROR("decoding frame failed"); + } + + // TODO(lode): allow next_in to move forward if sections from the + // beginning of the stream have been processed + + if (status.code() == StatusCode::kNotEnoughBytes || + dec->sections->section_info.size() < dec->frame_dec->NumSections()) { + // Not all sections have been processed yet + return JXL_DEC_NEED_MORE_INPUT; + } + + size_t internal_index = dec->internal_frames - 1; + JXL_ASSERT(dec->frame_references.size() > internal_index); + // Always fill this in, even if it was already written, it could be that + // this frame was skipped before and set to 255, while only now we know + // the true value. + dec->frame_references[internal_index] = dec->frame_dec->References(); + if (!dec->frame_dec->FinalizeFrame()) { + return JXL_API_ERROR("decoding frame failed"); + } + dec->frame_dec_in_progress = false; + dec->frame_stage = FrameStage::kFullOutput; + } + + if (dec->frame_stage == FrameStage::kFullOutput) { + if (dec->is_last_of_still) { + if (dec->events_wanted & JXL_DEC_FULL_IMAGE) { + dec->events_wanted &= ~JXL_DEC_FULL_IMAGE; + return_full_image = true; + } + + // Frame finished, restore the events_wanted with the per-frame events + // from orig_events_wanted, in case there is a next frame. + dec->events_wanted |= + (dec->orig_events_wanted & (JXL_DEC_FULL_IMAGE | JXL_DEC_FRAME)); + + // If no output buffer was set, we merely return the JXL_DEC_FULL_IMAGE + // status without outputting pixels. + if (dec->jpeg_decoder.IsOutputSet() && dec->ib->jpeg_data != nullptr) { + JxlDecoderStatus status = + dec->jpeg_decoder.WriteOutput(*dec->ib->jpeg_data); + if (status != JXL_DEC_SUCCESS) return status; + } else if (return_full_image && dec->image_out_buffer_set) { + if (!dec->frame_dec->HasRGBBuffer()) { + // Copy pixels if desired. + JxlDecoderStatus status = ConvertImageInternal( + dec, *dec->ib, dec->image_out_format, dec->image_out_buffer, + dec->image_out_size, dec->image_out_callback, + dec->image_out_opaque); + if (status != JXL_DEC_SUCCESS) return status; + } + dec->image_out_buffer_set = false; + } + } + } + + // The pixels have been output or are not needed, do not keep them in + // memory here. + dec->ib.reset(); + dec->frame_stage = FrameStage::kHeader; + dec->frame_start += dec->frame_size; + if (return_full_image && !dec->skipping_frame) { + return JXL_DEC_FULL_IMAGE; + } + } + + dec->stage = DecoderStage::kFinished; + // Return success, this means there is nothing more to do. + return JXL_DEC_SUCCESS; +} + +} // namespace +} // namespace jxl + +JxlDecoderStatus JxlDecoderSetInput(JxlDecoder* dec, const uint8_t* data, + size_t size) { + if (dec->next_in) return JXL_DEC_ERROR; + + dec->next_in = data; + dec->avail_in = size; + return JXL_DEC_SUCCESS; +} + +size_t JxlDecoderReleaseInput(JxlDecoder* dec) { + size_t result = dec->avail_in; + dec->next_in = nullptr; + dec->avail_in = 0; + return result; +} + +JxlDecoderStatus JxlDecoderSetJPEGBuffer(JxlDecoder* dec, uint8_t* data, + size_t size) { + return dec->jpeg_decoder.SetOutputBuffer(data, size); +} + +size_t JxlDecoderReleaseJPEGBuffer(JxlDecoder* dec) { + return dec->jpeg_decoder.ReleaseOutputBuffer(); +} + +JxlDecoderStatus JxlDecoderProcessInput(JxlDecoder* dec) { + const uint8_t** next_in = &dec->next_in; + size_t* avail_in = &dec->avail_in; + if (dec->stage == DecoderStage::kInited) { + dec->stage = DecoderStage::kStarted; + } + if (dec->stage == DecoderStage::kError) { + return JXL_API_ERROR( + "Cannot keep using decoder after it encountered an error, use " + "JxlDecoderReset to reset it"); + } + if (dec->stage == DecoderStage::kFinished) { + return JXL_API_ERROR( + "Cannot keep using decoder after it finished, use JxlDecoderReset to " + "reset it"); + } + + if (!dec->got_signature) { + JxlSignature sig = JxlSignatureCheck(*next_in, *avail_in); + if (sig == JXL_SIG_INVALID) return JXL_API_ERROR("invalid signature"); + if (sig == JXL_SIG_NOT_ENOUGH_BYTES) return JXL_DEC_NEED_MORE_INPUT; + + dec->got_signature = true; + + if (sig == JXL_SIG_CONTAINER) { + dec->have_container = 1; + } + } + + // Available codestream bytes, may differ from *avail_in if there is another + // box behind the current position, in the dec->have_container case. + size_t csize = *avail_in; + + if (dec->have_container) { + /* + Process bytes as follows: + *) find the box(es) containing the codestream + *) support codestream split over multiple partial boxes + *) avoid copying bytes to the codestream vector if the decoding will be + one-shot, when the user already provided everything contiguously in + memory + *) copy to codestream vector, and update next_in so user can delete the data + on their side, once we know it's not oneshot. This relieves the user from + continuing to store the data. + *) also copy to codestream if one-shot but the codestream is split across + multiple boxes: this copying can be avoided in the future if the C++ + decoder is updated for streaming, but for now it requires all consecutive + data at once. + */ + + if (dec->skip_box) { + // Amount of remaining bytes in the box that is being skipped. + size_t remaining = dec->box_end - dec->file_pos; + if (*avail_in < remaining) { + // Don't have the full box yet, skip all we have so far + dec->file_pos += *avail_in; + *next_in += *avail_in; + *avail_in -= *avail_in; + return JXL_DEC_NEED_MORE_INPUT; + } else { + // Full box available, skip all its remaining bytes + dec->file_pos += remaining; + *next_in += remaining; + *avail_in -= remaining; + dec->skip_box = false; + } + } + + if (dec->first_codestream_seen && !dec->last_codestream_seen && + dec->codestream_end != 0 && dec->file_pos < dec->codestream_end && + dec->file_pos + *avail_in >= dec->codestream_end && + !dec->codestream.empty()) { + // dec->file_pos in a codestream, not in surrounding box format bytes, but + // the end of the current codestream part is in the current input, and + // boxes that can contain a next part of the codestream could be present. + // Therefore, store the known codestream part, and ensure processing of + // boxes below will trigger. This is only done if + // !dec->codestream.empty(), that is, we're already streaming. + + // Size of the codestream, excluding potential boxes that come after it. + csize = *avail_in; + if (dec->codestream_end && csize > dec->codestream_end - dec->file_pos) { + csize = dec->codestream_end - dec->file_pos; + } + dec->codestream.insert(dec->codestream.end(), *next_in, *next_in + csize); + dec->file_pos += csize; + *next_in += csize; + *avail_in -= csize; + } + + if (dec->jpeg_decoder.IsParsingBox()) { + // We are inside a JPEG reconstruction box. + JxlDecoderStatus recon_result = + dec->jpeg_decoder.Process(next_in, avail_in); + if (recon_result == JXL_DEC_JPEG_RECONSTRUCTION) { + // If successful JPEG reconstruction, return the success if the user + // cares about it, otherwise continue. + if (dec->events_wanted & recon_result) { + dec->events_wanted &= ~recon_result; + return recon_result; + } + } else { + // If anything else, return the result. + return recon_result; + } + } + + if (!dec->last_codestream_seen && + (dec->codestream_begin == 0 || + (dec->codestream_end != 0 && dec->file_pos >= dec->codestream_end))) { + size_t pos = 0; + // after this for loop, either we should be in a part of the data that is + // codestream (not boxes), or have returned that we need more input. + for (;;) { + const uint8_t* in = *next_in; + size_t size = *avail_in; + if (size == pos) { + // If the remaining size is 0, we are exactly after a full box. We + // can't know for sure if this is the last box or not since more bytes + // can follow, but do not return NEED_MORE_INPUT, instead break and + // let the codestream-handling code determine if we need more. + break; + } + if (OutOfBounds(pos, 8, size)) { + dec->basic_info_size_hint = + InitialBasicInfoSizeHint() + pos + 8 - dec->file_pos; + return JXL_DEC_NEED_MORE_INPUT; + } + size_t box_start = pos; + // Box size, including this header itself. + uint64_t box_size = LoadBE32(in + pos); + char type[5] = {0}; + memcpy(type, in + pos + 4, 4); + pos += 8; + if (box_size == 1) { + if (OutOfBounds(pos, 8, size)) return JXL_DEC_NEED_MORE_INPUT; + box_size = LoadBE64(in + pos); + pos += 8; + } + size_t header_size = pos - box_start; + if (box_size > 0 && box_size < header_size) { + return JXL_API_ERROR("invalid box size"); + } + if (SumOverflows(dec->file_pos, pos, box_size)) { + return JXL_API_ERROR("Box size overflow"); + } + size_t contents_size = + (box_size == 0) ? 0 : (box_size - pos + box_start); + + dec->box_begin = box_start; + dec->box_end = dec->file_pos + box_start + box_size; + if (strcmp(type, "jxlc") == 0 || strcmp(type, "jxlp") == 0) { + size_t codestream_size = contents_size; + // Whether this is the last codestream box, either when it is a jxlc + // box, or when it is a jxlp box that has the final bit set. + // The codestream is either contained within a single jxlc box, or + // within one or more jxlp boxes. The final jxlp box is marked as last + // by setting the high bit of its 4-byte box-index value. + bool last_codestream = false; + if (strcmp(type, "jxlp") == 0) { + if (OutOfBounds(pos, 4, size)) return JXL_DEC_NEED_MORE_INPUT; + if (box_size != 0 && contents_size < 4) { + return JXL_API_ERROR("jxlp box too small to contain index"); + } + codestream_size -= 4; + size_t jxlp_index = LoadBE32(in + pos); + pos += 4; + // The high bit of jxlp_index indicates whether this is the last + // jxlp box. + if (jxlp_index & 0x80000000) last_codestream = true; + } else if (strcmp(type, "jxlc") == 0) { + last_codestream = true; + } + if (!last_codestream && box_size == 0) { + return JXL_API_ERROR( + "final box has unbounded size, but is a non-final codestream " + "box"); + } + dec->first_codestream_seen = true; + if (last_codestream) dec->last_codestream_seen = true; + if (dec->codestream_begin != 0 && dec->codestream.empty()) { + // We've already seen a codestream part, so it's a stream spanning + // multiple boxes. + // We have no choice but to copy contents to the codestream + // vector to make it a contiguous stream for the C++ decoder. + // This appends the previous codestream box that we had seen to + // dec->codestream. + if (dec->codestream_begin < dec->file_pos) { + return JXL_API_ERROR("earlier codestream box out of range"); + } + size_t begin = dec->codestream_begin - dec->file_pos; + size_t end = dec->codestream_end - dec->file_pos; + JXL_ASSERT(end <= *avail_in); + dec->codestream.insert(dec->codestream.end(), *next_in + begin, + *next_in + end); + } + dec->codestream_begin = dec->file_pos + pos; + dec->codestream_end = + (box_size == 0) ? 0 : (dec->codestream_begin + codestream_size); + size_t avail_codestream_size = + (box_size == 0) + ? (size - pos) + : std::min(size - pos, box_size - pos + box_start); + // If already appending codestream, append what we have here too + if (!dec->codestream.empty()) { + size_t begin = pos; + size_t end = + std::min(*avail_in, begin + avail_codestream_size); + dec->codestream.insert(dec->codestream.end(), *next_in + begin, + *next_in + end); + pos += (end - begin); + dec->file_pos += pos; + *next_in += pos; + *avail_in -= pos; + pos = 0; + // TODO(lode): check if this should break always instead, and + // process what we have of the codestream so far, to support + // progressive decoding, and get events such as basic info faster. + // The user could have given 1.5 boxes here, and the first one could + // contain useful parts of codestream that can already be processed. + // Similar to several other exact avail_size checks. This may not + // need to be changed here, but instead at the point in this for + // loop where it returns "NEED_MORE_INPUT", it could instead break + // and allow decoding what we have of the codestream so far. + if (*avail_in == 0) break; + } else { + // skip only the header, so next_in points to the start of this new + // codestream part, for the one-shot case where user data is not + // (yet) copied to dec->codestream. + dec->file_pos += pos; + *next_in += pos; + *avail_in -= pos; + pos = 0; + // Update pos to be after the box contents with codestream + if (avail_codestream_size == *avail_in) { + break; // the rest is codestream, this loop is done + } + pos += avail_codestream_size; + } + } else if ((JPEGXL_ENABLE_TRANSCODE_JPEG) && + (dec->orig_events_wanted & JXL_DEC_JPEG_RECONSTRUCTION) && + strcmp(type, "jbrd") == 0) { + // This is a new JPEG reconstruction metadata box. + dec->jpeg_decoder.StartBox(box_size, contents_size); + dec->file_pos += pos; + *next_in += pos; + *avail_in -= pos; + pos = 0; + JxlDecoderStatus recon_result = + dec->jpeg_decoder.Process(next_in, avail_in); + if (recon_result == JXL_DEC_JPEG_RECONSTRUCTION) { + // If successful JPEG reconstruction, return the success if the user + // cares about it, otherwise continue. + if (dec->events_wanted & recon_result) { + dec->events_wanted &= ~recon_result; + return recon_result; + } + } else { + // If anything else, return the result. + return recon_result; + } + } else { + if (box_size == 0) { + // Final box with unknown size, but it's not a codestream box, so + // nothing more to do. + if (!dec->first_codestream_seen) { + return JXL_API_ERROR("didn't find any codestream box"); + } + break; + } + if (OutOfBounds(pos, contents_size, size)) { + dec->skip_box = true; + dec->file_pos += pos; + *next_in += pos; + *avail_in -= pos; + // Indicate how many more bytes needed starting from *next_in. + dec->basic_info_size_hint = InitialBasicInfoSizeHint() + pos + + contents_size - dec->file_pos; + return JXL_DEC_NEED_MORE_INPUT; + } + pos += contents_size; + if (!(dec->codestream.empty() && dec->first_codestream_seen)) { + // Last box no longer needed since we have copied the codestream + // buffer, remove from input so user can release memory. + dec->file_pos += pos; + *next_in += pos; + *avail_in -= pos; + pos = 0; + } + } + } + } + + // Size of the codestream, excluding potential boxes that come after it. + csize = *avail_in; + if (dec->codestream_end && csize > dec->codestream_end - dec->file_pos) { + csize = dec->codestream_end - dec->file_pos; + } + } + + // Whether we are taking the input directly from the user (oneshot case, + // without copying bytes), or appending parts of input to dec->codestream + // (streaming) + bool detected_streaming = !dec->codestream.empty(); + JxlDecoderStatus result; + JXL_DASSERT(csize <= *avail_in); + + if (detected_streaming) { + dec->codestream.insert(dec->codestream.end(), *next_in, *next_in + csize); + dec->file_pos += csize; + *next_in += csize; + *avail_in -= csize; + result = jxl::JxlDecoderProcessInternal(dec, dec->codestream.data(), + dec->codestream.size()); + } else { + // No data copied to codestream buffer yet, the user input may contain the + // full codestream. + result = jxl::JxlDecoderProcessInternal(dec, *next_in, csize); + // Copy the user's input bytes to the codestream once we are able to and + // it is needed. Before we got the basic info, we're still parsing the box + // format instead. If the result is not JXL_DEC_NEED_MORE_INPUT, then + // there is no reason yet to copy since the user may have a full buffer + // allowing one-shot. Once JXL_DEC_NEED_MORE_INPUT occurred at least once, + // start copying over the codestream bytes and allow user to free them + // instead. Next call, detected_streaming will be true. + if (dec->got_basic_info && result == JXL_DEC_NEED_MORE_INPUT) { + dec->codestream.insert(dec->codestream.end(), *next_in, *next_in + csize); + dec->file_pos += csize; + *next_in += csize; + *avail_in -= csize; + } + } + + return result; +} + +JxlDecoderStatus JxlDecoderGetBasicInfo(const JxlDecoder* dec, + JxlBasicInfo* info) { + if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT; + + if (info) { + const jxl::ImageMetadata& meta = dec->metadata.m; + + info->have_container = dec->have_container; + info->xsize = dec->metadata.size.xsize(); + info->ysize = dec->metadata.size.ysize(); + info->uses_original_profile = !meta.xyb_encoded; + + info->bits_per_sample = meta.bit_depth.bits_per_sample; + info->exponent_bits_per_sample = meta.bit_depth.exponent_bits_per_sample; + + info->have_preview = meta.have_preview; + info->have_animation = meta.have_animation; + // TODO(janwas): intrinsic_size + info->orientation = static_cast(meta.orientation); + + if (!dec->keep_orientation) { + if (info->orientation >= JXL_ORIENT_TRANSPOSE) { + std::swap(info->xsize, info->ysize); + } + info->orientation = JXL_ORIENT_IDENTITY; + } + + info->intensity_target = meta.IntensityTarget(); + info->min_nits = meta.tone_mapping.min_nits; + info->relative_to_max_display = meta.tone_mapping.relative_to_max_display; + info->linear_below = meta.tone_mapping.linear_below; + + const jxl::ExtraChannelInfo* alpha = meta.Find(jxl::ExtraChannel::kAlpha); + if (alpha != nullptr) { + info->alpha_bits = alpha->bit_depth.bits_per_sample; + info->alpha_exponent_bits = alpha->bit_depth.exponent_bits_per_sample; + info->alpha_premultiplied = alpha->alpha_associated; + } else { + info->alpha_bits = 0; + info->alpha_exponent_bits = 0; + info->alpha_premultiplied = 0; + } + + info->num_color_channels = + meta.color_encoding.GetColorSpace() == jxl::ColorSpace::kGray ? 1 : 3; + + info->num_extra_channels = meta.num_extra_channels; + + if (info->have_preview) { + info->preview.xsize = dec->metadata.m.preview_size.xsize(); + info->preview.ysize = dec->metadata.m.preview_size.ysize(); + } + + if (info->have_animation) { + info->animation.tps_numerator = dec->metadata.m.animation.tps_numerator; + info->animation.tps_denominator = + dec->metadata.m.animation.tps_denominator; + info->animation.num_loops = dec->metadata.m.animation.num_loops; + info->animation.have_timecodes = dec->metadata.m.animation.have_timecodes; + } + } + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetExtraChannelInfo(const JxlDecoder* dec, + size_t index, + JxlExtraChannelInfo* info) { + if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT; + + const std::vector& channels = + dec->metadata.m.extra_channel_info; + + if (index >= channels.size()) return JXL_DEC_ERROR; // out of bounds + const jxl::ExtraChannelInfo& channel = channels[index]; + + info->type = static_cast(channel.type); + info->bits_per_sample = channel.bit_depth.bits_per_sample; + info->exponent_bits_per_sample = + channel.bit_depth.floating_point_sample + ? channel.bit_depth.exponent_bits_per_sample + : 0; + info->dim_shift = channel.dim_shift; + info->name_length = channel.name.size(); + info->alpha_associated = channel.alpha_associated; + info->spot_color[0] = channel.spot_color[0]; + info->spot_color[1] = channel.spot_color[1]; + info->spot_color[2] = channel.spot_color[2]; + info->spot_color[3] = channel.spot_color[3]; + info->cfa_channel = channel.cfa_channel; + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetExtraChannelName(const JxlDecoder* dec, + size_t index, char* name, + size_t size) { + if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT; + + const std::vector& channels = + dec->metadata.m.extra_channel_info; + + if (index >= channels.size()) return JXL_DEC_ERROR; // out of bounds + const jxl::ExtraChannelInfo& channel = channels[index]; + + // Also need null-termination character + if (channel.name.size() + 1 > size) return JXL_DEC_ERROR; + + memcpy(name, channel.name.c_str(), channel.name.size() + 1); + + return JXL_DEC_SUCCESS; +} + +namespace { + +// Gets the jxl::ColorEncoding for the desired target, and checks errors. +// Returns the object regardless of whether the actual color space is in ICC, +// but ensures that if the color encoding is not the encoding from the +// codestream header metadata, it cannot require ICC profile. +JxlDecoderStatus GetColorEncodingForTarget( + const JxlDecoder* dec, const JxlPixelFormat* format, + JxlColorProfileTarget target, const jxl::ColorEncoding** encoding) { + if (!dec->got_all_headers) return JXL_DEC_NEED_MORE_INPUT; + *encoding = nullptr; + if (target == JXL_COLOR_PROFILE_TARGET_DATA && dec->metadata.m.xyb_encoded) { + *encoding = &dec->passes_state->output_encoding_info.color_encoding; + } else { + *encoding = &dec->metadata.m.color_encoding; + } + return JXL_DEC_SUCCESS; +} +} // namespace + +JxlDecoderStatus JxlDecoderGetColorAsEncodedProfile( + const JxlDecoder* dec, const JxlPixelFormat* format, + JxlColorProfileTarget target, JxlColorEncoding* color_encoding) { + const jxl::ColorEncoding* jxl_color_encoding = nullptr; + JxlDecoderStatus status = + GetColorEncodingForTarget(dec, format, target, &jxl_color_encoding); + if (status) return status; + + if (jxl_color_encoding->WantICC()) + return JXL_DEC_ERROR; // Indicate no encoded profile available. + + if (color_encoding) { + ConvertInternalToExternalColorEncoding(*jxl_color_encoding, color_encoding); + } + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetICCProfileSize(const JxlDecoder* dec, + const JxlPixelFormat* format, + JxlColorProfileTarget target, + size_t* size) { + const jxl::ColorEncoding* jxl_color_encoding = nullptr; + JxlDecoderStatus status = + GetColorEncodingForTarget(dec, format, target, &jxl_color_encoding); + if (status != JXL_DEC_SUCCESS) return status; + + if (jxl_color_encoding->WantICC()) { + jxl::ColorSpace color_space = + dec->metadata.m.color_encoding.GetColorSpace(); + if (color_space == jxl::ColorSpace::kUnknown || + color_space == jxl::ColorSpace::kXYB) { + // This indicates there's no ICC profile available + // TODO(lode): for the XYB case, do we want to craft an ICC profile that + // represents XYB as an RGB profile? It may be possible, but not with + // only 1D transfer functions. + return JXL_DEC_ERROR; + } + } + + if (size) { + *size = jxl_color_encoding->ICC().size(); + } + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetColorAsICCProfile(const JxlDecoder* dec, + const JxlPixelFormat* format, + JxlColorProfileTarget target, + uint8_t* icc_profile, + size_t size) { + size_t wanted_size; + // This also checks the NEED_MORE_INPUT and the unknown/xyb cases + JxlDecoderStatus status = + JxlDecoderGetICCProfileSize(dec, format, target, &wanted_size); + if (status != JXL_DEC_SUCCESS) return status; + if (size < wanted_size) return JXL_API_ERROR("ICC profile output too small"); + + const jxl::ColorEncoding* jxl_color_encoding = nullptr; + status = GetColorEncodingForTarget(dec, format, target, &jxl_color_encoding); + if (status != JXL_DEC_SUCCESS) return status; + + memcpy(icc_profile, jxl_color_encoding->ICC().data(), + jxl_color_encoding->ICC().size()); + + return JXL_DEC_SUCCESS; +} + +namespace { + +// Returns the amount of bits needed for getting memory buffer size, and does +// all error checking required for size checking and format validity. +JxlDecoderStatus PrepareSizeCheck(const JxlDecoder* dec, + const JxlPixelFormat* format, size_t* bits) { + if (!dec->got_basic_info) { + // Don't know image dimensions yet, cannot check for valid size. + return JXL_DEC_NEED_MORE_INPUT; + } + if (format->num_channels > 4) { + return JXL_API_ERROR("More than 4 channels not supported"); + } + if (format->num_channels < 3 && !dec->metadata.m.color_encoding.IsGray()) { + return JXL_API_ERROR("Grayscale output not possible for color image"); + } + if (format->data_type == JXL_TYPE_BOOLEAN) { + return JXL_API_ERROR("Boolean data type not yet supported"); + } + if (format->data_type == JXL_TYPE_UINT32) { + return JXL_API_ERROR("uint32 data type not yet supported"); + } + + *bits = BitsPerChannel(format->data_type); + + if (*bits == 0) { + return JXL_API_ERROR("Invalid data type"); + } + + return JXL_DEC_SUCCESS; +} +} // namespace + +JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec) { + if (!dec->image_out_buffer) return JXL_DEC_ERROR; + if (!dec->sections || dec->sections->section_info.empty()) { + return JXL_DEC_ERROR; + } + if (!dec->frame_dec || !dec->frame_dec_in_progress) { + return JXL_DEC_ERROR; + } + if (!dec->frame_dec->HasDecodedDC()) { + // FrameDecoder::Fush currently requires DC to have been decoded already + // to work correctly. + return JXL_DEC_ERROR; + } + if (dec->frame_header->encoding != jxl::FrameEncoding::kVarDCT) { + // Flushing does not yet work correctly if the frame uses modular encoding. + return JXL_DEC_ERROR; + } + if (dec->metadata.m.num_extra_channels > 0) { + // Flushing does not yet work correctly if there are extra channels, which + // use modular + return JXL_DEC_ERROR; + } + + if (!dec->frame_dec->Flush()) { + return JXL_DEC_ERROR; + } + + if (dec->frame_dec->HasRGBBuffer()) { + return JXL_DEC_SUCCESS; + } + + // Temporarily shrink `dec->ib` to the actual size of the full image to call + // ConvertImageInternal. + size_t xsize = dec->ib->xsize(); + size_t ysize = dec->ib->ysize(); + dec->ib->ShrinkTo(dec->metadata.size.xsize(), dec->metadata.size.ysize()); + JxlDecoderStatus status = jxl::ConvertImageInternal( + dec, *dec->ib, dec->image_out_format, dec->image_out_buffer, + dec->image_out_size, + /*out_callback=*/nullptr, /*out_opaque=*/nullptr); + dec->ib->ShrinkTo(xsize, ysize); + if (status != JXL_DEC_SUCCESS) return status; + return JXL_DEC_SUCCESS; +} + +JXL_EXPORT JxlDecoderStatus JxlDecoderPreviewOutBufferSize( + const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size) { + size_t bits; + JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits); + if (status != JXL_DEC_SUCCESS) return status; + + size_t xsize = dec->metadata.oriented_preview_xsize(dec->keep_orientation); + size_t ysize = dec->metadata.oriented_preview_ysize(dec->keep_orientation); + + size_t row_size = + jxl::DivCeil(xsize * format->num_channels * bits, jxl::kBitsPerByte); + if (format->align > 1) { + row_size = jxl::DivCeil(row_size, format->align) * format->align; + } + *size = row_size * ysize; + return JXL_DEC_SUCCESS; +} + +JXL_EXPORT JxlDecoderStatus JxlDecoderSetPreviewOutBuffer( + JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size) { + if (!dec->got_basic_info || !dec->metadata.m.have_preview || + !(dec->orig_events_wanted & JXL_DEC_PREVIEW_IMAGE)) { + return JXL_API_ERROR("No preview out buffer needed at this time"); + } + + size_t min_size; + // This also checks whether the format is valid and supported and basic info + // is available. + JxlDecoderStatus status = + JxlDecoderPreviewOutBufferSize(dec, format, &min_size); + if (status != JXL_DEC_SUCCESS) return status; + + if (size < min_size) return JXL_DEC_ERROR; + + dec->preview_out_buffer_set = true; + dec->preview_out_buffer = buffer; + dec->preview_out_size = size; + dec->preview_out_format = *format; + + return JXL_DEC_SUCCESS; +} + +JXL_EXPORT JxlDecoderStatus JxlDecoderDCOutBufferSize( + const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size) { + size_t bits; + JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits); + if (status != JXL_DEC_SUCCESS) return status; + + size_t xsize = jxl::DivCeil( + dec->metadata.oriented_xsize(dec->keep_orientation), jxl::kBlockDim); + size_t ysize = jxl::DivCeil( + dec->metadata.oriented_ysize(dec->keep_orientation), jxl::kBlockDim); + + size_t row_size = + jxl::DivCeil(xsize * format->num_channels * bits, jxl::kBitsPerByte); + if (format->align > 1) { + row_size = jxl::DivCeil(row_size, format->align) * format->align; + } + *size = row_size * ysize; + return JXL_DEC_SUCCESS; +} + +JXL_EXPORT JxlDecoderStatus JxlDecoderSetDCOutBuffer( + JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size) { + // No buffer set: this feature is deprecated + return JXL_DEC_SUCCESS; +} + +JXL_EXPORT JxlDecoderStatus JxlDecoderImageOutBufferSize( + const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size) { + size_t bits; + JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits); + if (status != JXL_DEC_SUCCESS) return status; + + size_t row_size = + jxl::DivCeil(dec->metadata.oriented_xsize(dec->keep_orientation) * + format->num_channels * bits, + jxl::kBitsPerByte); + if (format->align > 1) { + row_size = jxl::DivCeil(row_size, format->align) * format->align; + } + *size = row_size * dec->metadata.oriented_ysize(dec->keep_orientation); + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetImageOutBuffer(JxlDecoder* dec, + const JxlPixelFormat* format, + void* buffer, size_t size) { + if (!dec->got_basic_info || !(dec->orig_events_wanted & JXL_DEC_FULL_IMAGE)) { + return JXL_API_ERROR("No image out buffer needed at this time"); + } + if (dec->image_out_buffer_set && !!dec->image_out_callback) { + return JXL_API_ERROR( + "Cannot change from image out callback to image out buffer"); + } + size_t min_size; + // This also checks whether the format is valid and supported and basic info + // is available. + JxlDecoderStatus status = + JxlDecoderImageOutBufferSize(dec, format, &min_size); + if (status != JXL_DEC_SUCCESS) return status; + + if (size < min_size) return JXL_DEC_ERROR; + + dec->image_out_buffer_set = true; + dec->image_out_buffer = buffer; + dec->image_out_size = size; + dec->image_out_format = *format; + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetImageOutCallback(JxlDecoder* dec, + const JxlPixelFormat* format, + JxlImageOutCallback callback, + void* opaque) { + if (dec->image_out_buffer_set && !!dec->image_out_buffer) { + return JXL_API_ERROR( + "Cannot change from image out buffer to image out callback"); + } + + // Perform error checking for invalid format. + size_t bits_dummy; + JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits_dummy); + if (status != JXL_DEC_SUCCESS) return status; + + dec->image_out_buffer_set = true; + dec->image_out_callback = callback; + dec->image_out_opaque = opaque; + dec->image_out_format = *format; + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetFrameHeader(const JxlDecoder* dec, + JxlFrameHeader* header) { + if (!dec->frame_header || dec->frame_stage == FrameStage::kHeader) { + return JXL_API_ERROR("no frame header available"); + } + const auto& metadata = dec->metadata.m; + if (metadata.have_animation) { + header->duration = dec->frame_header->animation_frame.duration; + if (metadata.animation.have_timecodes) { + header->timecode = dec->frame_header->animation_frame.timecode; + } + } + header->name_length = dec->frame_header->name.size(); + header->is_last = dec->frame_header->is_last; + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderGetFrameName(const JxlDecoder* dec, char* name, + size_t size) { + if (!dec->frame_header || dec->frame_stage == FrameStage::kHeader) { + return JXL_API_ERROR("no frame header available"); + } + if (size < dec->frame_header->name.size() + 1) { + return JXL_API_ERROR("too small frame name output buffer"); + } + memcpy(name, dec->frame_header->name.c_str(), + dec->frame_header->name.size() + 1); + + return JXL_DEC_SUCCESS; +} + +JxlDecoderStatus JxlDecoderSetPreferredColorProfile( + JxlDecoder* dec, const JxlColorEncoding* color_encoding) { + if (!dec->got_all_headers) { + return JXL_API_ERROR("color info not yet available"); + } + if (dec->post_headers) { + return JXL_API_ERROR("too late to set the color encoding"); + } + if (dec->metadata.m.color_encoding.IsGray() != + (color_encoding->color_space == JXL_COLOR_SPACE_GRAY)) { + return JXL_API_ERROR("grayscale mismatch"); + } + if (color_encoding->color_space == JXL_COLOR_SPACE_UNKNOWN || + color_encoding->color_space == JXL_COLOR_SPACE_XYB) { + return JXL_API_ERROR("only RGB or grayscale output supported"); + } + + JXL_API_RETURN_IF_ERROR(ConvertExternalToInternalColorEncoding( + *color_encoding, &dec->default_enc)); + JXL_API_RETURN_IF_ERROR(dec->passes_state->output_encoding_info.Set( + dec->metadata, dec->default_enc)); + return JXL_DEC_SUCCESS; +} + +// This function is "package-private". It is only used by fuzzer to avoid +// running cases that are too memory / CPU hungry. Limitations are applied +// at mid-level API. In the future high-level API would also include the +// means of limiting / throttling memory / CPU usage. +void SetDecoderMemoryLimitBase_(size_t memory_limit_base) { + memory_limit_base_ = memory_limit_base; + // Allow 5 x max_image_size processing units; every frame is accounted + // as W x H CPU processing units, so there could be numerous small frames + // or few larger ones. + cpu_limit_base_ = 5 * memory_limit_base; +} diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_encode.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_encode.cc new file mode 100644 index 0000000000..f4e94d1412 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_encode.cc @@ -0,0 +1,471 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "jxl/encode.h" + +#include +#include + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/enc_external_image.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_icc_codec.h" +#include "lib/jxl/encode_internal.h" +#include "lib/jxl/jpeg/enc_jpeg_data.h" + +#define JPEGXL_MAJOR_VERSION 0 +#define JPEGXL_MINOR_VERSION 5 +#define JPEGXL_PATCH_VERSION 0 + +// Debug-printing failure macro similar to JXL_FAILURE, but for the status code +// JXL_ENC_ERROR +#ifdef JXL_CRASH_ON_ERROR +#define JXL_API_ERROR(format, ...) \ + (::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \ + ::jxl::Abort(), JXL_ENC_ERROR) +#else // JXL_CRASH_ON_ERROR +#define JXL_API_ERROR(format, ...) \ + (((JXL_DEBUG_ON_ERROR) && \ + ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__)), \ + JXL_ENC_ERROR) +#endif // JXL_CRASH_ON_ERROR + +namespace jxl {} // namespace jxl + +uint32_t JxlEncoderVersion(void) { + return JPEGXL_MAJOR_VERSION * 1000000 + JPEGXL_MINOR_VERSION * 1000 + + JPEGXL_PATCH_VERSION; +} + +JxlEncoderStatus JxlEncoderStruct::RefillOutputByteQueue() { + jxl::MemoryManagerUniquePtr input_frame = + std::move(input_frame_queue[0]); + input_frame_queue.erase(input_frame_queue.begin()); + + // TODO(zond): If the frame queue is empty and the input_closed is true, + // then mark this frame as the last. + + jxl::BitWriter writer; + + if (!wrote_bytes) { + if (use_container) { + output_byte_queue.insert( + output_byte_queue.end(), jxl::kContainerHeader, + jxl::kContainerHeader + sizeof(jxl::kContainerHeader)); + if (store_jpeg_metadata && jpeg_metadata.size() > 0) { + jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_metadata.size(), + false, &output_byte_queue); + output_byte_queue.insert(output_byte_queue.end(), jpeg_metadata.begin(), + jpeg_metadata.end()); + } + } + if (!WriteHeaders(&metadata, &writer, nullptr)) { + return JXL_ENC_ERROR; + } + // Only send ICC (at least several hundred bytes) if fields aren't enough. + if (metadata.m.color_encoding.WantICC()) { + if (!jxl::WriteICC(metadata.m.color_encoding.ICC(), &writer, + jxl::kLayerHeader, nullptr)) { + return JXL_ENC_ERROR; + } + } + + // TODO(lode): preview should be added here if a preview image is added + + // Each frame should start on byte boundaries. + writer.ZeroPadToByte(); + } + + // TODO(zond): Handle progressive mode like EncodeFile does it. + // TODO(zond): Handle animation like EncodeFile does it, by checking if + // JxlEncoderCloseInput has been called and if the frame queue is + // empty (to see if it's the last animation frame). + + if (metadata.m.xyb_encoded) { + input_frame->option_values.cparams.color_transform = + jxl::ColorTransform::kXYB; + } else { + // TODO(zond): Figure out when to use kYCbCr instead. + input_frame->option_values.cparams.color_transform = + jxl::ColorTransform::kNone; + } + + jxl::PassesEncoderState enc_state; + if (!jxl::EncodeFrame(input_frame->option_values.cparams, jxl::FrameInfo{}, + &metadata, input_frame->frame, &enc_state, + thread_pool.get(), &writer, + /*aux_out=*/nullptr)) { + return JXL_ENC_ERROR; + } + + jxl::PaddedBytes bytes = std::move(writer).TakeBytes(); + + if (use_container && !wrote_bytes) { + if (input_closed && input_frame_queue.empty()) { + jxl::AppendBoxHeader(jxl::MakeBoxType("jxlc"), bytes.size(), + /*unbounded=*/false, &output_byte_queue); + } else { + jxl::AppendBoxHeader(jxl::MakeBoxType("jxlc"), 0, /*unbounded=*/true, + &output_byte_queue); + } + } + + output_byte_queue.insert(output_byte_queue.end(), bytes.data(), + bytes.data() + bytes.size()); + wrote_bytes = true; + + last_used_cparams = input_frame->option_values.cparams; + + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderSetColorEncoding(JxlEncoder* enc, + const JxlColorEncoding* color) { + if (enc->color_encoding_set) { + // Already set + return JXL_ENC_ERROR; + } + if (!jxl::ConvertExternalToInternalColorEncoding( + *color, &enc->metadata.m.color_encoding)) { + return JXL_ENC_ERROR; + } + enc->color_encoding_set = true; + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderSetICCProfile(JxlEncoder* enc, + const uint8_t* icc_profile, + size_t size) { + if (enc->color_encoding_set) { + // Already set + return JXL_ENC_ERROR; + } + jxl::PaddedBytes icc; + icc.assign(icc_profile, icc_profile + size); + if (!enc->metadata.m.color_encoding.SetICCRaw(std::move(icc))) { + return JXL_ENC_ERROR; + } + enc->color_encoding_set = true; + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderSetBasicInfo(JxlEncoder* enc, + const JxlBasicInfo* info) { + if (!enc->metadata.size.Set(info->xsize, info->ysize)) { + return JXL_ENC_ERROR; + } + if (info->exponent_bits_per_sample) { + if (info->exponent_bits_per_sample != 8) return JXL_ENC_NOT_SUPPORTED; + if (info->bits_per_sample == 32) { + enc->metadata.m.SetFloat32Samples(); + } else { + return JXL_ENC_NOT_SUPPORTED; + } + } else { + switch (info->bits_per_sample) { + case 32: + case 16: + case 8: + enc->metadata.m.SetUintSamples(info->bits_per_sample); + break; + default: + return JXL_ENC_ERROR; + break; + } + } + if (info->alpha_bits > 0 && info->alpha_exponent_bits > 0) { + return JXL_ENC_NOT_SUPPORTED; + } + switch (info->alpha_bits) { + case 0: + break; + case 32: + case 16: + enc->metadata.m.SetAlphaBits(16); + break; + case 8: + enc->metadata.m.SetAlphaBits(info->alpha_bits); + break; + default: + return JXL_ENC_ERROR; + break; + } + enc->metadata.m.xyb_encoded = !info->uses_original_profile; + enc->basic_info_set = true; + return JXL_ENC_SUCCESS; +} + +JxlEncoderOptions* JxlEncoderOptionsCreate(JxlEncoder* enc, + const JxlEncoderOptions* source) { + auto opts = + jxl::MemoryManagerMakeUnique(&enc->memory_manager); + if (!opts) return nullptr; + opts->enc = enc; + if (source != nullptr) { + opts->values = source->values; + } else { + opts->values.lossless = false; + } + JxlEncoderOptions* ret = opts.get(); + enc->encoder_options.emplace_back(std::move(opts)); + return ret; +} + +JxlEncoderStatus JxlEncoderOptionsSetLossless(JxlEncoderOptions* options, + const JXL_BOOL lossless) { + options->values.lossless = lossless; + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderOptionsSetEffort(JxlEncoderOptions* options, + const int effort) { + if (effort < 3 || effort > 9) { + return JXL_ENC_ERROR; + } + options->values.cparams.speed_tier = static_cast(10 - effort); + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderOptionsSetDistance(JxlEncoderOptions* options, + float distance) { + if (distance < 0 || distance > 15) { + return JXL_ENC_ERROR; + } + options->values.cparams.butteraugli_distance = distance; + return JXL_ENC_SUCCESS; +} + +JxlEncoder* JxlEncoderCreate(const JxlMemoryManager* memory_manager) { + JxlMemoryManager local_memory_manager; + if (!jxl::MemoryManagerInit(&local_memory_manager, memory_manager)) { + return nullptr; + } + + void* alloc = + jxl::MemoryManagerAlloc(&local_memory_manager, sizeof(JxlEncoder)); + if (!alloc) return nullptr; + JxlEncoder* enc = new (alloc) JxlEncoder(); + enc->memory_manager = local_memory_manager; + + return enc; +} + +void JxlEncoderReset(JxlEncoder* enc) { + enc->thread_pool.reset(); + enc->input_frame_queue.clear(); + enc->encoder_options.clear(); + enc->output_byte_queue.clear(); + enc->wrote_bytes = false; + enc->metadata = jxl::CodecMetadata(); + enc->last_used_cparams = jxl::CompressParams(); + enc->input_closed = false; + enc->basic_info_set = false; + enc->color_encoding_set = false; +} + +void JxlEncoderDestroy(JxlEncoder* enc) { + if (enc) { + // Call destructor directly since custom free function is used. + enc->~JxlEncoder(); + jxl::MemoryManagerFree(&enc->memory_manager, enc); + } +} + +JxlEncoderStatus JxlEncoderUseContainer(JxlEncoder* enc, + JXL_BOOL use_container) { + enc->use_container = static_cast(use_container); + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderStoreJPEGMetadata(JxlEncoder* enc, + JXL_BOOL store_jpeg_metadata) { + enc->store_jpeg_metadata = static_cast(store_jpeg_metadata); + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderSetParallelRunner(JxlEncoder* enc, + JxlParallelRunner parallel_runner, + void* parallel_runner_opaque) { + if (enc->thread_pool) return JXL_API_ERROR("parallel runner already set"); + enc->thread_pool = jxl::MemoryManagerMakeUnique( + &enc->memory_manager, parallel_runner, parallel_runner_opaque); + if (!enc->thread_pool) { + return JXL_ENC_ERROR; + } + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderAddJPEGFrame(const JxlEncoderOptions* options, + const uint8_t* buffer, size_t size) { + if (options->enc->input_closed) { + return JXL_ENC_ERROR; + } + + jxl::CodecInOut io; + if (!jxl::jpeg::DecodeImageJPG(jxl::Span(buffer, size), &io)) { + return JXL_ENC_ERROR; + } + + if (!options->enc->color_encoding_set) { + if (!SetColorEncodingFromJpegData( + *io.Main().jpeg_data, &options->enc->metadata.m.color_encoding)) { + return JXL_ENC_ERROR; + } + } + + if (!options->enc->basic_info_set) { + JxlBasicInfo basic_info; + basic_info.exponent_bits_per_sample = 0; + basic_info.bits_per_sample = 8; + basic_info.alpha_bits = 0; + basic_info.alpha_exponent_bits = 0; + basic_info.xsize = io.Main().jpeg_data->width; + basic_info.ysize = io.Main().jpeg_data->height; + basic_info.uses_original_profile = true; + if (JxlEncoderSetBasicInfo(options->enc, &basic_info) != JXL_ENC_SUCCESS) { + return JXL_ENC_ERROR; + } + } + + if (options->enc->metadata.m.xyb_encoded) { + // Can't XYB encode a lossless JPEG. + return JXL_ENC_ERROR; + } + + if (options->enc->store_jpeg_metadata) { + jxl::jpeg::JPEGData data_in = *io.Main().jpeg_data; + jxl::PaddedBytes jpeg_data; + if (!EncodeJPEGData(data_in, &jpeg_data)) { + return JXL_ENC_ERROR; + } + options->enc->jpeg_metadata = std::vector( + jpeg_data.data(), jpeg_data.data() + jpeg_data.size()); + } + + auto queued_frame = jxl::MemoryManagerMakeUnique( + &options->enc->memory_manager, + // JxlEncoderQueuedFrame is a struct with no constructors, so we use the + // default move constructor there. + jxl::JxlEncoderQueuedFrame{options->values, + jxl::ImageBundle(&options->enc->metadata.m)}); + if (!queued_frame) { + return JXL_ENC_ERROR; + } + queued_frame->frame.SetFromImage(std::move(*io.Main().color()), + io.Main().c_current()); + queued_frame->frame.jpeg_data = std::move(io.Main().jpeg_data); + queued_frame->frame.color_transform = io.Main().color_transform; + queued_frame->frame.chroma_subsampling = io.Main().chroma_subsampling; + + if (options->values.lossless) { + queued_frame->option_values.cparams.SetLossless(); + } + + options->enc->input_frame_queue.emplace_back(std::move(queued_frame)); + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderAddImageFrame(const JxlEncoderOptions* options, + const JxlPixelFormat* pixel_format, + const void* buffer, size_t size) { + if (!options->enc->basic_info_set || !options->enc->color_encoding_set) { + return JXL_ENC_ERROR; + } + + if (options->enc->input_closed) { + return JXL_ENC_ERROR; + } + + auto queued_frame = jxl::MemoryManagerMakeUnique( + &options->enc->memory_manager, + // JxlEncoderQueuedFrame is a struct with no constructors, so we use the + // default move constructor there. + jxl::JxlEncoderQueuedFrame{options->values, + jxl::ImageBundle(&options->enc->metadata.m)}); + if (!queued_frame) { + return JXL_ENC_ERROR; + } + + if (pixel_format->data_type == JXL_TYPE_FLOAT16) { + // float16 is currently only supported in the decoder + return JXL_ENC_ERROR; + } + + jxl::ColorEncoding c_current; + if (options->enc->metadata.m.xyb_encoded) { + if (pixel_format->data_type == JXL_TYPE_FLOAT) { + c_current = + jxl::ColorEncoding::LinearSRGB(pixel_format->num_channels < 3); + } else { + c_current = jxl::ColorEncoding::SRGB(pixel_format->num_channels < 3); + } + } else { + c_current = options->enc->metadata.m.color_encoding; + } + + if (!jxl::BufferToImageBundle(*pixel_format, options->enc->metadata.xsize(), + options->enc->metadata.ysize(), buffer, size, + options->enc->thread_pool.get(), c_current, + &(queued_frame->frame))) { + return JXL_ENC_ERROR; + } + + if (options->values.lossless) { + queued_frame->option_values.cparams.SetLossless(); + } + + options->enc->input_frame_queue.emplace_back(std::move(queued_frame)); + return JXL_ENC_SUCCESS; +} + +void JxlEncoderCloseInput(JxlEncoder* enc) { enc->input_closed = true; } + +JxlEncoderStatus JxlEncoderProcessOutput(JxlEncoder* enc, uint8_t** next_out, + size_t* avail_out) { + while (*avail_out > 0 && + (!enc->output_byte_queue.empty() || !enc->input_frame_queue.empty())) { + if (!enc->output_byte_queue.empty()) { + size_t to_copy = std::min(*avail_out, enc->output_byte_queue.size()); + memcpy(static_cast(*next_out), enc->output_byte_queue.data(), + to_copy); + *next_out += to_copy; + *avail_out -= to_copy; + enc->output_byte_queue.erase(enc->output_byte_queue.begin(), + enc->output_byte_queue.begin() + to_copy); + } else if (!enc->input_frame_queue.empty()) { + if (enc->RefillOutputByteQueue() != JXL_ENC_SUCCESS) { + return JXL_ENC_ERROR; + } + } + } + + if (!enc->output_byte_queue.empty() || !enc->input_frame_queue.empty()) { + return JXL_ENC_NEED_MORE_OUTPUT; + } + return JXL_ENC_SUCCESS; +} + +JxlEncoderStatus JxlEncoderOptionsSetDecodingSpeed(JxlEncoderOptions* options, + int tier) { + if (tier < 0 || tier > 4) { + return JXL_ENC_ERROR; + } + options->values.cparams.decoding_speed_tier = tier; + return JXL_ENC_SUCCESS; +} + +void JxlColorEncodingSetToSRGB(JxlColorEncoding* color_encoding, + JXL_BOOL is_gray) { + ConvertInternalToExternalColorEncoding(jxl::ColorEncoding::SRGB(is_gray), + color_encoding); +} + +void JxlColorEncodingSetToLinearSRGB(JxlColorEncoding* color_encoding, + JXL_BOOL is_gray) { + ConvertInternalToExternalColorEncoding( + jxl::ColorEncoding::LinearSRGB(is_gray), color_encoding); +} diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_inspection.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_inspection.h new file mode 100644 index 0000000000..0b70a58523 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_inspection.h @@ -0,0 +1,22 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_JXL_INSPECTION_H_ +#define LIB_JXL_JXL_INSPECTION_H_ + +#include + +#include "lib/jxl/image.h" + +namespace jxl { +// Type of the inspection-callback which, if enabled, will be called on various +// intermediate data during image processing, allowing inspection access. +// +// Returns false if processing can be stopped at that point, true otherwise. +// This is only advisory - it is always OK to just continue processing. +using InspectorImage3F = std::function; +} // namespace jxl + +#endif // LIB_JXL_JXL_INSPECTION_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_osx.syms b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_osx.syms new file mode 100644 index 0000000000..96bc568025 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_osx.syms @@ -0,0 +1 @@ +_Jxl* diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_test.cc new file mode 100644 index 0000000000..139e7cffde --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/jxl_test.cc @@ -0,0 +1,1628 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "lib/extras/codec.h" +#include "lib/extras/codec_jpg.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/thread_pool_internal.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/dec_file.h" +#include "lib/jxl/dec_params.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/jpeg/enc_jpeg_data.h" +#include "lib/jxl/modular/options.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testdata.h" +#include "tools/box/box.h" + +namespace jxl { +namespace { +using test::Roundtrip; + +#define JXL_TEST_NL 0 // Disabled in code + +void CreateImage1x1(CodecInOut* io) { + Image3F image(1, 1); + ZeroFillImage(&image); + io->metadata.m.SetUintSamples(8); + io->metadata.m.color_encoding = ColorEncoding::SRGB(); + io->SetFromImage(std::move(image), io->metadata.m.color_encoding); +} + +TEST(JxlTest, HeaderSize) { + CodecInOut io; + CreateImage1x1(&io); + + CompressParams cparams; + cparams.butteraugli_distance = 1.5; + DecompressParams dparams; + ThreadPool* pool = nullptr; + + { + CodecInOut io2; + AuxOut aux_out; + Roundtrip(&io, cparams, dparams, pool, &io2, &aux_out); + EXPECT_LE(aux_out.layers[kLayerHeader].total_bits, 34); + } + + { + CodecInOut io2; + io.metadata.m.SetAlphaBits(8); + ImageF alpha(1, 1); + alpha.Row(0)[0] = 1; + io.Main().SetAlpha(std::move(alpha), /*alpha_is_premultiplied=*/false); + AuxOut aux_out; + Roundtrip(&io, cparams, dparams, pool, &io2, &aux_out); + EXPECT_LE(aux_out.layers[kLayerHeader].total_bits, 57); + } +} + +TEST(JxlTest, RoundtripSinglePixel) { + CodecInOut io; + CreateImage1x1(&io); + + CompressParams cparams; + cparams.butteraugli_distance = 1.0; + DecompressParams dparams; + ThreadPool* pool = nullptr; + CodecInOut io2; + Roundtrip(&io, cparams, dparams, pool, &io2); +} + +// Changing serialized signature causes Decode to fail. +#ifndef JXL_CRASH_ON_ERROR +TEST(JxlTest, RoundtripMarker) { + CodecInOut io; + CreateImage1x1(&io); + + CompressParams cparams; + cparams.butteraugli_distance = 1.0; + DecompressParams dparams; + AuxOut* aux_out = nullptr; + ThreadPool* pool = nullptr; + + PassesEncoderState enc_state; + for (size_t i = 0; i < 2; ++i) { + PaddedBytes compressed; + EXPECT_TRUE( + EncodeFile(cparams, &io, &enc_state, &compressed, aux_out, pool)); + compressed[i] ^= 0xFF; + CodecInOut io2; + EXPECT_FALSE(DecodeFile(dparams, compressed, &io2, pool)); + } +} +#endif + +TEST(JxlTest, RoundtripTinyFast) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + io.ShrinkTo(32, 32); + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.butteraugli_distance = 4.0f; + DecompressParams dparams; + + CodecInOut io2; + const size_t enc_bytes = Roundtrip(&io, cparams, dparams, pool, &io2); + printf("32x32 image size %zu bytes\n", enc_bytes); +} + +TEST(JxlTest, RoundtripSmallD1) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CompressParams cparams; + cparams.butteraugli_distance = 1.0; + DecompressParams dparams; + + CodecInOut io_out; + size_t compressed_size; + + { + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + io.ShrinkTo(io.xsize() / 8, io.ysize() / 8); + + compressed_size = Roundtrip(&io, cparams, dparams, pool, &io_out); + EXPECT_LE(compressed_size, 1000); + EXPECT_LE(ButteraugliDistance(io, io_out, cparams.ba_params, + /*distmap=*/nullptr, pool), + 1.5); + } + + { + // And then, with a lower intensity target than the default, the bitrate + // should be smaller. + CodecInOut io_dim; + io_dim.target_nits = 100; + ASSERT_TRUE(SetFromBytes(Span(orig), &io_dim, pool)); + io_dim.ShrinkTo(io_dim.xsize() / 8, io_dim.ysize() / 8); + EXPECT_LT(Roundtrip(&io_dim, cparams, dparams, pool, &io_out), + compressed_size); + EXPECT_LE(ButteraugliDistance(io_dim, io_out, cparams.ba_params, + /*distmap=*/nullptr, pool), + 1.5); + EXPECT_EQ(io_dim.metadata.m.IntensityTarget(), + io_out.metadata.m.IntensityTarget()); + } +} + +TEST(JxlTest, RoundtripOtherTransforms) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/64px/a2d1un_nkitzmiller_srgb8.png"); + std::unique_ptr io = jxl::make_unique(); + ASSERT_TRUE(SetFromBytes(Span(orig), io.get(), pool)); + + CompressParams cparams; + // Slow modes access linear image for adaptive quant search + cparams.speed_tier = SpeedTier::kKitten; + cparams.color_transform = ColorTransform::kNone; + cparams.butteraugli_distance = 5.0f; + DecompressParams dparams; + + std::unique_ptr io2 = jxl::make_unique(); + const size_t compressed_size = + Roundtrip(io.get(), cparams, dparams, pool, io2.get()); + EXPECT_LE(compressed_size, 23000); + EXPECT_LE(ButteraugliDistance(*io, *io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 6); + + // Check the consistency when performing another roundtrip. + std::unique_ptr io3 = jxl::make_unique(); + const size_t compressed_size2 = + Roundtrip(io.get(), cparams, dparams, pool, io3.get()); + EXPECT_LE(compressed_size2, 23000); + EXPECT_LE(ButteraugliDistance(*io, *io3, cparams.ba_params, + /*distmap=*/nullptr, pool), + 6); +} + +TEST(JxlTest, RoundtripResample2) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + io.ShrinkTo(io.xsize(), io.ysize()); + CompressParams cparams; + cparams.resampling = 2; + DecompressParams dparams; + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 15777); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 12.5); +} +TEST(JxlTest, RoundtripResample2MT) { + ThreadPoolInternal pool(4); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png"); + // image has to be large enough to have multiple groups after downsampling + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + CompressParams cparams; + cparams.resampling = 2; + DecompressParams dparams; + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, &pool, &io2), 57000); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, &pool), +#if JXL_HIGH_PRECISION + 4.5); +#else + 12.5); +#endif +} + +TEST(JxlTest, RoundtripResample4) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + io.ShrinkTo(io.xsize(), io.ysize()); + CompressParams cparams; + cparams.resampling = 4; + DecompressParams dparams; + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 6000); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 28); +} + +TEST(JxlTest, RoundtripResample8) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + io.ShrinkTo(io.xsize(), io.ysize()); + CompressParams cparams; + cparams.resampling = 8; + DecompressParams dparams; + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 2100); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 80); +} + +TEST(JxlTest, RoundtripUnalignedD2) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + io.ShrinkTo(io.xsize() / 12, io.ysize() / 7); + + CompressParams cparams; + cparams.butteraugli_distance = 2.0; + DecompressParams dparams; + + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 700); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 3.2); +} + +#if JXL_TEST_NL + +TEST(JxlTest, RoundtripMultiGroupNL) { + ThreadPoolInternal pool(4); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + io.ShrinkTo(600, 1024); // partial X, full Y group + + CompressParams cparams; + DecompressParams dparams; + + cparams.fast_mode = true; + cparams.butteraugli_distance = 1.0f; + CodecInOut io2; + Roundtrip(&io, cparams, dparams, &pool, &io2); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, &pool), + 0.9f); + + cparams.butteraugli_distance = 2.0f; + CodecInOut io3; + EXPECT_LE(Roundtrip(&io, cparams, dparams, &pool, &io3), 80000); + EXPECT_LE(ButteraugliDistance(io, io3, cparams.ba_params, + /*distmap=*/nullptr, &pool), + 1.5f); +} + +#endif + +TEST(JxlTest, RoundtripMultiGroup) { + ThreadPoolInternal pool(4); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + io.ShrinkTo(600, 1024); + + CompressParams cparams; + DecompressParams dparams; + + cparams.butteraugli_distance = 1.0f; + cparams.speed_tier = SpeedTier::kKitten; + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, &pool, &io2), 40000); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, &pool), + 1.99f); + + cparams.butteraugli_distance = 2.0f; + CodecInOut io3; + EXPECT_LE(Roundtrip(&io, cparams, dparams, &pool, &io3), 22100); + EXPECT_LE(ButteraugliDistance(io, io3, cparams.ba_params, + /*distmap=*/nullptr, &pool), + 3.0f); +} + +TEST(JxlTest, RoundtripLargeFast) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + DecompressParams dparams; + + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, &pool, &io2), 265000); +} + +TEST(JxlTest, RoundtripDotsForceEpf) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("wesaturate/500px/cvo9xd_keong_macan_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CompressParams cparams; + cparams.epf = 2; + cparams.dots = Override::kOn; + cparams.speed_tier = SpeedTier::kSquirrel; + DecompressParams dparams; + + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, &pool, &io2), 265000); +} + +// Checks for differing size/distance in two consecutive runs of distance 2, +// which involves additional processing including adaptive reconstruction. +// Failing this may be a sign of race conditions or invalid memory accesses. +TEST(JxlTest, RoundtripD2Consistent) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.butteraugli_distance = 2.0; + DecompressParams dparams; + + // Try each xsize mod kBlockDim to verify right border handling. + for (size_t xsize = 48; xsize > 40; --xsize) { + io.ShrinkTo(xsize, 15); + + CodecInOut io2; + const size_t size2 = Roundtrip(&io, cparams, dparams, &pool, &io2); + + CodecInOut io3; + const size_t size3 = Roundtrip(&io, cparams, dparams, &pool, &io3); + + // Exact same compressed size. + EXPECT_EQ(size2, size3); + + // Exact same distance. + const float dist2 = ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, &pool); + const float dist3 = ButteraugliDistance(io, io3, cparams.ba_params, + /*distmap=*/nullptr, &pool); + EXPECT_EQ(dist2, dist3); + } +} + +// Same as above, but for full image, testing multiple groups. +TEST(JxlTest, RoundtripLargeConsistent) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.butteraugli_distance = 2.0; + DecompressParams dparams; + + // Try each xsize mod kBlockDim to verify right border handling. + CodecInOut io2; + const size_t size2 = Roundtrip(&io, cparams, dparams, &pool, &io2); + + CodecInOut io3; + const size_t size3 = Roundtrip(&io, cparams, dparams, &pool, &io3); + + // Exact same compressed size. + EXPECT_EQ(size2, size3); + + // Exact same distance. + const float dist2 = ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, &pool); + const float dist3 = ButteraugliDistance(io, io3, cparams.ba_params, + /*distmap=*/nullptr, &pool); + EXPECT_EQ(dist2, dist3); +} + +#if JXL_TEST_NL + +TEST(JxlTest, RoundtripSmallNL) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + io.ShrinkTo(io.xsize() / 8, io.ysize() / 8); + + CompressParams cparams; + cparams.butteraugli_distance = 1.0; + DecompressParams dparams; + + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 1500); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 1.7); +} + +#endif + +TEST(JxlTest, RoundtripNoGaborishNoAR) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + + CompressParams cparams; + cparams.gaborish = Override::kOff; + cparams.epf = 0; + cparams.butteraugli_distance = 1.0; + DecompressParams dparams; + + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 40000); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 2.5); +} + +TEST(JxlTest, RoundtripSmallNoGaborish) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + io.ShrinkTo(io.xsize() / 8, io.ysize() / 8); + + CompressParams cparams; + cparams.gaborish = Override::kOff; + cparams.butteraugli_distance = 1.0; + DecompressParams dparams; + + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 900); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 1.7); +} + +TEST(JxlTest, RoundtripSmallPatchesAlpha) { + ThreadPool* pool = nullptr; + CodecInOut io; + io.metadata.m.color_encoding = ColorEncoding::LinearSRGB(); + Image3F black_with_small_lines(256, 256); + ImageF alpha(black_with_small_lines.xsize(), black_with_small_lines.ysize()); + ZeroFillImage(&black_with_small_lines); + // This pattern should be picked up by the patch detection heuristics. + for (size_t y = 0; y < black_with_small_lines.ysize(); y++) { + float* JXL_RESTRICT row = black_with_small_lines.PlaneRow(1, y); + for (size_t x = 0; x < black_with_small_lines.xsize(); x++) { + if (x % 4 == 0 && (y / 32) % 4 == 0) row[x] = 127.0f; + } + } + io.metadata.m.SetAlphaBits(8); + io.SetFromImage(std::move(black_with_small_lines), + ColorEncoding::LinearSRGB()); + FillImage(1.0f, &alpha); + io.Main().SetAlpha(std::move(alpha), /*alpha_is_premultiplied=*/false); + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.butteraugli_distance = 0.1f; + DecompressParams dparams; + + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 2000); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 0.5f); +} + +TEST(JxlTest, RoundtripSmallPatches) { + ThreadPool* pool = nullptr; + CodecInOut io; + io.metadata.m.color_encoding = ColorEncoding::LinearSRGB(); + Image3F black_with_small_lines(256, 256); + ZeroFillImage(&black_with_small_lines); + // This pattern should be picked up by the patch detection heuristics. + for (size_t y = 0; y < black_with_small_lines.ysize(); y++) { + float* JXL_RESTRICT row = black_with_small_lines.PlaneRow(1, y); + for (size_t x = 0; x < black_with_small_lines.xsize(); x++) { + if (x % 4 == 0 && (y / 32) % 4 == 0) row[x] = 127.0f; + } + } + io.SetFromImage(std::move(black_with_small_lines), + ColorEncoding::LinearSRGB()); + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.butteraugli_distance = 0.1f; + DecompressParams dparams; + + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 2000); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 0.5f); +} + +// Test header encoding of original bits per sample +TEST(JxlTest, RoundtripImageBundleOriginalBits) { + ThreadPool* pool = nullptr; + + // Image does not matter, only io.metadata.m and io2.metadata.m are tested. + Image3F image(1, 1); + ZeroFillImage(&image); + CodecInOut io; + io.metadata.m.color_encoding = ColorEncoding::LinearSRGB(); + io.SetFromImage(std::move(image), ColorEncoding::LinearSRGB()); + + CompressParams cparams; + DecompressParams dparams; + + // Test unsigned integers from 1 to 32 bits + for (uint32_t bit_depth = 1; bit_depth <= 32; bit_depth++) { + if (bit_depth == 32) { + // TODO(lode): allow testing 32, however the code below ends up in + // enc_modular which does not support 32. We only want to test the header + // encoding though, so try without modular. + break; + } + + io.metadata.m.SetUintSamples(bit_depth); + CodecInOut io2; + Roundtrip(&io, cparams, dparams, pool, &io2); + + EXPECT_EQ(bit_depth, io2.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io2.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0, io2.metadata.m.bit_depth.exponent_bits_per_sample); + EXPECT_EQ(0, io2.metadata.m.GetAlphaBits()); + } + + // Test various existing and non-existing floating point formats + for (uint32_t bit_depth = 8; bit_depth <= 32; bit_depth++) { + if (bit_depth != 32) { + // TODO: test other float types once they work + break; + } + + uint32_t exponent_bit_depth; + if (bit_depth < 10) { + exponent_bit_depth = 2; + } else if (bit_depth < 12) { + exponent_bit_depth = 3; + } else if (bit_depth < 16) { + exponent_bit_depth = 4; + } else if (bit_depth < 20) { + exponent_bit_depth = 5; + } else if (bit_depth < 24) { + exponent_bit_depth = 6; + } else if (bit_depth < 28) { + exponent_bit_depth = 7; + } else { + exponent_bit_depth = 8; + } + + io.metadata.m.bit_depth.bits_per_sample = bit_depth; + io.metadata.m.bit_depth.floating_point_sample = true; + io.metadata.m.bit_depth.exponent_bits_per_sample = exponent_bit_depth; + + CodecInOut io2; + Roundtrip(&io, cparams, dparams, pool, &io2); + + EXPECT_EQ(bit_depth, io2.metadata.m.bit_depth.bits_per_sample); + EXPECT_TRUE(io2.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(exponent_bit_depth, + io2.metadata.m.bit_depth.exponent_bits_per_sample); + EXPECT_EQ(0, io2.metadata.m.GetAlphaBits()); + } +} + +TEST(JxlTest, RoundtripGrayscale) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/cvo9xd_keong_macan_grayscale.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + ASSERT_NE(io.xsize(), 0); + io.ShrinkTo(128, 128); + EXPECT_TRUE(io.Main().IsGray()); + EXPECT_EQ(8, io.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0, io.metadata.m.bit_depth.exponent_bits_per_sample); + EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB()); + + PassesEncoderState enc_state; + AuxOut* aux_out = nullptr; + + { + CompressParams cparams; + cparams.butteraugli_distance = 1.0; + DecompressParams dparams; + + PaddedBytes compressed; + EXPECT_TRUE( + EncodeFile(cparams, &io, &enc_state, &compressed, aux_out, pool)); + CodecInOut io2; + EXPECT_TRUE(DecodeFile(dparams, compressed, &io2, pool)); + EXPECT_TRUE(io2.Main().IsGray()); + + EXPECT_LE(compressed.size(), 7000); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 1.7777777); + } + + // Test with larger butteraugli distance and other settings enabled so + // different jxl codepaths trigger. + { + CompressParams cparams; + cparams.butteraugli_distance = 8.0; + DecompressParams dparams; + + PaddedBytes compressed; + EXPECT_TRUE( + EncodeFile(cparams, &io, &enc_state, &compressed, aux_out, pool)); + CodecInOut io2; + EXPECT_TRUE(DecodeFile(dparams, compressed, &io2, pool)); + EXPECT_TRUE(io2.Main().IsGray()); + + EXPECT_LE(compressed.size(), 1300); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 9.0); + } +} + +TEST(JxlTest, RoundtripAlpha) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/tmshre_riaphotographs_alpha.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + + ASSERT_NE(io.xsize(), 0); + ASSERT_TRUE(io.metadata.m.HasAlpha()); + ASSERT_TRUE(io.Main().HasAlpha()); + io.ShrinkTo(300, 300); + + CompressParams cparams; + cparams.butteraugli_distance = 1.0; + DecompressParams dparams; + + EXPECT_EQ(8, io.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0, io.metadata.m.bit_depth.exponent_bits_per_sample); + EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB()); + PassesEncoderState enc_state; + AuxOut* aux_out = nullptr; + PaddedBytes compressed; + EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, aux_out, pool)); + CodecInOut io2; + EXPECT_TRUE(DecodeFile(dparams, compressed, &io2, pool)); + + EXPECT_LE(compressed.size(), 10077); + + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 1.4); +} + +TEST(JxlTest, RoundtripAlphaPremultiplied) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/tmshre_riaphotographs_alpha.png"); + CodecInOut io, io_nopremul; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + ASSERT_TRUE(SetFromBytes(Span(orig), &io_nopremul, pool)); + + ASSERT_NE(io.xsize(), 0); + ASSERT_TRUE(io.metadata.m.HasAlpha()); + ASSERT_TRUE(io.Main().HasAlpha()); + io.ShrinkTo(300, 300); + io_nopremul.ShrinkTo(300, 300); + + CompressParams cparams; + cparams.butteraugli_distance = 1.0; + DecompressParams dparams; + + io.PremultiplyAlpha(); + EXPECT_TRUE(io.Main().AlphaIsPremultiplied()); + PassesEncoderState enc_state; + AuxOut* aux_out = nullptr; + PaddedBytes compressed; + EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, aux_out, pool)); + CodecInOut io2; + EXPECT_TRUE(DecodeFile(dparams, compressed, &io2, pool)); + + EXPECT_LE(compressed.size(), 10000); + + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 1.4); + io2.Main().UnpremultiplyAlpha(); + EXPECT_LE(ButteraugliDistance(io_nopremul, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 1.8); +} + +TEST(JxlTest, RoundtripAlphaResampling) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/tmshre_riaphotographs_alpha.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + + ASSERT_NE(io.xsize(), 0); + ASSERT_TRUE(io.metadata.m.HasAlpha()); + ASSERT_TRUE(io.Main().HasAlpha()); + + CompressParams cparams; + cparams.resampling = 2; + cparams.ec_resampling = 2; + cparams.butteraugli_distance = 1.0; + DecompressParams dparams; + + PassesEncoderState enc_state; + AuxOut* aux_out = nullptr; + PaddedBytes compressed; + EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, aux_out, pool)); + CodecInOut io2; + EXPECT_TRUE(DecodeFile(dparams, compressed, &io2, pool)); + + EXPECT_LE(compressed.size(), 15000); + + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 6.0); +} + +TEST(JxlTest, RoundtripAlphaResamplingOnlyAlpha) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/tmshre_riaphotographs_alpha.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + + ASSERT_NE(io.xsize(), 0); + ASSERT_TRUE(io.metadata.m.HasAlpha()); + ASSERT_TRUE(io.Main().HasAlpha()); + + CompressParams cparams; + cparams.ec_resampling = 2; + cparams.butteraugli_distance = 1.0; + DecompressParams dparams; + + PassesEncoderState enc_state; + AuxOut* aux_out = nullptr; + PaddedBytes compressed; + EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, aux_out, pool)); + CodecInOut io2; + EXPECT_TRUE(DecodeFile(dparams, compressed, &io2, pool)); + + EXPECT_LE(compressed.size(), 31000); + + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 1.5); +} + +TEST(JxlTest, RoundtripAlphaNonMultipleOf8) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/tmshre_riaphotographs_alpha.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + + ASSERT_NE(io.xsize(), 0); + ASSERT_TRUE(io.metadata.m.HasAlpha()); + ASSERT_TRUE(io.Main().HasAlpha()); + io.ShrinkTo(12, 12); + + CompressParams cparams; + cparams.butteraugli_distance = 1.0; + DecompressParams dparams; + + EXPECT_EQ(8, io.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0, io.metadata.m.bit_depth.exponent_bits_per_sample); + EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB()); + PassesEncoderState enc_state; + AuxOut* aux_out = nullptr; + PaddedBytes compressed; + EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, aux_out, pool)); + CodecInOut io2; + EXPECT_TRUE(DecodeFile(dparams, compressed, &io2, pool)); + + EXPECT_LE(compressed.size(), 200); + + // TODO(robryk): Fix the following line in presence of different alpha_bits in + // the two contexts. + // EXPECT_TRUE(SamePixels(io.Main().alpha(), io2.Main().alpha())); + // TODO(robryk): Fix the distance estimate used in the encoder. + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 6.3); +} + +TEST(JxlTest, RoundtripAlpha16) { + ThreadPoolInternal pool(4); + + size_t xsize = 1200, ysize = 160; + Image3F color(xsize, ysize); + ImageF alpha(xsize, ysize); + // Generate 16-bit pattern that uses various colors and alpha values. + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xsize; x++) { + color.PlaneRow(0, y)[x] = (y * 65535 / ysize) * (1.0f / 65535); + color.PlaneRow(1, y)[x] = (x * 65535 / xsize) * (1.0f / 65535); + color.PlaneRow(2, y)[x] = + ((y + x) * 65535 / (xsize + ysize)) * (1.0f / 65535); + alpha.Row(y)[x] = (x * 65535 / xsize) * (1.0f / 65535); + } + } + const bool is_gray = false; + CodecInOut io; + io.metadata.m.SetUintSamples(16); + io.metadata.m.SetAlphaBits(16); + io.metadata.m.color_encoding = ColorEncoding::SRGB(is_gray); + io.SetFromImage(std::move(color), io.metadata.m.color_encoding); + io.Main().SetAlpha(std::move(alpha), /*alpha_is_premultiplied=*/false); + + // The image is wider than 512 pixels to ensure multiple groups are tested. + + ASSERT_NE(io.xsize(), 0); + ASSERT_TRUE(io.metadata.m.HasAlpha()); + ASSERT_TRUE(io.Main().HasAlpha()); + + CompressParams cparams; + cparams.butteraugli_distance = 0.5; + // Prevent the test to be too slow, does not affect alpha + cparams.speed_tier = SpeedTier::kSquirrel; + DecompressParams dparams; + + io.metadata.m.SetUintSamples(16); + EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB()); + PassesEncoderState enc_state; + AuxOut* aux_out = nullptr; + PaddedBytes compressed; + EXPECT_TRUE( + EncodeFile(cparams, &io, &enc_state, &compressed, aux_out, &pool)); + CodecInOut io2; + EXPECT_TRUE(DecodeFile(dparams, compressed, &io2, &pool)); + + EXPECT_TRUE(SamePixels(*io.Main().alpha(), *io2.Main().alpha())); +} + +namespace { +CompressParams CParamsForLossless() { + CompressParams cparams; + cparams.modular_mode = true; + cparams.color_transform = jxl::ColorTransform::kNone; + cparams.quality_pair = {100, 100}; + cparams.options.predictor = {Predictor::Weighted}; + return cparams; +} +} // namespace + +TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8)) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("wesaturate/500px/tmshre_riaphotographs_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CompressParams cparams = CParamsForLossless(); + DecompressParams dparams; + + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, &pool, &io2), 3500000); + // If this test fails with a very close to 0.0 but not exactly 0.0 butteraugli + // distance, then there is likely a floating point issue, that could be + // happening either in io or io2. The values of io are generated by + // external_image.cc, and those in io2 by the jxl decoder. If they use + // slightly different floating point operations (say, one casts int to float + // while other divides the int through 255.0f and later multiplies it by + // 255 again) they will get slightly different values. To fix, ensure both + // sides do the following formula for converting integer range 0-255 to + // floating point range 0.0f-255.0f: static_cast(i) + // without any further intermediate operations. + // Note that this precision issue is not a problem in practice if the values + // are equal when rounded to 8-bit int, but currently full exact precision is + // tested. + EXPECT_EQ(0.0, ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, &pool)); +} + +TEST(JxlTest, JXL_SLOW_TEST(RoundtripLosslessNoEncoderFastPathWP)) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("wesaturate/500px/tmshre_riaphotographs_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CompressParams cparams = CParamsForLossless(); + cparams.speed_tier = SpeedTier::kFalcon; + cparams.options.skip_encoder_fast_path = true; + DecompressParams dparams; + + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, &pool, &io2), 3500000); + EXPECT_EQ(0.0, ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, &pool)); +} + +TEST(JxlTest, JXL_SLOW_TEST(RoundtripLosslessNoEncoderFastPathGradient)) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("wesaturate/500px/tmshre_riaphotographs_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CompressParams cparams = CParamsForLossless(); + cparams.speed_tier = SpeedTier::kThunder; + cparams.options.skip_encoder_fast_path = true; + cparams.options.predictor = {Predictor::Gradient}; + DecompressParams dparams; + + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, &pool, &io2), 3500000); + EXPECT_EQ(0.0, ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, &pool)); +} + +TEST(JxlTest, JXL_SLOW_TEST(RoundtripLosslessNoEncoderVeryFastPathGradient)) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("wesaturate/500px/tmshre_riaphotographs_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CompressParams cparams = CParamsForLossless(); + cparams.speed_tier = SpeedTier::kLightning; + cparams.options.skip_encoder_fast_path = true; + cparams.options.predictor = {Predictor::Gradient}; + DecompressParams dparams; + + CodecInOut io2, io3; + EXPECT_LE(Roundtrip(&io, cparams, dparams, &pool, &io2), 3500000); + EXPECT_EQ(0.0, ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, &pool)); + cparams.options.skip_encoder_fast_path = false; + EXPECT_LE(Roundtrip(&io, cparams, dparams, &pool, &io3), 3500000); + EXPECT_EQ(0.0, ButteraugliDistance(io, io3, cparams.ba_params, + /*distmap=*/nullptr, &pool)); +} + +TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8Falcon)) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("wesaturate/500px/tmshre_riaphotographs_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CompressParams cparams = CParamsForLossless(); + cparams.speed_tier = SpeedTier::kFalcon; + DecompressParams dparams; + + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, &pool, &io2), 3500000); + EXPECT_EQ(0.0, ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, &pool)); +} + +TEST(JxlTest, RoundtripLossless8Alpha) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/tmshre_riaphotographs_alpha.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + EXPECT_EQ(8, io.metadata.m.GetAlphaBits()); + EXPECT_EQ(8, io.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0, io.metadata.m.bit_depth.exponent_bits_per_sample); + + CompressParams cparams = CParamsForLossless(); + DecompressParams dparams; + + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 350000); + // If fails, see note about floating point in RoundtripLossless8. + EXPECT_EQ(0.0, ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool)); + EXPECT_TRUE(SamePixels(*io.Main().alpha(), *io2.Main().alpha())); + EXPECT_EQ(8, io2.metadata.m.GetAlphaBits()); + EXPECT_EQ(8, io2.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io2.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0, io2.metadata.m.bit_depth.exponent_bits_per_sample); +} + +TEST(JxlTest, RoundtripLossless16Alpha) { + ThreadPool* pool = nullptr; + + size_t xsize = 1200, ysize = 160; + Image3F color(xsize, ysize); + ImageF alpha(xsize, ysize); + // Generate 16-bit pattern that uses various colors and alpha values. + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xsize; x++) { + color.PlaneRow(0, y)[x] = (y * 65535 / ysize) * (1.0f / 65535); + color.PlaneRow(1, y)[x] = (x * 65535 / xsize) * (1.0f / 65535); + color.PlaneRow(2, y)[x] = + ((y + x) * 65535 / (xsize + ysize)) * (1.0f / 65535); + alpha.Row(y)[x] = (x * 65535 / xsize) * (1.0f / 65535); + } + } + const bool is_gray = false; + CodecInOut io; + io.metadata.m.SetUintSamples(16); + io.metadata.m.SetAlphaBits(16); + io.metadata.m.color_encoding = ColorEncoding::SRGB(is_gray); + io.SetFromImage(std::move(color), io.metadata.m.color_encoding); + io.Main().SetAlpha(std::move(alpha), /*alpha_is_premultiplied=*/false); + + EXPECT_EQ(16, io.metadata.m.GetAlphaBits()); + EXPECT_EQ(16, io.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0, io.metadata.m.bit_depth.exponent_bits_per_sample); + + CompressParams cparams = CParamsForLossless(); + DecompressParams dparams; + + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 7100); + // If this test fails with a very close to 0.0 but not exactly 0.0 butteraugli + // distance, then there is likely a floating point issue, that could be + // happening either in io or io2. The values of io are generated by + // external_image.cc, and those in io2 by the jxl decoder. If they use + // slightly different floating point operations (say, one does "i / 257.0f" + // while the other does "i * (1.0f / 257)" they will get slightly different + // values. To fix, ensure both sides do the following formula for converting + // integer range 0-65535 to Image3F floating point range 0.0f-255.0f: + // "i * (1.0f / 257)". + // Note that this precision issue is not a problem in practice if the values + // are equal when rounded to 16-bit int, but currently full exact precision is + // tested. + EXPECT_EQ(0.0, ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool)); + EXPECT_TRUE(SamePixels(*io.Main().alpha(), *io2.Main().alpha())); + EXPECT_EQ(16, io2.metadata.m.GetAlphaBits()); + EXPECT_EQ(16, io2.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io2.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0, io2.metadata.m.bit_depth.exponent_bits_per_sample); +} + +TEST(JxlTest, RoundtripLossless16AlphaNotMisdetectedAs8Bit) { + ThreadPool* pool = nullptr; + + size_t xsize = 128, ysize = 128; + Image3F color(xsize, ysize); + ImageF alpha(xsize, ysize); + // All 16-bit values, both color and alpha, of this image are below 64. + // This allows testing if a code path wrongly concludes it's an 8-bit instead + // of 16-bit image (or even 6-bit). + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xsize; x++) { + color.PlaneRow(0, y)[x] = (y * 64 / ysize) * (1.0f / 65535); + color.PlaneRow(1, y)[x] = (x * 64 / xsize) * (1.0f / 65535); + color.PlaneRow(2, y)[x] = + ((y + x) * 64 / (xsize + ysize)) * (1.0f / 65535); + alpha.Row(y)[x] = (64 * x / xsize) * (1.0f / 65535); + } + } + const bool is_gray = false; + CodecInOut io; + io.metadata.m.SetUintSamples(16); + io.metadata.m.SetAlphaBits(16); + io.metadata.m.color_encoding = ColorEncoding::SRGB(is_gray); + io.SetFromImage(std::move(color), io.metadata.m.color_encoding); + io.Main().SetAlpha(std::move(alpha), /*alpha_is_premultiplied=*/false); + + EXPECT_EQ(16, io.metadata.m.GetAlphaBits()); + EXPECT_EQ(16, io.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0, io.metadata.m.bit_depth.exponent_bits_per_sample); + + CompressParams cparams = CParamsForLossless(); + DecompressParams dparams; + + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 3100); + EXPECT_EQ(16, io2.metadata.m.GetAlphaBits()); + EXPECT_EQ(16, io2.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io2.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0, io2.metadata.m.bit_depth.exponent_bits_per_sample); + // If fails, see note about floating point in RoundtripLossless8. + EXPECT_EQ(0.0, ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool)); + EXPECT_TRUE(SamePixels(*io.Main().alpha(), *io2.Main().alpha())); +} + +TEST(JxlTest, RoundtripYCbCr420) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + const PaddedBytes yuv420 = + ReadTestData("imagecompression.info/flower_foveon.png.ffmpeg.y4m"); + CodecInOut io2; + ASSERT_TRUE(SetFromBytes(Span(yuv420), &io2, pool)); + + CompressParams cparams = CParamsForLossless(); + cparams.speed_tier = SpeedTier::kThunder; + DecompressParams dparams; + + PassesEncoderState enc_state; + AuxOut* aux_out = nullptr; + PaddedBytes compressed; + EXPECT_TRUE( + EncodeFile(cparams, &io2, &enc_state, &compressed, aux_out, pool)); + CodecInOut io3; + EXPECT_TRUE(DecodeFile(dparams, compressed, &io3, pool)); + + EXPECT_LE(compressed.size(), 1320000); + + // we're comparing an original PNG with a YCbCr 4:2:0 version + EXPECT_LE(ButteraugliDistance(io, io3, cparams.ba_params, + /*distmap=*/nullptr, pool), + 2.5); +} + +TEST(JxlTest, RoundtripDots) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/cvo9xd_keong_macan_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + + ASSERT_NE(io.xsize(), 0); + + CompressParams cparams; + cparams.dots = Override::kOn; + cparams.butteraugli_distance = 0.04; + cparams.speed_tier = SpeedTier::kSquirrel; + DecompressParams dparams; + + EXPECT_EQ(8, io.metadata.m.bit_depth.bits_per_sample); + EXPECT_EQ(0, io.metadata.m.bit_depth.exponent_bits_per_sample); + EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample); + EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB()); + PassesEncoderState enc_state; + AuxOut* aux_out = nullptr; + PaddedBytes compressed; + EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, aux_out, pool)); + CodecInOut io2; + EXPECT_TRUE(DecodeFile(dparams, compressed, &io2, pool)); + + EXPECT_LE(compressed.size(), 400000); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 2.2); +} + +TEST(JxlTest, RoundtripNoise) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/cvo9xd_keong_macan_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + + ASSERT_NE(io.xsize(), 0); + + CompressParams cparams; + cparams.noise = Override::kOn; + cparams.speed_tier = SpeedTier::kSquirrel; + DecompressParams dparams; + + EXPECT_EQ(8, io.metadata.m.bit_depth.bits_per_sample); + EXPECT_EQ(0, io.metadata.m.bit_depth.exponent_bits_per_sample); + EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample); + EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB()); + PassesEncoderState enc_state; + AuxOut* aux_out = nullptr; + PaddedBytes compressed; + EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, aux_out, pool)); + CodecInOut io2; + EXPECT_TRUE(DecodeFile(dparams, compressed, &io2, pool)); + + EXPECT_LE(compressed.size(), 40000); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 2.2); +} + +TEST(JxlTest, RoundtripLossless8Gray) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/cvo9xd_keong_macan_grayscale.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + + CompressParams cparams = CParamsForLossless(); + DecompressParams dparams; + + EXPECT_TRUE(io.Main().IsGray()); + EXPECT_EQ(8, io.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0, io.metadata.m.bit_depth.exponent_bits_per_sample); + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 130000); + // If fails, see note about floating point in RoundtripLossless8. + EXPECT_EQ(0.0, ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool)); + EXPECT_TRUE(io2.Main().IsGray()); + EXPECT_EQ(8, io2.metadata.m.bit_depth.bits_per_sample); + EXPECT_FALSE(io2.metadata.m.bit_depth.floating_point_sample); + EXPECT_EQ(0, io2.metadata.m.bit_depth.exponent_bits_per_sample); +} + +#if JPEGXL_ENABLE_GIF + +TEST(JxlTest, RoundtripAnimation) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = ReadTestData("jxl/traffic_light.gif"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + ASSERT_EQ(4, io.frames.size()); + + CompressParams cparams; + DecompressParams dparams; + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 3000); + + EXPECT_EQ(io2.frames.size(), io.frames.size()); + test::CoalesceGIFAnimationWithAlpha(&io); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), +#if JXL_HIGH_PRECISION + 1.55); +#else + 1.75); +#endif +} + +TEST(JxlTest, RoundtripLosslessAnimation) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = ReadTestData("jxl/traffic_light.gif"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + ASSERT_EQ(4, io.frames.size()); + + CompressParams cparams = CParamsForLossless(); + DecompressParams dparams; + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 1200); + + EXPECT_EQ(io2.frames.size(), io.frames.size()); + test::CoalesceGIFAnimationWithAlpha(&io); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 5e-4); +} + +#endif // JPEGXL_ENABLE_GIF + +#if JPEGXL_ENABLE_JPEG + +namespace { + +jxl::Status DecompressJxlToJPEGForTest( + const jpegxl::tools::JpegXlContainer& container, jxl::ThreadPool* pool, + jxl::PaddedBytes* output) { + output->clear(); + jxl::Span compressed(container.codestream, + container.codestream_size); + + JXL_RETURN_IF_ERROR(compressed.size() >= 2); + + // JXL case + // Decode to DCT when possible and generate a JPG file. + jxl::CodecInOut io; + jxl::DecompressParams params; + params.keep_dct = true; + if (!jpegxl::tools::DecodeJpegXlToJpeg(params, container, &io, pool)) { + return JXL_FAILURE("Failed to decode JXL to JPEG"); + } + io.jpeg_quality = 95; + if (!EncodeImageJPG(&io, jxl::JpegEncoder::kLibJpeg, io.jpeg_quality, + jxl::YCbCrChromaSubsampling(), pool, output, + jxl::DecodeTarget::kQuantizedCoeffs)) { + return JXL_FAILURE("Failed to generate JPEG"); + } + return true; +} + +} // namespace + +size_t RoundtripJpeg(const PaddedBytes& jpeg_in, ThreadPool* pool) { + CodecInOut io; + io.dec_target = jxl::DecodeTarget::kQuantizedCoeffs; + EXPECT_TRUE(SetFromBytes(Span(jpeg_in), &io, pool)); + CompressParams cparams; + cparams.color_transform = jxl::ColorTransform::kYCbCr; + + PassesEncoderState passes_enc_state; + PaddedBytes compressed, codestream; + + EXPECT_TRUE(EncodeFile(cparams, &io, &passes_enc_state, &codestream, + /*aux_out=*/nullptr, pool)); + jpegxl::tools::JpegXlContainer enc_container; + enc_container.codestream = codestream.data(); + enc_container.codestream_size = codestream.size(); + jpeg::JPEGData data_in = *io.Main().jpeg_data; + jxl::PaddedBytes jpeg_data; + EXPECT_TRUE(EncodeJPEGData(data_in, &jpeg_data)); + enc_container.jpeg_reconstruction = jpeg_data.data(); + enc_container.jpeg_reconstruction_size = jpeg_data.size(); + EXPECT_TRUE(EncodeJpegXlContainerOneShot(enc_container, &compressed)); + + jpegxl::tools::JpegXlContainer container; + EXPECT_TRUE(DecodeJpegXlContainerOneShot(compressed.data(), compressed.size(), + &container)); + PaddedBytes out; + EXPECT_TRUE(DecompressJxlToJPEGForTest(container, pool, &out)); + EXPECT_EQ(out.size(), jpeg_in.size()); + size_t failures = 0; + for (size_t i = 0; i < std::min(out.size(), jpeg_in.size()); i++) { + if (out[i] != jpeg_in[i]) { + EXPECT_EQ(out[i], jpeg_in[i]) + << "byte mismatch " << i << " " << out[i] << " != " << jpeg_in[i]; + if (++failures > 4) { + return compressed.size(); + } + } + } + return compressed.size(); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression444)) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png.im_q85_444.jpg"); + // JPEG size is 326'916 bytes. + EXPECT_LE(RoundtripJpeg(orig, &pool), 256000); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels)) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png.im_q85_444.jpg"); + CodecInOut io; + io.dec_target = jxl::DecodeTarget::kQuantizedCoeffs; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CodecInOut io2; + ASSERT_TRUE(SetFromBytes(Span(orig), &io2, &pool)); + + CompressParams cparams; + cparams.color_transform = jxl::ColorTransform::kYCbCr; + + DecompressParams dparams; + + CodecInOut io3; + Roundtrip(&io, cparams, dparams, &pool, &io3); + + // TODO(eustas): investigate, why SJPEG and JpegRecompression pixels are + // different. + EXPECT_GE(1.8, ButteraugliDistance(io2, io3, cparams.ba_params, + /*distmap=*/nullptr, &pool)); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels420)) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png.im_q85_420.jpg"); + CodecInOut io; + io.dec_target = jxl::DecodeTarget::kQuantizedCoeffs; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CodecInOut io2; + ASSERT_TRUE(SetFromBytes(Span(orig), &io2, &pool)); + + CompressParams cparams; + cparams.color_transform = jxl::ColorTransform::kYCbCr; + + DecompressParams dparams; + + CodecInOut io3; + Roundtrip(&io, cparams, dparams, &pool, &io3); + + EXPECT_GE(1.5, ButteraugliDistance(io2, io3, cparams.ba_params, + /*distmap=*/nullptr, &pool)); +} + +TEST(JxlTest, + JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels420Mul16)) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon_cropped.jpg"); + CodecInOut io; + io.dec_target = jxl::DecodeTarget::kQuantizedCoeffs; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CodecInOut io2; + ASSERT_TRUE(SetFromBytes(Span(orig), &io2, &pool)); + + CompressParams cparams; + cparams.color_transform = jxl::ColorTransform::kYCbCr; + + DecompressParams dparams; + + CodecInOut io3; + Roundtrip(&io, cparams, dparams, &pool, &io3); + + EXPECT_GE(1.5, ButteraugliDistance(io2, io3, cparams.ba_params, + /*distmap=*/nullptr, &pool)); +} + +TEST(JxlTest, + JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels_asymmetric)) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = ReadTestData( + "imagecompression.info/flower_foveon.png.im_q85_asymmetric.jpg"); + CodecInOut io; + io.dec_target = jxl::DecodeTarget::kQuantizedCoeffs; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CodecInOut io2; + ASSERT_TRUE(SetFromBytes(Span(orig), &io2, &pool)); + + CompressParams cparams; + cparams.color_transform = jxl::ColorTransform::kYCbCr; + + DecompressParams dparams; + + CodecInOut io3; + Roundtrip(&io, cparams, dparams, &pool, &io3); + + EXPECT_GE(1.5, ButteraugliDistance(io2, io3, cparams.ba_params, + /*distmap=*/nullptr, &pool)); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionGray)) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png.im_q85_gray.jpg"); + // JPEG size is 167'025 bytes. + EXPECT_LE(RoundtripJpeg(orig, &pool), 140000); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression420)) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png.im_q85_420.jpg"); + // JPEG size is 226'018 bytes. + EXPECT_LE(RoundtripJpeg(orig, &pool), 181050); +} + +TEST(JxlTest, + JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression_luma_subsample)) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = ReadTestData( + "imagecompression.info/flower_foveon.png.im_q85_luma_subsample.jpg"); + // JPEG size is 216'069 bytes. + EXPECT_LE(RoundtripJpeg(orig, &pool), 181000); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression444_12)) { + // 444 JPEG that has an interesting sampling-factor (1x2, 1x2, 1x2). + ThreadPoolInternal pool(8); + const PaddedBytes orig = ReadTestData( + "imagecompression.info/flower_foveon.png.im_q85_444_1x2.jpg"); + // JPEG size is 329'942 bytes. + EXPECT_LE(RoundtripJpeg(orig, &pool), 256000); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression422)) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png.im_q85_422.jpg"); + // JPEG size is 265'590 bytes. + EXPECT_LE(RoundtripJpeg(orig, &pool), 209000); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression440)) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png.im_q85_440.jpg"); + // JPEG size is 262'249 bytes. + EXPECT_LE(RoundtripJpeg(orig, &pool), 209000); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression_asymmetric)) { + // 2x vertical downsample of one chroma channel, 2x horizontal downsample of + // the other. + ThreadPoolInternal pool(8); + const PaddedBytes orig = ReadTestData( + "imagecompression.info/flower_foveon.png.im_q85_asymmetric.jpg"); + // JPEG size is 262'249 bytes. + EXPECT_LE(RoundtripJpeg(orig, &pool), 209000); +} + +TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression420Progr)) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = ReadTestData( + "imagecompression.info/flower_foveon.png.im_q85_420_progr.jpg"); + EXPECT_LE(RoundtripJpeg(orig, &pool), 181000); +} + +#endif // JPEGXL_ENABLE_JPEG + +TEST(JxlTest, RoundtripProgressive) { + ThreadPoolInternal pool(4); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + io.ShrinkTo(600, 1024); + + CompressParams cparams; + DecompressParams dparams; + + cparams.butteraugli_distance = 1.0f; + cparams.progressive_dc = true; + cparams.responsive = true; + cparams.progressive_mode = true; + CodecInOut io2; + EXPECT_LE(Roundtrip(&io, cparams, dparams, &pool, &io2), 40000); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, &pool), + 4.0f); +} + +TEST(JxlTest, RoundtripAnimationPatches) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = ReadTestData("jxl/animation_patches.gif"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + ASSERT_EQ(2, io.frames.size()); + + CompressParams cparams; + cparams.patches = Override::kOn; + DecompressParams dparams; + CodecInOut io2; + // 40k with no patches, 27k with patch frames encoded multiple times. + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 24000); + + EXPECT_EQ(io2.frames.size(), io.frames.size()); + // >10 with broken patches + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 2.0); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/lehmer_code.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/lehmer_code.h new file mode 100644 index 0000000000..dd1d21c6f7 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/lehmer_code.h @@ -0,0 +1,102 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_LEHMER_CODE_H_ +#define LIB_JXL_LEHMER_CODE_H_ + +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +// Permutation <=> factorial base representation (Lehmer code). + +using LehmerT = uint32_t; + +template +constexpr T ValueOfLowest1Bit(T t) { + return t & -t; +} + +// Computes the Lehmer (factorial basis) code of permutation, an array of n +// unique indices in [0..n), and stores it in code[0..len). N*logN time. +// temp must have n + 1 elements but need not be initialized. +template +void ComputeLehmerCode(const PermutationT* JXL_RESTRICT permutation, + uint32_t* JXL_RESTRICT temp, const size_t n, + LehmerT* JXL_RESTRICT code) { + for (size_t idx = 0; idx < n + 1; ++idx) temp[idx] = 0; + + for (size_t idx = 0; idx < n; ++idx) { + const PermutationT s = permutation[idx]; + + // Compute sum in Fenwick tree + uint32_t penalty = 0; + uint32_t i = s + 1; + while (i != 0) { + penalty += temp[i]; + i &= i - 1; // clear lowest bit + } + JXL_DASSERT(s >= penalty); + code[idx] = s - penalty; + i = s + 1; + // Add operation in Fenwick tree + while (i < n + 1) { + temp[i] += 1; + i += ValueOfLowest1Bit(i); + } + } +} + +// Decodes the Lehmer code in code[0..n) into permutation[0..n). +// temp must have 1 << CeilLog2(n) elements but need not be initialized. +template +void DecodeLehmerCode(const LehmerT* JXL_RESTRICT code, + uint32_t* JXL_RESTRICT temp, size_t n, + PermutationT* JXL_RESTRICT permutation) { + JXL_DASSERT(n != 0); + const size_t log2n = CeilLog2Nonzero(n); + const size_t padded_n = 1ull << log2n; + + for (size_t i = 0; i < padded_n; i++) { + const int32_t i1 = static_cast(i + 1); + temp[i] = static_cast(ValueOfLowest1Bit(i1)); + } + + for (size_t i = 0; i < n; i++) { + JXL_DASSERT(code[i] + i < n); + uint32_t rank = code[i] + 1; + + // Extract i-th unused element via implicit order-statistics tree. + size_t bit = padded_n; + size_t next = 0; + for (size_t i = 0; i <= log2n; i++) { + const size_t cand = next + bit; + JXL_DASSERT(cand >= 1); + bit >>= 1; + if (temp[cand - 1] < rank) { + next = cand; + rank -= temp[cand - 1]; + } + } + + permutation[i] = next; + + // Mark as used + next += 1; + while (next <= padded_n) { + temp[next - 1] -= 1; + next += ValueOfLowest1Bit(next); + } + } +} + +} // namespace jxl + +#endif // LIB_JXL_LEHMER_CODE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/lehmer_code_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/lehmer_code_test.cc new file mode 100644 index 0000000000..1ce5618ea1 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/lehmer_code_test.cc @@ -0,0 +1,98 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/lehmer_code.h" + +#include +#include + +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/thread_pool_internal.h" + +namespace jxl { +namespace { + +template +struct WorkingSet { + explicit WorkingSet(size_t max_n) + : padded_n(1ull << CeilLog2Nonzero(max_n + 1)), + permutation(max_n), + temp(padded_n), + lehmer(max_n), + decoded(max_n) {} + + size_t padded_n; + std::vector permutation; + std::vector temp; + std::vector lehmer; + std::vector decoded; +}; + +template +void Roundtrip(size_t n, WorkingSet* ws) { + JXL_ASSERT(n != 0); + const size_t padded_n = 1ull << CeilLog2Nonzero(n); + + std::mt19937 rng(n * 65537 + 13); + + // Ensure indices fit into PermutationT + EXPECT_LE(n, 1ULL << (sizeof(PermutationT) * 8)); + + std::iota(ws->permutation.begin(), ws->permutation.begin() + n, 0); + + // For various random permutations: + for (size_t rep = 0; rep < 100; ++rep) { + std::shuffle(ws->permutation.begin(), ws->permutation.begin() + n, rng); + + // Must decode to the same permutation + ComputeLehmerCode(ws->permutation.data(), ws->temp.data(), n, + ws->lehmer.data()); + memset(ws->temp.data(), 0, padded_n * 4); + DecodeLehmerCode(ws->lehmer.data(), ws->temp.data(), n, ws->decoded.data()); + + for (size_t i = 0; i < n; ++i) { + EXPECT_EQ(ws->permutation[i], ws->decoded[i]); + } + } +} + +// Preallocates arrays and tests n = [begin, end). +template +void RoundtripSizeRange(ThreadPool* pool, uint32_t begin, uint32_t end) { + ASSERT_NE(0, begin); // n = 0 not allowed. + std::vector> working_sets; + + RunOnPool( + pool, begin, end, + [&working_sets, end](size_t num_threads) { + for (size_t i = 0; i < num_threads; i++) { + working_sets.emplace_back(end - 1); + } + return true; + }, + [&working_sets](int n, int thread) { + Roundtrip(n, &working_sets[thread]); + }, + "lehmer test"); +} + +TEST(LehmerCodeTest, TestRoundtrips) { + ThreadPoolInternal pool(8); + + RoundtripSizeRange(&pool, 1, 1026); + + // Ensures PermutationT can fit > 16 bit values. + RoundtripSizeRange(&pool, 65536, 65540); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/libjxl.pc.in b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/libjxl.pc.in new file mode 100644 index 0000000000..5dca2ac168 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/libjxl.pc.in @@ -0,0 +1,12 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=${prefix} +libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@ +includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ + +Name: libjxl +Description: Loads and saves JPEG XL files +Version: @JPEGXL_LIBRARY_VERSION@ +Requires.private: @JPEGXL_LIBRARY_REQUIRES@ +Libs: -L${libdir} -ljxl +Libs.private: -lm +Cflags: -I${includedir} diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/linalg.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/linalg.cc new file mode 100644 index 0000000000..61d66dd8db --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/linalg.cc @@ -0,0 +1,235 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/linalg.h" + +#include + +#include +#include +#include +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { + +void AssertSymmetric(const ImageD& A) { +#if JXL_ENABLE_ASSERT + JXL_ASSERT(A.xsize() == A.ysize()); + for (size_t i = 0; i < A.xsize(); ++i) { + for (size_t j = i + 1; j < A.xsize(); ++j) { + JXL_ASSERT(std::abs(A.Row(i)[j] - A.Row(j)[i]) < 1e-15); + } + } +#endif +} + +void Diagonalize2x2(const double a0, const double a1, const double b, double* c, + double* s) { + if (std::abs(b) < 1e-15) { + *c = 1.0; + *s = 0.0; + return; + } + double phi = std::atan2(2 * b, a1 - a0); + double theta = b > 0.0 ? 0.5 * phi : 0.5 * phi + Pi(1.0); + *c = std::cos(theta); + *s = std::sin(theta); +} + +void GivensRotation(const double x, const double y, double* c, double* s) { + if (y == 0.0) { + *c = x < 0.0 ? -1.0 : 1.0; + *s = 0.0; + } else { + const double h = hypot(x, y); + const double d = 1.0 / h; + *c = x * d; + *s = -y * d; + } +} + +void RotateMatrixCols(ImageD* const JXL_RESTRICT U, int i, int j, double c, + double s) { + JXL_ASSERT(U->xsize() == U->ysize()); + const size_t N = U->xsize(); + double* const JXL_RESTRICT u_i = U->Row(i); + double* const JXL_RESTRICT u_j = U->Row(j); + std::vector rot_i, rot_j; + rot_i.reserve(N); + rot_j.reserve(N); + for (size_t k = 0; k < N; ++k) { + rot_i.push_back(u_i[k] * c - u_j[k] * s); + rot_j.push_back(u_i[k] * s + u_j[k] * c); + } + for (size_t k = 0; k < N; ++k) { + u_i[k] = rot_i[k]; + u_j[k] = rot_j[k]; + } +} +void HouseholderReflector(const size_t N, const double* x, double* u) { + const double sigma = x[0] <= 0.0 ? 1.0 : -1.0; + u[0] = x[0] - sigma * std::sqrt(DotProduct(N, x, x)); + for (size_t k = 1; k < N; ++k) { + u[k] = x[k]; + } + double u_norm = 1.0 / std::sqrt(DotProduct(N, u, u)); + for (size_t k = 0; k < N; ++k) { + u[k] *= u_norm; + } +} + +void ConvertToTridiagonal(const ImageD& A, ImageD* const JXL_RESTRICT T, + ImageD* const JXL_RESTRICT U) { + AssertSymmetric(A); + const size_t N = A.xsize(); + *U = Identity(A.xsize()); + *T = CopyImage(A); + std::vector u_stack; + for (size_t k = 0; k + 2 < N; ++k) { + if (DotProduct(N - k - 2, &T->Row(k)[k + 2], &T->Row(k)[k + 2]) > 1e-15) { + ImageD u(N, 1); + ZeroFillImage(&u); + HouseholderReflector(N - k - 1, &T->Row(k)[k + 1], &u.Row(0)[k + 1]); + ImageD v = MatMul(*T, u); + double scale = DotProduct(u, v); + v = LinComb(2.0, v, -2.0 * scale, u); + SubtractFrom(MatMul(u, Transpose(v)), T); + SubtractFrom(MatMul(v, Transpose(u)), T); + u_stack.emplace_back(std::move(u)); + } + } + while (!u_stack.empty()) { + const ImageD& u = u_stack.back(); + ImageD v = MatMul(Transpose(*U), u); + SubtractFrom(ScaleImage(2.0, MatMul(u, Transpose(v))), U); + u_stack.pop_back(); + } +} + +double WilkinsonShift(const double a0, const double a1, const double b) { + const double d = 0.5 * (a0 - a1); + if (d == 0.0) { + return a1 - std::abs(b); + } + const double sign_d = d > 0.0 ? 1.0 : -1.0; + return a1 - b * b / (d + sign_d * hypotf(d, b)); +} + +void ImplicitQRStep(ImageD* const JXL_RESTRICT U, double* const JXL_RESTRICT a, + double* const JXL_RESTRICT b, int m0, int m1) { + JXL_ASSERT(m1 - m0 > 2); + double x = a[m0] - WilkinsonShift(a[m1 - 2], a[m1 - 1], b[m1 - 1]); + double y = b[m0 + 1]; + for (int k = m0; k < m1 - 1; ++k) { + double c, s; + GivensRotation(x, y, &c, &s); + const double w = c * x - s * y; + const double d = a[k] - a[k + 1]; + const double z = (2 * c * b[k + 1] + d * s) * s; + a[k] -= z; + a[k + 1] += z; + b[k + 1] = d * c * s + (c * c - s * s) * b[k + 1]; + x = b[k + 1]; + if (k > m0) { + b[k] = w; + } + if (k < m1 - 2) { + y = -s * b[k + 2]; + b[k + 2] *= c; + } + RotateMatrixCols(U, k, k + 1, c, s); + } +} + +void ScanInterval(const double* const JXL_RESTRICT a, + const double* const JXL_RESTRICT b, int istart, + const int iend, const double eps, + std::deque >* intervals) { + for (int k = istart; k < iend; ++k) { + if ((k + 1 == iend) || + std::abs(b[k + 1]) < eps * (std::abs(a[k]) + std::abs(a[k + 1]))) { + if (k > istart) { + intervals->push_back(std::make_pair(istart, k + 1)); + } + istart = k + 1; + } + } +} + +void ConvertToDiagonal(const ImageD& A, ImageD* const JXL_RESTRICT diag, + ImageD* const JXL_RESTRICT U) { + AssertSymmetric(A); + const size_t N = A.xsize(); + ImageD T; + ConvertToTridiagonal(A, &T, U); + // From now on, the algorithm keeps the transformed matrix tri-diagonal, + // so we only need to keep track of the diagonal and the off-diagonal entries. + std::vector a(N); + std::vector b(N); + for (size_t k = 0; k < N; ++k) { + a[k] = T.Row(k)[k]; + if (k > 0) b[k] = T.Row(k)[k - 1]; + } + // Run the symmetric tri-diagonal QR algorithm with implicit Wilkinson shift. + const double kEpsilon = 1e-14; + std::deque > intervals; + ScanInterval(&a[0], &b[0], 0, N, kEpsilon, &intervals); + while (!intervals.empty()) { + const int istart = intervals[0].first; + const int iend = intervals[0].second; + intervals.pop_front(); + if (iend == istart + 2) { + double& a0 = a[istart]; + double& a1 = a[istart + 1]; + double& b1 = b[istart + 1]; + double c, s; + Diagonalize2x2(a0, a1, b1, &c, &s); + const double d = a0 - a1; + const double z = (2 * c * b1 + d * s) * s; + a0 -= z; + a1 += z; + b1 = 0.0; + RotateMatrixCols(U, istart, istart + 1, c, s); + } else { + ImplicitQRStep(U, &a[0], &b[0], istart, iend); + ScanInterval(&a[0], &b[0], istart, iend, kEpsilon, &intervals); + } + } + *diag = ImageD(N, 1); + double* const JXL_RESTRICT diag_row = diag->Row(0); + for (size_t k = 0; k < N; ++k) { + diag_row[k] = a[k]; + } +} + +void ComputeQRFactorization(const ImageD& A, ImageD* const JXL_RESTRICT Q, + ImageD* const JXL_RESTRICT R) { + JXL_ASSERT(A.xsize() == A.ysize()); + const size_t N = A.xsize(); + *Q = Identity(N); + *R = CopyImage(A); + std::vector u_stack; + for (size_t k = 0; k + 1 < N; ++k) { + if (DotProduct(N - k - 1, &R->Row(k)[k + 1], &R->Row(k)[k + 1]) > 1e-15) { + ImageD u(N, 1); + FillImage(0.0, &u); + HouseholderReflector(N - k, &R->Row(k)[k], &u.Row(0)[k]); + ImageD v = MatMul(Transpose(u), *R); + SubtractFrom(ScaleImage(2.0, MatMul(u, v)), R); + u_stack.emplace_back(std::move(u)); + } + } + while (!u_stack.empty()) { + const ImageD& u = u_stack.back(); + ImageD v = MatMul(Transpose(u), *Q); + SubtractFrom(ScaleImage(2.0, MatMul(u, v)), Q); + u_stack.pop_back(); + } +} +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/linalg.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/linalg.h new file mode 100644 index 0000000000..7fbd943d90 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/linalg.h @@ -0,0 +1,294 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_LINALG_H_ +#define LIB_JXL_LINALG_H_ + +// Linear algebra. + +#include + +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { + +using ImageD = Plane; + +template +inline T DotProduct(const size_t N, const T* const JXL_RESTRICT a, + const T* const JXL_RESTRICT b) { + T sum = 0.0; + for (size_t k = 0; k < N; ++k) { + sum += a[k] * b[k]; + } + return sum; +} + +template +inline T L2NormSquared(const size_t N, const T* const JXL_RESTRICT a) { + return DotProduct(N, a, a); +} + +template +inline T L1Norm(const size_t N, const T* const JXL_RESTRICT a) { + T sum = 0; + for (size_t k = 0; k < N; ++k) { + sum += a[k] >= 0 ? a[k] : -a[k]; + } + return sum; +} + +inline double DotProduct(const ImageD& a, const ImageD& b) { + JXL_ASSERT(a.ysize() == 1); + JXL_ASSERT(b.ysize() == 1); + JXL_ASSERT(a.xsize() == b.xsize()); + const double* const JXL_RESTRICT row_a = a.Row(0); + const double* const JXL_RESTRICT row_b = b.Row(0); + return DotProduct(a.xsize(), row_a, row_b); +} + +inline ImageD Transpose(const ImageD& A) { + ImageD out(A.ysize(), A.xsize()); + for (size_t x = 0; x < A.xsize(); ++x) { + double* const JXL_RESTRICT row_out = out.Row(x); + for (size_t y = 0; y < A.ysize(); ++y) { + row_out[y] = A.Row(y)[x]; + } + } + return out; +} + +template +Plane MatMul(const Plane& A, const Plane& B) { + JXL_ASSERT(A.ysize() == B.xsize()); + Plane out(A.xsize(), B.ysize()); + for (size_t y = 0; y < B.ysize(); ++y) { + const Tin2* const JXL_RESTRICT row_b = B.Row(y); + Tout* const JXL_RESTRICT row_out = out.Row(y); + for (size_t x = 0; x < A.xsize(); ++x) { + row_out[x] = 0.0; + for (size_t k = 0; k < B.xsize(); ++k) { + row_out[x] += A.Row(k)[x] * row_b[k]; + } + } + } + return out; +} + +template +ImageD MatMul(const Plane& A, const Plane& B) { + return MatMul(A, B); +} + +template +ImageI MatMulI(const Plane& A, const Plane& B) { + return MatMul(A, B); +} + +// Computes A = B * C, with sizes rows*cols: A=ha*wa, B=wa*wb, C=ha*wb +template +void MatMul(const T* a, const T* b, int ha, int wa, int wb, T* c) { + std::vector temp(wa); // Make better use of cache lines + for (int x = 0; x < wb; x++) { + for (int z = 0; z < wa; z++) { + temp[z] = b[z * wb + x]; + } + for (int y = 0; y < ha; y++) { + double e = 0; + for (int z = 0; z < wa; z++) { + e += a[y * wa + z] * temp[z]; + } + c[y * wb + x] = e; + } + } +} + +// Computes C = A + factor * B +template +void MatAdd(const T* a, const T* b, F factor, int h, int w, T* c) { + for (int i = 0; i < w * h; i++) { + c[i] = a[i] + b[i] * factor; + } +} + +template +inline Plane Identity(const size_t N) { + Plane out(N, N); + for (size_t i = 0; i < N; ++i) { + T* JXL_RESTRICT row = out.Row(i); + std::fill(row, row + N, 0); + row[i] = static_cast(1.0); + } + return out; +} + +inline ImageD Diagonal(const ImageD& d) { + JXL_ASSERT(d.ysize() == 1); + ImageD out(d.xsize(), d.xsize()); + const double* JXL_RESTRICT row_diag = d.Row(0); + for (size_t k = 0; k < d.xsize(); ++k) { + double* JXL_RESTRICT row_out = out.Row(k); + std::fill(row_out, row_out + d.xsize(), 0.0); + row_out[k] = row_diag[k]; + } + return out; +} + +// Computes c, s such that c^2 + s^2 = 1 and +// [c -s] [x] = [ * ] +// [s c] [y] [ 0 ] +void GivensRotation(double x, double y, double* c, double* s); + +// U = U * Givens(i, j, c, s) +void RotateMatrixCols(ImageD* JXL_RESTRICT U, int i, int j, double c, double s); + +// A is symmetric, U is orthogonal, T is tri-diagonal and +// A = U * T * Transpose(U). +void ConvertToTridiagonal(const ImageD& A, ImageD* JXL_RESTRICT T, + ImageD* JXL_RESTRICT U); + +// A is symmetric, U is orthogonal, and A = U * Diagonal(diag) * Transpose(U). +void ConvertToDiagonal(const ImageD& A, ImageD* JXL_RESTRICT diag, + ImageD* JXL_RESTRICT U); + +// A is square matrix, Q is orthogonal, R is upper triangular and A = Q * R; +void ComputeQRFactorization(const ImageD& A, ImageD* JXL_RESTRICT Q, + ImageD* JXL_RESTRICT R); + +// Inverts a 3x3 matrix in place +template +Status Inv3x3Matrix(T* matrix) { + // Intermediate computation is done in double precision. + double temp[9]; + temp[0] = static_cast(matrix[4]) * matrix[8] - + static_cast(matrix[5]) * matrix[7]; + temp[1] = static_cast(matrix[2]) * matrix[7] - + static_cast(matrix[1]) * matrix[8]; + temp[2] = static_cast(matrix[1]) * matrix[5] - + static_cast(matrix[2]) * matrix[4]; + temp[3] = static_cast(matrix[5]) * matrix[6] - + static_cast(matrix[3]) * matrix[8]; + temp[4] = static_cast(matrix[0]) * matrix[8] - + static_cast(matrix[2]) * matrix[6]; + temp[5] = static_cast(matrix[2]) * matrix[3] - + static_cast(matrix[0]) * matrix[5]; + temp[6] = static_cast(matrix[3]) * matrix[7] - + static_cast(matrix[4]) * matrix[6]; + temp[7] = static_cast(matrix[1]) * matrix[6] - + static_cast(matrix[0]) * matrix[7]; + temp[8] = static_cast(matrix[0]) * matrix[4] - + static_cast(matrix[1]) * matrix[3]; + double det = matrix[0] * temp[0] + matrix[1] * temp[3] + matrix[2] * temp[6]; + if (std::abs(det) < 1e-10) { + return JXL_FAILURE("Matrix determinant is too close to 0"); + } + double idet = 1.0 / det; + for (int i = 0; i < 9; i++) { + matrix[i] = temp[i] * idet; + } + return true; +} + +// Solves system of linear equations A * X = B using the conjugate gradient +// method. Matrix a must be a n*n, symmetric and positive definite. +// Vectors b and x must have n elements +template +void ConjugateGradient(const T* a, int n, const T* b, T* x) { + std::vector r(n); + MatMul(a, x, n, n, 1, r.data()); + MatAdd(b, r.data(), -1, n, 1, r.data()); + std::vector p = r; + T rr; + MatMul(r.data(), r.data(), 1, n, 1, &rr); // inner product + + if (rr == 0) return; // The initial values were already optimal + + for (int i = 0; i < n; i++) { + std::vector ap(n); + MatMul(a, p.data(), n, n, 1, ap.data()); + T alpha; + MatMul(r.data(), ap.data(), 1, n, 1, &alpha); + // Normally alpha couldn't be zero here but if numerical issues caused it, + // return assuming the solution is close. + if (alpha == 0) return; + alpha = rr / alpha; + MatAdd(x, p.data(), alpha, n, 1, x); + MatAdd(r.data(), ap.data(), -alpha, n, 1, r.data()); + + T rr2; + MatMul(r.data(), r.data(), 1, n, 1, &rr2); // inner product + if (rr2 < 1e-20) break; + + T beta = rr2 / rr; + MatAdd(r.data(), p.data(), beta, 1, n, p.data()); + rr = rr2; + } +} + +// Computes optimal coefficients r to approximate points p with linear +// combination of functions f. The matrix f has h rows and w columns, r has h +// values, p has w values. h is the amount of functions, w the amount of points. +// Uses the finite element method and minimizes mean square error. +template +void FEM(const T* f, int h, int w, const T* p, T* r) { + // Compute "Gramian" matrix G = F * F^T + // Speed up multiplication by using non-zero intervals in sparse F. + std::vector start(h); + std::vector end(h); + for (int y = 0; y < h; y++) { + start[y] = end[y] = 0; + for (int x = 0; x < w; x++) { + if (f[y * w + x] != 0) { + start[y] = x; + break; + } + } + for (int x = w - 1; x >= 0; x--) { + if (f[y * w + x] != 0) { + end[y] = x + 1; + break; + } + } + } + + std::vector g(h * h); + for (int y = 0; y < h; y++) { + for (int x = 0; x <= y; x++) { + T v = 0; + // Intersection of the two sparse intervals. + int s = std::max(start[x], start[y]); + int e = std::min(end[x], end[y]); + for (int z = s; z < e; z++) { + v += f[x * w + z] * f[y * w + z]; + } + // Symmetric, so two values output at once + g[y * h + x] = v; + g[x * h + y] = v; + } + } + + // B vector: sum of each column of F multiplied by corresponding p + std::vector b(h, 0); + for (int y = 0; y < h; y++) { + T v = 0; + for (int x = 0; x < w; x++) { + v += f[y * w + x] * p[x]; + } + b[y] = v; + } + + ConjugateGradient(g.data(), h, b.data(), r); +} + +} // namespace jxl + +#endif // LIB_JXL_LINALG_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/linalg_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/linalg_test.cc new file mode 100644 index 0000000000..0842f61dad --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/linalg_test.cc @@ -0,0 +1,149 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/linalg.h" + +#include + +#include "gtest/gtest.h" +#include "lib/jxl/image_test_utils.h" + +namespace jxl { +namespace { + +template +Plane RandomMatrix(const size_t xsize, const size_t ysize, Random& rng, + const T vmin, const T vmax) { + Plane A(xsize, ysize); + GeneratorRandom gen(&rng, vmin, vmax); + GenerateImage(gen, &A); + return A; +} + +template +Plane RandomSymmetricMatrix(const size_t N, Random& rng, const T vmin, + const T vmax) { + Plane A = RandomMatrix(N, N, rng, vmin, vmax); + for (size_t i = 0; i < N; ++i) { + for (size_t j = 0; j < i; ++j) { + A.Row(j)[i] = A.Row(i)[j]; + } + } + return A; +} +void VerifyMatrixEqual(const ImageD& A, const ImageD& B, const double eps) { + ASSERT_EQ(A.xsize(), B.xsize()); + ASSERT_EQ(A.ysize(), B.ysize()); + for (size_t y = 0; y < A.ysize(); ++y) { + for (size_t x = 0; x < A.xsize(); ++x) { + ASSERT_NEAR(A.Row(y)[x], B.Row(y)[x], eps); + } + } +} + +void VerifyOrthogonal(const ImageD& A, const double eps) { + VerifyMatrixEqual(Identity(A.xsize()), MatMul(Transpose(A), A), eps); +} + +void VerifyTridiagonal(const ImageD& T, const double eps) { + ASSERT_EQ(T.xsize(), T.ysize()); + for (size_t i = 0; i < T.xsize(); ++i) { + for (size_t j = i + 2; j < T.xsize(); ++j) { + ASSERT_NEAR(T.Row(i)[j], 0.0, eps); + ASSERT_NEAR(T.Row(j)[i], 0.0, eps); + } + } +} + +void VerifyUpperTriangular(const ImageD& R, const double eps) { + ASSERT_EQ(R.xsize(), R.ysize()); + for (size_t i = 0; i < R.xsize(); ++i) { + for (size_t j = i + 1; j < R.xsize(); ++j) { + ASSERT_NEAR(R.Row(i)[j], 0.0, eps); + } + } +} + +TEST(LinAlgTest, ConvertToTridiagonal) { + { + ImageD I = Identity(5); + ImageD T, U; + ConvertToTridiagonal(I, &T, &U); + VerifyMatrixEqual(I, T, 1e-15); + VerifyMatrixEqual(I, U, 1e-15); + } + { + ImageD A = Identity(5); + A.Row(0)[1] = A.Row(1)[0] = 2.0; + A.Row(0)[4] = A.Row(4)[0] = 3.0; + A.Row(2)[3] = A.Row(3)[2] = 2.0; + A.Row(3)[4] = A.Row(4)[3] = 2.0; + ImageD U, d; + ConvertToDiagonal(A, &d, &U); + VerifyOrthogonal(U, 1e-12); + VerifyMatrixEqual(A, MatMul(U, MatMul(Diagonal(d), Transpose(U))), 1e-12); + } + std::mt19937_64 rng; + for (int N = 2; N < 100; ++N) { + ImageD A = RandomSymmetricMatrix(N, rng, -1.0, 1.0); + ImageD T, U; + ConvertToTridiagonal(A, &T, &U); + VerifyOrthogonal(U, 1e-12); + VerifyTridiagonal(T, 1e-12); + VerifyMatrixEqual(A, MatMul(U, MatMul(T, Transpose(U))), 1e-12); + } +} + +TEST(LinAlgTest, ConvertToDiagonal) { + { + ImageD I = Identity(5); + ImageD U, d; + ConvertToDiagonal(I, &d, &U); + VerifyMatrixEqual(I, U, 1e-15); + for (int k = 0; k < 5; ++k) { + ASSERT_NEAR(d.Row(0)[k], 1.0, 1e-15); + } + } + { + ImageD A = Identity(5); + A.Row(0)[1] = A.Row(1)[0] = 2.0; + A.Row(2)[3] = A.Row(3)[2] = 2.0; + A.Row(3)[4] = A.Row(4)[3] = 2.0; + ImageD U, d; + ConvertToDiagonal(A, &d, &U); + VerifyOrthogonal(U, 1e-12); + VerifyMatrixEqual(A, MatMul(U, MatMul(Diagonal(d), Transpose(U))), 1e-12); + } + std::mt19937_64 rng; + for (int N = 2; N < 100; ++N) { + ImageD A = RandomSymmetricMatrix(N, rng, -1.0, 1.0); + ImageD U, d; + ConvertToDiagonal(A, &d, &U); + VerifyOrthogonal(U, 1e-12); + VerifyMatrixEqual(A, MatMul(U, MatMul(Diagonal(d), Transpose(U))), 1e-12); + } +} + +TEST(LinAlgTest, ComputeQRFactorization) { + { + ImageD I = Identity(5); + ImageD Q, R; + ComputeQRFactorization(I, &Q, &R); + VerifyMatrixEqual(I, Q, 1e-15); + VerifyMatrixEqual(I, R, 1e-15); + } + std::mt19937_64 rng; + for (int N = 2; N < 100; ++N) { + ImageD A = RandomMatrix(N, N, rng, -1.0, 1.0); + ImageD Q, R; + ComputeQRFactorization(A, &Q, &R); + VerifyOrthogonal(Q, 1e-12); + VerifyUpperTriangular(R, 1e-12); + VerifyMatrixEqual(A, MatMul(Q, R), 1e-12); + } +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/loop_filter.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/loop_filter.cc new file mode 100644 index 0000000000..afa36a44e6 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/loop_filter.cc @@ -0,0 +1,87 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/loop_filter.h" + +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/fields.h" + +namespace jxl { + +LoopFilter::LoopFilter() { Bundle::Init(this); } +Status LoopFilter::VisitFields(Visitor* JXL_RESTRICT visitor) { + // Must come before AllDefault. + + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(true, &gab)); + if (visitor->Conditional(gab)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &gab_custom)); + if (visitor->Conditional(gab_custom)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(1.1 * 0.104699568f, &gab_x_weight1)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(1.1 * 0.055680538f, &gab_x_weight2)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(1.1 * 0.104699568f, &gab_y_weight1)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(1.1 * 0.055680538f, &gab_y_weight2)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(1.1 * 0.104699568f, &gab_b_weight1)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(1.1 * 0.055680538f, &gab_b_weight2)); + } + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(2, 2, &epf_iters)); + if (visitor->Conditional(epf_iters > 0)) { + if (visitor->Conditional(!nonserialized_is_modular)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &epf_sharp_custom)); + if (visitor->Conditional(epf_sharp_custom)) { + for (size_t i = 0; i < kEpfSharpEntries; ++i) { + JXL_QUIET_RETURN_IF_ERROR(visitor->F16( + float(i) / float(kEpfSharpEntries - 1), &epf_sharp_lut[i])); + } + } + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &epf_weight_custom)); + if (visitor->Conditional(epf_weight_custom)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(40.0f, &epf_channel_scale[0])); + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(5.0f, &epf_channel_scale[1])); + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(3.5f, &epf_channel_scale[2])); + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.45f, &epf_pass1_zeroflush)); + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.6f, &epf_pass2_zeroflush)); + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &epf_sigma_custom)); + if (visitor->Conditional(epf_sigma_custom)) { + if (visitor->Conditional(!nonserialized_is_modular)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.46f, &epf_quant_mul)); + } + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.9f, &epf_pass0_sigma_scale)); + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(6.5f, &epf_pass2_sigma_scale)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->F16(0.6666666666666666f, &epf_border_sad_mul)); + } + if (visitor->Conditional(nonserialized_is_modular)) { + JXL_QUIET_RETURN_IF_ERROR(visitor->F16(1.0f, &epf_sigma_for_modular)); + if (epf_sigma_for_modular < 1e-8) { + return JXL_FAILURE("EPF: sigma for modular is too small"); + } + } + } + + JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions)); + // Extensions: in chronological order of being added to the format. + return visitor->EndExtensions(); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/loop_filter.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/loop_filter.h new file mode 100644 index 0000000000..ffa68b5120 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/loop_filter.h @@ -0,0 +1,78 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_LOOP_FILTER_H_ +#define LIB_JXL_LOOP_FILTER_H_ + +// Parameters for loop filter(s), stored in each frame. + +#include +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/field_encodings.h" + +namespace jxl { + +struct LoopFilter : public Fields { + LoopFilter(); + const char* Name() const override { return "LoopFilter"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + size_t Padding() const { + static const size_t padding_per_epf_iter[4] = {0, 2, 3, 6}; + return padding_per_epf_iter[epf_iters] + (gab ? 1 : 0); + } + + mutable bool all_default; + + // --- Gaborish convolution + bool gab; + + bool gab_custom; + float gab_x_weight1; + float gab_x_weight2; + float gab_y_weight1; + float gab_y_weight2; + float gab_b_weight1; + float gab_b_weight2; + + // --- Edge-preserving filter + + // Number of EPF stages to apply. 0 means EPF disabled. 1 applies only the + // first stage, 2 applies both stages and 3 applies the first stage twice and + // the second stage once. + uint32_t epf_iters; + + bool epf_sharp_custom; + enum { kEpfSharpEntries = 8 }; + float epf_sharp_lut[kEpfSharpEntries]; + + bool epf_weight_custom; // Custom weight params + float epf_channel_scale[3]; // Relative weight of each channel + float epf_pass1_zeroflush; // Minimum weight for first pass + float epf_pass2_zeroflush; // Minimum weight for second pass + + bool epf_sigma_custom; // Custom sigma parameters + float epf_quant_mul; // Sigma is ~ this * quant + float epf_pass0_sigma_scale; // Multiplier for sigma in pass 0 + float epf_pass2_sigma_scale; // Multiplier for sigma in the second pass + float epf_border_sad_mul; // (inverse) multiplier for sigma on borders + + float epf_sigma_for_modular; + + uint64_t extensions; + + bool nonserialized_is_modular = false; +}; + +} // namespace jxl + +#endif // LIB_JXL_LOOP_FILTER_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/luminance.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/luminance.cc new file mode 100644 index 0000000000..9eba4d4011 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/luminance.cc @@ -0,0 +1,31 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/luminance.h" + +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" + +namespace jxl { + +void SetIntensityTarget(CodecInOut* io) { + if (io->target_nits != 0) { + io->metadata.m.SetIntensityTarget(io->target_nits); + return; + } + if (io->metadata.m.color_encoding.tf.IsPQ()) { + // Peak luminance of PQ as defined by SMPTE ST 2084:2014. + io->metadata.m.SetIntensityTarget(10000); + } else if (io->metadata.m.color_encoding.tf.IsHLG()) { + // Nominal display peak luminance used as a reference by + // Rec. ITU-R BT.2100-2. + io->metadata.m.SetIntensityTarget(1000); + } else { + // SDR + io->metadata.m.SetIntensityTarget(kDefaultIntensityTarget); + } +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/luminance.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/luminance.h new file mode 100644 index 0000000000..c6a9d9e184 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/luminance.h @@ -0,0 +1,21 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_LUMINANCE_H_ +#define LIB_JXL_LUMINANCE_H_ + +namespace jxl { + +// Chooses a default intensity target based on the transfer function of the +// image, if known. For SDR images or images not known to be HDR, returns +// kDefaultIntensityTarget, for images known to have PQ or HLG transfer function +// returns a higher value. If the image metadata already has a non-zero +// intensity target, does nothing. +class CodecInOut; +void SetIntensityTarget(CodecInOut* io); + +} // namespace jxl + +#endif // LIB_JXL_LUMINANCE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/memory_manager_internal.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/memory_manager_internal.cc new file mode 100644 index 0000000000..87727e75cd --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/memory_manager_internal.cc @@ -0,0 +1,18 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/memory_manager_internal.h" + +#include + +namespace jxl { + +void* MemoryManagerDefaultAlloc(void* opaque, size_t size) { + return malloc(size); +} + +void MemoryManagerDefaultFree(void* opaque, void* address) { free(address); } + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/memory_manager_internal.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/memory_manager_internal.h new file mode 100644 index 0000000000..b4a78903fe --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/memory_manager_internal.h @@ -0,0 +1,101 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MEMORY_MANAGER_INTERNAL_H_ +#define LIB_JXL_MEMORY_MANAGER_INTERNAL_H_ + +// Memory allocator with support for alignment + misalignment. + +#include +#include +#include +#include // memcpy + +#include +#include + +#include "jxl/memory_manager.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +// Default alloc and free functions. +void* MemoryManagerDefaultAlloc(void* opaque, size_t size); +void MemoryManagerDefaultFree(void* opaque, void* address); + +// Initializes the memory manager instance with the passed one. The +// MemoryManager passed in |memory_manager| may be NULL or contain NULL +// functions which will be initialized with the default ones. If either alloc +// or free are NULL, then both must be NULL, otherwise this function returns an +// error. +static JXL_INLINE Status MemoryManagerInit( + JxlMemoryManager* self, const JxlMemoryManager* memory_manager) { + if (memory_manager) { + *self = *memory_manager; + } else { + memset(self, 0, sizeof(*self)); + } + if (!self->alloc != !self->free) { + return false; + } + if (!self->alloc) self->alloc = jxl::MemoryManagerDefaultAlloc; + if (!self->free) self->free = jxl::MemoryManagerDefaultFree; + + return true; +} + +static JXL_INLINE void* MemoryManagerAlloc( + const JxlMemoryManager* memory_manager, size_t size) { + return memory_manager->alloc(memory_manager->opaque, size); +} + +static JXL_INLINE void MemoryManagerFree(const JxlMemoryManager* memory_manager, + void* address) { + return memory_manager->free(memory_manager->opaque, address); +} + +// Helper class to be used as a deleter in a unique_ptr call. +class MemoryManagerDeleteHelper { + public: + explicit MemoryManagerDeleteHelper(const JxlMemoryManager* memory_manager) + : memory_manager_(memory_manager) {} + + // Delete and free the passed pointer using the memory_manager. + template + void operator()(T* address) const { + if (!address) { + return; + } + address->~T(); + return memory_manager_->free(memory_manager_->opaque, address); + } + + private: + const JxlMemoryManager* memory_manager_; +}; + +template +using MemoryManagerUniquePtr = std::unique_ptr; + +// Creates a new object T allocating it with the memory allocator into a +// unique_ptr. +template +JXL_INLINE MemoryManagerUniquePtr MemoryManagerMakeUnique( + const JxlMemoryManager* memory_manager, Args&&... args) { + T* mem = + static_cast(memory_manager->alloc(memory_manager->opaque, sizeof(T))); + if (!mem) { + // Allocation error case. + return MemoryManagerUniquePtr(nullptr, + MemoryManagerDeleteHelper(memory_manager)); + } + return MemoryManagerUniquePtr(new (mem) T(std::forward(args)...), + MemoryManagerDeleteHelper(memory_manager)); +} + +} // namespace jxl + +#endif // LIB_JXL_MEMORY_MANAGER_INTERNAL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/context_predict.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/context_predict.h new file mode 100644 index 0000000000..63c7f7bb65 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/context_predict.h @@ -0,0 +1,653 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_ENCODING_CONTEXT_PREDICT_H_ +#define LIB_JXL_MODULAR_ENCODING_CONTEXT_PREDICT_H_ + +#include +#include + +#include "lib/jxl/fields.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/options.h" + +namespace jxl { + +namespace weighted { +constexpr static size_t kNumPredictors = 4; +constexpr static int64_t kPredExtraBits = 3; +constexpr static int64_t kPredictionRound = ((1 << kPredExtraBits) >> 1) - 1; +constexpr static size_t kNumProperties = 1; + +struct Header : public Fields { + const char *Name() const override { return "WeightedPredictorHeader"; } + // TODO(janwas): move to cc file, avoid including fields.h. + Header() { Bundle::Init(this); } + + Status VisitFields(Visitor *JXL_RESTRICT visitor) override { + if (visitor->AllDefault(*this, &all_default)) { + // Overwrite all serialized fields, but not any nonserialized_*. + visitor->SetDefault(this); + return true; + } + auto visit_p = [visitor](pixel_type val, pixel_type *p) { + uint32_t up = *p; + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(5, val, &up)); + *p = up; + return Status(true); + }; + JXL_QUIET_RETURN_IF_ERROR(visit_p(16, &p1C)); + JXL_QUIET_RETURN_IF_ERROR(visit_p(10, &p2C)); + JXL_QUIET_RETURN_IF_ERROR(visit_p(7, &p3Ca)); + JXL_QUIET_RETURN_IF_ERROR(visit_p(7, &p3Cb)); + JXL_QUIET_RETURN_IF_ERROR(visit_p(7, &p3Cc)); + JXL_QUIET_RETURN_IF_ERROR(visit_p(0, &p3Cd)); + JXL_QUIET_RETURN_IF_ERROR(visit_p(0, &p3Ce)); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(4, 0xd, &w[0])); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(4, 0xc, &w[1])); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(4, 0xc, &w[2])); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(4, 0xc, &w[3])); + return true; + } + + bool all_default; + pixel_type p1C = 0, p2C = 0, p3Ca = 0, p3Cb = 0, p3Cc = 0, p3Cd = 0, p3Ce = 0; + uint32_t w[kNumPredictors] = {}; +}; + +struct State { + pixel_type_w prediction[kNumPredictors] = {}; + pixel_type_w pred = 0; // *before* removing the added bits. + std::vector pred_errors[kNumPredictors]; + std::vector error; + Header header; + + // Allows to approximate division by a number from 1 to 64. + uint32_t divlookup[64]; + + constexpr static pixel_type_w AddBits(pixel_type_w x) { + return uint64_t(x) << kPredExtraBits; + } + + State(Header header, size_t xsize, size_t ysize) : header(header) { + // Extra margin to avoid out-of-bounds writes. + // All have space for two rows of data. + for (size_t i = 0; i < 4; i++) { + pred_errors[i].resize((xsize + 2) * 2); + } + error.resize((xsize + 2) * 2); + // Initialize division lookup table. + for (int i = 0; i < 64; i++) { + divlookup[i] = (1 << 24) / (i + 1); + } + } + + // Approximates 4+(maxweight<<24)/(x+1), avoiding division + JXL_INLINE uint32_t ErrorWeight(uint64_t x, uint32_t maxweight) const { + int shift = FloorLog2Nonzero(x + 1) - 5; + if (shift < 0) shift = 0; + return 4 + ((maxweight * divlookup[x >> shift]) >> shift); + } + + // Approximates the weighted average of the input values with the given + // weights, avoiding division. Weights must sum to at least 16. + JXL_INLINE pixel_type_w + WeightedAverage(const pixel_type_w *JXL_RESTRICT p, + std::array w) const { + uint32_t weight_sum = 0; + for (size_t i = 0; i < kNumPredictors; i++) { + weight_sum += w[i]; + } + JXL_DASSERT(weight_sum > 15); + uint32_t log_weight = FloorLog2Nonzero(weight_sum); // at least 4. + weight_sum = 0; + for (size_t i = 0; i < kNumPredictors; i++) { + w[i] >>= log_weight - 4; + weight_sum += w[i]; + } + // for rounding. + pixel_type_w sum = (weight_sum >> 1) - 1; + for (size_t i = 0; i < kNumPredictors; i++) { + sum += p[i] * w[i]; + } + return (sum * divlookup[weight_sum - 1]) >> 24; + } + + template + JXL_INLINE pixel_type_w Predict(size_t x, size_t y, size_t xsize, + pixel_type_w N, pixel_type_w W, + pixel_type_w NE, pixel_type_w NW, + pixel_type_w NN, Properties *properties, + size_t offset) { + size_t cur_row = y & 1 ? 0 : (xsize + 2); + size_t prev_row = y & 1 ? (xsize + 2) : 0; + size_t pos_N = prev_row + x; + size_t pos_NE = x < xsize - 1 ? pos_N + 1 : pos_N; + size_t pos_NW = x > 0 ? pos_N - 1 : pos_N; + std::array weights; + for (size_t i = 0; i < kNumPredictors; i++) { + // pred_errors[pos_N] also contains the error of pixel W. + // pred_errors[pos_NW] also contains the error of pixel WW. + weights[i] = pred_errors[i][pos_N] + pred_errors[i][pos_NE] + + pred_errors[i][pos_NW]; + weights[i] = ErrorWeight(weights[i], header.w[i]); + } + + N = AddBits(N); + W = AddBits(W); + NE = AddBits(NE); + NW = AddBits(NW); + NN = AddBits(NN); + + pixel_type_w teW = x == 0 ? 0 : error[cur_row + x - 1]; + pixel_type_w teN = error[pos_N]; + pixel_type_w teNW = error[pos_NW]; + pixel_type_w sumWN = teN + teW; + pixel_type_w teNE = error[pos_NE]; + + if (compute_properties) { + pixel_type_w p = teW; + if (std::abs(teN) > std::abs(p)) p = teN; + if (std::abs(teNW) > std::abs(p)) p = teNW; + if (std::abs(teNE) > std::abs(p)) p = teNE; + (*properties)[offset++] = p; + } + + prediction[0] = W + NE - N; + prediction[1] = N - (((sumWN + teNE) * header.p1C) >> 5); + prediction[2] = W - (((sumWN + teNW) * header.p2C) >> 5); + prediction[3] = + N - ((teNW * header.p3Ca + teN * header.p3Cb + teNE * header.p3Cc + + (NN - N) * header.p3Cd + (NW - W) * header.p3Ce) >> + 5); + + pred = WeightedAverage(prediction, weights); + + // If all three have the same sign, skip clamping. + if (((teN ^ teW) | (teN ^ teNW)) > 0) { + return (pred + kPredictionRound) >> kPredExtraBits; + } + + // Otherwise, clamp to min/max of neighbouring pixels (just W, NE, N). + pixel_type_w mx = std::max(W, std::max(NE, N)); + pixel_type_w mn = std::min(W, std::min(NE, N)); + pred = std::max(mn, std::min(mx, pred)); + return (pred + kPredictionRound) >> kPredExtraBits; + } + + JXL_INLINE void UpdateErrors(pixel_type_w val, size_t x, size_t y, + size_t xsize) { + size_t cur_row = y & 1 ? 0 : (xsize + 2); + size_t prev_row = y & 1 ? (xsize + 2) : 0; + val = AddBits(val); + error[cur_row + x] = pred - val; + for (size_t i = 0; i < kNumPredictors; i++) { + pixel_type_w err = + (std::abs(prediction[i] - val) + kPredictionRound) >> kPredExtraBits; + // For predicting in the next row. + pred_errors[i][cur_row + x] = err; + // Add the error on this pixel to the error on the NE pixel. This has the + // effect of adding the error on this pixel to the E and EE pixels. + pred_errors[i][prev_row + x + 1] += err; + } + } +}; + +// Encoder helper function to set the parameters to some presets. +inline void PredictorMode(int i, Header *header) { + switch (i) { + case 0: + // ~ lossless16 predictor + header->w[0] = 0xd; + header->w[1] = 0xc; + header->w[2] = 0xc; + header->w[3] = 0xc; + header->p1C = 16; + header->p2C = 10; + header->p3Ca = 7; + header->p3Cb = 7; + header->p3Cc = 7; + header->p3Cd = 0; + header->p3Ce = 0; + break; + case 1: + // ~ default lossless8 predictor + header->w[0] = 0xd; + header->w[1] = 0xc; + header->w[2] = 0xc; + header->w[3] = 0xb; + header->p1C = 8; + header->p2C = 8; + header->p3Ca = 4; + header->p3Cb = 0; + header->p3Cc = 3; + header->p3Cd = 23; + header->p3Ce = 2; + break; + case 2: + // ~ west lossless8 predictor + header->w[0] = 0xd; + header->w[1] = 0xc; + header->w[2] = 0xd; + header->w[3] = 0xc; + header->p1C = 10; + header->p2C = 9; + header->p3Ca = 7; + header->p3Cb = 0; + header->p3Cc = 0; + header->p3Cd = 16; + header->p3Ce = 9; + break; + case 3: + // ~ north lossless8 predictor + header->w[0] = 0xd; + header->w[1] = 0xd; + header->w[2] = 0xc; + header->w[3] = 0xc; + header->p1C = 16; + header->p2C = 8; + header->p3Ca = 0; + header->p3Cb = 16; + header->p3Cc = 0; + header->p3Cd = 23; + header->p3Ce = 0; + break; + case 4: + default: + // something else, because why not + header->w[0] = 0xd; + header->w[1] = 0xc; + header->w[2] = 0xc; + header->w[3] = 0xc; + header->p1C = 10; + header->p2C = 10; + header->p3Ca = 5; + header->p3Cb = 5; + header->p3Cc = 5; + header->p3Cd = 12; + header->p3Ce = 4; + break; + } +} +} // namespace weighted + +// Stores a node and its two children at the same time. This significantly +// reduces the number of branches needed during decoding. +struct FlatDecisionNode { + // Property + splitval of the top node. + int32_t property0; // -1 if leaf. + union { + PropertyVal splitval0; + Predictor predictor; + }; + uint32_t childID; // childID is ctx id if leaf. + // Property+splitval of the two child nodes. + union { + PropertyVal splitvals[2]; + int32_t multiplier; + }; + union { + int32_t properties[2]; + int64_t predictor_offset; + }; +}; +using FlatTree = std::vector; + +class MATreeLookup { + public: + explicit MATreeLookup(const FlatTree &tree) : nodes_(tree) {} + struct LookupResult { + uint32_t context; + Predictor predictor; + int64_t offset; + int32_t multiplier; + }; + LookupResult Lookup(const Properties &properties) const { + uint32_t pos = 0; + while (true) { + const FlatDecisionNode &node = nodes_[pos]; + if (node.property0 < 0) { + return {node.childID, node.predictor, node.predictor_offset, + node.multiplier}; + } + bool p0 = properties[node.property0] <= node.splitval0; + uint32_t off0 = properties[node.properties[0]] <= node.splitvals[0]; + uint32_t off1 = 2 | (properties[node.properties[1]] <= node.splitvals[1]); + pos = node.childID + (p0 ? off1 : off0); + } + } + + private: + const FlatTree &nodes_; +}; + +static constexpr size_t kExtraPropsPerChannel = 4; +static constexpr size_t kNumNonrefProperties = + kNumStaticProperties + 13 + weighted::kNumProperties; + +constexpr size_t kWPProp = kNumNonrefProperties - weighted::kNumProperties; +constexpr size_t kGradientProp = 9; + +// Clamps gradient to the min/max of n, w (and l, implicitly). +static JXL_INLINE int32_t ClampedGradient(const int32_t n, const int32_t w, + const int32_t l) { + const int32_t m = std::min(n, w); + const int32_t M = std::max(n, w); + // The end result of this operation doesn't overflow or underflow if the + // result is between m and M, but the intermediate value may overflow, so we + // do the intermediate operations in uint32_t and check later if we had an + // overflow or underflow condition comparing m, M and l directly. + // grad = M + m - l = n + w - l + const int32_t grad = + static_cast(static_cast(n) + static_cast(w) - + static_cast(l)); + // We use two sets of ternary operators to force the evaluation of them in + // any case, allowing the compiler to avoid branches and use cmovl/cmovg in + // x86. + const int32_t grad_clamp_M = (l < m) ? M : grad; + return (l > M) ? m : grad_clamp_M; +} + +inline pixel_type_w Select(pixel_type_w a, pixel_type_w b, pixel_type_w c) { + pixel_type_w p = a + b - c; + pixel_type_w pa = std::abs(p - a); + pixel_type_w pb = std::abs(p - b); + return pa < pb ? a : b; +} + +inline void PrecomputeReferences(const Channel &ch, size_t y, + const Image &image, uint32_t i, + Channel *references) { + ZeroFillImage(&references->plane); + uint32_t offset = 0; + size_t num_extra_props = references->w; + intptr_t onerow = references->plane.PixelsPerRow(); + for (int32_t j = static_cast(i) - 1; + j >= 0 && offset < num_extra_props; j--) { + if (image.channel[j].w != image.channel[i].w || + image.channel[j].h != image.channel[i].h) { + continue; + } + if (image.channel[j].hshift != image.channel[i].hshift) continue; + if (image.channel[j].vshift != image.channel[i].vshift) continue; + pixel_type *JXL_RESTRICT rp = references->Row(0) + offset; + const pixel_type *JXL_RESTRICT rpp = image.channel[j].Row(y); + const pixel_type *JXL_RESTRICT rpprev = image.channel[j].Row(y ? y - 1 : 0); + for (size_t x = 0; x < ch.w; x++, rp += onerow) { + pixel_type_w v = rpp[x]; + rp[0] = std::abs(v); + rp[1] = v; + pixel_type_w vleft = (x ? rpp[x - 1] : 0); + pixel_type_w vtop = (y ? rpprev[x] : vleft); + pixel_type_w vtopleft = (x && y ? rpprev[x - 1] : vleft); + pixel_type_w vpredicted = ClampedGradient(vleft, vtop, vtopleft); + rp[2] = std::abs(v - vpredicted); + rp[3] = v - vpredicted; + } + + offset += kExtraPropsPerChannel; + } +} + +struct PredictionResult { + int context = 0; + pixel_type_w guess = 0; + Predictor predictor; + int32_t multiplier; +}; + +inline std::string PropertyName(size_t i) { + static_assert(kNumNonrefProperties == 16, "Update this function"); + switch (i) { + case 0: + return "c"; + case 1: + return "g"; + case 2: + return "y"; + case 3: + return "x"; + case 4: + return "|N|"; + case 5: + return "|W|"; + case 6: + return "N"; + case 7: + return "W"; + case 8: + return "W-WW-NW+NWW"; + case 9: + return "W+N-NW"; + case 10: + return "W-NW"; + case 11: + return "NW-N"; + case 12: + return "N-NE"; + case 13: + return "N-NN"; + case 14: + return "W-WW"; + case 15: + return "WGH"; + default: + return "ch[" + ToString(15 - (int)i) + "]"; + } +} + +inline void InitPropsRow( + Properties *p, + const std::array &static_props, + const int y) { + for (size_t i = 0; i < kNumStaticProperties; i++) { + (*p)[i] = static_props[i]; + } + (*p)[2] = y; + (*p)[9] = 0; // local gradient. +} + +namespace detail { +enum PredictorMode { + kUseTree = 1, + kUseWP = 2, + kForceComputeProperties = 4, + kAllPredictions = 8, +}; + +JXL_INLINE pixel_type_w PredictOne(Predictor p, pixel_type_w left, + pixel_type_w top, pixel_type_w toptop, + pixel_type_w topleft, pixel_type_w topright, + pixel_type_w leftleft, + pixel_type_w toprightright, + pixel_type_w wp_pred) { + switch (p) { + case Predictor::Zero: + return pixel_type_w{0}; + case Predictor::Left: + return left; + case Predictor::Top: + return top; + case Predictor::Select: + return Select(left, top, topleft); + case Predictor::Weighted: + return wp_pred; + case Predictor::Gradient: + return pixel_type_w{ClampedGradient(left, top, topleft)}; + case Predictor::TopLeft: + return topleft; + case Predictor::TopRight: + return topright; + case Predictor::LeftLeft: + return leftleft; + case Predictor::Average0: + return (left + top) / 2; + case Predictor::Average1: + return (left + topleft) / 2; + case Predictor::Average2: + return (topleft + top) / 2; + case Predictor::Average3: + return (top + topright) / 2; + case Predictor::Average4: + return (6 * top - 2 * toptop + 7 * left + 1 * leftleft + + 1 * toprightright + 3 * topright + 8) / + 16; + default: + return pixel_type_w{0}; + } +} + +template +inline PredictionResult Predict( + Properties *p, size_t w, const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const size_t x, const size_t y, Predictor predictor, + const MATreeLookup *lookup, const Channel *references, + weighted::State *wp_state, pixel_type_w *predictions) { + // We start in position 3 because of 2 static properties + y. + size_t offset = 3; + constexpr bool compute_properties = + mode & kUseTree || mode & kForceComputeProperties; + pixel_type_w left = (x ? pp[-1] : (y ? pp[-onerow] : 0)); + pixel_type_w top = (y ? pp[-onerow] : left); + pixel_type_w topleft = (x && y ? pp[-1 - onerow] : left); + pixel_type_w topright = (x + 1 < w && y ? pp[1 - onerow] : top); + pixel_type_w leftleft = (x > 1 ? pp[-2] : left); + pixel_type_w toptop = (y > 1 ? pp[-onerow - onerow] : top); + pixel_type_w toprightright = (x + 2 < w && y ? pp[2 - onerow] : topright); + + if (compute_properties) { + // location + (*p)[offset++] = x; + // neighbors + (*p)[offset++] = std::abs(top); + (*p)[offset++] = std::abs(left); + (*p)[offset++] = top; + (*p)[offset++] = left; + + // local gradient + (*p)[offset] = left - (*p)[offset + 1]; + offset++; + // local gradient + (*p)[offset++] = left + top - topleft; + + // FFV1 context properties + (*p)[offset++] = left - topleft; + (*p)[offset++] = topleft - top; + (*p)[offset++] = top - topright; + (*p)[offset++] = top - toptop; + (*p)[offset++] = left - leftleft; + } + + pixel_type_w wp_pred = 0; + if (mode & kUseWP) { + wp_pred = wp_state->Predict( + x, y, w, top, left, topright, topleft, toptop, p, offset); + } + if (compute_properties) { + offset += weighted::kNumProperties; + // Extra properties. + const pixel_type *JXL_RESTRICT rp = references->Row(x); + for (size_t i = 0; i < references->w; i++) { + (*p)[offset++] = rp[i]; + } + } + PredictionResult result; + if (mode & kUseTree) { + MATreeLookup::LookupResult lr = lookup->Lookup(*p); + result.context = lr.context; + result.guess = lr.offset; + result.multiplier = lr.multiplier; + predictor = lr.predictor; + } + if (mode & kAllPredictions) { + for (size_t i = 0; i < kNumModularPredictors; i++) { + predictions[i] = PredictOne((Predictor)i, left, top, toptop, topleft, + topright, leftleft, toprightright, wp_pred); + } + } + result.guess += PredictOne(predictor, left, top, toptop, topleft, topright, + leftleft, toprightright, wp_pred); + result.predictor = predictor; + + return result; +} +} // namespace detail + +inline PredictionResult PredictNoTreeNoWP(size_t w, + const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const int x, + const int y, Predictor predictor) { + return detail::Predict( + /*p=*/nullptr, w, pp, onerow, x, y, predictor, /*lookup=*/nullptr, + /*references=*/nullptr, /*wp_state=*/nullptr, /*predictions=*/nullptr); +} + +inline PredictionResult PredictNoTreeWP(size_t w, + const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const int x, + const int y, Predictor predictor, + weighted::State *wp_state) { + return detail::Predict( + /*p=*/nullptr, w, pp, onerow, x, y, predictor, /*lookup=*/nullptr, + /*references=*/nullptr, wp_state, /*predictions=*/nullptr); +} + +inline PredictionResult PredictTreeNoWP(Properties *p, size_t w, + const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const int x, + const int y, + const MATreeLookup &tree_lookup, + const Channel &references) { + return detail::Predict( + p, w, pp, onerow, x, y, Predictor::Zero, &tree_lookup, &references, + /*wp_state=*/nullptr, /*predictions=*/nullptr); +} + +inline PredictionResult PredictTreeWP(Properties *p, size_t w, + const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const int x, + const int y, + const MATreeLookup &tree_lookup, + const Channel &references, + weighted::State *wp_state) { + return detail::Predict( + p, w, pp, onerow, x, y, Predictor::Zero, &tree_lookup, &references, + wp_state, /*predictions=*/nullptr); +} + +inline PredictionResult PredictLearn(Properties *p, size_t w, + const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const int x, + const int y, Predictor predictor, + const Channel &references, + weighted::State *wp_state) { + return detail::Predict( + p, w, pp, onerow, x, y, predictor, /*lookup=*/nullptr, &references, + wp_state, /*predictions=*/nullptr); +} + +inline void PredictLearnAll(Properties *p, size_t w, + const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const int x, const int y, + const Channel &references, + weighted::State *wp_state, + pixel_type_w *predictions) { + detail::Predict( + p, w, pp, onerow, x, y, Predictor::Zero, + /*lookup=*/nullptr, &references, wp_state, predictions); +} + +inline void PredictAllNoWP(size_t w, const pixel_type *JXL_RESTRICT pp, + const intptr_t onerow, const int x, const int y, + pixel_type_w *predictions) { + detail::Predict( + /*p=*/nullptr, w, pp, onerow, x, y, Predictor::Zero, + /*lookup=*/nullptr, + /*references=*/nullptr, /*wp_state=*/nullptr, predictions); +} +} // namespace jxl + +#endif // LIB_JXL_MODULAR_ENCODING_CONTEXT_PREDICT_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/dec_ma.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/dec_ma.cc new file mode 100644 index 0000000000..5be9d756ed --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/dec_ma.cc @@ -0,0 +1,106 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/encoding/dec_ma.h" + +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/modular/encoding/ma_common.h" +#include "lib/jxl/modular/modular_image.h" + +namespace jxl { + +namespace { + +Status ValidateTree( + const Tree &tree, + const std::vector> &prop_bounds, + size_t root) { + if (tree[root].property == -1) return true; + size_t p = tree[root].property; + int val = tree[root].splitval; + if (prop_bounds[p].first > val) return JXL_FAILURE("Invalid tree"); + // Splitting at max value makes no sense: left range will be exactly same + // as parent, right range will be invalid (min > max). + if (prop_bounds[p].second <= val) return JXL_FAILURE("Invalid tree"); + auto new_bounds = prop_bounds; + new_bounds[p].first = val + 1; + JXL_RETURN_IF_ERROR(ValidateTree(tree, new_bounds, tree[root].lchild)); + new_bounds[p] = prop_bounds[p]; + new_bounds[p].second = val; + return ValidateTree(tree, new_bounds, tree[root].rchild); +} + +Status DecodeTree(BitReader *br, ANSSymbolReader *reader, + const std::vector &context_map, Tree *tree, + size_t tree_size_limit) { + size_t leaf_id = 0; + size_t to_decode = 1; + tree->clear(); + while (to_decode > 0) { + JXL_RETURN_IF_ERROR(br->AllReadsWithinBounds()); + if (tree->size() > tree_size_limit) { + return JXL_FAILURE("Tree is too large"); + } + to_decode--; + int property = + reader->ReadHybridUint(kPropertyContext, br, context_map) - 1; + if (property < -1 || property >= 256) { + return JXL_FAILURE("Invalid tree property value"); + } + if (property == -1) { + size_t predictor = + reader->ReadHybridUint(kPredictorContext, br, context_map); + if (predictor >= kNumModularPredictors) { + return JXL_FAILURE("Invalid predictor"); + } + int64_t predictor_offset = + UnpackSigned(reader->ReadHybridUint(kOffsetContext, br, context_map)); + uint32_t mul_log = + reader->ReadHybridUint(kMultiplierLogContext, br, context_map); + if (mul_log >= 31) { + return JXL_FAILURE("Invalid multiplier logarithm"); + } + uint32_t mul_bits = + reader->ReadHybridUint(kMultiplierBitsContext, br, context_map); + if (mul_bits + 1 >= 1u << (31u - mul_log)) { + return JXL_FAILURE("Invalid multiplier"); + } + uint32_t multiplier = (mul_bits + 1U) << mul_log; + tree->emplace_back(-1, 0, leaf_id++, 0, static_cast(predictor), + predictor_offset, multiplier); + continue; + } + int splitval = + UnpackSigned(reader->ReadHybridUint(kSplitValContext, br, context_map)); + tree->emplace_back(property, splitval, tree->size() + to_decode + 1, + tree->size() + to_decode + 2, Predictor::Zero, 0, 1); + to_decode += 2; + } + std::vector> prop_bounds; + prop_bounds.resize(256, {std::numeric_limits::min(), + std::numeric_limits::max()}); + return ValidateTree(*tree, prop_bounds, 0); +} +} // namespace + +Status DecodeTree(BitReader *br, Tree *tree, size_t tree_size_limit) { + std::vector tree_context_map; + ANSCode tree_code; + JXL_RETURN_IF_ERROR( + DecodeHistograms(br, kNumTreeContexts, &tree_code, &tree_context_map)); + // TODO(eustas): investigate more infinite tree cases. + if (tree_code.degenerate_symbols[tree_context_map[kPropertyContext]] > 0) { + return JXL_FAILURE("Infinite tree"); + } + ANSSymbolReader reader(&tree_code, br); + JXL_RETURN_IF_ERROR(DecodeTree(br, &reader, tree_context_map, tree, + std::min(tree_size_limit, kMaxTreeSize))); + if (!reader.CheckANSFinalState()) { + return JXL_FAILURE("ANS decode final state failed"); + } + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/dec_ma.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/dec_ma.h new file mode 100644 index 0000000000..a910c4deb1 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/dec_ma.h @@ -0,0 +1,66 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_ENCODING_DEC_MA_H_ +#define LIB_JXL_MODULAR_ENCODING_DEC_MA_H_ + +#include +#include + +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/modular/options.h" + +namespace jxl { + +// inner nodes +struct PropertyDecisionNode { + PropertyVal splitval; + int16_t property; // -1: leaf node, lchild points to leaf node + uint32_t lchild; + uint32_t rchild; + Predictor predictor; + int64_t predictor_offset; + uint32_t multiplier; + + PropertyDecisionNode(int p, int split_val, int lchild, int rchild, + Predictor predictor, int64_t predictor_offset, + uint32_t multiplier) + : splitval(split_val), + property(p), + lchild(lchild), + rchild(rchild), + predictor(predictor), + predictor_offset(predictor_offset), + multiplier(multiplier) {} + PropertyDecisionNode() + : splitval(0), + property(-1), + lchild(0), + rchild(0), + predictor(Predictor::Zero), + predictor_offset(0), + multiplier(1) {} + static PropertyDecisionNode Leaf(Predictor predictor, int64_t offset = 0, + uint32_t multiplier = 1) { + return PropertyDecisionNode(-1, 0, 0, 0, predictor, offset, multiplier); + } + static PropertyDecisionNode Split(int p, int split_val, int lchild, + int rchild = -1) { + if (rchild == -1) rchild = lchild + 1; + return PropertyDecisionNode(p, split_val, lchild, rchild, Predictor::Zero, + 0, 1); + } +}; + +using Tree = std::vector; + +Status DecodeTree(BitReader *br, Tree *tree, size_t tree_size_limit); + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_ENCODING_DEC_MA_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_encoding.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_encoding.cc new file mode 100644 index 0000000000..f7bb372c74 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_encoding.cc @@ -0,0 +1,549 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/base/os_macros.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/encoding/enc_ma.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/encoding/ma_common.h" +#include "lib/jxl/modular/options.h" +#include "lib/jxl/modular/transform/transform.h" +#include "lib/jxl/toc.h" + +#if JXL_OS_IOS +#define JXL_ENABLE_DOT 0 +#else +#define JXL_ENABLE_DOT 1 // iOS lacks C89 system() +#endif + +namespace jxl { + +namespace { +// Plot tree (if enabled) and predictor usage map. +constexpr bool kWantDebug = false; +} // namespace + +void GatherTreeData(const Image &image, pixel_type chan, size_t group_id, + const weighted::Header &wp_header, + const ModularOptions &options, TreeSamples &tree_samples, + size_t *total_pixels) { + const Channel &channel = image.channel[chan]; + + JXL_DEBUG_V(7, "Learning %zux%zu channel %d", channel.w, channel.h, chan); + + std::array static_props = {chan, + (int)group_id}; + Properties properties(kNumNonrefProperties + + kExtraPropsPerChannel * options.max_properties); + double pixel_fraction = std::min(1.0f, options.nb_repeats); + // a fraction of 0 is used to disable learning entirely. + if (pixel_fraction > 0) { + pixel_fraction = std::max(pixel_fraction, + std::min(1.0, 1024.0 / (channel.w * channel.h))); + } + uint64_t threshold = + (std::numeric_limits::max() >> 32) * pixel_fraction; + uint64_t s[2] = {0x94D049BB133111EBull, 0xBF58476D1CE4E5B9ull}; + // Xorshift128+ adapted from xorshift128+-inl.h + auto use_sample = [&]() { + auto s1 = s[0]; + const auto s0 = s[1]; + const auto bits = s1 + s0; // b, c + s[0] = s0; + s1 ^= s1 << 23; + s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5); + s[1] = s1; + return (bits >> 32) <= threshold; + }; + + const intptr_t onerow = channel.plane.PixelsPerRow(); + Channel references(properties.size() - kNumNonrefProperties, channel.w); + weighted::State wp_state(wp_header, channel.w, channel.h); + tree_samples.PrepareForSamples(pixel_fraction * channel.h * channel.w + 64); + for (size_t y = 0; y < channel.h; y++) { + const pixel_type *JXL_RESTRICT p = channel.Row(y); + PrecomputeReferences(channel, y, image, chan, &references); + InitPropsRow(&properties, static_props, y); + // TODO(veluca): avoid computing WP if we don't use its property or + // predictions. + for (size_t x = 0; x < channel.w; x++) { + pixel_type_w pred[kNumModularPredictors]; + if (tree_samples.NumPredictors() != 1) { + PredictLearnAll(&properties, channel.w, p + x, onerow, x, y, references, + &wp_state, pred); + } else { + pred[static_cast(tree_samples.PredictorFromIndex(0))] = + PredictLearn(&properties, channel.w, p + x, onerow, x, y, + tree_samples.PredictorFromIndex(0), references, + &wp_state) + .guess; + } + (*total_pixels)++; + if (use_sample()) { + tree_samples.AddSample(p[x], properties, pred); + } + wp_state.UpdateErrors(p[x], x, y, channel.w); + } + } +} + +Tree LearnTree(TreeSamples &&tree_samples, size_t total_pixels, + const ModularOptions &options, + const std::vector &multiplier_info = {}, + StaticPropRange static_prop_range = {}) { + for (size_t i = 0; i < kNumStaticProperties; i++) { + if (static_prop_range[i][1] == 0) { + static_prop_range[i][1] = std::numeric_limits::max(); + } + } + if (!tree_samples.HasSamples()) { + Tree tree; + tree.emplace_back(); + tree.back().predictor = tree_samples.PredictorFromIndex(0); + tree.back().property = -1; + tree.back().predictor_offset = 0; + tree.back().multiplier = 1; + return tree; + } + float pixel_fraction = tree_samples.NumSamples() * 1.0f / total_pixels; + float required_cost = pixel_fraction * 0.9 + 0.1; + tree_samples.AllSamplesDone(); + Tree tree; + ComputeBestTree(tree_samples, + options.splitting_heuristics_node_threshold * required_cost, + multiplier_info, static_prop_range, + options.fast_decode_multiplier, &tree); + return tree; +} + +constexpr bool kPrintTree = false; + +void PrintTree(const Tree &tree, const std::string &path) { + if (!kPrintTree) return; + FILE *f = fopen((path + ".dot").c_str(), "w"); + fprintf(f, "graph{\n"); + for (size_t cur = 0; cur < tree.size(); cur++) { + if (tree[cur].property < 0) { + fprintf(f, "n%05zu [label=\"%s%+" PRId64 " (x%u)\"];\n", cur, + PredictorName(tree[cur].predictor), tree[cur].predictor_offset, + tree[cur].multiplier); + } else { + fprintf(f, "n%05zu [label=\"%s>%d\"];\n", cur, + PropertyName(tree[cur].property).c_str(), tree[cur].splitval); + fprintf(f, "n%05zu -- n%05d;\n", cur, tree[cur].lchild); + fprintf(f, "n%05zu -- n%05d;\n", cur, tree[cur].rchild); + } + } + fprintf(f, "}\n"); + fclose(f); +#if JXL_ENABLE_DOT + JXL_ASSERT( + system(("dot " + path + ".dot -T svg -o " + path + ".svg").c_str()) == 0); +#endif +} + +Status EncodeModularChannelMAANS(const Image &image, pixel_type chan, + const weighted::Header &wp_header, + const Tree &global_tree, + std::vector *tokens, AuxOut *aux_out, + size_t group_id, bool skip_encoder_fast_path) { + const Channel &channel = image.channel[chan]; + + JXL_ASSERT(channel.w != 0 && channel.h != 0); + + Image3F predictor_img; + if (kWantDebug) predictor_img = Image3F(channel.w, channel.h); + + JXL_DEBUG_V(6, + "Encoding %zux%zu channel %d, " + "(shift=%i,%i)", + channel.w, channel.h, chan, channel.hshift, channel.vshift); + + std::array static_props = {chan, + (int)group_id}; + bool use_wp, is_wp_only; + bool is_gradient_only; + size_t num_props; + FlatTree tree = FilterTree(global_tree, static_props, &num_props, &use_wp, + &is_wp_only, &is_gradient_only); + Properties properties(num_props); + MATreeLookup tree_lookup(tree); + JXL_DEBUG_V(3, "Encoding using a MA tree with %zu nodes", tree.size()); + + // Check if this tree is a WP-only tree with a small enough property value + // range. + // Initialized to avoid clang-tidy complaining. + uint16_t context_lookup[2 * kPropRangeFast] = {}; + int8_t offsets[2 * kPropRangeFast] = {}; + if (is_wp_only) { + is_wp_only = TreeToLookupTable(tree, context_lookup, offsets); + } + if (is_gradient_only) { + is_gradient_only = TreeToLookupTable(tree, context_lookup, offsets); + } + + tokens->reserve(tokens->size() + channel.w * channel.h); + if (is_wp_only && !skip_encoder_fast_path) { + for (size_t c = 0; c < 3; c++) { + FillImage(static_cast(PredictorColor(Predictor::Weighted)[c]), + &predictor_img.Plane(c)); + } + const intptr_t onerow = channel.plane.PixelsPerRow(); + weighted::State wp_state(wp_header, channel.w, channel.h); + Properties properties(1); + for (size_t y = 0; y < channel.h; y++) { + const pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + size_t offset = 0; + pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); + pixel_type_w top = (y ? *(r + x - onerow) : left); + pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); + pixel_type_w topright = + (x + 1 < channel.w && y ? *(r + x + 1 - onerow) : top); + pixel_type_w toptop = (y > 1 ? *(r + x - onerow - onerow) : top); + int32_t guess = wp_state.Predict( + x, y, channel.w, top, left, topright, topleft, toptop, &properties, + offset); + uint32_t pos = + kPropRangeFast + std::min(std::max(-kPropRangeFast, properties[0]), + kPropRangeFast - 1); + uint32_t ctx_id = context_lookup[pos]; + int32_t residual = r[x] - guess - offsets[pos]; + tokens->emplace_back(ctx_id, PackSigned(residual)); + wp_state.UpdateErrors(r[x], x, y, channel.w); + } + } + } else if (tree.size() == 1 && tree[0].predictor == Predictor::Gradient && + tree[0].multiplier == 1 && tree[0].predictor_offset == 0 && + !skip_encoder_fast_path) { + for (size_t c = 0; c < 3; c++) { + FillImage(static_cast(PredictorColor(Predictor::Gradient)[c]), + &predictor_img.Plane(c)); + } + const intptr_t onerow = channel.plane.PixelsPerRow(); + for (size_t y = 0; y < channel.h; y++) { + const pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); + pixel_type_w top = (y ? *(r + x - onerow) : left); + pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); + int32_t guess = ClampedGradient(top, left, topleft); + int32_t residual = r[x] - guess; + tokens->emplace_back(tree[0].childID, PackSigned(residual)); + } + } + } else if (is_gradient_only && !skip_encoder_fast_path) { + for (size_t c = 0; c < 3; c++) { + FillImage(static_cast(PredictorColor(Predictor::Gradient)[c]), + &predictor_img.Plane(c)); + } + const intptr_t onerow = channel.plane.PixelsPerRow(); + for (size_t y = 0; y < channel.h; y++) { + const pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); + pixel_type_w top = (y ? *(r + x - onerow) : left); + pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); + int32_t guess = ClampedGradient(top, left, topleft); + uint32_t pos = + kPropRangeFast + + std::min( + std::max(-kPropRangeFast, top + left - topleft), + kPropRangeFast - 1); + uint32_t ctx_id = context_lookup[pos]; + int32_t residual = r[x] - guess - offsets[pos]; + tokens->emplace_back(ctx_id, PackSigned(residual)); + } + } + } else if (tree.size() == 1 && tree[0].predictor == Predictor::Zero && + tree[0].multiplier == 1 && tree[0].predictor_offset == 0 && + !skip_encoder_fast_path) { + for (size_t c = 0; c < 3; c++) { + FillImage(static_cast(PredictorColor(Predictor::Zero)[c]), + &predictor_img.Plane(c)); + } + for (size_t y = 0; y < channel.h; y++) { + const pixel_type *JXL_RESTRICT p = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + tokens->emplace_back(tree[0].childID, PackSigned(p[x])); + } + } + } else if (tree.size() == 1 && tree[0].predictor != Predictor::Weighted && + (tree[0].multiplier & (tree[0].multiplier - 1)) == 0 && + tree[0].predictor_offset == 0 && !skip_encoder_fast_path) { + // multiplier is a power of 2. + for (size_t c = 0; c < 3; c++) { + FillImage(static_cast(PredictorColor(tree[0].predictor)[c]), + &predictor_img.Plane(c)); + } + uint32_t mul_shift = FloorLog2Nonzero((uint32_t)tree[0].multiplier); + const intptr_t onerow = channel.plane.PixelsPerRow(); + for (size_t y = 0; y < channel.h; y++) { + const pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + PredictionResult pred = PredictNoTreeNoWP(channel.w, r + x, onerow, x, + y, tree[0].predictor); + pixel_type_w residual = r[x] - pred.guess; + JXL_DASSERT((residual >> mul_shift) * tree[0].multiplier == residual); + tokens->emplace_back(tree[0].childID, + PackSigned(residual >> mul_shift)); + } + } + + } else if (!use_wp && !skip_encoder_fast_path) { + const intptr_t onerow = channel.plane.PixelsPerRow(); + Channel references(properties.size() - kNumNonrefProperties, channel.w); + for (size_t y = 0; y < channel.h; y++) { + const pixel_type *JXL_RESTRICT p = channel.Row(y); + PrecomputeReferences(channel, y, image, chan, &references); + float *pred_img_row[3]; + if (kWantDebug) { + for (size_t c = 0; c < 3; c++) { + pred_img_row[c] = predictor_img.PlaneRow(c, y); + } + } + InitPropsRow(&properties, static_props, y); + for (size_t x = 0; x < channel.w; x++) { + PredictionResult res = + PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y, + tree_lookup, references); + if (kWantDebug) { + for (size_t i = 0; i < 3; i++) { + pred_img_row[i][x] = PredictorColor(res.predictor)[i]; + } + } + pixel_type_w residual = p[x] - res.guess; + JXL_ASSERT(residual % res.multiplier == 0); + tokens->emplace_back(res.context, + PackSigned(residual / res.multiplier)); + } + } + } else { + const intptr_t onerow = channel.plane.PixelsPerRow(); + Channel references(properties.size() - kNumNonrefProperties, channel.w); + weighted::State wp_state(wp_header, channel.w, channel.h); + for (size_t y = 0; y < channel.h; y++) { + const pixel_type *JXL_RESTRICT p = channel.Row(y); + PrecomputeReferences(channel, y, image, chan, &references); + float *pred_img_row[3]; + if (kWantDebug) { + for (size_t c = 0; c < 3; c++) { + pred_img_row[c] = predictor_img.PlaneRow(c, y); + } + } + InitPropsRow(&properties, static_props, y); + for (size_t x = 0; x < channel.w; x++) { + PredictionResult res = + PredictTreeWP(&properties, channel.w, p + x, onerow, x, y, + tree_lookup, references, &wp_state); + if (kWantDebug) { + for (size_t i = 0; i < 3; i++) { + pred_img_row[i][x] = PredictorColor(res.predictor)[i]; + } + } + pixel_type_w residual = p[x] - res.guess; + JXL_ASSERT(residual % res.multiplier == 0); + tokens->emplace_back(res.context, + PackSigned(residual / res.multiplier)); + wp_state.UpdateErrors(p[x], x, y, channel.w); + } + } + } + if (kWantDebug && WantDebugOutput(aux_out)) { + aux_out->DumpImage( + ("pred_" + ToString(group_id) + "_" + ToString(chan)).c_str(), + predictor_img); + } + return true; +} + +Status ModularEncode(const Image &image, const ModularOptions &options, + BitWriter *writer, AuxOut *aux_out, size_t layer, + size_t group_id, TreeSamples *tree_samples, + size_t *total_pixels, const Tree *tree, + GroupHeader *header, std::vector *tokens, + size_t *width) { + if (image.error) return JXL_FAILURE("Invalid image"); + size_t nb_channels = image.channel.size(); + JXL_DEBUG_V(2, "Encoding %zu-channel, %i-bit, %zux%zu image.", nb_channels, + image.bitdepth, image.w, image.h); + + if (nb_channels < 1) { + return true; // is there any use for a zero-channel image? + } + + // encode transforms + GroupHeader header_storage; + if (header == nullptr) header = &header_storage; + Bundle::Init(header); + if (options.predictor == Predictor::Weighted) { + weighted::PredictorMode(options.wp_mode, &header->wp_header); + } + header->transforms = image.transform; + // This doesn't actually work + if (tree != nullptr) { + header->use_global_tree = true; + } + if (tree_samples == nullptr && tree == nullptr) { + JXL_RETURN_IF_ERROR(Bundle::Write(*header, writer, layer, aux_out)); + } + + TreeSamples tree_samples_storage; + size_t total_pixels_storage = 0; + if (!total_pixels) total_pixels = &total_pixels_storage; + // If there's no tree, compute one (or gather data to). + if (tree == nullptr) { + bool gather_data = tree_samples != nullptr; + if (tree_samples == nullptr) { + JXL_RETURN_IF_ERROR(tree_samples_storage.SetPredictor( + options.predictor, options.wp_tree_mode)); + JXL_RETURN_IF_ERROR(tree_samples_storage.SetProperties( + options.splitting_heuristics_properties, options.wp_tree_mode)); + std::vector pixel_samples; + std::vector diff_samples; + std::vector group_pixel_count; + std::vector channel_pixel_count; + CollectPixelSamples(image, options, 0, group_pixel_count, + channel_pixel_count, pixel_samples, diff_samples); + std::vector dummy_multiplier_info; + StaticPropRange range; + tree_samples_storage.PreQuantizeProperties( + range, dummy_multiplier_info, group_pixel_count, channel_pixel_count, + pixel_samples, diff_samples, options.max_property_values); + } + for (size_t i = 0; i < nb_channels; i++) { + if (!image.channel[i].w || !image.channel[i].h) { + continue; // skip empty channels + } + if (i >= image.nb_meta_channels && + (image.channel[i].w > options.max_chan_size || + image.channel[i].h > options.max_chan_size)) { + break; + } + GatherTreeData(image, i, group_id, header->wp_header, options, + gather_data ? *tree_samples : tree_samples_storage, + total_pixels); + } + if (gather_data) return true; + } + + JXL_ASSERT((tree == nullptr) == (tokens == nullptr)); + + Tree tree_storage; + std::vector> tokens_storage(1); + // Compute tree. + if (tree == nullptr) { + EntropyEncodingData code; + std::vector context_map; + + std::vector> tree_tokens(1); + tree_storage = + LearnTree(std::move(tree_samples_storage), *total_pixels, options); + tree = &tree_storage; + tokens = &tokens_storage[0]; + + Tree decoded_tree; + TokenizeTree(*tree, &tree_tokens[0], &decoded_tree); + JXL_ASSERT(tree->size() == decoded_tree.size()); + tree_storage = std::move(decoded_tree); + + if (kWantDebug && WantDebugOutput(aux_out)) { + PrintTree(*tree, aux_out->debug_prefix + "/tree_" + ToString(group_id)); + } + // Write tree + BuildAndEncodeHistograms(HistogramParams(), kNumTreeContexts, tree_tokens, + &code, &context_map, writer, kLayerModularTree, + aux_out); + WriteTokens(tree_tokens[0], code, context_map, writer, kLayerModularTree, + aux_out); + } + + size_t image_width = 0; + for (size_t i = 0; i < nb_channels; i++) { + if (!image.channel[i].w || !image.channel[i].h) { + continue; // skip empty channels + } + if (i >= image.nb_meta_channels && + (image.channel[i].w > options.max_chan_size || + image.channel[i].h > options.max_chan_size)) { + break; + } + if (image.channel[i].w > image_width) image_width = image.channel[i].w; + if (options.zero_tokens) { + tokens->resize(tokens->size() + image.channel[i].w * image.channel[i].h, + {0, 0}); + } else { + JXL_RETURN_IF_ERROR(EncodeModularChannelMAANS( + image, i, header->wp_header, *tree, tokens, aux_out, group_id, + options.skip_encoder_fast_path)); + } + } + + // Write data if not using a global tree/ANS stream. + if (!header->use_global_tree) { + EntropyEncodingData code; + std::vector context_map; + HistogramParams histo_params; + histo_params.image_widths.push_back(image_width); + BuildAndEncodeHistograms(histo_params, (tree->size() + 1) / 2, + tokens_storage, &code, &context_map, writer, layer, + aux_out); + WriteTokens(tokens_storage[0], code, context_map, writer, layer, aux_out); + } else { + *width = image_width; + } + return true; +} + +Status ModularGenericCompress(Image &image, const ModularOptions &opts, + BitWriter *writer, AuxOut *aux_out, size_t layer, + size_t group_id, TreeSamples *tree_samples, + size_t *total_pixels, const Tree *tree, + GroupHeader *header, std::vector *tokens, + size_t *width) { + if (image.w == 0 || image.h == 0) return true; + ModularOptions options = opts; // Make a copy to modify it. + + if (options.predictor == static_cast(-1)) { + options.predictor = Predictor::Gradient; + } + + size_t bits = writer ? writer->BitsWritten() : 0; + JXL_RETURN_IF_ERROR(ModularEncode(image, options, writer, aux_out, layer, + group_id, tree_samples, total_pixels, tree, + header, tokens, width)); + bits = writer ? writer->BitsWritten() - bits : 0; + if (writer) { + JXL_DEBUG_V( + 4, + "Modular-encoded a %zux%zu bitdepth=%i nbchans=%zu image in %zu bytes", + image.w, image.h, image.bitdepth, image.channel.size(), bits / 8); + } + (void)bits; + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_encoding.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_encoding.h new file mode 100644 index 0000000000..9c083e9575 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_encoding.h @@ -0,0 +1,49 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_ENCODING_ENC_ENCODING_H_ +#define LIB_JXL_MODULAR_ENCODING_ENC_ENCODING_H_ + +#include +#include + +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/image.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/encoding/enc_ma.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/options.h" +#include "lib/jxl/modular/transform/transform.h" + +namespace jxl { + +void PrintTree(const Tree &tree, const std::string &path); +Tree LearnTree(TreeSamples &&tree_samples, size_t total_pixels, + const ModularOptions &options, + const std::vector &multiplier_info = {}, + StaticPropRange static_prop_range = {}); + +// TODO(veluca): make cleaner interfaces. + +Status ModularGenericCompress( + Image &image, const ModularOptions &opts, BitWriter *writer, + AuxOut *aux_out = nullptr, size_t layer = 0, size_t group_id = 0, + // For gathering data for producing a global tree. + TreeSamples *tree_samples = nullptr, size_t *total_pixels = nullptr, + // For encoding with global tree. + const Tree *tree = nullptr, GroupHeader *header = nullptr, + std::vector *tokens = nullptr, size_t *widths = nullptr); +} // namespace jxl + +#endif // LIB_JXL_MODULAR_ENCODING_ENC_ENCODING_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_ma.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_ma.cc new file mode 100644 index 0000000000..0e2eaac71f --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_ma.cc @@ -0,0 +1,1043 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/encoding/enc_ma.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "lib/jxl/modular/encoding/ma_common.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/modular/encoding/enc_ma.cc" +#include +#include + +#ifndef LIB_JXL_ENC_MODULAR_ENCODING_MA_ +#define LIB_JXL_ENC_MODULAR_ENCODING_MA_ +namespace { +struct Rng { + uint64_t s[2]; + explicit Rng(size_t seed) + : s{0x94D049BB133111EBull, 0xBF58476D1CE4E5B9ull + seed} {} + // Xorshift128+ adapted from xorshift128+-inl.h + uint64_t operator()() { + uint64_t s1 = s[0]; + const uint64_t s0 = s[1]; + const uint64_t bits = s1 + s0; // b, c + s[0] = s0; + s1 ^= s1 << 23; + s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5); + s[1] = s1; + return bits; + } + static constexpr uint64_t max() { return ~0ULL; } + static constexpr uint64_t min() { return 0; } +}; +} // namespace +#endif + +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/fast_math-inl.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/options.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +const HWY_FULL(float) df; +const HWY_FULL(int32_t) di; +size_t Padded(size_t x) { return RoundUpTo(x, Lanes(df)); } + +float EstimateBits(const int32_t *counts, int32_t *rounded_counts, + size_t num_symbols) { + // Try to approximate the effect of rounding up nonzero probabilities. + int32_t total = std::accumulate(counts, counts + num_symbols, 0); + const auto min = Set(di, (total + ANS_TAB_SIZE - 1) >> ANS_LOG_TAB_SIZE); + const auto zero_i = Zero(di); + for (size_t i = 0; i < num_symbols; i += Lanes(df)) { + auto counts_v = LoadU(di, &counts[i]); + counts_v = IfThenElse(counts_v == zero_i, zero_i, + IfThenElse(counts_v < min, min, counts_v)); + StoreU(counts_v, di, &rounded_counts[i]); + } + // Compute entropy of the "rounded" probabilities. + const auto zero = Zero(df); + const size_t total_scalar = + std::accumulate(rounded_counts, rounded_counts + num_symbols, 0); + const auto inv_total = Set(df, 1.0f / total_scalar); + auto bits_lanes = Zero(df); + auto total_v = Set(di, total_scalar); + for (size_t i = 0; i < num_symbols; i += Lanes(df)) { + const auto counts_v = ConvertTo(df, LoadU(di, &counts[i])); + const auto round_counts_v = LoadU(di, &rounded_counts[i]); + const auto probs = ConvertTo(df, round_counts_v) * inv_total; + const auto nbps = IfThenElse(round_counts_v == total_v, BitCast(di, zero), + BitCast(di, FastLog2f(df, probs))); + bits_lanes -= + IfThenElse(counts_v == zero, zero, counts_v * BitCast(df, nbps)); + } + return GetLane(SumOfLanes(bits_lanes)); +} + +void MakeSplitNode(size_t pos, int property, int splitval, Predictor lpred, + int64_t loff, Predictor rpred, int64_t roff, Tree *tree) { + // Note that the tree splits on *strictly greater*. + (*tree)[pos].lchild = tree->size(); + (*tree)[pos].rchild = tree->size() + 1; + (*tree)[pos].splitval = splitval; + (*tree)[pos].property = property; + tree->emplace_back(); + tree->back().property = -1; + tree->back().predictor = rpred; + tree->back().predictor_offset = roff; + tree->back().multiplier = 1; + tree->emplace_back(); + tree->back().property = -1; + tree->back().predictor = lpred; + tree->back().predictor_offset = loff; + tree->back().multiplier = 1; +} + +enum class IntersectionType { kNone, kPartial, kInside }; +IntersectionType BoxIntersects(StaticPropRange needle, StaticPropRange haystack, + uint32_t &partial_axis, uint32_t &partial_val) { + bool partial = false; + for (size_t i = 0; i < kNumStaticProperties; i++) { + if (haystack[i][0] >= needle[i][1]) { + return IntersectionType::kNone; + } + if (haystack[i][1] <= needle[i][0]) { + return IntersectionType::kNone; + } + if (haystack[i][0] <= needle[i][0] && haystack[i][1] >= needle[i][1]) { + continue; + } + partial = true; + partial_axis = i; + if (haystack[i][0] > needle[i][0] && haystack[i][0] < needle[i][1]) { + partial_val = haystack[i][0] - 1; + } else { + JXL_DASSERT(haystack[i][1] > needle[i][0] && + haystack[i][1] < needle[i][1]); + partial_val = haystack[i][1] - 1; + } + } + return partial ? IntersectionType::kPartial : IntersectionType::kInside; +} + +void SplitTreeSamples(TreeSamples &tree_samples, size_t begin, size_t pos, + size_t end, size_t prop) { + auto cmp = [&](size_t a, size_t b) { + return int32_t(tree_samples.Property(prop, a)) - + int32_t(tree_samples.Property(prop, b)); + }; + Rng rng(0); + while (end > begin + 1) { + { + JXL_ASSERT(end > begin); // silence clang-tidy. + size_t pivot = rng() % (end - begin) + begin; + tree_samples.Swap(begin, pivot); + } + size_t pivot_begin = begin; + size_t pivot_end = pivot_begin + 1; + for (size_t i = begin + 1; i < end; i++) { + JXL_DASSERT(i >= pivot_end); + JXL_DASSERT(pivot_end > pivot_begin); + int32_t cmp_result = cmp(i, pivot_begin); + if (cmp_result < 0) { // i < pivot, move pivot forward and put i before + // the pivot. + tree_samples.ThreeShuffle(pivot_begin, pivot_end, i); + pivot_begin++; + pivot_end++; + } else if (cmp_result == 0) { + tree_samples.Swap(pivot_end, i); + pivot_end++; + } + } + JXL_DASSERT(pivot_begin >= begin); + JXL_DASSERT(pivot_end > pivot_begin); + JXL_DASSERT(pivot_end <= end); + for (size_t i = begin; i < pivot_begin; i++) { + JXL_DASSERT(cmp(i, pivot_begin) < 0); + } + for (size_t i = pivot_end; i < end; i++) { + JXL_DASSERT(cmp(i, pivot_begin) > 0); + } + for (size_t i = pivot_begin; i < pivot_end; i++) { + JXL_DASSERT(cmp(i, pivot_begin) == 0); + } + // We now have that [begin, pivot_begin) is < pivot, [pivot_begin, + // pivot_end) is = pivot, and [pivot_end, end) is > pivot. + // If pos falls in the first or the last interval, we continue in that + // interval; otherwise, we are done. + if (pivot_begin > pos) { + end = pivot_begin; + } else if (pivot_end < pos) { + begin = pivot_end; + } else { + break; + } + } +} + +void FindBestSplit(TreeSamples &tree_samples, float threshold, + const std::vector &mul_info, + StaticPropRange initial_static_prop_range, + float fast_decode_multiplier, Tree *tree) { + struct NodeInfo { + size_t pos; + size_t begin; + size_t end; + uint64_t used_properties; + StaticPropRange static_prop_range; + }; + std::vector nodes; + nodes.push_back(NodeInfo{0, 0, tree_samples.NumDistinctSamples(), 0, + initial_static_prop_range}); + + size_t num_predictors = tree_samples.NumPredictors(); + size_t num_properties = tree_samples.NumProperties(); + + // TODO(veluca): consider parallelizing the search (processing multiple nodes + // at a time). + while (!nodes.empty()) { + size_t pos = nodes.back().pos; + size_t begin = nodes.back().begin; + size_t end = nodes.back().end; + uint64_t used_properties = nodes.back().used_properties; + StaticPropRange static_prop_range = nodes.back().static_prop_range; + nodes.pop_back(); + if (begin == end) continue; + + struct SplitInfo { + size_t prop = 0; + uint32_t val = 0; + size_t pos = 0; + float lcost = std::numeric_limits::max(); + float rcost = std::numeric_limits::max(); + Predictor lpred = Predictor::Zero; + Predictor rpred = Predictor::Zero; + float Cost() { return lcost + rcost; } + }; + + SplitInfo best_split_static_constant; + SplitInfo best_split_static; + SplitInfo best_split_nonstatic; + SplitInfo best_split_nowp; + + JXL_DASSERT(begin <= end); + JXL_DASSERT(end <= tree_samples.NumDistinctSamples()); + + // Compute the maximum token in the range. + size_t max_symbols = 0; + for (size_t pred = 0; pred < num_predictors; pred++) { + for (size_t i = begin; i < end; i++) { + uint32_t tok = tree_samples.Token(pred, i); + max_symbols = max_symbols > tok + 1 ? max_symbols : tok + 1; + } + } + max_symbols = Padded(max_symbols); + std::vector rounded_counts(max_symbols); + std::vector counts(max_symbols * num_predictors); + std::vector tot_extra_bits(num_predictors); + for (size_t pred = 0; pred < num_predictors; pred++) { + for (size_t i = begin; i < end; i++) { + counts[pred * max_symbols + tree_samples.Token(pred, i)] += + tree_samples.Count(i); + tot_extra_bits[pred] += + tree_samples.NBits(pred, i) * tree_samples.Count(i); + } + } + + float base_bits; + { + size_t pred = tree_samples.PredictorIndex((*tree)[pos].predictor); + base_bits = EstimateBits(counts.data() + pred * max_symbols, + rounded_counts.data(), max_symbols) + + tot_extra_bits[pred]; + } + + SplitInfo *best = &best_split_nonstatic; + + SplitInfo forced_split; + // The multiplier ranges cut halfway through the current ranges of static + // properties. We do this even if the current node is not a leaf, to + // minimize the number of nodes in the resulting tree. + for (size_t i = 0; i < mul_info.size(); i++) { + uint32_t axis, val; + IntersectionType t = + BoxIntersects(static_prop_range, mul_info[i].range, axis, val); + if (t == IntersectionType::kNone) continue; + if (t == IntersectionType::kInside) { + (*tree)[pos].multiplier = mul_info[i].multiplier; + break; + } + if (t == IntersectionType::kPartial) { + forced_split.val = tree_samples.QuantizeProperty(axis, val); + forced_split.prop = axis; + forced_split.lcost = forced_split.rcost = base_bits / 2 - threshold; + forced_split.lpred = forced_split.rpred = (*tree)[pos].predictor; + best = &forced_split; + best->pos = begin; + JXL_ASSERT(best->prop == tree_samples.PropertyFromIndex(best->prop)); + for (size_t x = begin; x < end; x++) { + if (tree_samples.Property(best->prop, x) <= best->val) { + best->pos++; + } + } + break; + } + } + + if (best != &forced_split) { + std::vector prop_value_used_count; + std::vector count_increase; + std::vector extra_bits_increase; + // For each property, compute which of its values are used, and what + // tokens correspond to those usages. Then, iterate through the values, + // and compute the entropy of each side of the split (of the form `prop > + // threshold`). Finally, find the split that minimizes the cost. + struct CostInfo { + float cost = std::numeric_limits::max(); + float extra_cost = 0; + float Cost() const { return cost + extra_cost; } + Predictor pred; // will be uninitialized in some cases, but never used. + }; + std::vector costs_l; + std::vector costs_r; + + std::vector counts_above(max_symbols); + std::vector counts_below(max_symbols); + + // The lower the threshold, the higher the expected noisiness of the + // estimate. Thus, discourage changing predictors. + float change_pred_penalty = 800.0f / (100.0f + threshold); + for (size_t prop = 0; prop < num_properties && base_bits > threshold; + prop++) { + costs_l.clear(); + costs_r.clear(); + size_t prop_size = tree_samples.NumPropertyValues(prop); + if (extra_bits_increase.size() < prop_size) { + count_increase.resize(prop_size * max_symbols); + extra_bits_increase.resize(prop_size); + } + // Clear prop_value_used_count (which cannot be cleared "on the go") + prop_value_used_count.clear(); + prop_value_used_count.resize(prop_size); + + size_t first_used = prop_size; + size_t last_used = 0; + + // TODO(veluca): consider finding multiple splits along a single + // property at the same time, possibly with a bottom-up approach. + for (size_t i = begin; i < end; i++) { + size_t p = tree_samples.Property(prop, i); + prop_value_used_count[p]++; + last_used = std::max(last_used, p); + first_used = std::min(first_used, p); + } + costs_l.resize(last_used - first_used); + costs_r.resize(last_used - first_used); + // For all predictors, compute the right and left costs of each split. + for (size_t pred = 0; pred < num_predictors; pred++) { + // Compute cost and histogram increments for each property value. + for (size_t i = begin; i < end; i++) { + size_t p = tree_samples.Property(prop, i); + size_t cnt = tree_samples.Count(i); + size_t sym = tree_samples.Token(pred, i); + count_increase[p * max_symbols + sym] += cnt; + extra_bits_increase[p] += tree_samples.NBits(pred, i) * cnt; + } + memcpy(counts_above.data(), counts.data() + pred * max_symbols, + max_symbols * sizeof counts_above[0]); + memset(counts_below.data(), 0, max_symbols * sizeof counts_below[0]); + size_t extra_bits_below = 0; + // Exclude last used: this ensures neither counts_above nor + // counts_below is empty. + for (size_t i = first_used; i < last_used; i++) { + if (!prop_value_used_count[i]) continue; + extra_bits_below += extra_bits_increase[i]; + // The increase for this property value has been used, and will not + // be used again: clear it. Also below. + extra_bits_increase[i] = 0; + for (size_t sym = 0; sym < max_symbols; sym++) { + counts_above[sym] -= count_increase[i * max_symbols + sym]; + counts_below[sym] += count_increase[i * max_symbols + sym]; + count_increase[i * max_symbols + sym] = 0; + } + float rcost = EstimateBits(counts_above.data(), + rounded_counts.data(), max_symbols) + + tot_extra_bits[pred] - extra_bits_below; + float lcost = EstimateBits(counts_below.data(), + rounded_counts.data(), max_symbols) + + extra_bits_below; + JXL_DASSERT(extra_bits_below <= tot_extra_bits[pred]); + float penalty = 0; + // Never discourage moving away from the Weighted predictor. + if (tree_samples.PredictorFromIndex(pred) != + (*tree)[pos].predictor && + (*tree)[pos].predictor != Predictor::Weighted) { + penalty = change_pred_penalty; + } + // If everything else is equal, disfavour Weighted (slower) and + // favour Zero (faster if it's the only predictor used in a + // group+channel combination) + if (tree_samples.PredictorFromIndex(pred) == Predictor::Weighted) { + penalty += 1e-8; + } + if (tree_samples.PredictorFromIndex(pred) == Predictor::Zero) { + penalty -= 1e-8; + } + if (rcost + penalty < costs_r[i - first_used].Cost()) { + costs_r[i - first_used].cost = rcost; + costs_r[i - first_used].extra_cost = penalty; + costs_r[i - first_used].pred = + tree_samples.PredictorFromIndex(pred); + } + if (lcost + penalty < costs_l[i - first_used].Cost()) { + costs_l[i - first_used].cost = lcost; + costs_l[i - first_used].extra_cost = penalty; + costs_l[i - first_used].pred = + tree_samples.PredictorFromIndex(pred); + } + } + } + // Iterate through the possible splits and find the one with minimum sum + // of costs of the two sides. + size_t split = begin; + for (size_t i = first_used; i < last_used; i++) { + if (!prop_value_used_count[i]) continue; + split += prop_value_used_count[i]; + float rcost = costs_r[i - first_used].cost; + float lcost = costs_l[i - first_used].cost; + // WP was not used + we would use the WP property or predictor + bool adds_wp = + (tree_samples.PropertyFromIndex(prop) == kWPProp && + (used_properties & (1LU << prop)) == 0) || + ((costs_l[i - first_used].pred == Predictor::Weighted || + costs_r[i - first_used].pred == Predictor::Weighted) && + (*tree)[pos].predictor != Predictor::Weighted); + bool zero_entropy_side = rcost == 0 || lcost == 0; + + SplitInfo &best = + prop < kNumStaticProperties + ? (zero_entropy_side ? best_split_static_constant + : best_split_static) + : (adds_wp ? best_split_nonstatic : best_split_nowp); + if (lcost + rcost < best.Cost()) { + best.prop = prop; + best.val = i; + best.pos = split; + best.lcost = lcost; + best.lpred = costs_l[i - first_used].pred; + best.rcost = rcost; + best.rpred = costs_r[i - first_used].pred; + } + } + // Clear extra_bits_increase and cost_increase for last_used. + extra_bits_increase[last_used] = 0; + for (size_t sym = 0; sym < max_symbols; sym++) { + count_increase[last_used * max_symbols + sym] = 0; + } + } + + // Try to avoid introducing WP. + if (best_split_nowp.Cost() + threshold < base_bits && + best_split_nowp.Cost() <= fast_decode_multiplier * best->Cost()) { + best = &best_split_nowp; + } + // Split along static props if possible and not significantly more + // expensive. + if (best_split_static.Cost() + threshold < base_bits && + best_split_static.Cost() <= fast_decode_multiplier * best->Cost()) { + best = &best_split_static; + } + // Split along static props to create constant nodes if possible. + if (best_split_static_constant.Cost() + threshold < base_bits) { + best = &best_split_static_constant; + } + } + + if (best->Cost() + threshold < base_bits) { + uint32_t p = tree_samples.PropertyFromIndex(best->prop); + pixel_type dequant = + tree_samples.UnquantizeProperty(best->prop, best->val); + // Split node and try to split children. + MakeSplitNode(pos, p, dequant, best->lpred, 0, best->rpred, 0, tree); + // "Sort" according to winning property + SplitTreeSamples(tree_samples, begin, best->pos, end, best->prop); + if (p >= kNumStaticProperties) { + used_properties |= 1 << best->prop; + } + auto new_sp_range = static_prop_range; + if (p < kNumStaticProperties) { + JXL_ASSERT(static_cast(dequant + 1) <= new_sp_range[p][1]); + new_sp_range[p][1] = dequant + 1; + JXL_ASSERT(new_sp_range[p][0] < new_sp_range[p][1]); + } + nodes.push_back(NodeInfo{(*tree)[pos].rchild, begin, best->pos, + used_properties, new_sp_range}); + new_sp_range = static_prop_range; + if (p < kNumStaticProperties) { + JXL_ASSERT(new_sp_range[p][0] <= static_cast(dequant + 1)); + new_sp_range[p][0] = dequant + 1; + JXL_ASSERT(new_sp_range[p][0] < new_sp_range[p][1]); + } + nodes.push_back(NodeInfo{(*tree)[pos].lchild, best->pos, end, + used_properties, new_sp_range}); + } + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +HWY_EXPORT(FindBestSplit); // Local function. + +void ComputeBestTree(TreeSamples &tree_samples, float threshold, + const std::vector &mul_info, + StaticPropRange static_prop_range, + float fast_decode_multiplier, Tree *tree) { + // TODO(veluca): take into account that different contexts can have different + // uint configs. + // + // Initialize tree. + tree->emplace_back(); + tree->back().property = -1; + tree->back().predictor = tree_samples.PredictorFromIndex(0); + tree->back().predictor_offset = 0; + tree->back().multiplier = 1; + JXL_ASSERT(tree_samples.NumProperties() < 64); + + JXL_ASSERT(tree_samples.NumDistinctSamples() <= + std::numeric_limits::max()); + HWY_DYNAMIC_DISPATCH(FindBestSplit) + (tree_samples, threshold, mul_info, static_prop_range, fast_decode_multiplier, + tree); +} + +constexpr int TreeSamples::kPropertyRange; +constexpr uint32_t TreeSamples::kDedupEntryUnused; + +Status TreeSamples::SetPredictor(Predictor predictor, + ModularOptions::TreeMode wp_tree_mode) { + if (wp_tree_mode == ModularOptions::TreeMode::kWPOnly) { + predictors = {Predictor::Weighted}; + residuals.resize(1); + return true; + } + if (wp_tree_mode == ModularOptions::TreeMode::kNoWP && + predictor == Predictor::Weighted) { + return JXL_FAILURE("Invalid predictor settings"); + } + if (predictor == Predictor::Variable) { + for (size_t i = 0; i < kNumModularPredictors; i++) { + predictors.push_back(static_cast(i)); + } + std::swap(predictors[0], predictors[static_cast(Predictor::Weighted)]); + std::swap(predictors[1], predictors[static_cast(Predictor::Gradient)]); + } else if (predictor == Predictor::Best) { + predictors = {Predictor::Weighted, Predictor::Gradient}; + } else { + predictors = {predictor}; + } + if (wp_tree_mode == ModularOptions::TreeMode::kNoWP) { + auto wp_it = + std::find(predictors.begin(), predictors.end(), Predictor::Weighted); + if (wp_it != predictors.end()) { + predictors.erase(wp_it); + } + } + residuals.resize(predictors.size()); + return true; +} + +Status TreeSamples::SetProperties(const std::vector &properties, + ModularOptions::TreeMode wp_tree_mode) { + props_to_use = properties; + if (wp_tree_mode == ModularOptions::TreeMode::kWPOnly) { + props_to_use = {static_cast(kWPProp)}; + } + if (wp_tree_mode == ModularOptions::TreeMode::kGradientOnly) { + props_to_use = {static_cast(kGradientProp)}; + } + if (wp_tree_mode == ModularOptions::TreeMode::kNoWP) { + auto it = std::find(props_to_use.begin(), props_to_use.end(), kWPProp); + if (it != props_to_use.end()) { + props_to_use.erase(it); + } + } + if (props_to_use.empty()) { + return JXL_FAILURE("Invalid property set configuration"); + } + props.resize(props_to_use.size()); + return true; +} + +void TreeSamples::InitTable(size_t size) { + JXL_DASSERT((size & (size - 1)) == 0); + if (dedup_table_.size() == size) return; + dedup_table_.resize(size, kDedupEntryUnused); + for (size_t i = 0; i < NumDistinctSamples(); i++) { + if (sample_counts[i] != std::numeric_limits::max()) { + AddToTable(i); + } + } +} + +bool TreeSamples::AddToTableAndMerge(size_t a) { + size_t pos1 = Hash1(a); + size_t pos2 = Hash2(a); + if (dedup_table_[pos1] != kDedupEntryUnused && + IsSameSample(a, dedup_table_[pos1])) { + JXL_DASSERT(sample_counts[a] == 1); + sample_counts[dedup_table_[pos1]]++; + // Remove from hash table samples that are saturated. + if (sample_counts[dedup_table_[pos1]] == + std::numeric_limits::max()) { + dedup_table_[pos1] = kDedupEntryUnused; + } + return true; + } + if (dedup_table_[pos2] != kDedupEntryUnused && + IsSameSample(a, dedup_table_[pos2])) { + JXL_DASSERT(sample_counts[a] == 1); + sample_counts[dedup_table_[pos2]]++; + // Remove from hash table samples that are saturated. + if (sample_counts[dedup_table_[pos2]] == + std::numeric_limits::max()) { + dedup_table_[pos2] = kDedupEntryUnused; + } + return true; + } + AddToTable(a); + return false; +} + +void TreeSamples::AddToTable(size_t a) { + size_t pos1 = Hash1(a); + size_t pos2 = Hash2(a); + if (dedup_table_[pos1] == kDedupEntryUnused) { + dedup_table_[pos1] = a; + } else if (dedup_table_[pos2] == kDedupEntryUnused) { + dedup_table_[pos2] = a; + } +} + +void TreeSamples::PrepareForSamples(size_t num_samples) { + for (auto &res : residuals) { + res.reserve(res.size() + num_samples); + } + for (auto &p : props) { + p.reserve(p.size() + num_samples); + } + size_t total_num_samples = num_samples + sample_counts.size(); + size_t next_pow2 = 1LLU << CeilLog2Nonzero(total_num_samples * 3 / 2); + InitTable(next_pow2); +} + +size_t TreeSamples::Hash1(size_t a) const { + constexpr uint64_t constant = 0x1e35a7bd; + uint64_t h = constant; + for (const auto &r : residuals) { + h = h * constant + r[a].tok; + h = h * constant + r[a].nbits; + } + for (const auto &p : props) { + h = h * constant + p[a]; + } + return (h >> 16) & (dedup_table_.size() - 1); +} +size_t TreeSamples::Hash2(size_t a) const { + constexpr uint64_t constant = 0x1e35a7bd1e35a7bd; + uint64_t h = constant; + for (const auto &p : props) { + h = h * constant ^ p[a]; + } + for (const auto &r : residuals) { + h = h * constant ^ r[a].tok; + h = h * constant ^ r[a].nbits; + } + return (h >> 16) & (dedup_table_.size() - 1); +} + +bool TreeSamples::IsSameSample(size_t a, size_t b) const { + bool ret = true; + for (const auto &r : residuals) { + if (r[a].tok != r[b].tok) { + ret = false; + } + if (r[a].nbits != r[b].nbits) { + ret = false; + } + } + for (const auto &p : props) { + if (p[a] != p[b]) { + ret = false; + } + } + return ret; +} + +void TreeSamples::AddSample(pixel_type_w pixel, const Properties &properties, + const pixel_type_w *predictions) { + for (size_t i = 0; i < predictors.size(); i++) { + pixel_type v = pixel - predictions[static_cast(predictors[i])]; + uint32_t tok, nbits, bits; + HybridUintConfig(4, 1, 2).Encode(PackSigned(v), &tok, &nbits, &bits); + JXL_DASSERT(tok < 256); + JXL_DASSERT(nbits < 256); + residuals[i].emplace_back( + ResidualToken{static_cast(tok), static_cast(nbits)}); + } + for (size_t i = 0; i < props_to_use.size(); i++) { + props[i].push_back(QuantizeProperty(i, properties[props_to_use[i]])); + } + sample_counts.push_back(1); + num_samples++; + if (AddToTableAndMerge(sample_counts.size() - 1)) { + for (auto &r : residuals) r.pop_back(); + for (auto &p : props) p.pop_back(); + sample_counts.pop_back(); + } +} + +void TreeSamples::Swap(size_t a, size_t b) { + if (a == b) return; + for (auto &r : residuals) { + std::swap(r[a], r[b]); + } + for (auto &p : props) { + std::swap(p[a], p[b]); + } + std::swap(sample_counts[a], sample_counts[b]); +} + +void TreeSamples::ThreeShuffle(size_t a, size_t b, size_t c) { + if (b == c) return Swap(a, b); + for (auto &r : residuals) { + auto tmp = r[a]; + r[a] = r[c]; + r[c] = r[b]; + r[b] = tmp; + } + for (auto &p : props) { + auto tmp = p[a]; + p[a] = p[c]; + p[c] = p[b]; + p[b] = tmp; + } + auto tmp = sample_counts[a]; + sample_counts[a] = sample_counts[c]; + sample_counts[c] = sample_counts[b]; + sample_counts[b] = tmp; +} + +namespace { +std::vector QuantizeHistogram(const std::vector &histogram, + size_t num_chunks) { + if (histogram.empty()) return {}; + // TODO(veluca): selecting distinct quantiles is likely not the best + // way to go about this. + std::vector thresholds; + size_t sum = std::accumulate(histogram.begin(), histogram.end(), 0LU); + size_t cumsum = 0; + size_t threshold = 0; + for (size_t i = 0; i + 1 < histogram.size(); i++) { + cumsum += histogram[i]; + if (cumsum > (threshold + 1) * sum / num_chunks) { + thresholds.push_back(i); + while (cumsum >= (threshold + 1) * sum / num_chunks) threshold++; + } + } + return thresholds; +} + +std::vector QuantizeSamples(const std::vector &samples, + size_t num_chunks) { + if (samples.empty()) return {}; + int min = *std::min_element(samples.begin(), samples.end()); + constexpr int kRange = 512; + min = std::min(std::max(min, -kRange), kRange); + std::vector counts(2 * kRange + 1); + for (int s : samples) { + uint32_t sample_offset = std::min(std::max(s, -kRange), kRange) - min; + counts[sample_offset]++; + } + std::vector thresholds = QuantizeHistogram(counts, num_chunks); + for (auto &v : thresholds) v += min; + return thresholds; +} +} // namespace + +void TreeSamples::PreQuantizeProperties( + const StaticPropRange &range, + const std::vector &multiplier_info, + const std::vector &group_pixel_count, + const std::vector &channel_pixel_count, + std::vector &pixel_samples, + std::vector &diff_samples, size_t max_property_values) { + // If we have forced splits because of multipliers, choose channel and group + // thresholds accordingly. + std::vector group_multiplier_thresholds; + std::vector channel_multiplier_thresholds; + for (const auto &v : multiplier_info) { + if (v.range[0][0] != range[0][0]) { + channel_multiplier_thresholds.push_back(v.range[0][0] - 1); + } + if (v.range[0][1] != range[0][1]) { + channel_multiplier_thresholds.push_back(v.range[0][1] - 1); + } + if (v.range[1][0] != range[1][0]) { + group_multiplier_thresholds.push_back(v.range[1][0] - 1); + } + if (v.range[1][1] != range[1][1]) { + group_multiplier_thresholds.push_back(v.range[1][1] - 1); + } + } + std::sort(channel_multiplier_thresholds.begin(), + channel_multiplier_thresholds.end()); + channel_multiplier_thresholds.resize( + std::unique(channel_multiplier_thresholds.begin(), + channel_multiplier_thresholds.end()) - + channel_multiplier_thresholds.begin()); + std::sort(group_multiplier_thresholds.begin(), + group_multiplier_thresholds.end()); + group_multiplier_thresholds.resize( + std::unique(group_multiplier_thresholds.begin(), + group_multiplier_thresholds.end()) - + group_multiplier_thresholds.begin()); + + compact_properties.resize(props_to_use.size()); + auto quantize_channel = [&]() { + if (!channel_multiplier_thresholds.empty()) { + return channel_multiplier_thresholds; + } + return QuantizeHistogram(channel_pixel_count, max_property_values); + }; + auto quantize_group_id = [&]() { + if (!group_multiplier_thresholds.empty()) { + return group_multiplier_thresholds; + } + return QuantizeHistogram(group_pixel_count, max_property_values); + }; + auto quantize_coordinate = [&]() { + std::vector quantized; + quantized.reserve(max_property_values - 1); + for (size_t i = 0; i + 1 < max_property_values; i++) { + quantized.push_back((i + 1) * 256 / max_property_values - 1); + } + return quantized; + }; + std::vector abs_pixel_thr; + std::vector pixel_thr; + auto quantize_pixel_property = [&]() { + if (pixel_thr.empty()) { + pixel_thr = QuantizeSamples(pixel_samples, max_property_values); + } + return pixel_thr; + }; + auto quantize_abs_pixel_property = [&]() { + if (abs_pixel_thr.empty()) { + quantize_pixel_property(); // Compute the non-abs thresholds. + for (auto &v : pixel_samples) v = std::abs(v); + abs_pixel_thr = QuantizeSamples(pixel_samples, max_property_values); + } + return abs_pixel_thr; + }; + std::vector abs_diff_thr; + std::vector diff_thr; + auto quantize_diff_property = [&]() { + if (diff_thr.empty()) { + diff_thr = QuantizeSamples(diff_samples, max_property_values); + } + return diff_thr; + }; + auto quantize_abs_diff_property = [&]() { + if (abs_diff_thr.empty()) { + quantize_diff_property(); // Compute the non-abs thresholds. + for (auto &v : diff_samples) v = std::abs(v); + abs_diff_thr = QuantizeSamples(diff_samples, max_property_values); + } + return abs_diff_thr; + }; + auto quantize_wp = [&]() { + if (max_property_values < 32) { + return std::vector{-127, -63, -31, -15, -7, -3, -1, 0, + 1, 3, 7, 15, 31, 63, 127}; + } + if (max_property_values < 64) { + return std::vector{-255, -191, -127, -95, -63, -47, -31, -23, + -15, -11, -7, -5, -3, -1, 0, 1, + 3, 5, 7, 11, 15, 23, 31, 47, + 63, 95, 127, 191, 255}; + } + return std::vector{ + -255, -223, -191, -159, -127, -111, -95, -79, -63, -55, -47, + -39, -31, -27, -23, -19, -15, -13, -11, -9, -7, -6, + -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, + 6, 7, 9, 11, 13, 15, 19, 23, 27, 31, 39, + 47, 55, 63, 79, 95, 111, 127, 159, 191, 223, 255}; + }; + + property_mapping.resize(props_to_use.size()); + for (size_t i = 0; i < props_to_use.size(); i++) { + if (props_to_use[i] == 0) { + compact_properties[i] = quantize_channel(); + } else if (props_to_use[i] == 1) { + compact_properties[i] = quantize_group_id(); + } else if (props_to_use[i] == 2 || props_to_use[i] == 3) { + compact_properties[i] = quantize_coordinate(); + } else if (props_to_use[i] == 6 || props_to_use[i] == 7 || + props_to_use[i] == 8 || + (props_to_use[i] >= kNumNonrefProperties && + (props_to_use[i] - kNumNonrefProperties) % 4 == 1)) { + compact_properties[i] = quantize_pixel_property(); + } else if (props_to_use[i] == 4 || props_to_use[i] == 5 || + (props_to_use[i] >= kNumNonrefProperties && + (props_to_use[i] - kNumNonrefProperties) % 4 == 0)) { + compact_properties[i] = quantize_abs_pixel_property(); + } else if (props_to_use[i] >= kNumNonrefProperties && + (props_to_use[i] - kNumNonrefProperties) % 4 == 2) { + compact_properties[i] = quantize_abs_diff_property(); + } else if (props_to_use[i] == kWPProp) { + compact_properties[i] = quantize_wp(); + } else { + compact_properties[i] = quantize_diff_property(); + } + property_mapping[i].resize(kPropertyRange * 2 + 1); + size_t mapped = 0; + for (size_t j = 0; j < property_mapping[i].size(); j++) { + while (mapped < compact_properties[i].size() && + static_cast(j) - kPropertyRange > + compact_properties[i][mapped]) { + mapped++; + } + // property_mapping[i] of a value V is `mapped` if + // compact_properties[i][mapped] <= j and + // compact_properties[i][mapped-1] > j + // This is because the decision node in the tree splits on (property) > j, + // hence everything that is not > of a threshold should be clustered + // together. + property_mapping[i][j] = mapped; + } + } +} + +void CollectPixelSamples(const Image &image, const ModularOptions &options, + size_t group_id, + std::vector &group_pixel_count, + std::vector &channel_pixel_count, + std::vector &pixel_samples, + std::vector &diff_samples) { + if (options.nb_repeats == 0) return; + if (group_pixel_count.size() <= group_id) { + group_pixel_count.resize(group_id + 1); + } + if (channel_pixel_count.size() < image.channel.size()) { + channel_pixel_count.resize(image.channel.size()); + } + Rng rng(group_id); + // Sample 10% of the final number of samples for property quantization. + float fraction = options.nb_repeats * 0.1; + std::geometric_distribution dist(fraction); + size_t total_pixels = 0; + std::vector channel_ids; + for (size_t i = 0; i < image.channel.size(); i++) { + if (image.channel[i].w <= 1 || image.channel[i].h == 0) { + continue; // skip empty or width-1 channels. + } + if (i >= image.nb_meta_channels && + (image.channel[i].w > options.max_chan_size || + image.channel[i].h > options.max_chan_size)) { + break; + } + channel_ids.push_back(i); + group_pixel_count[group_id] += image.channel[i].w * image.channel[i].h; + channel_pixel_count[i] += image.channel[i].w * image.channel[i].h; + total_pixels += image.channel[i].w * image.channel[i].h; + } + if (channel_ids.empty()) return; + pixel_samples.reserve(pixel_samples.size() + fraction * total_pixels); + diff_samples.reserve(diff_samples.size() + fraction * total_pixels); + size_t i = 0; + size_t y = 0; + size_t x = 0; + auto advance = [&](size_t amount) { + x += amount; + // Detect row overflow (rare). + while (x >= image.channel[channel_ids[i]].w) { + x -= image.channel[channel_ids[i]].w; + y++; + // Detect end-of-channel (even rarer). + if (y == image.channel[channel_ids[i]].h) { + i++; + y = 0; + if (i >= channel_ids.size()) { + return; + } + } + } + }; + advance(dist(rng)); + for (; i < channel_ids.size(); advance(dist(rng) + 1)) { + const pixel_type *row = image.channel[channel_ids[i]].Row(y); + pixel_samples.push_back(row[x]); + size_t xp = x == 0 ? 1 : x - 1; + diff_samples.push_back(row[x] - row[xp]); + } +} + +// TODO(veluca): very simple encoding scheme. This should be improved. +void TokenizeTree(const Tree &tree, std::vector *tokens, + Tree *decoder_tree) { + JXL_ASSERT(tree.size() <= kMaxTreeSize); + std::queue q; + q.push(0); + size_t leaf_id = 0; + decoder_tree->clear(); + while (!q.empty()) { + int cur = q.front(); + q.pop(); + JXL_ASSERT(tree[cur].property >= -1); + tokens->emplace_back(kPropertyContext, tree[cur].property + 1); + if (tree[cur].property == -1) { + tokens->emplace_back(kPredictorContext, + static_cast(tree[cur].predictor)); + tokens->emplace_back(kOffsetContext, + PackSigned(tree[cur].predictor_offset)); + uint32_t mul_log = Num0BitsBelowLS1Bit_Nonzero(tree[cur].multiplier); + uint32_t mul_bits = (tree[cur].multiplier >> mul_log) - 1; + tokens->emplace_back(kMultiplierLogContext, mul_log); + tokens->emplace_back(kMultiplierBitsContext, mul_bits); + JXL_ASSERT(tree[cur].predictor < Predictor::Best); + decoder_tree->emplace_back(-1, 0, leaf_id++, 0, tree[cur].predictor, + tree[cur].predictor_offset, + tree[cur].multiplier); + continue; + } + decoder_tree->emplace_back(tree[cur].property, tree[cur].splitval, + decoder_tree->size() + q.size() + 1, + decoder_tree->size() + q.size() + 2, + Predictor::Zero, 0, 1); + q.push(tree[cur].lchild); + q.push(tree[cur].rchild); + tokens->emplace_back(kSplitValContext, PackSigned(tree[cur].splitval)); + } +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_ma.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_ma.h new file mode 100644 index 0000000000..d0a90cc952 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/enc_ma.h @@ -0,0 +1,157 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_ENCODING_ENC_MA_H_ +#define LIB_JXL_MODULAR_ENCODING_ENC_MA_H_ + +#include + +#include "lib/jxl/enc_ans.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/modular/encoding/dec_ma.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/options.h" + +namespace jxl { + +// Struct to collect all the data needed to build a tree. +struct TreeSamples { + bool HasSamples() const { + return !residuals.empty() && !residuals[0].empty(); + } + size_t NumDistinctSamples() const { return sample_counts.size(); } + size_t NumSamples() const { return num_samples; } + // Set the predictor to use. Must be called before adding any samples. + Status SetPredictor(Predictor predictor, + ModularOptions::TreeMode wp_tree_mode); + // Set the properties to use. Must be called before adding any samples. + Status SetProperties(const std::vector &properties, + ModularOptions::TreeMode wp_tree_mode); + + size_t Token(size_t pred, size_t i) const { return residuals[pred][i].tok; } + size_t NBits(size_t pred, size_t i) const { return residuals[pred][i].nbits; } + size_t Count(size_t i) const { return sample_counts[i]; } + size_t PredictorIndex(Predictor predictor) const { + const auto predictor_elem = + std::find(predictors.begin(), predictors.end(), predictor); + JXL_DASSERT(predictor_elem != predictors.end()); + return predictor_elem - predictors.begin(); + } + size_t PropertyIndex(size_t property) const { + const auto property_elem = + std::find(props_to_use.begin(), props_to_use.end(), property); + JXL_DASSERT(property_elem != props_to_use.end()); + return property_elem - props_to_use.begin(); + } + size_t NumPropertyValues(size_t property_index) const { + return compact_properties[property_index].size() + 1; + } + // Returns the *quantized* property value. + size_t Property(size_t property_index, size_t i) const { + return props[property_index][i]; + } + int UnquantizeProperty(size_t property_index, uint32_t quant) const { + JXL_ASSERT(quant < compact_properties[property_index].size()); + return compact_properties[property_index][quant]; + } + + Predictor PredictorFromIndex(size_t index) const { + JXL_DASSERT(index < predictors.size()); + return predictors[index]; + } + size_t PropertyFromIndex(size_t index) const { + JXL_DASSERT(index < props_to_use.size()); + return props_to_use[index]; + } + size_t NumPredictors() const { return predictors.size(); } + size_t NumProperties() const { return props_to_use.size(); } + + // Preallocate data for a given number of samples. MUST be called before + // adding any sample. + void PrepareForSamples(size_t num_samples); + // Add a sample. + void AddSample(pixel_type_w pixel, const Properties &properties, + const pixel_type_w *predictions); + // Pre-cluster property values. + void PreQuantizeProperties( + const StaticPropRange &range, + const std::vector &multiplier_info, + const std::vector &group_pixel_count, + const std::vector &channel_pixel_count, + std::vector &pixel_samples, + std::vector &diff_samples, size_t max_property_values); + + void AllSamplesDone() { dedup_table_ = std::vector(); } + + uint32_t QuantizeProperty(uint32_t prop, pixel_type v) const { + v = std::min(std::max(v, -kPropertyRange), kPropertyRange) + kPropertyRange; + return property_mapping[prop][v]; + } + + // Swaps samples in position a and b. Does nothing if a == b. + void Swap(size_t a, size_t b); + + // Cycles samples: a -> b -> c -> a. We assume a <= b <= c, so that we can + // just call Swap(a, b) if b==c. + void ThreeShuffle(size_t a, size_t b, size_t c); + + private: + // TODO(veluca): as the total number of properties and predictors are known + // before adding any samples, it might be better to interleave predictors, + // properties and counts in a single vector to improve locality. + // A first attempt at doing this actually results in much slower encoding, + // possibly because of the more complex addressing. + struct ResidualToken { + uint8_t tok; + uint8_t nbits; + }; + // Residual information: token and number of extra bits, per predictor. + std::vector> residuals; + // Number of occurrences of each sample. + std::vector sample_counts; + // Property values, quantized to at most 256 distinct values. + std::vector> props; + // Decompactification info for `props`. + std::vector> compact_properties; + // List of properties to use. + std::vector props_to_use; + // List of predictors to use. + std::vector predictors; + // Mapping property value -> quantized property value. + static constexpr int kPropertyRange = 511; + std::vector> property_mapping; + // Number of samples seen. + size_t num_samples = 0; + // Table for deduplication. + static constexpr uint32_t kDedupEntryUnused{static_cast(-1)}; + std::vector dedup_table_; + + // Functions for sample deduplication. + bool IsSameSample(size_t a, size_t b) const; + size_t Hash1(size_t a) const; + size_t Hash2(size_t a) const; + void InitTable(size_t size); + // Returns true if `a` was already present in the table. + bool AddToTableAndMerge(size_t a); + void AddToTable(size_t a); +}; + +void TokenizeTree(const Tree &tree, std::vector *tokens, + Tree *decoder_tree); + +void CollectPixelSamples(const Image &image, const ModularOptions &options, + size_t group_id, + std::vector &group_pixel_count, + std::vector &channel_pixel_count, + std::vector &pixel_samples, + std::vector &diff_samples); + +void ComputeBestTree(TreeSamples &tree_samples, float threshold, + const std::vector &mul_info, + StaticPropRange static_prop_range, + float fast_decode_multiplier, Tree *tree); + +} // namespace jxl +#endif // LIB_JXL_MODULAR_ENCODING_ENC_MA_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/encoding.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/encoding.cc new file mode 100644 index 0000000000..0b757113f8 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/encoding.cc @@ -0,0 +1,530 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/encoding/encoding.h" + +#include +#include + +#include + +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/options.h" + +namespace jxl { + +// Removes all nodes that use a static property (i.e. channel or group ID) from +// the tree and collapses each node on even levels with its two children to +// produce a flatter tree. Also computes whether the resulting tree requires +// using the weighted predictor. +FlatTree FilterTree(const Tree &global_tree, + std::array &static_props, + size_t *num_props, bool *use_wp, bool *wp_only, + bool *gradient_only) { + *num_props = 0; + bool has_wp = false; + bool has_non_wp = false; + *gradient_only = true; + const auto mark_property = [&](int32_t p) { + if (p == kWPProp) { + has_wp = true; + } else if (p >= kNumStaticProperties) { + has_non_wp = true; + } + if (p >= kNumStaticProperties && p != kGradientProp) { + *gradient_only = false; + } + }; + FlatTree output; + std::queue nodes; + nodes.push(0); + // Produces a trimmed and flattened tree by doing a BFS visit of the original + // tree, ignoring branches that are known to be false and proceeding two + // levels at a time to collapse nodes in a flatter tree; if an inner parent + // node has a leaf as a child, the leaf is duplicated and an implicit fake + // node is added. This allows to reduce the number of branches when traversing + // the resulting flat tree. + while (!nodes.empty()) { + size_t cur = nodes.front(); + nodes.pop(); + // Skip nodes that we can decide now, by jumping directly to their children. + while (global_tree[cur].property < kNumStaticProperties && + global_tree[cur].property != -1) { + if (static_props[global_tree[cur].property] > global_tree[cur].splitval) { + cur = global_tree[cur].lchild; + } else { + cur = global_tree[cur].rchild; + } + } + FlatDecisionNode flat; + if (global_tree[cur].property == -1) { + flat.property0 = -1; + flat.childID = global_tree[cur].lchild; + flat.predictor = global_tree[cur].predictor; + flat.predictor_offset = global_tree[cur].predictor_offset; + flat.multiplier = global_tree[cur].multiplier; + *gradient_only &= flat.predictor == Predictor::Gradient; + has_wp |= flat.predictor == Predictor::Weighted; + has_non_wp |= flat.predictor != Predictor::Weighted; + output.push_back(flat); + continue; + } + flat.childID = output.size() + nodes.size() + 1; + + flat.property0 = global_tree[cur].property; + *num_props = std::max(flat.property0 + 1, *num_props); + flat.splitval0 = global_tree[cur].splitval; + + for (size_t i = 0; i < 2; i++) { + size_t cur_child = + i == 0 ? global_tree[cur].lchild : global_tree[cur].rchild; + // Skip nodes that we can decide now. + while (global_tree[cur_child].property < kNumStaticProperties && + global_tree[cur_child].property != -1) { + if (static_props[global_tree[cur_child].property] > + global_tree[cur_child].splitval) { + cur_child = global_tree[cur_child].lchild; + } else { + cur_child = global_tree[cur_child].rchild; + } + } + // We ended up in a leaf, add a dummy decision and two copies of the leaf. + if (global_tree[cur_child].property == -1) { + flat.properties[i] = 0; + flat.splitvals[i] = 0; + nodes.push(cur_child); + nodes.push(cur_child); + } else { + flat.properties[i] = global_tree[cur_child].property; + flat.splitvals[i] = global_tree[cur_child].splitval; + nodes.push(global_tree[cur_child].lchild); + nodes.push(global_tree[cur_child].rchild); + *num_props = std::max(flat.properties[i] + 1, *num_props); + } + } + + for (size_t j = 0; j < 2; j++) mark_property(flat.properties[j]); + mark_property(flat.property0); + output.push_back(flat); + } + if (*num_props > kNumNonrefProperties) { + *num_props = + DivCeil(*num_props - kNumNonrefProperties, kExtraPropsPerChannel) * + kExtraPropsPerChannel + + kNumNonrefProperties; + } else { + *num_props = kNumNonrefProperties; + } + *use_wp = has_wp; + *wp_only = has_wp && !has_non_wp; + + return output; +} + +Status DecodeModularChannelMAANS(BitReader *br, ANSSymbolReader *reader, + const std::vector &context_map, + const Tree &global_tree, + const weighted::Header &wp_header, + pixel_type chan, size_t group_id, + Image *image) { + Channel &channel = image->channel[chan]; + + std::array static_props = {chan, + (int)group_id}; + // TODO(veluca): filter the tree according to static_props. + + // zero pixel channel? could happen + if (channel.w == 0 || channel.h == 0) return true; + + bool tree_has_wp_prop_or_pred = false; + bool is_wp_only = false; + bool is_gradient_only = false; + size_t num_props; + FlatTree tree = + FilterTree(global_tree, static_props, &num_props, + &tree_has_wp_prop_or_pred, &is_wp_only, &is_gradient_only); + + // From here on, tree lookup returns a *clustered* context ID. + // This avoids an extra memory lookup after tree traversal. + for (size_t i = 0; i < tree.size(); i++) { + if (tree[i].property0 == -1) { + tree[i].childID = context_map[tree[i].childID]; + } + } + + JXL_DEBUG_V(3, "Decoded MA tree with %zu nodes", tree.size()); + + // MAANS decode + const auto make_pixel = [](uint64_t v, pixel_type multiplier, + pixel_type_w offset) -> pixel_type { + JXL_DASSERT((v & 0xFFFFFFFF) == v); + pixel_type_w val = UnpackSigned(v); + // if it overflows, it overflows, and we have a problem anyway + return val * multiplier + offset; + }; + + if (tree.size() == 1) { + // special optimized case: no meta-adaptation, so no need + // to compute properties. + Predictor predictor = tree[0].predictor; + int64_t offset = tree[0].predictor_offset; + int32_t multiplier = tree[0].multiplier; + size_t ctx_id = tree[0].childID; + if (predictor == Predictor::Zero) { + uint32_t value; + if (reader->IsSingleValueAndAdvance(ctx_id, &value, + channel.w * channel.h)) { + // Special-case: histogram has a single symbol, with no extra bits, and + // we use ANS mode. + JXL_DEBUG_V(8, "Fastest track."); + pixel_type v = make_pixel(value, multiplier, offset); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT r = channel.Row(y); + std::fill(r, r + channel.w, v); + } + + } else { + JXL_DEBUG_V(8, "Fast track."); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + uint32_t v = reader->ReadHybridUintClustered(ctx_id, br); + r[x] = make_pixel(v, multiplier, offset); + } + } + } + } else if (predictor == Predictor::Gradient && offset == 0 && + multiplier == 1) { + JXL_DEBUG_V(8, "Gradient very fast track."); + const intptr_t onerow = channel.plane.PixelsPerRow(); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + pixel_type left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); + pixel_type top = (y ? *(r + x - onerow) : left); + pixel_type topleft = (x && y ? *(r + x - 1 - onerow) : left); + pixel_type guess = ClampedGradient(top, left, topleft); + uint64_t v = reader->ReadHybridUintClustered(ctx_id, br); + r[x] = make_pixel(v, 1, guess); + } + } + } else if (predictor != Predictor::Weighted) { + // special optimized case: no wp + JXL_DEBUG_V(8, "Quite fast track."); + const intptr_t onerow = channel.plane.PixelsPerRow(); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + PredictionResult pred = + PredictNoTreeNoWP(channel.w, r + x, onerow, x, y, predictor); + pixel_type_w g = pred.guess + offset; + uint64_t v = reader->ReadHybridUintClustered(ctx_id, br); + // NOTE: pred.multiplier is unset. + r[x] = make_pixel(v, multiplier, g); + } + } + } else { + JXL_DEBUG_V(8, "Somewhat fast track."); + const intptr_t onerow = channel.plane.PixelsPerRow(); + weighted::State wp_state(wp_header, channel.w, channel.h); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + pixel_type_w g = PredictNoTreeWP(channel.w, r + x, onerow, x, y, + predictor, &wp_state) + .guess + + offset; + uint64_t v = reader->ReadHybridUintClustered(ctx_id, br); + r[x] = make_pixel(v, multiplier, g); + wp_state.UpdateErrors(r[x], x, y, channel.w); + } + } + } + return true; + } + + // Check if this tree is a WP-only tree with a small enough property value + // range. + // Initialized to avoid clang-tidy complaining. + uint8_t context_lookup[2 * kPropRangeFast] = {}; + int8_t multipliers[2 * kPropRangeFast] = {}; + int8_t offsets[2 * kPropRangeFast] = {}; + if (is_wp_only) { + is_wp_only = TreeToLookupTable(tree, context_lookup, offsets, multipliers); + } + if (is_gradient_only) { + is_gradient_only = + TreeToLookupTable(tree, context_lookup, offsets, multipliers); + } + + if (is_gradient_only) { + JXL_DEBUG_V(8, "Gradient fast track."); + const intptr_t onerow = channel.plane.PixelsPerRow(); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); + pixel_type_w top = (y ? *(r + x - onerow) : left); + pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); + int32_t guess = ClampedGradient(top, left, topleft); + uint32_t pos = + kPropRangeFast + + std::min( + std::max(-kPropRangeFast, top + left - topleft), + kPropRangeFast - 1); + uint32_t ctx_id = context_lookup[pos]; + uint64_t v = reader->ReadHybridUintClustered(ctx_id, br); + r[x] = make_pixel(v, multipliers[pos], + static_cast(offsets[pos]) + guess); + } + } + } else if (is_wp_only) { + JXL_DEBUG_V(8, "WP fast track."); + const intptr_t onerow = channel.plane.PixelsPerRow(); + weighted::State wp_state(wp_header, channel.w, channel.h); + Properties properties(1); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT r = channel.Row(y); + for (size_t x = 0; x < channel.w; x++) { + size_t offset = 0; + pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); + pixel_type_w top = (y ? *(r + x - onerow) : left); + pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); + pixel_type_w topright = + (x + 1 < channel.w && y ? *(r + x + 1 - onerow) : top); + pixel_type_w toptop = (y > 1 ? *(r + x - onerow - onerow) : top); + int32_t guess = wp_state.Predict( + x, y, channel.w, top, left, topright, topleft, toptop, &properties, + offset); + uint32_t pos = + kPropRangeFast + std::min(std::max(-kPropRangeFast, properties[0]), + kPropRangeFast - 1); + uint32_t ctx_id = context_lookup[pos]; + uint64_t v = reader->ReadHybridUintClustered(ctx_id, br); + r[x] = make_pixel(v, multipliers[pos], + static_cast(offsets[pos]) + guess); + wp_state.UpdateErrors(r[x], x, y, channel.w); + } + } + } else if (!tree_has_wp_prop_or_pred) { + // special optimized case: the weighted predictor and its properties are not + // used, so no need to compute weights and properties. + JXL_DEBUG_V(8, "Slow track."); + MATreeLookup tree_lookup(tree); + Properties properties = Properties(num_props); + const intptr_t onerow = channel.plane.PixelsPerRow(); + Channel references(properties.size() - kNumNonrefProperties, channel.w); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT p = channel.Row(y); + PrecomputeReferences(channel, y, *image, chan, &references); + InitPropsRow(&properties, static_props, y); + for (size_t x = 0; x < channel.w; x++) { + PredictionResult res = + PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y, + tree_lookup, references); + uint64_t v = reader->ReadHybridUintClustered(res.context, br); + p[x] = make_pixel(v, res.multiplier, res.guess); + } + } + } else { + JXL_DEBUG_V(8, "Slowest track."); + MATreeLookup tree_lookup(tree); + Properties properties = Properties(num_props); + const intptr_t onerow = channel.plane.PixelsPerRow(); + Channel references(properties.size() - kNumNonrefProperties, channel.w); + weighted::State wp_state(wp_header, channel.w, channel.h); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT p = channel.Row(y); + InitPropsRow(&properties, static_props, y); + PrecomputeReferences(channel, y, *image, chan, &references); + for (size_t x = 0; x < channel.w; x++) { + PredictionResult res = + PredictTreeWP(&properties, channel.w, p + x, onerow, x, y, + tree_lookup, references, &wp_state); + uint64_t v = reader->ReadHybridUintClustered(res.context, br); + p[x] = make_pixel(v, res.multiplier, res.guess); + wp_state.UpdateErrors(p[x], x, y, channel.w); + } + } + } + return true; +} + +GroupHeader::GroupHeader() { Bundle::Init(this); } + +Status ValidateChannelDimensions(const Image &image, + const ModularOptions &options) { + size_t nb_channels = image.channel.size(); + for (bool is_dc : {true, false}) { + size_t group_dim = options.group_dim * (is_dc ? kBlockDim : 1); + size_t c = image.nb_meta_channels; + for (; c < nb_channels; c++) { + const Channel &ch = image.channel[c]; + if (ch.w > options.group_dim || ch.h > options.group_dim) break; + } + for (; c < nb_channels; c++) { + const Channel &ch = image.channel[c]; + if (ch.w == 0 || ch.h == 0) continue; // skip empty + bool is_dc_channel = std::min(ch.hshift, ch.vshift) >= 3; + if (is_dc_channel != is_dc) continue; + size_t tile_dim = group_dim >> std::max(ch.hshift, ch.vshift); + if (tile_dim == 0) { + return JXL_FAILURE("Inconsistent transforms"); + } + } + } + return true; +} + +Status ModularDecode(BitReader *br, Image &image, GroupHeader &header, + size_t group_id, ModularOptions *options, + const Tree *global_tree, const ANSCode *global_code, + const std::vector *global_ctx_map, + bool allow_truncated_group) { + if (image.channel.empty()) return true; + + // decode transforms + JXL_RETURN_IF_ERROR(Bundle::Read(br, &header)); + JXL_DEBUG_V(3, "Image data underwent %zu transformations: ", + header.transforms.size()); + image.transform = header.transforms; + for (Transform &transform : image.transform) { + JXL_RETURN_IF_ERROR(transform.MetaApply(image)); + } + if (image.error) { + return JXL_FAILURE("Corrupt file. Aborting."); + } + if (br->AllReadsWithinBounds()) { + // Only check if the transforms list is complete. + JXL_RETURN_IF_ERROR(ValidateChannelDimensions(image, *options)); + } + + size_t nb_channels = image.channel.size(); + + size_t num_chans = 0; + size_t distance_multiplier = 0; + for (size_t i = 0; i < nb_channels; i++) { + Channel &channel = image.channel[i]; + if (!channel.w || !channel.h) { + continue; // skip empty channels + } + if (i >= image.nb_meta_channels && (channel.w > options->max_chan_size || + channel.h > options->max_chan_size)) { + break; + } + if (channel.w > distance_multiplier) { + distance_multiplier = channel.w; + } + num_chans++; + } + if (num_chans == 0) return true; + + // Read tree. + Tree tree_storage; + std::vector context_map_storage; + ANSCode code_storage; + const Tree *tree = &tree_storage; + const ANSCode *code = &code_storage; + const std::vector *context_map = &context_map_storage; + if (!header.use_global_tree) { + size_t max_tree_size = 1024; + for (size_t i = 0; i < nb_channels; i++) { + Channel &channel = image.channel[i]; + if (!channel.w || !channel.h) { + continue; // skip empty channels + } + if (i >= image.nb_meta_channels && (channel.w > options->max_chan_size || + channel.h > options->max_chan_size)) { + break; + } + size_t pixels = channel.w * channel.h; + if (pixels / channel.w != channel.h) { + return JXL_FAILURE("Tree size overflow"); + } + max_tree_size += pixels; + if (max_tree_size < pixels) return JXL_FAILURE("Tree size overflow"); + } + + JXL_RETURN_IF_ERROR(DecodeTree(br, &tree_storage, max_tree_size)); + JXL_RETURN_IF_ERROR(DecodeHistograms(br, (tree_storage.size() + 1) / 2, + &code_storage, &context_map_storage)); + } else { + if (!global_tree || !global_code || !global_ctx_map || + global_tree->empty()) { + return JXL_FAILURE("No global tree available but one was requested"); + } + tree = global_tree; + code = global_code; + context_map = global_ctx_map; + } + + // Read channels + ANSSymbolReader reader(code, br, distance_multiplier); + for (size_t i = 0; i < nb_channels; i++) { + Channel &channel = image.channel[i]; + if (!channel.w || !channel.h) { + continue; // skip empty channels + } + if (i >= image.nb_meta_channels && (channel.w > options->max_chan_size || + channel.h > options->max_chan_size)) { + break; + } + JXL_RETURN_IF_ERROR(DecodeModularChannelMAANS(br, &reader, *context_map, + *tree, header.wp_header, i, + group_id, &image)); + // Truncated group. + if (!br->AllReadsWithinBounds()) { + if (!allow_truncated_group) return JXL_FAILURE("Truncated input"); + ZeroFillImage(&channel.plane); + while (++i < nb_channels) ZeroFillImage(&image.channel[i].plane); + return Status(StatusCode::kNotEnoughBytes); + } + } + if (!reader.CheckANSFinalState()) { + return JXL_FAILURE("ANS decode final state failed"); + } + return true; +} + +Status ModularGenericDecompress(BitReader *br, Image &image, + GroupHeader *header, size_t group_id, + ModularOptions *options, int undo_transforms, + const Tree *tree, const ANSCode *code, + const std::vector *ctx_map, + bool allow_truncated_group) { +#ifdef JXL_ENABLE_ASSERT + std::vector> req_sizes(image.channel.size()); + for (size_t c = 0; c < req_sizes.size(); c++) { + req_sizes[c] = {image.channel[c].w, image.channel[c].h}; + } +#endif + GroupHeader local_header; + if (header == nullptr) header = &local_header; + auto dec_status = ModularDecode(br, image, *header, group_id, options, tree, + code, ctx_map, allow_truncated_group); + if (!allow_truncated_group) JXL_RETURN_IF_ERROR(dec_status); + if (dec_status.IsFatalError()) return dec_status; + image.undo_transforms(header->wp_header, undo_transforms); + if (image.error) return JXL_FAILURE("Corrupt file. Aborting."); + size_t bit_pos = br->TotalBitsConsumed(); + JXL_DEBUG_V(4, "Modular-decoded a %zux%zu nbchans=%zu image from %zu bytes", + image.w, image.h, image.channel.size(), + (br->TotalBitsConsumed() - bit_pos) / 8); + (void)bit_pos; +#ifdef JXL_ENABLE_ASSERT + // Check that after applying all transforms we are back to the requested image + // sizes, otherwise there's a programming error with the transformations. + if (undo_transforms == -1 || undo_transforms == 0) { + JXL_ASSERT(image.channel.size() == req_sizes.size()); + for (size_t c = 0; c < req_sizes.size(); c++) { + JXL_ASSERT(req_sizes[c].first == image.channel[c].w); + JXL_ASSERT(req_sizes[c].second == image.channel[c].h); + } + } +#endif + return dec_status; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/encoding.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/encoding.h new file mode 100644 index 0000000000..8a208765f6 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/encoding.h @@ -0,0 +1,140 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_ENCODING_ENCODING_H_ +#define LIB_JXL_MODULAR_ENCODING_ENCODING_H_ + +#include +#include + +#include + +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/image.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/encoding/dec_ma.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/options.h" +#include "lib/jxl/modular/transform/transform.h" + +namespace jxl { + +// Valid range of properties for using lookup tables instead of trees. +constexpr int32_t kPropRangeFast = 512; + +struct GroupHeader : public Fields { + GroupHeader(); + + const char *Name() const override { return "GroupHeader"; } + + Status VisitFields(Visitor *JXL_RESTRICT visitor) override { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &use_global_tree)); + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&wp_header)); + uint32_t num_transforms = static_cast(transforms.size()); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(0), Val(1), BitsOffset(4, 2), + BitsOffset(8, 18), 0, + &num_transforms)); + if (visitor->IsReading()) transforms.resize(num_transforms); + for (size_t i = 0; i < num_transforms; i++) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&transforms[i])); + } + return true; + } + + bool use_global_tree; + weighted::Header wp_header; + + std::vector transforms; +}; + +FlatTree FilterTree(const Tree &global_tree, + std::array &static_props, + size_t *num_props, bool *use_wp, bool *wp_only, + bool *gradient_only); + +template +bool TreeToLookupTable(const FlatTree &tree, + T context_lookup[2 * kPropRangeFast], + int8_t offsets[2 * kPropRangeFast], + int8_t multipliers[2 * kPropRangeFast] = nullptr) { + struct TreeRange { + // Begin *excluded*, end *included*. This works best with > vs <= decision + // nodes. + int begin, end; + size_t pos; + }; + std::vector ranges; + ranges.push_back(TreeRange{-kPropRangeFast - 1, kPropRangeFast - 1, 0}); + while (!ranges.empty()) { + TreeRange cur = ranges.back(); + ranges.pop_back(); + if (cur.begin < -kPropRangeFast - 1 || cur.begin >= kPropRangeFast - 1 || + cur.end > kPropRangeFast - 1) { + // Tree is outside the allowed range, exit. + return false; + } + auto &node = tree[cur.pos]; + // Leaf. + if (node.property0 == -1) { + if (node.predictor_offset < std::numeric_limits::min() || + node.predictor_offset > std::numeric_limits::max()) { + return false; + } + if (node.multiplier < std::numeric_limits::min() || + node.multiplier > std::numeric_limits::max()) { + return false; + } + if (multipliers == nullptr && node.multiplier != 1) { + return false; + } + for (int i = cur.begin + 1; i < cur.end + 1; i++) { + context_lookup[i + kPropRangeFast] = node.childID; + if (multipliers) multipliers[i + kPropRangeFast] = node.multiplier; + offsets[i + kPropRangeFast] = node.predictor_offset; + } + continue; + } + // > side of top node. + if (node.properties[0] >= kNumStaticProperties) { + ranges.push_back(TreeRange({node.splitvals[0], cur.end, node.childID})); + ranges.push_back( + TreeRange({node.splitval0, node.splitvals[0], node.childID + 1})); + } else { + ranges.push_back(TreeRange({node.splitval0, cur.end, node.childID})); + } + // <= side + if (node.properties[1] >= kNumStaticProperties) { + ranges.push_back( + TreeRange({node.splitvals[1], node.splitval0, node.childID + 2})); + ranges.push_back( + TreeRange({cur.begin, node.splitvals[1], node.childID + 3})); + } else { + ranges.push_back( + TreeRange({cur.begin, node.splitval0, node.childID + 2})); + } + } + return true; +} +// TODO(veluca): make cleaner interfaces. + +Status ValidateChannelDimensions(const Image &image, + const ModularOptions &options); + +// undo_transforms == N > 0: undo all transforms except the first N +// (e.g. to represent YCbCr420 losslessly) +// undo_transforms == 0: undo all transforms +// undo_transforms == -1: undo all transforms but don't clamp to range +// undo_transforms == -2: don't undo any transform +Status ModularGenericDecompress(BitReader *br, Image &image, + GroupHeader *header, size_t group_id, + ModularOptions *options, + int undo_transforms = -1, + const Tree *tree = nullptr, + const ANSCode *code = nullptr, + const std::vector *ctx_map = nullptr, + bool allow_truncated_group = false); +} // namespace jxl + +#endif // LIB_JXL_MODULAR_ENCODING_ENCODING_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/ma_common.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/ma_common.h new file mode 100644 index 0000000000..e5b6cf3335 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/encoding/ma_common.h @@ -0,0 +1,28 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_ENCODING_MA_COMMON_H_ +#define LIB_JXL_MODULAR_ENCODING_MA_COMMON_H_ + +#include + +namespace jxl { + +enum MATreeContext : size_t { + kSplitValContext = 0, + kPropertyContext = 1, + kPredictorContext = 2, + kOffsetContext = 3, + kMultiplierLogContext = 4, + kMultiplierBitsContext = 5, + + kNumTreeContexts = 6, +}; + +static constexpr size_t kMaxTreeSize = 1 << 26; + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_ENCODING_MA_COMMON_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/modular_image.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/modular_image.cc new file mode 100644 index 0000000000..6c26b96c0d --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/modular_image.cc @@ -0,0 +1,62 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/modular_image.h" + +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/modular/transform/transform.h" + +namespace jxl { + +void Image::undo_transforms(const weighted::Header &wp_header, int keep, + jxl::ThreadPool *pool) { + if (keep == -2) return; + while ((int)transform.size() > keep && transform.size() > 0) { + Transform t = transform.back(); + JXL_DEBUG_V(4, "Undoing transform %s", t.Name()); + Status result = t.Inverse(*this, wp_header, pool); + if (result == false) { + JXL_NOTIFY_ERROR("Error while undoing transform %s.", t.Name()); + error = true; + return; + } + JXL_DEBUG_V(8, "Undoing transform %s: done", t.Name()); + transform.pop_back(); + } + if (!keep && bitdepth < 32) { + // clamp the values to the valid range (lossy compression can produce values + // outside the range) + pixel_type maxval = (1u << bitdepth) - 1; + for (size_t i = 0; i < channel.size(); i++) { + for (size_t y = 0; y < channel[i].h; y++) { + pixel_type *JXL_RESTRICT p = channel[i].plane.Row(y); + for (size_t x = 0; x < channel[i].w; x++, p++) { + *p = Clamp1(*p, 0, maxval); + } + } + } + } +} + +Image::Image(size_t iw, size_t ih, int bd, int nb_chans) + : w(iw), h(ih), bitdepth(bd), nb_meta_channels(0), error(false) { + for (int i = 0; i < nb_chans; i++) channel.emplace_back(Channel(iw, ih)); +} + +Image::Image() : w(0), h(0), bitdepth(8), nb_meta_channels(0), error(true) {} + +Image &Image::operator=(Image &&other) noexcept { + w = other.w; + h = other.h; + bitdepth = other.bitdepth; + nb_meta_channels = other.nb_meta_channels; + error = other.error; + channel = std::move(other.channel); + transform = std::move(other.transform); + return *this; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/modular_image.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/modular_image.h new file mode 100644 index 0000000000..c418ba4fe2 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/modular_image.h @@ -0,0 +1,107 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_MODULAR_IMAGE_H_ +#define LIB_JXL_MODULAR_MODULAR_IMAGE_H_ + +#include +#include +#include +#include + +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" + +namespace jxl { + +typedef int32_t pixel_type; // can use int16_t if it's only for 8-bit images. + // Need some wiggle room for YCoCg / Squeeze etc + +typedef int64_t pixel_type_w; + +namespace weighted { +struct Header; +} + +class Channel { + public: + jxl::Plane plane; + size_t w, h; + int hshift, vshift; // w ~= image.w >> hshift; h ~= image.h >> vshift + Channel(size_t iw, size_t ih, int hsh = 0, int vsh = 0) + : plane(iw, ih), w(iw), h(ih), hshift(hsh), vshift(vsh) {} + + Channel(const Channel& other) = delete; + Channel& operator=(const Channel& other) = delete; + + // Move assignment + Channel& operator=(Channel&& other) noexcept { + w = other.w; + h = other.h; + hshift = other.hshift; + vshift = other.vshift; + plane = std::move(other.plane); + return *this; + } + + // Move constructor + Channel(Channel&& other) noexcept = default; + + void shrink() { + if (plane.xsize() == w && plane.ysize() == h) return; + jxl::Plane resizedplane(w, h); + plane = std::move(resizedplane); + } + void shrink(int nw, int nh) { + w = nw; + h = nh; + shrink(); + } + + JXL_INLINE pixel_type* Row(const size_t y) { return plane.Row(y); } + JXL_INLINE const pixel_type* Row(const size_t y) const { + return plane.Row(y); + } +}; + +class Transform; + +class Image { + public: + // image data, transforms can dramatically change the number of channels and + // their semantics + std::vector channel; + // transforms that have been applied (and that have to be undone) + std::vector transform; + + // image dimensions (channels may have different dimensions due to transforms) + size_t w, h; + int bitdepth; + size_t nb_meta_channels; // first few channels might contain palette(s) + bool error; // true if a fatal error occurred, false otherwise + + Image(size_t iw, size_t ih, int bitdepth, int nb_chans); + Image(); + + Image(const Image& other) = delete; + Image& operator=(const Image& other) = delete; + + Image& operator=(Image&& other) noexcept; + Image(Image&& other) noexcept = default; + + // undo all except the first 'keep' transforms + void undo_transforms(const weighted::Header& wp_header, int keep = 0, + jxl::ThreadPool* pool = nullptr); +}; + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_MODULAR_IMAGE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/options.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/options.h new file mode 100644 index 0000000000..b25b17c6c8 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/options.h @@ -0,0 +1,172 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_OPTIONS_H_ +#define LIB_JXL_MODULAR_OPTIONS_H_ + +#include + +#include +#include + +namespace jxl { + +using PropertyVal = int32_t; +using Properties = std::vector; + +enum class Predictor : uint32_t { + Zero = 0, + Left = 1, + Top = 2, + Average0 = 3, + Select = 4, + Gradient = 5, + Weighted = 6, + TopRight = 7, + TopLeft = 8, + LeftLeft = 9, + Average1 = 10, + Average2 = 11, + Average3 = 12, + Average4 = 13, + // The following predictors are encoder-only. + Best = 14, // Best of Gradient and Weighted + Variable = + 15, // Find the best decision tree for predictors/predictor per row +}; + +inline const char* PredictorName(Predictor p) { + switch (p) { + case Predictor::Zero: + return "Zero"; + case Predictor::Left: + return "Left"; + case Predictor::Top: + return "Top"; + case Predictor::Average0: + return "Avg0"; + case Predictor::Average1: + return "Avg1"; + case Predictor::Average2: + return "Avg2"; + case Predictor::Average3: + return "Avg3"; + case Predictor::Average4: + return "Avg4"; + case Predictor::Select: + return "Sel"; + case Predictor::Gradient: + return "Grd"; + case Predictor::Weighted: + return "Wgh"; + case Predictor::TopLeft: + return "TopL"; + case Predictor::TopRight: + return "TopR"; + case Predictor::LeftLeft: + return "LL"; + default: + return "INVALID"; + }; +} + +inline std::array PredictorColor(Predictor p) { + switch (p) { + case Predictor::Zero: + return {0, 0, 0}; + case Predictor::Left: + return {255, 0, 0}; + case Predictor::Top: + return {0, 255, 0}; + case Predictor::Average0: + return {0, 0, 255}; + case Predictor::Average4: + return {192, 128, 128}; + case Predictor::Select: + return {255, 255, 0}; + case Predictor::Gradient: + return {255, 0, 255}; + case Predictor::Weighted: + return {0, 255, 255}; + // TODO + default: + return {255, 255, 255}; + }; +} + +constexpr size_t kNumModularPredictors = static_cast(Predictor::Best); + +static constexpr ssize_t kNumStaticProperties = 2; // channel, group_id. + +using StaticPropRange = + std::array, kNumStaticProperties>; + +struct ModularMultiplierInfo { + StaticPropRange range; + uint32_t multiplier; +}; + +struct ModularOptions { + /// Used in both encode and decode: + + // Stop encoding/decoding when reaching a (non-meta) channel that has a + // dimension bigger than max_chan_size. + size_t max_chan_size = 0xFFFFFF; + + // Used during decoding for validation of transforms (sqeeezing) scheme. + size_t group_dim = 0x1FFFFFFF; + + /// Encode options: + // Fraction of pixels to look at to learn a MA tree + // Number of iterations to do to learn a MA tree + // (if zero there is no MA context model) + float nb_repeats = .5f; + + // Maximum number of (previous channel) properties to use in the MA trees + int max_properties = 0; // no previous channels + + // Alternative heuristic tweaks. + // Properties default to channel, group, weighted, gradient residual, W-NW, + // NW-N, N-NE, N-NN + std::vector splitting_heuristics_properties = {0, 1, 15, 9, + 10, 11, 12, 13}; + float splitting_heuristics_node_threshold = 96; + size_t max_property_values = 32; + + // Predictor to use for each channel. + Predictor predictor = static_cast(-1); + + int wp_mode = 0; + + float fast_decode_multiplier = 1.01f; + + // Forces the encoder to produce a tree that is compatible with the WP-only + // decode path (or with the no-wp path, or the gradient-only path). + enum class TreeMode { kGradientOnly, kWPOnly, kNoWP, kDefault }; + TreeMode wp_tree_mode = TreeMode::kDefault; + + // Skip fast paths in the encoder. + bool skip_encoder_fast_path = false; + + // Kind of tree to use. + // TODO(veluca): add tree kinds for JPEG recompression with CfL enabled, + // general AC metadata, different DC qualities, and others. + enum class TreeKind { + kLearn, + kJpegTranscodeACMeta, + kFalconACMeta, + kACMeta, + kWPFixedDC, + kGradientFixedDC, + }; + TreeKind tree_kind = TreeKind::kLearn; + + // Ignore the image and just pretend all tokens are zeroes + bool zero_tokens = false; +}; + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_OPTIONS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_palette.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_palette.cc new file mode 100644 index 0000000000..cb012fff8a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_palette.cc @@ -0,0 +1,447 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/transform/enc_palette.h" + +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/enc_transform.h" +#include "lib/jxl/modular/transform/palette.h" + +namespace jxl { + +namespace palette_internal { + +static constexpr bool kEncodeToHighQualityImplicitPalette = true; + +// Inclusive. +static constexpr int kMinImplicitPaletteIndex = -(2 * 72 - 1); + +float ColorDistance(const std::vector &JXL_RESTRICT a, + const std::vector &JXL_RESTRICT b) { + JXL_ASSERT(a.size() == b.size()); + float distance = 0; + float ave3 = 0; + if (a.size() >= 3) { + ave3 = (a[0] + b[0] + a[1] + b[1] + a[2] + b[2]) * (1.21f / 3.0f); + } + float sum_a = 0, sum_b = 0; + for (size_t c = 0; c < a.size(); ++c) { + const float difference = + static_cast(a[c]) - static_cast(b[c]); + float weight = c == 0 ? 3 : c == 1 ? 5 : 2; + if (c < 3 && (a[c] + b[c] >= ave3)) { + const float add_w[3] = { + 1.15, + 1.15, + 1.12, + }; + weight += add_w[c]; + if (c == 2 && ((a[2] + b[2]) < 1.22 * ave3)) { + weight -= 0.5; + } + } + distance += difference * difference * weight * weight; + const int sum_weight = c == 0 ? 3 : c == 1 ? 5 : 1; + sum_a += a[c] * sum_weight; + sum_b += b[c] * sum_weight; + } + distance *= 4; + float sum_difference = sum_a - sum_b; + distance += sum_difference * sum_difference; + return distance; +} + +static int QuantizeColorToImplicitPaletteIndex( + const std::vector &color, const int palette_size, + const int bit_depth, bool high_quality) { + int index = 0; + if (high_quality) { + int multiplier = 1; + for (size_t c = 0; c < color.size(); c++) { + int quantized = ((kLargeCube - 1) * color[c] + (1 << (bit_depth - 1))) / + ((1 << bit_depth) - 1); + JXL_ASSERT((quantized % kLargeCube) == quantized); + index += quantized * multiplier; + multiplier *= kLargeCube; + } + return index + palette_size + kLargeCubeOffset; + } else { + int multiplier = 1; + for (size_t c = 0; c < color.size(); c++) { + int value = color[c]; + value -= 1 << (std::max(0, bit_depth - 3)); + value = std::max(0, value); + int quantized = ((kLargeCube - 1) * value + (1 << (bit_depth - 1))) / + ((1 << bit_depth) - 1); + JXL_ASSERT((quantized % kLargeCube) == quantized); + if (quantized > kSmallCube - 1) { + quantized = kSmallCube - 1; + } + index += quantized * multiplier; + multiplier *= kSmallCube; + } + return index + palette_size; + } +} + +} // namespace palette_internal + +Status FwdPalette(Image &input, uint32_t begin_c, uint32_t end_c, + uint32_t &nb_colors, bool ordered, bool lossy, + Predictor &predictor, const weighted::Header &wp_header) { + JXL_QUIET_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, end_c)); + JXL_ASSERT(begin_c >= input.nb_meta_channels); + uint32_t nb = end_c - begin_c + 1; + + size_t w = input.channel[begin_c].w; + size_t h = input.channel[begin_c].h; + + if (!lossy && nb == 1) { + // Channel palette special case + if (nb_colors == 0) return false; + std::vector lookup; + pixel_type minval, maxval; + compute_minmax(input.channel[begin_c], &minval, &maxval); + size_t lookup_table_size = + static_cast(maxval) - static_cast(minval) + 1; + if (lookup_table_size > palette_internal::kMaxPaletteLookupTableSize) { + return false; // too large lookup table + } + lookup.resize(lookup_table_size, 0); + pixel_type idx = 0; + for (size_t y = 0; y < h; y++) { + const pixel_type *p = input.channel[begin_c].Row(y); + for (size_t x = 0; x < w; x++) { + if (lookup[p[x] - minval] == 0) { + lookup[p[x] - minval] = 1; + idx++; + if (idx > (int)nb_colors) return false; + } + } + } + JXL_DEBUG_V(6, "Channel %i uses only %i colors.", begin_c, idx); + Channel pch(idx, 1); + pch.hshift = -1; + nb_colors = idx; + idx = 0; + pixel_type *JXL_RESTRICT p_palette = pch.Row(0); + for (size_t i = 0; i < lookup_table_size; i++) { + if (lookup[i]) { + p_palette[idx] = i + minval; + lookup[i] = idx; + idx++; + } + } + for (size_t y = 0; y < h; y++) { + pixel_type *p = input.channel[begin_c].Row(y); + for (size_t x = 0; x < w; x++) p[x] = lookup[p[x] - minval]; + } + predictor = Predictor::Zero; + input.nb_meta_channels++; + input.channel.insert(input.channel.begin(), std::move(pch)); + return true; + } + + Image quantized_input; + if (lossy) { + quantized_input = Image(w, h, input.bitdepth, nb); + for (size_t c = 0; c < nb; c++) { + CopyImageTo(input.channel[begin_c + c].plane, + &quantized_input.channel[c].plane); + } + } + + JXL_DEBUG_V( + 7, "Trying to represent channels %i-%i using at most a %i-color palette.", + begin_c, end_c, nb_colors); + int nb_deltas = 0; + bool delta_used = false; + std::set> + candidate_palette; // ordered lexicographically + std::vector> candidate_palette_imageorder; + std::vector color(nb); + std::vector color_with_error(nb); + std::vector p_in(nb); + + if (lossy) { + // Count color frequency for colors that make a cross. + std::map, size_t> color_freq_map; + for (size_t y = 1; y + 1 < h; y++) { + for (uint32_t c = 0; c < nb; c++) { + p_in[c] = input.channel[begin_c + c].Row(y); + } + for (size_t x = 1; x + 1 < w; x++) { + for (uint32_t c = 0; c < nb; c++) { + color[c] = p_in[c][x]; + } + int offsets[4][2] = {{1, 0}, {-1, 0}, {0, 1}, {0, -1}}; + bool makes_cross = true; + for (int i = 0; i < 4 && makes_cross; ++i) { + int dx = offsets[i][0]; + int dy = offsets[i][1]; + for (uint32_t c = 0; c < nb && makes_cross; c++) { + if (input.channel[begin_c + c].Row(y + dy)[x + dx] != color[c]) { + makes_cross = false; + } + } + } + if (makes_cross) color_freq_map[color] += 1; + } + } + // Add colors satisfying frequency condition to the palette. + constexpr float kImageFraction = 0.01f; + size_t color_frequency_lower_bound = 5 + input.h * input.w * kImageFraction; + for (const auto &color_freq : color_freq_map) { + if (color_freq.second > color_frequency_lower_bound) { + candidate_palette.insert(color_freq.first); + candidate_palette_imageorder.push_back(color_freq.first); + } + } + } + + for (size_t y = 0; y < h; y++) { + for (uint32_t c = 0; c < nb; c++) { + p_in[c] = input.channel[begin_c + c].Row(y); + } + for (size_t x = 0; x < w; x++) { + if (lossy && candidate_palette.size() >= nb_colors) break; + for (uint32_t c = 0; c < nb; c++) { + color[c] = p_in[c][x]; + } + const bool new_color = candidate_palette.insert(color).second; + if (new_color) { + candidate_palette_imageorder.push_back(color); + } + if (candidate_palette.size() > nb_colors) { + return false; // too many colors + } + } + } + + nb_colors = candidate_palette.size(); + JXL_DEBUG_V(6, "Channels %i-%i can be represented using a %i-color palette.", + begin_c, end_c, nb_colors); + + Channel pch(nb_colors, nb); + pch.hshift = -1; + int x = 0; + pixel_type *JXL_RESTRICT p_palette = pch.Row(0); + intptr_t onerow = pch.plane.PixelsPerRow(); + intptr_t onerow_image = input.channel[begin_c].plane.PixelsPerRow(); + const int bit_depth = input.bitdepth; + if (ordered) { + JXL_DEBUG_V(7, "Palette of %i colors, using lexicographic order", + nb_colors); + for (auto pcol : candidate_palette) { + JXL_DEBUG_V(9, " Color %i : ", x); + for (size_t i = 0; i < nb; i++) { + p_palette[i * onerow + x] = pcol[i]; + } + for (size_t i = 0; i < nb; i++) { + JXL_DEBUG_V(9, "%i ", pcol[i]); + } + x++; + } + } else { + JXL_DEBUG_V(7, "Palette of %i colors, using image order", nb_colors); + for (auto pcol : candidate_palette_imageorder) { + JXL_DEBUG_V(9, " Color %i : ", x); + for (size_t i = 0; i < nb; i++) p_palette[i * onerow + x] = pcol[i]; + for (size_t i = 0; i < nb; i++) JXL_DEBUG_V(9, "%i ", pcol[i]); + x++; + } + } + std::vector wp_states; + for (size_t c = 0; c < nb; c++) { + wp_states.emplace_back(wp_header, w, h); + } + std::vector p_quant(nb); + // Three rows of error for dithering: y to y + 2. + // Each row has two pixels of padding in the ends, which is + // beneficial for both precision and encoding speed. + std::vector> error_row[3]; + if (lossy) { + for (int i = 0; i < 3; ++i) { + error_row[i].resize(nb); + for (size_t c = 0; c < nb; ++c) { + error_row[i][c].resize(w + 4); + } + } + } + for (size_t y = 0; y < h; y++) { + for (size_t c = 0; c < nb; c++) { + p_in[c] = input.channel[begin_c + c].Row(y); + if (lossy) p_quant[c] = quantized_input.channel[c].Row(y); + } + pixel_type *JXL_RESTRICT p = input.channel[begin_c].Row(y); + for (size_t x = 0; x < w; x++) { + int index; + if (!lossy) { + for (size_t c = 0; c < nb; c++) color[c] = p_in[c][x]; + // Exact search. + for (index = 0; static_cast(index) < nb_colors; index++) { + bool found = true; + for (size_t c = 0; c < nb; c++) { + if (color[c] != p_palette[c * onerow + index]) { + found = false; + break; + } + } + if (found) break; + } + if (index < nb_deltas) { + delta_used = true; + } + } else { + for (size_t c = 0; c < nb; c++) { + color_with_error[c] = p_in[c][x] + error_row[0][c][x + 2]; + color[c] = Clamp1(lroundf(color_with_error[c]), 0l, + (1l << input.bitdepth) - 1); + } + float best_distance = std::numeric_limits::infinity(); + int best_index = 0; + bool best_is_delta = false; + std::vector best_val(nb, 0); + std::vector quantized_val(nb); + std::vector predictions(nb); + for (size_t c = 0; c < nb; ++c) { + predictions[c] = PredictNoTreeWP(w, p_quant[c] + x, onerow_image, x, + y, predictor, &wp_states[c]) + .guess; + } + const auto TryIndex = [&](const int index) { + for (size_t c = 0; c < nb; c++) { + quantized_val[c] = palette_internal::GetPaletteValue( + p_palette, index, /*c=*/c, + /*palette_size=*/nb_colors, + /*onerow=*/onerow, /*bit_depth=*/bit_depth); + if (index < nb_deltas) { + quantized_val[c] += predictions[c]; + } + } + const float color_distance = + 32 * + palette_internal::ColorDistance(color_with_error, quantized_val); + float index_penalty = 0; + if (index == -1) { + index_penalty = -124; + } else if (index < static_cast(nb_colors)) { + index_penalty = 2 * std::abs(index); + } else if (index < static_cast(nb_colors) + + palette_internal::kLargeCubeOffset) { + index_penalty = 70; + } else { + index_penalty = 256; + } + index_penalty *= 1LL << std::max(2 * (bit_depth - 8), 0); + const float distance = color_distance + index_penalty; + if (distance < best_distance) { + best_distance = distance; + best_index = index; + best_is_delta = index < nb_deltas; + best_val.swap(quantized_val); + } + }; + for (index = palette_internal::kMinImplicitPaletteIndex; + index < static_cast(nb_colors); index++) { + TryIndex(index); + } + TryIndex(palette_internal::QuantizeColorToImplicitPaletteIndex( + color, nb_colors, bit_depth, + /*high_quality=*/false)); + if (palette_internal::kEncodeToHighQualityImplicitPalette) { + TryIndex(palette_internal::QuantizeColorToImplicitPaletteIndex( + color, nb_colors, bit_depth, + /*high_quality=*/true)); + } + index = best_index; + delta_used |= best_is_delta; + for (size_t c = 0; c < nb; ++c) { + wp_states[c].UpdateErrors(best_val[c], x, y, w); + p_quant[c][x] = best_val[c]; + } + float len_error = 0; + for (size_t c = 0; c < nb; ++c) { + float local_error = color_with_error[c] - best_val[c]; + len_error += local_error * local_error; + } + len_error = sqrt(len_error); + float modulate = 1.0; + int len_limit = 38 << std::max(0, bit_depth - 8); + if (len_error > len_limit) { + modulate *= len_limit / len_error; + } + for (size_t c = 0; c < nb; ++c) { + float local_error = (color_with_error[c] - best_val[c]); + float total_error = 0.65 * local_error; + + // If the neighboring pixels have some error in the opposite + // direction of total_error, cancel some or all of it out before + // spreading among them. + constexpr int offsets[12][2] = {{1, 2}, {0, 3}, {0, 4}, {1, 1}, + {1, 3}, {2, 2}, {1, 0}, {1, 4}, + {2, 1}, {2, 3}, {2, 0}, {2, 4}}; + float total_available = 0; + int n = 0; + for (int i = 0; i < 11; ++i) { + const int row = offsets[i][0]; + const int col = offsets[i][1]; + if (std::signbit(error_row[row][c][x + col]) != + std::signbit(total_error)) { + total_available += error_row[row][c][x + col]; + n++; + } + } + float weight = + std::abs(total_error) / (std::abs(total_available) + 1e-3); + weight = std::min(weight, 1.0f); + for (int i = 0; i < 11; ++i) { + const int row = offsets[i][0]; + const int col = offsets[i][1]; + if (std::signbit(error_row[row][c][x + col]) != + std::signbit(total_error)) { + total_error += weight * error_row[row][c][x + col]; + error_row[row][c][x + col] *= (1 - weight); + } + } + total_error *= modulate; + const float remaining_error = (1.0f / 14.) * total_error; + error_row[0][c][x + 3] += 2 * remaining_error; + error_row[0][c][x + 4] += remaining_error; + error_row[1][c][x + 0] += remaining_error; + for (int i = 0; i < 5; ++i) { + error_row[1][c][x + i] += remaining_error; + error_row[2][c][x + i] += remaining_error; + } + } + } + p[x] = index; + } + if (lossy) { + for (size_t c = 0; c < nb; ++c) { + error_row[0][c].swap(error_row[1][c]); + error_row[1][c].swap(error_row[2][c]); + std::fill(error_row[2][c].begin(), error_row[2][c].end(), 0.f); + } + } + } + if (!delta_used) { + predictor = Predictor::Zero; + } + input.nb_meta_channels++; + input.channel.erase(input.channel.begin() + begin_c + 1, + input.channel.begin() + end_c + 1); + input.channel.insert(input.channel.begin(), std::move(pch)); + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_palette.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_palette.h new file mode 100644 index 0000000000..3a0dbd97dc --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_palette.h @@ -0,0 +1,21 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_TRANSFORM_ENC_PALETTE_H_ +#define LIB_JXL_MODULAR_TRANSFORM_ENC_PALETTE_H_ + +#include "lib/jxl/fields.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/modular_image.h" + +namespace jxl { + +Status FwdPalette(Image &input, uint32_t begin_c, uint32_t end_c, + uint32_t &nb_colors, bool ordered, bool lossy, + Predictor &predictor, const weighted::Header &wp_header); + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_TRANSFORM_ENC_PALETTE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_rct.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_rct.cc new file mode 100644 index 0000000000..81ba7e6433 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_rct.cc @@ -0,0 +1,71 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/transform/enc_rct.h" + +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/transform.h" // CheckEqualChannels + +namespace jxl { + +Status FwdRCT(Image& input, size_t begin_c, size_t rct_type) { + JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2)); + if (rct_type == 0) { // noop + return false; + } + // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR + int permutation = rct_type / 7; + // 0-5 values have the low bit corresponding to Third and the high bits + // corresponding to Second. 6 corresponds to YCoCg. + // + // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird + // + // Third: 0=nop, 1=SubtractFirst + int custom = rct_type % 7; + size_t m = begin_c; + size_t w = input.channel[m + 0].w; + size_t h = input.channel[m + 0].h; + int second = (custom % 7) >> 1; + int third = (custom % 7) & 1; + for (size_t y = 0; y < h; y++) { + const pixel_type* in0 = input.channel[m + (permutation % 3)].Row(y); + const pixel_type* in1 = + input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y); + const pixel_type* in2 = + input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y); + pixel_type* out0 = input.channel[m].Row(y); + pixel_type* out1 = input.channel[m + 1].Row(y); + pixel_type* out2 = input.channel[m + 2].Row(y); + for (size_t x = 0; x < w; x++) { + if (custom == 6) { + pixel_type R = in0[x]; + pixel_type G = in1[x]; + pixel_type B = in2[x]; + out1[x] = R - B; + pixel_type tmp = B + (out1[x] >> 1); + out2[x] = G - tmp; + out0[x] = tmp + (out2[x] >> 1); + } else { + pixel_type First = in0[x]; + pixel_type Second = in1[x]; + pixel_type Third = in2[x]; + if (second == 1) { + Second = Second - First; + } else if (second == 2) { + Second = Second - ((First + Third) >> 1); + } + if (third) Third = Third - First; + out0[x] = First; + out1[x] = Second; + out2[x] = Third; + } + } + } + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_rct.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_rct.h new file mode 100644 index 0000000000..8a412393d4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_rct.h @@ -0,0 +1,17 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_TRANSFORM_ENC_RCT_H_ +#define LIB_JXL_MODULAR_TRANSFORM_ENC_RCT_H_ + +#include "lib/jxl/modular/modular_image.h" + +namespace jxl { + +Status FwdRCT(Image &input, size_t begin_c, size_t rct_type); + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_TRANSFORM_ENC_RCT_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_squeeze.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_squeeze.cc new file mode 100644 index 0000000000..7a3219e677 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_squeeze.cc @@ -0,0 +1,140 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/transform/enc_squeeze.h" + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/common.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/squeeze.h" +#include "lib/jxl/modular/transform/transform.h" + +namespace jxl { + +void FwdHSqueeze(Image &input, int c, int rc) { + const Channel &chin = input.channel[c]; + + JXL_DEBUG_V(4, "Doing horizontal squeeze of channel %i to new channel %i", c, + rc); + + Channel chout((chin.w + 1) / 2, chin.h, chin.hshift + 1, chin.vshift); + Channel chout_residual(chin.w - chout.w, chout.h, chin.hshift + 1, + chin.vshift); + + for (size_t y = 0; y < chout.h; y++) { + const pixel_type *JXL_RESTRICT p_in = chin.Row(y); + pixel_type *JXL_RESTRICT p_out = chout.Row(y); + pixel_type *JXL_RESTRICT p_res = chout_residual.Row(y); + for (size_t x = 0; x < chout_residual.w; x++) { + pixel_type A = p_in[x * 2]; + pixel_type B = p_in[x * 2 + 1]; + pixel_type avg = (A + B + (A > B)) >> 1; + p_out[x] = avg; + + pixel_type diff = A - B; + + pixel_type next_avg = avg; + if (x + 1 < chout_residual.w) { + next_avg = (p_in[x * 2 + 2] + p_in[x * 2 + 3] + + (p_in[x * 2 + 2] > p_in[x * 2 + 3])) >> + 1; // which will be chout.value(y,x+1) + } else if (chin.w & 1) + next_avg = p_in[x * 2 + 2]; + pixel_type left = (x > 0 ? p_in[x * 2 - 1] : avg); + pixel_type tendency = SmoothTendency(left, avg, next_avg); + + p_res[x] = diff - tendency; + } + if (chin.w & 1) { + int x = chout.w - 1; + p_out[x] = p_in[x * 2]; + } + } + input.channel[c] = std::move(chout); + input.channel.insert(input.channel.begin() + rc, std::move(chout_residual)); +} + +void FwdVSqueeze(Image &input, int c, int rc) { + const Channel &chin = input.channel[c]; + + JXL_DEBUG_V(4, "Doing vertical squeeze of channel %i to new channel %i", c, + rc); + + Channel chout(chin.w, (chin.h + 1) / 2, chin.hshift, chin.vshift + 1); + Channel chout_residual(chin.w, chin.h - chout.h, chin.hshift, + chin.vshift + 1); + intptr_t onerow_in = chin.plane.PixelsPerRow(); + for (size_t y = 0; y < chout_residual.h; y++) { + const pixel_type *JXL_RESTRICT p_in = chin.Row(y * 2); + pixel_type *JXL_RESTRICT p_out = chout.Row(y); + pixel_type *JXL_RESTRICT p_res = chout_residual.Row(y); + for (size_t x = 0; x < chout.w; x++) { + pixel_type A = p_in[x]; + pixel_type B = p_in[x + onerow_in]; + pixel_type avg = (A + B + (A > B)) >> 1; + p_out[x] = avg; + + pixel_type diff = A - B; + + pixel_type next_avg = avg; + if (y + 1 < chout_residual.h) { + next_avg = (p_in[x + 2 * onerow_in] + p_in[x + 3 * onerow_in] + + (p_in[x + 2 * onerow_in] > p_in[x + 3 * onerow_in])) >> + 1; // which will be chout.value(y+1,x) + } else if (chin.h & 1) { + next_avg = p_in[x + 2 * onerow_in]; + } + pixel_type top = + (y > 0 ? p_in[static_cast(x) - onerow_in] : avg); + pixel_type tendency = SmoothTendency(top, avg, next_avg); + + p_res[x] = diff - tendency; + } + } + if (chin.h & 1) { + size_t y = chout.h - 1; + const pixel_type *p_in = chin.Row(y * 2); + pixel_type *p_out = chout.Row(y); + for (size_t x = 0; x < chout.w; x++) { + p_out[x] = p_in[x]; + } + } + input.channel[c] = std::move(chout); + input.channel.insert(input.channel.begin() + rc, std::move(chout_residual)); +} + +Status FwdSqueeze(Image &input, std::vector parameters, + ThreadPool *pool) { + if (parameters.empty()) { + DefaultSqueezeParameters(¶meters, input); + } + + for (size_t i = 0; i < parameters.size(); i++) { + JXL_RETURN_IF_ERROR( + CheckMetaSqueezeParams(parameters[i], input.channel.size())); + bool horizontal = parameters[i].horizontal; + bool in_place = parameters[i].in_place; + uint32_t beginc = parameters[i].begin_c; + uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1; + uint32_t offset; + if (in_place) { + offset = endc + 1; + } else { + offset = input.channel.size(); + } + for (uint32_t c = beginc; c <= endc; c++) { + if (horizontal) { + FwdHSqueeze(input, c, offset + c - beginc); + } else { + FwdVSqueeze(input, c, offset + c - beginc); + } + } + } + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_squeeze.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_squeeze.h new file mode 100644 index 0000000000..39b001017b --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_squeeze.h @@ -0,0 +1,20 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_TRANSFORM_ENC_SQUEEZE_H_ +#define LIB_JXL_MODULAR_TRANSFORM_ENC_SQUEEZE_H_ + +#include "lib/jxl/fields.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/transform.h" + +namespace jxl { + +Status FwdSqueeze(Image &input, std::vector parameters, + ThreadPool *pool); + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_TRANSFORM_ENC_SQUEEZE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_transform.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_transform.cc new file mode 100644 index 0000000000..2d7c2949e3 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_transform.cc @@ -0,0 +1,46 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/transform/enc_transform.h" + +#include "lib/jxl/modular/transform/enc_palette.h" +#include "lib/jxl/modular/transform/enc_rct.h" +#include "lib/jxl/modular/transform/enc_squeeze.h" + +namespace jxl { + +Status TransformForward(Transform &t, Image &input, + const weighted::Header &wp_header, ThreadPool *pool) { + switch (t.id) { + case TransformId::kRCT: + return FwdRCT(input, t.begin_c, t.rct_type); + case TransformId::kSqueeze: + return FwdSqueeze(input, t.squeezes, pool); + case TransformId::kPalette: + return FwdPalette(input, t.begin_c, t.begin_c + t.num_c - 1, t.nb_colors, + t.ordered_palette, t.lossy_palette, t.predictor, + wp_header); + default: + return JXL_FAILURE("Unknown transformation (ID=%u)", + static_cast(t.id)); + } +} + +void compute_minmax(const Channel &ch, pixel_type *min, pixel_type *max) { + pixel_type realmin = std::numeric_limits::max(); + pixel_type realmax = std::numeric_limits::min(); + for (size_t y = 0; y < ch.h; y++) { + const pixel_type *JXL_RESTRICT p = ch.Row(y); + for (size_t x = 0; x < ch.w; x++) { + if (p[x] < realmin) realmin = p[x]; + if (p[x] > realmax) realmax = p[x]; + } + } + + if (min) *min = realmin; + if (max) *max = realmax; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_transform.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_transform.h new file mode 100644 index 0000000000..07659e1b0a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/enc_transform.h @@ -0,0 +1,22 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_TRANSFORM_ENC_TRANSFORM_H_ +#define LIB_JXL_MODULAR_TRANSFORM_ENC_TRANSFORM_H_ + +#include "lib/jxl/fields.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/transform.h" + +namespace jxl { + +Status TransformForward(Transform &t, Image &input, + const weighted::Header &wp_header, ThreadPool *pool); + +void compute_minmax(const Channel &ch, pixel_type *min, pixel_type *max); + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_TRANSFORM_ENC_TRANSFORM_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/jxl_transform.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/jxl_transform.cc new file mode 100644 index 0000000000..e63013a38c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/jxl_transform.cc @@ -0,0 +1,92 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/transform/transform.h" + +#include "lib/jxl/fields.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/palette.h" +#include "lib/jxl/modular/transform/rct.h" +#include "lib/jxl/modular/transform/squeeze.h" + +namespace jxl { + +SqueezeParams::SqueezeParams() { Bundle::Init(this); } +Transform::Transform(TransformId id) { + Bundle::Init(this); + this->id = id; +} + +Status Transform::Inverse(Image &input, const weighted::Header &wp_header, + ThreadPool *pool) { + switch (id) { + case TransformId::kRCT: + return InvRCT(input, begin_c, rct_type); + case TransformId::kSqueeze: + return InvSqueeze(input, squeezes, pool); + case TransformId::kPalette: + return InvPalette(input, begin_c, nb_colors, nb_deltas, predictor, + wp_header, pool); + default: + return JXL_FAILURE("Unknown transformation (ID=%u)", + static_cast(id)); + } +} + +Status Transform::MetaApply(Image &input) { + switch (id) { + case TransformId::kRCT: + JXL_DEBUG_V(2, "Transform: kRCT, rct_type=%" PRIu32, rct_type); + return CheckEqualChannels(input, begin_c, begin_c + 2); + case TransformId::kSqueeze: + JXL_DEBUG_V(2, "Transform: kSqueeze:"); +#if JXL_DEBUG_V_LEVEL >= 2 + { + auto squeezes_copy = squeezes; + if (squeezes_copy.empty()) { + DefaultSqueezeParameters(&squeezes_copy, input); + } + for (const auto ¶ms : squeezes_copy) { + JXL_DEBUG_V( + 2, + " squeeze params: horizontal=%d, in_place=%d, begin_c=%" PRIu32 + ", num_c=%" PRIu32, + params.horizontal, params.in_place, params.begin_c, params.num_c); + } + } +#endif + return MetaSqueeze(input, &squeezes); + case TransformId::kPalette: + JXL_DEBUG_V(2, + "Transform: kPalette, begin_c=%" PRIu32 ", num_c=%" PRIu32 + ", nb_colors=%" PRIu32 ", nb_deltas=%" PRIu32, + begin_c, num_c, nb_colors, nb_deltas); + return MetaPalette(input, begin_c, begin_c + num_c - 1, nb_colors, + nb_deltas, lossy_palette); + default: + return JXL_FAILURE("Unknown transformation (ID=%u)", + static_cast(id)); + } +} + +Status CheckEqualChannels(const Image &image, uint32_t c1, uint32_t c2) { + if (c1 > image.channel.size() || c2 >= image.channel.size() || c2 < c1) { + return JXL_FAILURE("Invalid channel range"); + } + if (c1 < image.nb_meta_channels && c2 >= image.nb_meta_channels) { + return JXL_FAILURE("Invalid: transforming mix of meta and nonmeta"); + } + const auto &ch1 = image.channel[c1]; + for (size_t c = c1 + 1; c <= c2; c++) { + const auto &ch2 = image.channel[c]; + if (ch1.w != ch2.w || ch1.h != ch2.h || ch1.hshift != ch2.hshift || + ch1.vshift != ch2.vshift) { + return false; + } + } + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/palette.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/palette.h new file mode 100644 index 0000000000..da5423afae --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/palette.h @@ -0,0 +1,311 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_TRANSFORM_PALETTE_H_ +#define LIB_JXL_MODULAR_TRANSFORM_PALETTE_H_ + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/transform.h" // CheckEqualChannels + +namespace jxl { + +namespace palette_internal { + +static constexpr int kMaxPaletteLookupTableSize = 1 << 16; + +static constexpr int kCubePow = 3; + +// 5x5x5 color cube for the larger cube. +static constexpr int kLargeCube = 5; + +// Smaller interleaved color cube to fill the holes of the larger cube. +static constexpr int kSmallCube = kLargeCube - 1; +// kSmallCube ** kCubePow +static constexpr int kLargeCubeOffset = kSmallCube * kSmallCube * kSmallCube; + +static constexpr pixel_type Scale(int value, int bit_depth, int denom) { + return (value * ((static_cast(1) << bit_depth) - 1)) / denom; +} + +// The purpose of this function is solely to extend the interpretation of +// palette indices to implicit values. If index < nb_deltas, indicating that the +// result is a delta palette entry, it is the responsibility of the caller to +// treat it as such. +static pixel_type GetPaletteValue(const pixel_type *const palette, int index, + const size_t c, const int palette_size, + const int onerow, const int bit_depth) { + if (index < 0) { + static constexpr std::array, 72> kDeltaPalette = { + { + {0, 0, 0}, {4, 4, 4}, {11, 0, 0}, {0, 0, -13}, + {0, -12, 0}, {-10, -10, -10}, {-18, -18, -18}, {-27, -27, -27}, + {-18, -18, 0}, {0, 0, -32}, {-32, 0, 0}, {-37, -37, -37}, + {0, -32, -32}, {24, 24, 45}, {50, 50, 50}, {-45, -24, -24}, + {-24, -45, -45}, {0, -24, -24}, {-34, -34, 0}, {-24, 0, -24}, + {-45, -45, -24}, {64, 64, 64}, {-32, 0, -32}, {0, -32, 0}, + {-32, 0, 32}, {-24, -45, -24}, {45, 24, 45}, {24, -24, -45}, + {-45, -24, 24}, {80, 80, 80}, {64, 0, 0}, {0, 0, -64}, + {0, -64, -64}, {-24, -24, 45}, {96, 96, 96}, {64, 64, 0}, + {45, -24, -24}, {34, -34, 0}, {112, 112, 112}, {24, -45, -45}, + {45, 45, -24}, {0, -32, 32}, {24, -24, 45}, {0, 96, 96}, + {45, -24, 24}, {24, -45, -24}, {-24, -45, 24}, {0, -64, 0}, + {96, 0, 0}, {128, 128, 128}, {64, 0, 64}, {144, 144, 144}, + {96, 96, 0}, {-36, -36, 36}, {45, -24, -45}, {45, -45, -24}, + {0, 0, -96}, {0, 128, 128}, {0, 96, 0}, {45, 24, -45}, + {-128, 0, 0}, {24, -45, 24}, {-45, 24, -45}, {64, 0, -64}, + {64, -64, -64}, {96, 0, 96}, {45, -45, 24}, {24, 45, -45}, + {64, 64, -64}, {128, 128, 0}, {0, 0, -128}, {-24, 45, -45}, + }}; + if (c >= kDeltaPalette[0].size()) { + return 0; + } + // Do not open the brackets, otherwise INT32_MIN negation could overflow. + index = -(index + 1); + index %= 1 + 2 * (kDeltaPalette.size() - 1); + static constexpr int kMultiplier[] = {-1, 1}; + pixel_type result = + kDeltaPalette[((index + 1) >> 1)][c] * kMultiplier[index & 1]; + if (bit_depth > 8) { + result *= static_cast(1) << (bit_depth - 8); + } + return result; + } else if (palette_size <= index && index < palette_size + kLargeCubeOffset) { + if (c >= kCubePow) return 0; + index -= palette_size; + if (c > 0) { + int divisor = kSmallCube; + for (size_t i = 1; i < c; ++i) { + divisor *= kSmallCube; + } + index /= divisor; + } + return Scale(index % kSmallCube, bit_depth, kSmallCube) + + (1 << (std::max(0, bit_depth - 3))); + } else if (palette_size + kLargeCubeOffset <= index) { + if (c >= kCubePow) return 0; + index -= palette_size + kLargeCubeOffset; + // TODO(eustas): should we take care of ambiguity created by + // index >= kLargeCube ** 3 ? + if (c > 0) { + int divisor = kLargeCube; + for (size_t i = 1; i < c; ++i) { + divisor *= kLargeCube; + } + index /= divisor; + } + return Scale(index % kLargeCube, bit_depth, kLargeCube - 1); + } + + return palette[c * onerow + static_cast(index)]; +} + +} // namespace palette_internal + +static Status InvPalette(Image &input, uint32_t begin_c, uint32_t nb_colors, + uint32_t nb_deltas, Predictor predictor, + const weighted::Header &wp_header, ThreadPool *pool) { + if (input.nb_meta_channels < 1) { + return JXL_FAILURE("Error: Palette transform without palette."); + } + std::atomic num_errors{0}; + int nb = input.channel[0].h; + uint32_t c0 = begin_c + 1; + if (c0 >= input.channel.size()) { + return JXL_FAILURE("Channel is out of range."); + } + size_t w = input.channel[c0].w; + size_t h = input.channel[c0].h; + if (nb < 1) return JXL_FAILURE("Corrupted transforms"); + for (int i = 1; i < nb; i++) { + input.channel.insert( + input.channel.begin() + c0 + 1, + Channel(w, h, input.channel[c0].hshift, input.channel[c0].vshift)); + } + const Channel &palette = input.channel[0]; + const pixel_type *JXL_RESTRICT p_palette = input.channel[0].Row(0); + intptr_t onerow = input.channel[0].plane.PixelsPerRow(); + intptr_t onerow_image = input.channel[c0].plane.PixelsPerRow(); + const int bit_depth = input.bitdepth; + + if (w == 0) { + // Nothing to do. + // Avoid touching "empty" channels with non-zero height. + } else if (nb_deltas == 0 && predictor == Predictor::Zero) { + if (nb == 1) { + RunOnPool( + pool, 0, h, ThreadPool::SkipInit(), + [&](const int task, const int thread) { + const size_t y = task; + pixel_type *p = input.channel[c0].Row(y); + for (size_t x = 0; x < w; x++) { + const int index = Clamp1(p[x], 0, (pixel_type)palette.w - 1); + p[x] = palette_internal::GetPaletteValue( + p_palette, index, /*c=*/0, + /*palette_size=*/palette.w, + /*onerow=*/onerow, /*bit_depth=*/bit_depth); + } + }, + "UndoChannelPalette"); + } else { + RunOnPool( + pool, 0, h, ThreadPool::SkipInit(), + [&](const int task, const int thread) { + const size_t y = task; + std::vector p_out(nb); + const pixel_type *p_index = input.channel[c0].Row(y); + for (int c = 0; c < nb; c++) + p_out[c] = input.channel[c0 + c].Row(y); + for (size_t x = 0; x < w; x++) { + const int index = p_index[x]; + for (int c = 0; c < nb; c++) { + p_out[c][x] = palette_internal::GetPaletteValue( + p_palette, index, /*c=*/c, + /*palette_size=*/palette.w, + /*onerow=*/onerow, /*bit_depth=*/bit_depth); + } + } + }, + "UndoPalette"); + } + } else { + // Parallelized per channel. + ImageI indices = CopyImage(input.channel[c0].plane); + if (predictor == Predictor::Weighted) { + RunOnPool( + pool, 0, nb, ThreadPool::SkipInit(), + [&](size_t c, size_t _) { + Channel &channel = input.channel[c0 + c]; + weighted::State wp_state(wp_header, channel.w, channel.h); + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT p = channel.Row(y); + const pixel_type *JXL_RESTRICT idx = indices.Row(y); + for (size_t x = 0; x < channel.w; x++) { + int index = idx[x]; + pixel_type_w val = 0; + const pixel_type palette_entry = + palette_internal::GetPaletteValue( + p_palette, index, /*c=*/c, + /*palette_size=*/palette.w, /*onerow=*/onerow, + /*bit_depth=*/bit_depth); + if (index < static_cast(nb_deltas)) { + PredictionResult pred = + PredictNoTreeWP(channel.w, p + x, onerow_image, x, y, + predictor, &wp_state); + val = pred.guess + palette_entry; + } else { + val = palette_entry; + } + p[x] = val; + wp_state.UpdateErrors(p[x], x, y, channel.w); + } + } + }, + "UndoDeltaPaletteWP"); + } else if (predictor == Predictor::Gradient) { + // Gradient is the most common predictor for now. This special case gives + // about 20% extra speed. + RunOnPool( + pool, 0, nb, ThreadPool::SkipInit(), + [&](size_t c, size_t _) { + Channel &channel = input.channel[c0 + c]; + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT p = channel.Row(y); + const pixel_type *JXL_RESTRICT idx = indices.Row(y); + for (size_t x = 0; x < channel.w; x++) { + int index = idx[x]; + pixel_type val = 0; + const pixel_type palette_entry = + palette_internal::GetPaletteValue( + p_palette, index, /*c=*/c, + /*palette_size=*/palette.w, + /*onerow=*/onerow, /*bit_depth=*/bit_depth); + if (index < static_cast(nb_deltas)) { + pixel_type left = + x ? p[x - 1] : (y ? *(p + x - onerow_image) : 0); + pixel_type top = y ? *(p + x - onerow_image) : left; + pixel_type topleft = + x && y ? *(p + x - 1 - onerow_image) : left; + val = PixelAdd(ClampedGradient(left, top, topleft), + palette_entry); + } else { + val = palette_entry; + } + p[x] = val; + } + } + }, + "UndoDeltaPaletteGradient"); + } else { + RunOnPool( + pool, 0, nb, ThreadPool::SkipInit(), + [&](size_t c, size_t _) { + Channel &channel = input.channel[c0 + c]; + for (size_t y = 0; y < channel.h; y++) { + pixel_type *JXL_RESTRICT p = channel.Row(y); + const pixel_type *JXL_RESTRICT idx = indices.Row(y); + for (size_t x = 0; x < channel.w; x++) { + int index = idx[x]; + pixel_type_w val = 0; + const pixel_type palette_entry = + palette_internal::GetPaletteValue( + p_palette, index, /*c=*/c, + /*palette_size=*/palette.w, + /*onerow=*/onerow, /*bit_depth=*/bit_depth); + if (index < static_cast(nb_deltas)) { + PredictionResult pred = PredictNoTreeNoWP( + channel.w, p + x, onerow_image, x, y, predictor); + val = pred.guess + palette_entry; + } else { + val = palette_entry; + } + p[x] = val; + } + } + }, + "UndoDeltaPaletteNoWP"); + } + } + if (c0 >= input.nb_meta_channels) { + // Palette was done on normal channels + input.nb_meta_channels--; + } else { + // Palette was done on metachannels + JXL_ASSERT(static_cast(input.nb_meta_channels) >= 2 - nb); + input.nb_meta_channels -= 2 - nb; + JXL_ASSERT(begin_c + nb - 1 < input.nb_meta_channels); + } + input.channel.erase(input.channel.begin(), input.channel.begin() + 1); + return num_errors.load(std::memory_order_relaxed) == 0; +} + +static Status MetaPalette(Image &input, uint32_t begin_c, uint32_t end_c, + uint32_t nb_colors, uint32_t nb_deltas, bool lossy) { + JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, end_c)); + + size_t nb = end_c - begin_c + 1; + if (begin_c >= input.nb_meta_channels) { + // Palette was done on normal channels + input.nb_meta_channels++; + } else { + // Palette was done on metachannels + JXL_ASSERT(end_c < input.nb_meta_channels); + // we remove nb-1 metachannels and add one + input.nb_meta_channels += 2 - nb; + } + input.channel.erase(input.channel.begin() + begin_c + 1, + input.channel.begin() + end_c + 1); + Channel pch(nb_colors + nb_deltas, nb); + pch.hshift = -1; + input.channel.insert(input.channel.begin(), std::move(pch)); + return true; +} + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_TRANSFORM_PALETTE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/rct.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/rct.h new file mode 100644 index 0000000000..e6434de1d2 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/rct.h @@ -0,0 +1,103 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_TRANSFORM_RCT_H_ +#define LIB_JXL_MODULAR_TRANSFORM_RCT_H_ + +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/transform.h" // CheckEqualChannels + +namespace jxl { + +template +void InvRCTRow(const pixel_type* in0, const pixel_type* in1, + const pixel_type* in2, pixel_type* out0, pixel_type* out1, + pixel_type* out2, size_t w) { + static_assert(transform_type >= 0 && transform_type < 7, + "Invalid transform type"); + int second = transform_type >> 1; + int third = transform_type & 1; + for (size_t x = 0; x < w; x++) { + if (transform_type == 6) { + pixel_type Y = in0[x]; + pixel_type Co = in1[x]; + pixel_type Cg = in2[x]; + pixel_type tmp = PixelAdd(Y, -(Cg >> 1)); + pixel_type G = PixelAdd(Cg, tmp); + pixel_type B = PixelAdd(tmp, -(Co >> 1)); + pixel_type R = PixelAdd(B, Co); + out0[x] = R; + out1[x] = G; + out2[x] = B; + } else { + pixel_type First = in0[x]; + pixel_type Second = in1[x]; + pixel_type Third = in2[x]; + if (third) Third = PixelAdd(Third, First); + if (second == 1) { + Second = PixelAdd(Second, First); + } else if (second == 2) { + Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1)); + } + out0[x] = First; + out1[x] = Second; + out2[x] = Third; + } + } +} + +Status InvRCT(Image& input, size_t begin_c, size_t rct_type) { + JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2)); + size_t m = begin_c; + Channel& c0 = input.channel[m + 0]; + size_t w = c0.w; + size_t h = c0.h; + if (rct_type == 0) { // noop + return true; + } + // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR + int permutation = rct_type / 7; + JXL_CHECK(permutation < 6); + // 0-5 values have the low bit corresponding to Third and the high bits + // corresponding to Second. 6 corresponds to YCoCg. + // + // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird + // + // Third: 0=nop, 1=SubtractFirst + int custom = rct_type % 7; + // Special case: permute-only. Swap channels around. + if (custom == 0) { + Channel ch0 = std::move(input.channel[m]); + Channel ch1 = std::move(input.channel[m + 1]); + Channel ch2 = std::move(input.channel[m + 2]); + input.channel[m + (permutation % 3)] = std::move(ch0); + input.channel[m + ((permutation + 1 + permutation / 3) % 3)] = + std::move(ch1); + input.channel[m + ((permutation + 2 - permutation / 3) % 3)] = + std::move(ch2); + return true; + } + constexpr decltype(&InvRCTRow<0>) inv_rct_row[] = { + InvRCTRow<0>, InvRCTRow<1>, InvRCTRow<2>, InvRCTRow<3>, + InvRCTRow<4>, InvRCTRow<5>, InvRCTRow<6>}; + for (size_t y = 0; y < h; y++) { + const pixel_type* in0 = input.channel[m].Row(y); + const pixel_type* in1 = input.channel[m + 1].Row(y); + const pixel_type* in2 = input.channel[m + 2].Row(y); + pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y); + pixel_type* out1 = + input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y); + pixel_type* out2 = + input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y); + inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w); + } + return true; +} + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_TRANSFORM_RCT_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/squeeze.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/squeeze.cc new file mode 100644 index 0000000000..3edbfc9cd1 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/squeeze.cc @@ -0,0 +1,329 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/transform/squeeze.h" + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/common.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/transform.h" + +namespace jxl { + +void InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { + JXL_ASSERT(c < input.channel.size()); + JXL_ASSERT(rc < input.channel.size()); + const Channel &chin = input.channel[c]; + const Channel &chin_residual = input.channel[rc]; + // These must be valid since we ran MetaApply already. + JXL_ASSERT(chin.w == DivCeil(chin.w + chin_residual.w, 2)); + JXL_ASSERT(chin.h == chin_residual.h); + + if (chin_residual.w == 0) { + // Short-circuit: output channel has same dimensions as input. + input.channel[c].hshift--; + return; + } + + // Note: chin.w >= chin_residual.w and at most 1 different. + Channel chout(chin.w + chin_residual.w, chin.h, chin.hshift - 1, chin.vshift); + JXL_DEBUG_V(4, + "Undoing horizontal squeeze of channel %i using residuals in " + "channel %i (going from width %zu to %zu)", + c, rc, chin.w, chout.w); + + if (chin_residual.h == 0) { + // Short-circuit: channel with no pixels. + input.channel[c] = std::move(chout); + return; + } + + RunOnPool( + pool, 0, chin.h, ThreadPool::SkipInit(), + [&](const int task, const int thread) { + const size_t y = task; + const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y); + const pixel_type *JXL_RESTRICT p_avg = chin.Row(y); + pixel_type *JXL_RESTRICT p_out = chout.Row(y); + + // special case for x=0 so we don't have to check x>0 + pixel_type_w avg = p_avg[0]; + pixel_type_w next_avg = (1 < chin.w ? p_avg[1] : avg); + pixel_type_w tendency = SmoothTendency(avg, avg, next_avg); + pixel_type_w diff = p_residual[0] + tendency; + pixel_type_w A = + ((avg * 2) + diff + (diff > 0 ? -(diff & 1) : (diff & 1))) >> 1; + pixel_type_w B = A - diff; + p_out[0] = A; + p_out[1] = B; + + for (size_t x = 1; x < chin_residual.w; x++) { + pixel_type_w diff_minus_tendency = p_residual[x]; + pixel_type_w avg = p_avg[x]; + pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg); + pixel_type_w left = p_out[(x << 1) - 1]; + pixel_type_w tendency = SmoothTendency(left, avg, next_avg); + pixel_type_w diff = diff_minus_tendency + tendency; + pixel_type_w A = + ((avg * 2) + diff + (diff > 0 ? -(diff & 1) : (diff & 1))) >> 1; + p_out[x << 1] = A; + pixel_type_w B = A - diff; + p_out[(x << 1) + 1] = B; + } + if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1]; + }, + "InvHorizontalSqueeze"); + input.channel[c] = std::move(chout); +} + +void InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { + JXL_ASSERT(c < input.channel.size()); + JXL_ASSERT(rc < input.channel.size()); + const Channel &chin = input.channel[c]; + const Channel &chin_residual = input.channel[rc]; + // These must be valid since we ran MetaApply already. + JXL_ASSERT(chin.h == DivCeil(chin.h + chin_residual.h, 2)); + JXL_ASSERT(chin.w == chin_residual.w); + + if (chin_residual.h == 0) { + // Short-circuit: output channel has same dimensions as input. + input.channel[c].vshift--; + return; + } + + // Note: chin.h >= chin_residual.h and at most 1 different. + Channel chout(chin.w, chin.h + chin_residual.h, chin.hshift, chin.vshift - 1); + JXL_DEBUG_V( + 4, + "Undoing vertical squeeze of channel %i using residuals in channel " + "%i (going from height %zu to %zu)", + c, rc, chin.h, chout.h); + + if (chin_residual.w == 0) { + // Short-circuit: channel with no pixels. + input.channel[c] = std::move(chout); + return; + } + + intptr_t onerow_in = chin.plane.PixelsPerRow(); + intptr_t onerow_out = chout.plane.PixelsPerRow(); + constexpr int kColsPerThread = 64; + RunOnPool( + pool, 0, DivCeil(chin.w, kColsPerThread), ThreadPool::SkipInit(), + [&](const int task, const int thread) { + const size_t x0 = task * kColsPerThread; + const size_t x1 = std::min((size_t)(task + 1) * kColsPerThread, chin.w); + // We only iterate up to std::min(chin_residual.h, chin.h) which is + // always chin_residual.h. + for (size_t y = 0; y < chin_residual.h; y++) { + const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y); + const pixel_type *JXL_RESTRICT p_avg = chin.Row(y); + pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1); + for (size_t x = x0; x < x1; x++) { + pixel_type_w diff_minus_tendency = p_residual[x]; + pixel_type_w avg = p_avg[x]; + + pixel_type_w next_avg = avg; + if (y + 1 < chin.h) next_avg = p_avg[x + onerow_in]; + pixel_type_w top = + (y > 0 ? p_out[static_cast(x) - onerow_out] : avg); + pixel_type_w tendency = SmoothTendency(top, avg, next_avg); + pixel_type_w diff = diff_minus_tendency + tendency; + pixel_type_w out = + ((avg * 2) + diff + (diff > 0 ? -(diff & 1) : (diff & 1))) >> 1; + + p_out[x] = out; + // If the chin_residual.h == chin.h, the output has an even number + // of rows so the next line is fine. Otherwise, this loop won't + // write to the last output row which is handled separately. + p_out[x + onerow_out] = p_out[x] - diff; + } + } + }, + "InvVertSqueeze"); + + if (chout.h & 1) { + size_t y = chin.h - 1; + const pixel_type *p_avg = chin.Row(y); + pixel_type *p_out = chout.Row(y << 1); + for (size_t x = 0; x < chin.w; x++) { + p_out[x] = p_avg[x]; + } + } + input.channel[c] = std::move(chout); +} + +void DefaultSqueezeParameters(std::vector *parameters, + const Image &image) { + int nb_channels = image.channel.size() - image.nb_meta_channels; + + parameters->clear(); + size_t w = image.channel[image.nb_meta_channels].w; + size_t h = image.channel[image.nb_meta_channels].h; + JXL_DEBUG_V(7, "Default squeeze parameters for %zux%zu image: ", w, h); + + // do horizontal first on wide images; vertical first on tall images + bool wide = (w > h); + + if (nb_channels > 2 && image.channel[image.nb_meta_channels + 1].w == w && + image.channel[image.nb_meta_channels + 1].h == h) { + // assume channels 1 and 2 are chroma, and can be squeezed first for 4:2:0 + // previews + JXL_DEBUG_V(7, "(4:2:0 chroma), %zux%zu image", w, h); + SqueezeParams params; + // horizontal chroma squeeze + params.horizontal = true; + params.in_place = false; + params.begin_c = image.nb_meta_channels + 1; + params.num_c = 2; + parameters->push_back(params); + params.horizontal = false; + // vertical chroma squeeze + parameters->push_back(params); + } + SqueezeParams params; + params.begin_c = image.nb_meta_channels; + params.num_c = nb_channels; + params.in_place = true; + + if (!wide) { + if (h > JXL_MAX_FIRST_PREVIEW_SIZE) { + params.horizontal = false; + parameters->push_back(params); + h = (h + 1) / 2; + JXL_DEBUG_V(7, "Vertical (%zux%zu), ", w, h); + } + } + while (w > JXL_MAX_FIRST_PREVIEW_SIZE || h > JXL_MAX_FIRST_PREVIEW_SIZE) { + if (w > JXL_MAX_FIRST_PREVIEW_SIZE) { + params.horizontal = true; + parameters->push_back(params); + w = (w + 1) / 2; + JXL_DEBUG_V(7, "Horizontal (%zux%zu), ", w, h); + } + if (h > JXL_MAX_FIRST_PREVIEW_SIZE) { + params.horizontal = false; + parameters->push_back(params); + h = (h + 1) / 2; + JXL_DEBUG_V(7, "Vertical (%zux%zu), ", w, h); + } + } + JXL_DEBUG_V(7, "that's it"); +} + +Status CheckMetaSqueezeParams(const SqueezeParams ¶meter, + int num_channels) { + int c1 = parameter.begin_c; + int c2 = parameter.begin_c + parameter.num_c - 1; + if (c1 < 0 || c1 >= num_channels || c2 < 0 || c2 >= num_channels || c2 < c1) { + return JXL_FAILURE("Invalid channel range"); + } + return true; +} + +Status MetaSqueeze(Image &image, std::vector *parameters) { + if (parameters->empty()) { + DefaultSqueezeParameters(parameters, image); + } + + for (size_t i = 0; i < parameters->size(); i++) { + JXL_RETURN_IF_ERROR( + CheckMetaSqueezeParams((*parameters)[i], image.channel.size())); + bool horizontal = (*parameters)[i].horizontal; + bool in_place = (*parameters)[i].in_place; + uint32_t beginc = (*parameters)[i].begin_c; + uint32_t endc = (*parameters)[i].begin_c + (*parameters)[i].num_c - 1; + + uint32_t offset; + if (beginc < image.nb_meta_channels) { + if (endc >= image.nb_meta_channels) { + return JXL_FAILURE("Invalid squeeze: mix of meta and nonmeta channels"); + } + if (!in_place) + return JXL_FAILURE( + "Invalid squeeze: meta channels require in-place residuals"); + image.nb_meta_channels += (*parameters)[i].num_c; + } + if (in_place) { + offset = endc + 1; + } else { + offset = image.channel.size(); + } + for (uint32_t c = beginc; c <= endc; c++) { + if (image.channel[c].hshift > 30 || image.channel[c].vshift > 30) { + return JXL_FAILURE("Too many squeezes: shift > 30"); + } + size_t w = image.channel[c].w; + size_t h = image.channel[c].h; + if (horizontal) { + image.channel[c].w = (w + 1) / 2; + image.channel[c].hshift++; + w = w - (w + 1) / 2; + } else { + image.channel[c].h = (h + 1) / 2; + image.channel[c].vshift++; + h = h - (h + 1) / 2; + } + image.channel[c].shrink(); + Channel dummy(w, h); + dummy.hshift = image.channel[c].hshift; + dummy.vshift = image.channel[c].vshift; + + image.channel.insert(image.channel.begin() + offset + (c - beginc), + std::move(dummy)); + } + } + return true; +} + +Status InvSqueeze(Image &input, std::vector parameters, + ThreadPool *pool) { + if (parameters.empty()) { + DefaultSqueezeParameters(¶meters, input); + } + + for (int i = parameters.size() - 1; i >= 0; i--) { + JXL_RETURN_IF_ERROR( + CheckMetaSqueezeParams(parameters[i], input.channel.size())); + bool horizontal = parameters[i].horizontal; + bool in_place = parameters[i].in_place; + uint32_t beginc = parameters[i].begin_c; + uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1; + uint32_t offset; + if (in_place) { + offset = endc + 1; + } else { + offset = input.channel.size() + beginc - endc - 1; + } + if (beginc < input.nb_meta_channels) { + // This is checked in MetaSqueeze. + JXL_ASSERT(input.nb_meta_channels > parameters[i].num_c); + input.nb_meta_channels -= parameters[i].num_c; + } + + for (uint32_t c = beginc; c <= endc; c++) { + uint32_t rc = offset + c - beginc; + // MetaApply should imply that `rc` is within range, otherwise there's a + // programming bug. + JXL_ASSERT(rc < input.channel.size()); + if ((input.channel[c].w < input.channel[rc].w) || + (input.channel[c].h < input.channel[rc].h)) { + return JXL_FAILURE("Corrupted squeeze transform"); + } + if (horizontal) { + InvHSqueeze(input, c, rc, pool); + } else { + InvVSqueeze(input, c, rc, pool); + } + } + input.channel.erase(input.channel.begin() + offset, + input.channel.begin() + offset + (endc - beginc + 1)); + } + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/squeeze.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/squeeze.h new file mode 100644 index 0000000000..a2d3afdc6e --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/squeeze.h @@ -0,0 +1,94 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_TRANSFORM_SQUEEZE_H_ +#define LIB_JXL_MODULAR_TRANSFORM_SQUEEZE_H_ + +// Haar-like transform: halves the resolution in one direction +// A B -> (A+B)>>1 in one channel (average) -> same range as +// original channel +// A-B - tendency in a new channel ('residual' needed to make +// the transform reversible) +// -> theoretically range could be 2.5 +// times larger (2 times without the +// 'tendency'), but there should be lots +// of zeroes +// Repeated application (alternating horizontal and vertical squeezes) results +// in downscaling +// +// The default coefficient ordering is low-frequency to high-frequency, as in +// M. Antonini, M. Barlaud, P. Mathieu and I. Daubechies, "Image coding using +// wavelet transform", IEEE Transactions on Image Processing, vol. 1, no. 2, pp. +// 205-220, April 1992, doi: 10.1109/83.136597. + +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/common.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/transform.h" + +#define JXL_MAX_FIRST_PREVIEW_SIZE 8 + +namespace jxl { + +/* + int avg=(A+B)>>1; + int diff=(A-B); + int rA=(diff+(avg<<1)+(diff&1))>>1; + int rB=rA-diff; + +*/ +// |A B|C D|E F| +// p a n p=avg(A,B), a=avg(C,D), n=avg(E,F) +// +// Goal: estimate C-D (avoiding ringing artifacts) +// (ensuring that in smooth areas, a zero residual corresponds to a smooth +// gradient) + +// best estimate for C: (B + 2*a)/3 +// best estimate for D: (n + 3*a)/4 +// best estimate for C-D: 4*B - 3*n - a /12 + +// avoid ringing by 1) only doing this if B <= a <= n or B >= a >= n +// (otherwise, this is not a smooth area and we cannot really estimate C-D) +// 2) making sure that B <= C <= D <= n or B >= C >= D >= n + +inline pixel_type_w SmoothTendency(pixel_type_w B, pixel_type_w a, + pixel_type_w n) { + pixel_type_w diff = 0; + if (B >= a && a >= n) { + diff = (4 * B - 3 * n - a + 6) / 12; + // 2C = a<<1 + diff - diff&1 <= 2B so diff - diff&1 <= 2B - 2a + // 2D = a<<1 - diff - diff&1 >= 2n so diff + diff&1 <= 2a - 2n + if (diff - (diff & 1) > 2 * (B - a)) diff = 2 * (B - a) + 1; + if (diff + (diff & 1) > 2 * (a - n)) diff = 2 * (a - n); + } else if (B <= a && a <= n) { + diff = (4 * B - 3 * n - a - 6) / 12; + // 2C = a<<1 + diff + diff&1 >= 2B so diff + diff&1 >= 2B - 2a + // 2D = a<<1 - diff + diff&1 <= 2n so diff - diff&1 >= 2a - 2n + if (diff + (diff & 1) < 2 * (B - a)) diff = 2 * (B - a) - 1; + if (diff - (diff & 1) < 2 * (a - n)) diff = 2 * (a - n); + } + return diff; +} + +void InvHSqueeze(Image &input, int c, int rc, ThreadPool *pool); + +void InvVSqueeze(Image &input, int c, int rc, ThreadPool *pool); + +void DefaultSqueezeParameters(std::vector *parameters, + const Image &image); + +Status CheckMetaSqueezeParams(const SqueezeParams ¶meter, int num_channels); + +Status MetaSqueeze(Image &image, std::vector *parameters); + +Status InvSqueeze(Image &input, std::vector parameters, + ThreadPool *pool); + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_TRANSFORM_SQUEEZE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/transform.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/transform.cc new file mode 100644 index 0000000000..e63013a38c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/transform.cc @@ -0,0 +1,92 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/modular/transform/transform.h" + +#include "lib/jxl/fields.h" +#include "lib/jxl/modular/modular_image.h" +#include "lib/jxl/modular/transform/palette.h" +#include "lib/jxl/modular/transform/rct.h" +#include "lib/jxl/modular/transform/squeeze.h" + +namespace jxl { + +SqueezeParams::SqueezeParams() { Bundle::Init(this); } +Transform::Transform(TransformId id) { + Bundle::Init(this); + this->id = id; +} + +Status Transform::Inverse(Image &input, const weighted::Header &wp_header, + ThreadPool *pool) { + switch (id) { + case TransformId::kRCT: + return InvRCT(input, begin_c, rct_type); + case TransformId::kSqueeze: + return InvSqueeze(input, squeezes, pool); + case TransformId::kPalette: + return InvPalette(input, begin_c, nb_colors, nb_deltas, predictor, + wp_header, pool); + default: + return JXL_FAILURE("Unknown transformation (ID=%u)", + static_cast(id)); + } +} + +Status Transform::MetaApply(Image &input) { + switch (id) { + case TransformId::kRCT: + JXL_DEBUG_V(2, "Transform: kRCT, rct_type=%" PRIu32, rct_type); + return CheckEqualChannels(input, begin_c, begin_c + 2); + case TransformId::kSqueeze: + JXL_DEBUG_V(2, "Transform: kSqueeze:"); +#if JXL_DEBUG_V_LEVEL >= 2 + { + auto squeezes_copy = squeezes; + if (squeezes_copy.empty()) { + DefaultSqueezeParameters(&squeezes_copy, input); + } + for (const auto ¶ms : squeezes_copy) { + JXL_DEBUG_V( + 2, + " squeeze params: horizontal=%d, in_place=%d, begin_c=%" PRIu32 + ", num_c=%" PRIu32, + params.horizontal, params.in_place, params.begin_c, params.num_c); + } + } +#endif + return MetaSqueeze(input, &squeezes); + case TransformId::kPalette: + JXL_DEBUG_V(2, + "Transform: kPalette, begin_c=%" PRIu32 ", num_c=%" PRIu32 + ", nb_colors=%" PRIu32 ", nb_deltas=%" PRIu32, + begin_c, num_c, nb_colors, nb_deltas); + return MetaPalette(input, begin_c, begin_c + num_c - 1, nb_colors, + nb_deltas, lossy_palette); + default: + return JXL_FAILURE("Unknown transformation (ID=%u)", + static_cast(id)); + } +} + +Status CheckEqualChannels(const Image &image, uint32_t c1, uint32_t c2) { + if (c1 > image.channel.size() || c2 >= image.channel.size() || c2 < c1) { + return JXL_FAILURE("Invalid channel range"); + } + if (c1 < image.nb_meta_channels && c2 >= image.nb_meta_channels) { + return JXL_FAILURE("Invalid: transforming mix of meta and nonmeta"); + } + const auto &ch1 = image.channel[c1]; + for (size_t c = c1 + 1; c <= c2; c++) { + const auto &ch2 = image.channel[c]; + if (ch1.w != ch2.w || ch1.h != ch2.h || ch1.hshift != ch2.hshift || + ch1.vshift != ch2.vshift) { + return false; + } + } + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/transform.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/transform.h new file mode 100644 index 0000000000..0562d2fe3e --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular/transform/transform.h @@ -0,0 +1,148 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_MODULAR_TRANSFORM_TRANSFORM_H_ +#define LIB_JXL_MODULAR_TRANSFORM_TRANSFORM_H_ + +#include +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/modular/encoding/context_predict.h" +#include "lib/jxl/modular/options.h" + +namespace jxl { + +enum class TransformId : uint32_t { + // G, R-G, B-G and variants (including YCoCg). + kRCT = 0, + + // Color palette. Parameters are: [begin_c] [end_c] [nb_colors] + kPalette = 1, + + // Squeezing (Haar-style) + kSqueeze = 2, + + // Invalid for now. + kInvalid = 3, +}; + +struct SqueezeParams : public Fields { + const char *Name() const override { return "SqueezeParams"; } + bool horizontal; + bool in_place; + uint32_t begin_c; + uint32_t num_c; + SqueezeParams(); + Status VisitFields(Visitor *JXL_RESTRICT visitor) override { + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &horizontal)); + JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &in_place)); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Bits(3), BitsOffset(6, 8), + BitsOffset(10, 72), + BitsOffset(13, 1096), 0, &begin_c)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(2), Val(3), BitsOffset(4, 4), 2, &num_c)); + return true; + } +}; + +class Transform : public Fields { + public: + TransformId id; + // for Palette and RCT. + uint32_t begin_c; + // for RCT. 42 possible values starting from 0. + uint32_t rct_type; + // Only for Palette and NearLossless. + uint32_t num_c; + // Only for Palette. + uint32_t nb_colors; + uint32_t nb_deltas; + // for Squeeze. Default squeeze if empty. + std::vector squeezes; + // for NearLossless, not serialized. + int max_delta_error; + // Serialized for Palette. + Predictor predictor; + // for Palette, not serialized. + bool ordered_palette = true; + bool lossy_palette = false; + + explicit Transform(TransformId id); + // default constructor for bundles. + Transform() : Transform(TransformId::kInvalid) {} + + Status VisitFields(Visitor *JXL_RESTRICT visitor) override { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32( + Val((uint32_t)TransformId::kRCT), Val((uint32_t)TransformId::kPalette), + Val((uint32_t)TransformId::kSqueeze), + Val((uint32_t)TransformId::kInvalid), (uint32_t)TransformId::kRCT, + reinterpret_cast(&id))); + if (id == TransformId::kInvalid) { + return JXL_FAILURE("Invalid transform ID"); + } + if (visitor->Conditional(id == TransformId::kRCT || + id == TransformId::kPalette)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Bits(3), BitsOffset(6, 8), BitsOffset(10, 72), + BitsOffset(13, 1096), 0, &begin_c)); + } + if (visitor->Conditional(id == TransformId::kRCT)) { + // 0-41, default YCoCg. + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(6), Bits(2), BitsOffset(4, 2), + BitsOffset(6, 10), 6, &rct_type)); + if (rct_type >= 42) { + return JXL_FAILURE("Invalid transform RCT type"); + } + } + if (visitor->Conditional(id == TransformId::kPalette)) { + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(1), Val(3), Val(4), BitsOffset(13, 1), 3, &num_c)); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32( + BitsOffset(8, 0), BitsOffset(10, 256), BitsOffset(12, 1280), + BitsOffset(16, 5376), 256, &nb_colors)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(0), BitsOffset(8, 1), BitsOffset(10, 257), + BitsOffset(16, 1281), 0, &nb_deltas)); + JXL_QUIET_RETURN_IF_ERROR( + visitor->Bits(4, (uint32_t)Predictor::Zero, + reinterpret_cast(&predictor))); + if (predictor >= Predictor::Best) { + return JXL_FAILURE("Invalid predictor"); + } + } + + if (visitor->Conditional(id == TransformId::kSqueeze)) { + uint32_t num_squeezes = static_cast(squeezes.size()); + JXL_QUIET_RETURN_IF_ERROR( + visitor->U32(Val(0), BitsOffset(4, 1), BitsOffset(6, 9), + BitsOffset(8, 41), 0, &num_squeezes)); + if (visitor->IsReading()) squeezes.resize(num_squeezes); + for (size_t i = 0; i < num_squeezes; i++) { + JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&squeezes[i])); + } + } + return true; + } + + const char *Name() const override { return "Transform"; } + + Status Inverse(Image &input, const weighted::Header &wp_header, + ThreadPool *pool = nullptr); + Status MetaApply(Image &input); +}; + +Status CheckEqualChannels(const Image &image, uint32_t c1, uint32_t c2); + +static inline pixel_type PixelAdd(pixel_type a, pixel_type b) { + return static_cast(static_cast(a) + + static_cast(b)); +} + +} // namespace jxl + +#endif // LIB_JXL_MODULAR_TRANSFORM_TRANSFORM_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular_test.cc new file mode 100644 index 0000000000..a528998971 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/modular_test.cc @@ -0,0 +1,171 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "lib/extras/codec.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/thread_pool_internal.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/dec_file.h" +#include "lib/jxl/dec_params.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/modular/encoding/enc_encoding.h" +#include "lib/jxl/modular/encoding/encoding.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testdata.h" + +namespace jxl { +namespace { +using test::Roundtrip; + +void TestLosslessGroups(size_t group_size_shift) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png"); + CompressParams cparams; + cparams.modular_mode = true; + cparams.modular_group_size_shift = group_size_shift; + cparams.color_transform = jxl::ColorTransform::kNone; + DecompressParams dparams; + + CodecInOut io_out; + size_t compressed_size; + + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + io.ShrinkTo(io.xsize() / 4, io.ysize() / 4); + + compressed_size = Roundtrip(&io, cparams, dparams, pool, &io_out); + EXPECT_LE(compressed_size, 280000); + EXPECT_LE(ButteraugliDistance(io, io_out, cparams.ba_params, + /*distmap=*/nullptr, pool), + 0.0); +} + +TEST(ModularTest, RoundtripLosslessGroups128) { TestLosslessGroups(0); } + +TEST(ModularTest, JXL_TSAN_SLOW_TEST(RoundtripLosslessGroups512)) { + TestLosslessGroups(2); +} + +TEST(ModularTest, JXL_TSAN_SLOW_TEST(RoundtripLosslessGroups1024)) { + TestLosslessGroups(3); +} + +TEST(ModularTest, RoundtripLossy) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CompressParams cparams; + cparams.modular_mode = true; + cparams.quality_pair = {80.0f, 80.0f}; + DecompressParams dparams; + + CodecInOut io_out; + size_t compressed_size; + + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + + compressed_size = Roundtrip(&io, cparams, dparams, pool, &io_out); + EXPECT_LE(compressed_size, 40000); + cparams.ba_params.intensity_target = 80.0f; + EXPECT_LE(ButteraugliDistance(io, io_out, cparams.ba_params, + /*distmap=*/nullptr, pool), + 3.0); +} + +TEST(ModularTest, RoundtripLossy16) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("raw.pixls/DJI-FC6310-16bit_709_v4_krita.png"); + CompressParams cparams; + cparams.modular_mode = true; + cparams.quality_pair = {80.0f, 80.0f}; + DecompressParams dparams; + + CodecInOut io_out; + size_t compressed_size; + + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + JXL_CHECK(io.TransformTo(ColorEncoding::SRGB(), pool)); + io.metadata.m.color_encoding = ColorEncoding::SRGB(); + + compressed_size = Roundtrip(&io, cparams, dparams, pool, &io_out); + EXPECT_LE(compressed_size, 400); + cparams.ba_params.intensity_target = 80.0f; + EXPECT_LE(ButteraugliDistance(io, io_out, cparams.ba_params, + /*distmap=*/nullptr, pool), + 1.5); +} + +TEST(ModularTest, RoundtripExtraProperties) { + constexpr size_t kSize = 250; + Image image(kSize, kSize, /*bitdepth=*/8, 3); + ModularOptions options; + options.max_properties = 4; + options.predictor = Predictor::Zero; + std::mt19937 rng(0); + std::uniform_int_distribution<> dist(0, 8); + for (size_t y = 0; y < kSize; y++) { + for (size_t x = 0; x < kSize; x++) { + image.channel[0].plane.Row(y)[x] = image.channel[2].plane.Row(y)[x] = + dist(rng); + } + } + ZeroFillImage(&image.channel[1].plane); + BitWriter writer; + ASSERT_TRUE(ModularGenericCompress(image, options, &writer)); + writer.ZeroPadToByte(); + Image decoded(kSize, kSize, /*bitdepth=*/8, image.channel.size()); + for (size_t i = 0; i < image.channel.size(); i++) { + const Channel& ch = image.channel[i]; + decoded.channel[i] = Channel(ch.w, ch.h, ch.hshift, ch.vshift); + } + Status status = true; + { + BitReader reader(writer.GetSpan()); + BitReaderScopedCloser closer(&reader, &status); + ASSERT_TRUE(ModularGenericDecompress(&reader, decoded, /*header=*/nullptr, + /*group_id=*/0, &options)); + } + ASSERT_TRUE(status); + ASSERT_EQ(image.channel.size(), decoded.channel.size()); + for (size_t c = 0; c < image.channel.size(); c++) { + for (size_t y = 0; y < image.channel[c].plane.ysize(); y++) { + for (size_t x = 0; x < image.channel[c].plane.xsize(); x++) { + EXPECT_EQ(image.channel[c].plane.Row(y)[x], + decoded.channel[c].plane.Row(y)[x]) + << "c = " << c << ", x = " << x << ", y = " << y; + } + } + } +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/noise.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/noise.h new file mode 100644 index 0000000000..329b325f1c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/noise.h @@ -0,0 +1,60 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_NOISE_H_ +#define LIB_JXL_NOISE_H_ + +// Noise parameters shared by encoder/decoder. + +#include + +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" + +namespace jxl { + +const float kNoisePrecision = 1 << 10; + +struct NoiseParams { + // LUT index is an intensity of pixel / mean intensity of patch + static constexpr size_t kNumNoisePoints = 8; + float lut[kNumNoisePoints]; + + void Clear() { + for (float& i : lut) i = 0; + } + bool HasAny() const { + for (float i : lut) { + if (std::abs(i) > 1e-3f) return true; + } + return false; + } +}; + +static inline std::pair IndexAndFrac(float x) { + constexpr size_t kScaleNumerator = NoiseParams::kNumNoisePoints - 2; + // TODO: instead of 1, this should be a proper Y range. + constexpr float kScale = kScaleNumerator / 1; + float scaled_x = std::max(0.f, x * kScale); + float floor_x; + float frac_x = std::modf(scaled_x, &floor_x); + if (JXL_UNLIKELY(scaled_x >= kScaleNumerator)) { + floor_x = kScaleNumerator - 1; + frac_x = 1; + } + return std::make_pair(static_cast(static_cast(floor_x)), frac_x); +} + +struct NoiseLevel { + float noise_level; + float intensity; +}; + +} // namespace jxl + +#endif // LIB_JXL_NOISE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/noise_distributions.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/noise_distributions.h new file mode 100644 index 0000000000..65a61cc6ef --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/noise_distributions.h @@ -0,0 +1,138 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_NOISE_DISTRIBUTIONS_H_ +#define LIB_JXL_NOISE_DISTRIBUTIONS_H_ + +// Noise distributions for testing partial_derivatives and robust_statistics. + +#include +#include + +#include // distributions +#include + +#include "lib/jxl/common.h" +#include "lib/jxl/image.h" + +namespace jxl { + +// Unmodified input +struct NoiseNone { + std::string Name() const { return "None"; } + + template + float operator()(const float in, Random* rng) const { + return in; + } +}; + +// Salt+pepper +class NoiseImpulse { + public: + explicit NoiseImpulse(const uint32_t threshold) : threshold_(threshold) {} + std::string Name() const { return "Impulse" + ToString(threshold_); } + + // Sets pixels to 0 if rand < threshold or 1 if rand > ~threshold. + template + float operator()(const float in, Random* rng) const { + const uint32_t rand = (*rng)(); + float out = 0.0f; + if (rand > ~threshold_) { + out = 1.0f; + } + if (rand > threshold_) { + out = in; + } + return out; + } + + private: + const uint32_t threshold_; +}; + +class NoiseUniform { + public: + NoiseUniform(const float min, const float max_exclusive) + : dist_(min, max_exclusive) {} + std::string Name() const { return "Uniform" + ToString(dist_.b()); } + + template + float operator()(const float in, Random* rng) const { + return in + dist_(*rng); + } + + private: + mutable std::uniform_real_distribution dist_; +}; + +// Additive, zero-mean Gaussian. +class NoiseGaussian { + public: + explicit NoiseGaussian(const float stddev) : dist_(0.0f, stddev) {} + std::string Name() const { return "Gaussian" + ToString(dist_.stddev()); } + + template + float operator()(const float in, Random* rng) const { + return in + dist_(*rng); + } + + private: + mutable std::normal_distribution dist_; +}; + +// Integer noise is scaled by 1E-3. +class NoisePoisson { + public: + explicit NoisePoisson(const double mean) : dist_(mean) {} + std::string Name() const { return "Poisson" + ToString(dist_.mean()); } + + template + float operator()(const float in, Random* rng) const { + return in + dist_(*rng) * 1E-3f; + } + + private: + mutable std::poisson_distribution dist_; +}; + +// Returns the result of applying the randomized "noise" function to each pixel. +template +ImageF AddNoise(const ImageF& in, const NoiseType& noise, Random* rng) { + const size_t xsize = in.xsize(); + const size_t ysize = in.ysize(); + ImageF out(xsize, ysize); + for (size_t y = 0; y < ysize; ++y) { + const float* JXL_RESTRICT in_row = in.ConstRow(y); + float* JXL_RESTRICT out_row = out.Row(y); + for (size_t x = 0; x < xsize; ++x) { + out_row[x] = noise(in_row[x], rng); + } + } + return out; +} + +template +Image3F AddNoise(const Image3F& in, const NoiseType& noise, Random* rng) { + const size_t xsize = in.xsize(); + const size_t ysize = in.ysize(); + Image3F out(xsize, ysize); + // noise_estimator_test requires this loop order. + for (size_t c = 0; c < 3; ++c) { + for (size_t y = 0; y < ysize; ++y) { + const float* JXL_RESTRICT in_row = in.ConstPlaneRow(c, y); + float* JXL_RESTRICT out_row = out.PlaneRow(c, y); + + for (size_t x = 0; x < xsize; ++x) { + out_row[x] = noise(in_row[x], rng); + } + } + } + return out; +} + +} // namespace jxl + +#endif // LIB_JXL_NOISE_DISTRIBUTIONS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_image_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_image_test.cc new file mode 100644 index 0000000000..d79c7cf479 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_image_test.cc @@ -0,0 +1,127 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/image.h" +#include "lib/jxl/linalg.h" +#include "lib/jxl/opsin_params.h" + +namespace jxl { +namespace { + +class OpsinImageTargetTest : public hwy::TestWithParamTarget {}; +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(OpsinImageTargetTest); + +TEST_P(OpsinImageTargetTest, MaxCubeRootError) { TestCubeRoot(); } + +// Convert a single linear sRGB color to xyb, using the exact image conversion +// procedure that jpeg xl uses. +void LinearSrgbToOpsin(float rgb_r, float rgb_g, float rgb_b, + float* JXL_RESTRICT xyb_x, float* JXL_RESTRICT xyb_y, + float* JXL_RESTRICT xyb_b) { + Image3F linear(1, 1); + linear.PlaneRow(0, 0)[0] = rgb_r; + linear.PlaneRow(1, 0)[0] = rgb_g; + linear.PlaneRow(2, 0)[0] = rgb_b; + + ImageMetadata metadata; + metadata.SetFloat32Samples(); + metadata.color_encoding = ColorEncoding::LinearSRGB(); + ImageBundle ib(&metadata); + ib.SetFromImage(std::move(linear), metadata.color_encoding); + Image3F opsin(1, 1); + (void)ToXYB(ib, /*pool=*/nullptr, &opsin); + + *xyb_x = opsin.PlaneRow(0, 0)[0]; + *xyb_y = opsin.PlaneRow(1, 0)[0]; + *xyb_b = opsin.PlaneRow(2, 0)[0]; +} + +// Convert a single XYB color to linear sRGB, using the exact image conversion +// procedure that jpeg xl uses. +void OpsinToLinearSrgb(float xyb_x, float xyb_y, float xyb_b, + float* JXL_RESTRICT rgb_r, float* JXL_RESTRICT rgb_g, + float* JXL_RESTRICT rgb_b) { + Image3F opsin(1, 1); + opsin.PlaneRow(0, 0)[0] = xyb_x; + opsin.PlaneRow(1, 0)[0] = xyb_y; + opsin.PlaneRow(2, 0)[0] = xyb_b; + Image3F linear(1, 1); + OpsinParams opsin_params; + opsin_params.Init(/*intensity_target=*/255.0f); + OpsinToLinear(opsin, Rect(opsin), nullptr, &linear, opsin_params); + *rgb_r = linear.PlaneRow(0, 0)[0]; + *rgb_g = linear.PlaneRow(1, 0)[0]; + *rgb_b = linear.PlaneRow(2, 0)[0]; +} + +void OpsinRoundtripTestRGB(float r, float g, float b) { + float xyb_x, xyb_y, xyb_b; + LinearSrgbToOpsin(r, g, b, &xyb_x, &xyb_y, &xyb_b); + float r2, g2, b2; + OpsinToLinearSrgb(xyb_x, xyb_y, xyb_b, &r2, &g2, &b2); + EXPECT_NEAR(r, r2, 1e-3); + EXPECT_NEAR(g, g2, 1e-3); + EXPECT_NEAR(b, b2, 1e-3); +} + +TEST(OpsinImageTest, VerifyOpsinAbsorbanceInverseMatrix) { + float matrix[9]; // writable copy + for (int i = 0; i < 9; i++) { + matrix[i] = GetOpsinAbsorbanceInverseMatrix()[i]; + } + EXPECT_TRUE(Inv3x3Matrix(matrix)); + for (int i = 0; i < 9; i++) { + EXPECT_NEAR(matrix[i], kOpsinAbsorbanceMatrix[i], 1e-6); + } +} + +TEST(OpsinImageTest, OpsinRoundtrip) { + OpsinRoundtripTestRGB(0, 0, 0); + OpsinRoundtripTestRGB(1. / 255, 1. / 255, 1. / 255); + OpsinRoundtripTestRGB(128. / 255, 128. / 255, 128. / 255); + OpsinRoundtripTestRGB(1, 1, 1); + + OpsinRoundtripTestRGB(0, 0, 1. / 255); + OpsinRoundtripTestRGB(0, 0, 128. / 255); + OpsinRoundtripTestRGB(0, 0, 1); + + OpsinRoundtripTestRGB(0, 1. / 255, 0); + OpsinRoundtripTestRGB(0, 128. / 255, 0); + OpsinRoundtripTestRGB(0, 1, 0); + + OpsinRoundtripTestRGB(1. / 255, 0, 0); + OpsinRoundtripTestRGB(128. / 255, 0, 0); + OpsinRoundtripTestRGB(1, 0, 0); +} + +TEST(OpsinImageTest, VerifyZero) { + // Test that black color (zero energy) is 0,0,0 in xyb. + float x, y, b; + LinearSrgbToOpsin(0, 0, 0, &x, &y, &b); + EXPECT_NEAR(0, x, 1e-9); + EXPECT_NEAR(0, y, 1e-7); + EXPECT_NEAR(0, b, 1e-7); +} + +TEST(OpsinImageTest, VerifyGray) { + // Test that grayscale colors have a fixed y/b ratio and x==0. + for (size_t i = 1; i < 255; i++) { + float x, y, b; + LinearSrgbToOpsin(i / 255., i / 255., i / 255., &x, &y, &b); + EXPECT_NEAR(0, x, 1e-6); + EXPECT_NEAR(kYToBRatio, b / y, 3e-5); + } +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_inverse_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_inverse_test.cc new file mode 100644 index 0000000000..b7c1964259 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_inverse_test.cc @@ -0,0 +1,55 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "gtest/gtest.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/color_management.h" +#include "lib/jxl/dec_xyb.h" +#include "lib/jxl/enc_xyb.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_test_utils.h" + +namespace jxl { +namespace { + +TEST(OpsinInverseTest, LinearInverseInverts) { + Image3F linear(128, 128); + RandomFillImage(&linear, 1.0f); + + CodecInOut io; + io.metadata.m.SetFloat32Samples(); + io.metadata.m.color_encoding = ColorEncoding::LinearSRGB(); + io.SetFromImage(CopyImage(linear), io.metadata.m.color_encoding); + ThreadPool* null_pool = nullptr; + Image3F opsin(io.xsize(), io.ysize()); + (void)ToXYB(io.Main(), null_pool, &opsin); + + OpsinParams opsin_params; + opsin_params.Init(/*intensity_target=*/255.0f); + OpsinToLinearInplace(&opsin, /*pool=*/nullptr, opsin_params); + + VerifyRelativeError(linear, opsin, 3E-3, 2E-4); +} + +TEST(OpsinInverseTest, YcbCrInverts) { + Image3F rgb(128, 128); + RandomFillImage(&rgb, 1.0f); + + ThreadPool* null_pool = nullptr; + Image3F ycbcr(rgb.xsize(), rgb.ysize()); + RgbToYcbcr(rgb.Plane(0), rgb.Plane(1), rgb.Plane(2), &ycbcr.Plane(1), + &ycbcr.Plane(0), &ycbcr.Plane(2), null_pool); + + Image3F rgb2(rgb.xsize(), rgb.ysize()); + YcbcrToRgb(ycbcr, &rgb2, Rect(rgb)); + + VerifyRelativeError(rgb, rgb2, 4E-5, 4E-7); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_params.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_params.cc new file mode 100644 index 0000000000..f80a18af8c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_params.cc @@ -0,0 +1,44 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/opsin_params.h" + +#include + +#include "lib/jxl/linalg.h" + +namespace jxl { + +#define INVERSE_OPSIN_FROM_SPEC 1 + +const float* GetOpsinAbsorbanceInverseMatrix() { +#if INVERSE_OPSIN_FROM_SPEC + return DefaultInverseOpsinAbsorbanceMatrix(); +#else // INVERSE_OPSIN_FROM_SPEC + // Compute the inverse opsin matrix from the forward matrix. Less precise + // than taking the values from the specification, but must be used if the + // forward transform is changed and the spec will require updating. + static const float* const kInverse = [] { + static float inverse[9]; + for (int i = 0; i < 9; i++) { + inverse[i] = kOpsinAbsorbanceMatrix[i]; + } + Inv3x3Matrix(inverse); + return inverse; + }(); + return kInverse; +#endif // INVERSE_OPSIN_FROM_SPEC +} + +void InitSIMDInverseMatrix(const float* JXL_RESTRICT inverse, + float* JXL_RESTRICT simd_inverse, + float intensity_target) { + for (size_t i = 0; i < 9; ++i) { + simd_inverse[4 * i] = simd_inverse[4 * i + 1] = simd_inverse[4 * i + 2] = + simd_inverse[4 * i + 3] = inverse[i] * (255.0f / intensity_target); + } +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_params.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_params.h new file mode 100644 index 0000000000..e8e2e4331e --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/opsin_params.h @@ -0,0 +1,74 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_OPSIN_PARAMS_H_ +#define LIB_JXL_OPSIN_PARAMS_H_ + +// Constants that define the XYB color space. + +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" + +namespace jxl { + +// Parameters for opsin absorbance. +static const float kM02 = 0.078f; +static const float kM00 = 0.30f; +static const float kM01 = 1.0f - kM02 - kM00; + +static const float kM12 = 0.078f; +static const float kM10 = 0.23f; +static const float kM11 = 1.0f - kM12 - kM10; + +static const float kM20 = 0.24342268924547819f; +static const float kM21 = 0.20476744424496821f; +static const float kM22 = 1.0f - kM20 - kM21; + +static const float kBScale = 1.0f; +static const float kYToBRatio = 1.0f; // works better with 0.50017729543783418 +static const float kBToYRatio = 1.0f / kYToBRatio; + +static const float kB0 = 0.0037930732552754493f; +static const float kB1 = kB0; +static const float kB2 = kB0; + +// Opsin absorbance matrix is now frozen. +static const float kOpsinAbsorbanceMatrix[9] = { + kM00, kM01, kM02, kM10, kM11, kM12, kM20, kM21, kM22, +}; + +// Must be the inverse matrix of kOpsinAbsorbanceMatrix and match the spec. +static inline const float* DefaultInverseOpsinAbsorbanceMatrix() { + static float kDefaultInverseOpsinAbsorbanceMatrix[9] = { + 11.031566901960783f, -9.866943921568629f, -0.16462299647058826f, + -3.254147380392157f, 4.418770392156863f, -0.16462299647058826f, + -3.6588512862745097f, 2.7129230470588235f, 1.9459282392156863f}; + return kDefaultInverseOpsinAbsorbanceMatrix; +} + +// Returns 3x3 row-major matrix inverse of kOpsinAbsorbanceMatrix. +// opsin_image_test verifies this is actually the inverse. +const float* GetOpsinAbsorbanceInverseMatrix(); + +void InitSIMDInverseMatrix(const float* JXL_RESTRICT inverse, + float* JXL_RESTRICT simd_inverse, + float intensity_target); + +static const float kOpsinAbsorbanceBias[3] = { + kB0, + kB1, + kB2, +}; + +static const float kNegOpsinAbsorbanceBiasRGB[4] = { + -kOpsinAbsorbanceBias[0], -kOpsinAbsorbanceBias[1], + -kOpsinAbsorbanceBias[2], 1.0f}; + +} // namespace jxl + +#endif // LIB_JXL_OPSIN_PARAMS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/optimize.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/optimize.cc new file mode 100644 index 0000000000..0816596365 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/optimize.cc @@ -0,0 +1,163 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/optimize.h" + +#include + +#include "lib/jxl/base/status.h" + +namespace jxl { + +namespace optimize { + +namespace { + +// simplex vector must be sorted by first element of its elements +std::vector Midpoint(const std::vector>& simplex) { + JXL_CHECK(!simplex.empty()); + JXL_CHECK(simplex.size() == simplex[0].size()); + int dim = simplex.size() - 1; + std::vector result(dim + 1, 0); + for (int i = 0; i < dim; i++) { + for (int k = 0; k < dim; k++) { + result[i + 1] += simplex[k][i + 1]; + } + result[i + 1] /= dim; + } + return result; +} + +// first element ignored +std::vector Subtract(const std::vector& a, + const std::vector& b) { + JXL_CHECK(a.size() == b.size()); + std::vector result(a.size()); + result[0] = 0; + for (size_t i = 1; i < result.size(); i++) { + result[i] = a[i] - b[i]; + } + return result; +} + +// first element ignored +std::vector Add(const std::vector& a, + const std::vector& b) { + JXL_CHECK(a.size() == b.size()); + std::vector result(a.size()); + result[0] = 0; + for (size_t i = 1; i < result.size(); i++) { + result[i] = a[i] + b[i]; + } + return result; +} + +// first element ignored +std::vector Average(const std::vector& a, + const std::vector& b) { + JXL_CHECK(a.size() == b.size()); + std::vector result(a.size()); + result[0] = 0; + for (size_t i = 1; i < result.size(); i++) { + result[i] = 0.5 * (a[i] + b[i]); + } + return result; +} + +// vec: [0] will contain the objective function, [1:] will +// contain the vector position for the objective function. +// fun: the function evaluates the value. +void Eval(std::vector* vec, + const std::function&)>& fun) { + std::vector args(vec->begin() + 1, vec->end()); + (*vec)[0] = fun(args); +} + +void Sort(std::vector>* simplex) { + std::sort(simplex->begin(), simplex->end()); +} + +// Main iteration step of Nelder-Mead like optimization. +void Reflect(std::vector>* simplex, + const std::function&)>& fun) { + Sort(simplex); + const std::vector& last = simplex->back(); + std::vector mid = Midpoint(*simplex); + std::vector diff = Subtract(mid, last); + std::vector mirrored = Add(mid, diff); + Eval(&mirrored, fun); + if (mirrored[0] > (*simplex)[simplex->size() - 2][0]) { + // Still the worst, shrink towards the best. + std::vector shrinking = Average(simplex->back(), (*simplex)[0]); + Eval(&shrinking, fun); + simplex->back() = shrinking; + } else if (mirrored[0] < (*simplex)[0][0]) { + // new best + std::vector even_further = Add(mirrored, diff); + Eval(&even_further, fun); + if (even_further[0] < mirrored[0]) { + mirrored = even_further; + } + simplex->back() = mirrored; + } else { + // not a best, not a worst point + simplex->back() = mirrored; + } +} + +// Initialize the simplex at origin. +std::vector> InitialSimplex( + int dim, double amount, const std::vector& init, + const std::function&)>& fun) { + std::vector best(1 + dim, 0); + std::copy(init.begin(), init.end(), best.begin() + 1); + Eval(&best, fun); + std::vector> result{best}; + for (int i = 0; i < dim; i++) { + best = result[0]; + best[i + 1] += amount; + Eval(&best, fun); + result.push_back(best); + Sort(&result); + } + return result; +} + +// For comparing the same with the python tool +/*void RunSimplexExternal( + int dim, double amount, int max_iterations, + const std::function&))>& fun) { + vector vars; + for (int i = 0; i < dim; i++) { + vars.push_back(atof(getenv(StrCat("VAR", i).c_str()))); + } + double result = fun(vars); + std::cout << "Result=" << result; +}*/ + +} // namespace + +std::vector RunSimplex( + int dim, double amount, int max_iterations, const std::vector& init, + const std::function&)>& fun) { + std::vector> simplex = + InitialSimplex(dim, amount, init, fun); + for (int i = 0; i < max_iterations; i++) { + Sort(&simplex); + Reflect(&simplex, fun); + } + return simplex[0]; +} + +std::vector RunSimplex( + int dim, double amount, int max_iterations, + const std::function&)>& fun) { + std::vector init(dim, 0.0); + return RunSimplex(dim, amount, max_iterations, init, fun); +} + +} // namespace optimize + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/optimize.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/optimize.h new file mode 100644 index 0000000000..0a60198214 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/optimize.h @@ -0,0 +1,218 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Utility functions for optimizing multi-dimensional nonlinear functions. + +#ifndef LIB_JXL_OPTIMIZE_H_ +#define LIB_JXL_OPTIMIZE_H_ + +#include + +#include +#include +#include +#include + +#include "lib/jxl/base/status.h" + +namespace jxl { +namespace optimize { + +// An array type of numeric values that supports math operations with operator-, +// operator+, etc. +template +class Array { + public: + Array() = default; + explicit Array(T v) { + for (size_t i = 0; i < N; i++) v_[i] = v; + } + + size_t size() const { return N; } + + T& operator[](size_t index) { + JXL_DASSERT(index < N); + return v_[index]; + } + T operator[](size_t index) const { + JXL_DASSERT(index < N); + return v_[index]; + } + + private: + // The values used by this Array. + T v_[N]; +}; + +template +Array operator+(const Array& x, const Array& y) { + Array z; + for (size_t i = 0; i < N; ++i) { + z[i] = x[i] + y[i]; + } + return z; +} + +template +Array operator-(const Array& x, const Array& y) { + Array z; + for (size_t i = 0; i < N; ++i) { + z[i] = x[i] - y[i]; + } + return z; +} + +template +Array operator*(T v, const Array& x) { + Array y; + for (size_t i = 0; i < N; ++i) { + y[i] = v * x[i]; + } + return y; +} + +template +T operator*(const Array& x, const Array& y) { + T r = 0.0; + for (size_t i = 0; i < N; ++i) { + r += x[i] * y[i]; + } + return r; +} + +// Runs Nelder-Mead like optimization. Runs for max_iterations times, +// fun gets called with a vector of size dim as argument, and returns the score +// based on those parameters (lower is better). Returns a vector of dim+1 +// dimensions, where the first value is the optimal value of the function and +// the rest is the argmin value. Use init to pass an initial guess or where +// the optimal value is. +// +// Usage example: +// +// RunSimplex(2, 0.1, 100, [](const vector& v) { +// return (v[0] - 5) * (v[0] - 5) + (v[1] - 7) * (v[1] - 7); +// }); +// +// Returns (0.0, 5, 7) +std::vector RunSimplex( + int dim, double amount, int max_iterations, + const std::function&)>& fun); +std::vector RunSimplex( + int dim, double amount, int max_iterations, const std::vector& init, + const std::function&)>& fun); + +// Implementation of the Scaled Conjugate Gradient method described in the +// following paper: +// Moller, M. "A Scaled Conjugate Gradient Algorithm for Fast Supervised +// Learning", Neural Networks, Vol. 6. pp. 525-533, 1993 +// http://sci2s.ugr.es/keel/pdf/algorithm/articulo/moller1990.pdf +// +// The Function template parameter is a class that has the following method: +// +// // Returns the value of the function at point w and sets *df to be the +// // negative gradient vector of the function at point w. +// double Compute(const optimize::Array& w, +// optimize::Array* df) const; +// +// Returns a vector w, such that |df(w)| < grad_norm_threshold. +template +Array OptimizeWithScaledConjugateGradientMethod( + const Function& f, const Array& w0, const T grad_norm_threshold, + size_t max_iters) { + const size_t n = w0.size(); + const T rsq_threshold = grad_norm_threshold * grad_norm_threshold; + const T sigma0 = static_cast(0.0001); + const T l_min = static_cast(1.0e-15); + const T l_max = static_cast(1.0e15); + + Array w = w0; + Array wp; + Array r; + Array rt; + Array e; + Array p; + T psq; + T fp; + T D; + T d; + T m; + T a; + T b; + T s; + T t; + + T fw = f.Compute(w, &r); + T rsq = r * r; + e = r; + p = r; + T l = static_cast(1.0); + bool success = true; + size_t n_success = 0; + size_t k = 0; + + while (k++ < max_iters) { + if (success) { + m = -(p * r); + if (m >= 0) { + p = r; + m = -(p * r); + } + psq = p * p; + s = sigma0 / std::sqrt(psq); + f.Compute(w + (s * p), &rt); + t = (p * (r - rt)) / s; + } + + d = t + l * psq; + if (d <= 0) { + d = l * psq; + l = l - t / psq; + } + + a = -m / d; + wp = w + a * p; + fp = f.Compute(wp, &rt); + + D = 2.0 * (fp - fw) / (a * m); + if (D >= 0.0) { + success = true; + n_success++; + w = wp; + } else { + success = false; + } + + if (success) { + e = r; + r = rt; + rsq = r * r; + fw = fp; + if (rsq <= rsq_threshold) { + break; + } + } + + if (D < 0.25) { + l = std::min(4.0 * l, l_max); + } else if (D > 0.75) { + l = std::max(0.25 * l, l_min); + } + + if ((n_success % n) == 0) { + p = r; + l = 1.0; + } else if (success) { + b = ((e - r) * r) / m; + p = b * p + r; + } + } + + return w; +} + +} // namespace optimize +} // namespace jxl + +#endif // LIB_JXL_OPTIMIZE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/optimize_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/optimize_test.cc new file mode 100644 index 0000000000..5d5b5a8365 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/optimize_test.cc @@ -0,0 +1,109 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/optimize.h" + +#include + +#include "gtest/gtest.h" + +namespace jxl { +namespace optimize { +namespace { + +// The maximum number of iterations for the test. +static const size_t kMaxTestIter = 100000; + +// F(w) = (w - w_min)^2. +struct SimpleQuadraticFunction { + typedef Array ArrayType; + explicit SimpleQuadraticFunction(const ArrayType& w0) : w_min(w0) {} + + double Compute(const ArrayType& w, ArrayType* df) const { + ArrayType dw = w - w_min; + *df = -2.0 * dw; + return dw * dw; + } + + ArrayType w_min; +}; + +// F(alpha, beta, gamma| x,y) = \sum_i(y_i - (alpha x_i ^ gamma + beta))^2. +struct PowerFunction { + explicit PowerFunction(const std::vector& x0, + const std::vector& y0) + : x(x0), y(y0) {} + + typedef Array ArrayType; + double Compute(const ArrayType& w, ArrayType* df) const { + double loss_function = 0; + (*df)[0] = 0; + (*df)[1] = 0; + (*df)[2] = 0; + for (size_t ind = 0; ind < y.size(); ++ind) { + if (x[ind] != 0) { + double l_f = y[ind] - (w[0] * pow(x[ind], w[1]) + w[2]); + (*df)[0] += 2.0 * l_f * pow(x[ind], w[1]); + (*df)[1] += 2.0 * l_f * w[0] * pow(x[ind], w[1]) * log(x[ind]); + (*df)[2] += 2.0 * l_f * 1; + loss_function += l_f * l_f; + } + } + return loss_function; + } + + std::vector x; + std::vector y; +}; + +TEST(OptimizeTest, SimpleQuadraticFunction) { + SimpleQuadraticFunction::ArrayType w_min; + w_min[0] = 1.0; + w_min[1] = 2.0; + SimpleQuadraticFunction f(w_min); + SimpleQuadraticFunction::ArrayType w(0.); + static const double kPrecision = 1e-8; + w = optimize::OptimizeWithScaledConjugateGradientMethod(f, w, kPrecision, + kMaxTestIter); + EXPECT_NEAR(w[0], 1.0, kPrecision); + EXPECT_NEAR(w[1], 2.0, kPrecision); +} + +TEST(OptimizeTest, PowerFunction) { + std::vector x(10); + std::vector y(10); + for (int ind = 0; ind < 10; ++ind) { + x[ind] = 1. * ind; + y[ind] = 2. * pow(x[ind], 3) + 5.; + } + PowerFunction f(x, y); + PowerFunction::ArrayType w(0.); + + static const double kPrecision = 0.01; + w = optimize::OptimizeWithScaledConjugateGradientMethod(f, w, kPrecision, + kMaxTestIter); + EXPECT_NEAR(w[0], 2.0, kPrecision); + EXPECT_NEAR(w[1], 3.0, kPrecision); + EXPECT_NEAR(w[2], 5.0, kPrecision); +} + +TEST(OptimizeTest, SimplexOptTest) { + auto f = [](const std::vector& x) -> double { + double t1 = x[0] - 1.0; + double t2 = x[1] + 1.5; + return 2.0 + t1 * t1 + t2 * t2; + }; + auto opt = RunSimplex(2, 0.01, 100, f); + EXPECT_EQ(opt.size(), 3); + + static const double kPrecision = 0.01; + EXPECT_NEAR(opt[0], 2.0, kPrecision); + EXPECT_NEAR(opt[1], 1.0, kPrecision); + EXPECT_NEAR(opt[2], -1.5, kPrecision); +} + +} // namespace +} // namespace optimize +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/padded_bytes_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/padded_bytes_test.cc new file mode 100644 index 0000000000..1f4786fbcf --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/padded_bytes_test.cc @@ -0,0 +1,126 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/padded_bytes.h" + +#include // iota +#include + +#include "gtest/gtest.h" + +namespace jxl { +namespace { + +TEST(PaddedBytesTest, TestNonEmptyFirstByteZero) { + PaddedBytes pb(1); + EXPECT_EQ(0, pb[0]); + // Even after resizing.. + pb.resize(20); + EXPECT_EQ(0, pb[0]); + // And reserving. + pb.reserve(200); + EXPECT_EQ(0, pb[0]); +} + +TEST(PaddedBytesTest, TestEmptyFirstByteZero) { + PaddedBytes pb(0); + // After resizing - new zero is written despite there being nothing to copy. + pb.resize(20); + EXPECT_EQ(0, pb[0]); +} + +TEST(PaddedBytesTest, TestFillWithoutReserve) { + PaddedBytes pb; + for (size_t i = 0; i < 170; ++i) { + pb.push_back(i); + } + EXPECT_EQ(170, pb.size()); + EXPECT_GE(pb.capacity(), 170); +} + +TEST(PaddedBytesTest, TestFillWithExactReserve) { + PaddedBytes pb; + pb.reserve(170); + for (size_t i = 0; i < 170; ++i) { + pb.push_back(i); + } + EXPECT_EQ(170, pb.size()); + EXPECT_EQ(pb.capacity(), 170); +} + +TEST(PaddedBytesTest, TestFillWithMoreReserve) { + PaddedBytes pb; + pb.reserve(171); + for (size_t i = 0; i < 170; ++i) { + pb.push_back(i); + } + EXPECT_EQ(170, pb.size()); + EXPECT_GT(pb.capacity(), 170); +} + +// Can assign() a subset of the valid data. +TEST(PaddedBytesTest, TestAssignFromWithin) { + PaddedBytes pb; + pb.reserve(256); + for (size_t i = 0; i < 256; ++i) { + pb.push_back(i); + } + pb.assign(pb.data() + 64, pb.data() + 192); + EXPECT_EQ(128, pb.size()); + for (size_t i = 0; i < 128; ++i) { + EXPECT_EQ(i + 64, pb[i]); + } +} + +// Can assign() a range with both valid and previously-allocated data. +TEST(PaddedBytesTest, TestAssignReclaim) { + PaddedBytes pb; + pb.reserve(256); + for (size_t i = 0; i < 256; ++i) { + pb.push_back(i); + } + + const uint8_t* mem = pb.data(); + pb.resize(200); + // Just shrank without reallocating + EXPECT_EQ(mem, pb.data()); + EXPECT_EQ(256, pb.capacity()); + + // Reclaim part of initial allocation + pb.assign(pb.data() + 100, pb.data() + 240); + EXPECT_EQ(140, pb.size()); + + for (size_t i = 0; i < 140; ++i) { + EXPECT_EQ(i + 100, pb[i]); + } +} + +// Can assign() smaller and larger ranges outside the current allocation. +TEST(PaddedBytesTest, TestAssignOutside) { + PaddedBytes pb; + pb.resize(400); + std::iota(pb.begin(), pb.end(), 1); + + std::vector small(64); + std::iota(small.begin(), small.end(), 500); + + pb.assign(small.data(), small.data() + small.size()); + EXPECT_EQ(64, pb.size()); + for (size_t i = 0; i < 64; ++i) { + EXPECT_EQ((i + 500) & 0xFF, pb[i]); + } + + std::vector large(1000); + std::iota(large.begin(), large.end(), 600); + + pb.assign(large.data(), large.data() + large.size()); + EXPECT_EQ(1000, pb.size()); + for (size_t i = 0; i < 1000; ++i) { + EXPECT_EQ((i + 600) & 0xFF, pb[i]); + } +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/passes_state.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/passes_state.cc new file mode 100644 index 0000000000..a0cc1983fe --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/passes_state.cc @@ -0,0 +1,68 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/passes_state.h" + +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/common.h" + +namespace jxl { + +Status InitializePassesSharedState(const FrameHeader& frame_header, + PassesSharedState* JXL_RESTRICT shared, + bool encoder) { + JXL_ASSERT(frame_header.nonserialized_metadata != nullptr); + shared->frame_header = frame_header; + shared->metadata = frame_header.nonserialized_metadata; + shared->frame_dim = frame_header.ToFrameDimensions(); + shared->image_features.patches.SetPassesSharedState(shared); + + const FrameDimensions& frame_dim = shared->frame_dim; + + shared->ac_strategy = + AcStrategyImage(frame_dim.xsize_blocks, frame_dim.ysize_blocks); + shared->raw_quant_field = + ImageI(frame_dim.xsize_blocks, frame_dim.ysize_blocks); + shared->epf_sharpness = + ImageB(frame_dim.xsize_blocks, frame_dim.ysize_blocks); + shared->cmap = ColorCorrelationMap(frame_dim.xsize, frame_dim.ysize); + + // In the decoder, we allocate coeff orders afterwards, when we know how many + // we will actually need. + shared->coeff_order_size = kCoeffOrderMaxSize; + if (encoder && + shared->coeff_orders.size() < + frame_header.passes.num_passes * kCoeffOrderMaxSize && + frame_header.encoding == FrameEncoding::kVarDCT) { + shared->coeff_orders.resize(frame_header.passes.num_passes * + kCoeffOrderMaxSize); + } + + shared->quant_dc = ImageB(frame_dim.xsize_blocks, frame_dim.ysize_blocks); + if (!(frame_header.flags & FrameHeader::kUseDcFrame) || encoder) { + shared->dc_storage = + Image3F(frame_dim.xsize_blocks, frame_dim.ysize_blocks); + } else { + if (frame_header.dc_level == 4) { + return JXL_FAILURE("Invalid DC level for kUseDcFrame: %u", + frame_header.dc_level); + } + shared->dc = &shared->dc_frames[frame_header.dc_level]; + if (shared->dc->xsize() == 0) { + return JXL_FAILURE( + "kUseDcFrame specified for dc_level %u, but no frame was decoded " + "with level %u", + frame_header.dc_level, frame_header.dc_level + 1); + } + ZeroFillImage(&shared->quant_dc); + } + + shared->dc_storage = Image3F(frame_dim.xsize_blocks, frame_dim.ysize_blocks); + + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/passes_state.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/passes_state.h new file mode 100644 index 0000000000..069d7acdf0 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/passes_state.h @@ -0,0 +1,138 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_PASSES_STATE_H_ +#define LIB_JXL_PASSES_STATE_H_ + +#include "lib/jxl/ac_context.h" +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_patch_dictionary.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/loop_filter.h" +#include "lib/jxl/noise.h" +#include "lib/jxl/quant_weights.h" +#include "lib/jxl/quantizer.h" +#include "lib/jxl/splines.h" + +// Structures that hold the (en/de)coder state for a JPEG XL kVarDCT +// (en/de)coder. + +namespace jxl { + +struct ImageFeatures { + NoiseParams noise_params; + PatchDictionary patches; + Splines splines; +}; + +// State common to both encoder and decoder. +// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) +struct PassesSharedState { + PassesSharedState() : frame_header(nullptr) {} + + // Headers and metadata. + const CodecMetadata* metadata; + FrameHeader frame_header; + + FrameDimensions frame_dim; + + // Control fields and parameters. + AcStrategyImage ac_strategy; + + // Dequant matrices + quantizer. + DequantMatrices matrices; + Quantizer quantizer{&matrices}; + ImageI raw_quant_field; + + // Per-block side information for EPF detail preservation. + ImageB epf_sharpness; + + ColorCorrelationMap cmap; + + ImageFeatures image_features; + + // Memory area for storing coefficient orders. + // `coeff_order_size` is the size used by *one* set of coefficient orders (at + // most kMaxCoeffOrderSize). A set of coefficient orders is present for each + // pass. + size_t coeff_order_size = 0; + std::vector coeff_orders; + + // Decoder-side DC and quantized DC. + ImageB quant_dc; + Image3F dc_storage; + const Image3F* JXL_RESTRICT dc = &dc_storage; + + BlockCtxMap block_ctx_map; + + Image3F dc_frames[4]; + + struct { + ImageBundle storage; + // Can either point to `storage`, if this is a frame that is not stored in + // the CodecInOut, or can point to an existing ImageBundle. + // TODO(veluca): pointing to ImageBundles in CodecInOut is not possible for + // now, as they are stored in a vector and thus may be moved. Fix this. + ImageBundle* JXL_RESTRICT frame = &storage; + // ImageBundle doesn't yet have a simple way to state it is in XYB. + bool ib_is_in_xyb = false; + } reference_frames[4] = {}; + + // Number of pre-clustered set of histograms (with the same ctx map), per + // pass. Encoded as num_histograms_ - 1. + size_t num_histograms = 0; + + bool IsGrayscale() const { return metadata->m.color_encoding.IsGray(); } + + Rect GroupRect(size_t group_index) const { + const size_t gx = group_index % frame_dim.xsize_groups; + const size_t gy = group_index / frame_dim.xsize_groups; + const Rect rect(gx * frame_dim.group_dim, gy * frame_dim.group_dim, + frame_dim.group_dim, frame_dim.group_dim, frame_dim.xsize, + frame_dim.ysize); + return rect; + } + + Rect PaddedGroupRect(size_t group_index) const { + const size_t gx = group_index % frame_dim.xsize_groups; + const size_t gy = group_index / frame_dim.xsize_groups; + const Rect rect(gx * frame_dim.group_dim, gy * frame_dim.group_dim, + frame_dim.group_dim, frame_dim.group_dim, + frame_dim.xsize_padded, frame_dim.ysize_padded); + return rect; + } + + Rect BlockGroupRect(size_t group_index) const { + const size_t gx = group_index % frame_dim.xsize_groups; + const size_t gy = group_index / frame_dim.xsize_groups; + const Rect rect(gx * (frame_dim.group_dim >> 3), + gy * (frame_dim.group_dim >> 3), frame_dim.group_dim >> 3, + frame_dim.group_dim >> 3, frame_dim.xsize_blocks, + frame_dim.ysize_blocks); + return rect; + } + + Rect DCGroupRect(size_t group_index) const { + const size_t gx = group_index % frame_dim.xsize_dc_groups; + const size_t gy = group_index / frame_dim.xsize_dc_groups; + const Rect rect(gx * frame_dim.group_dim, gy * frame_dim.group_dim, + frame_dim.group_dim, frame_dim.group_dim, + frame_dim.xsize_blocks, frame_dim.ysize_blocks); + return rect; + } +}; + +// Initialized the state information that is shared between encoder and decoder. +Status InitializePassesSharedState(const FrameHeader& frame_header, + PassesSharedState* JXL_RESTRICT shared, + bool encoder = false); + +} // namespace jxl + +#endif // LIB_JXL_PASSES_STATE_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/passes_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/passes_test.cc new file mode 100644 index 0000000000..9ed3a6aba5 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/passes_test.cc @@ -0,0 +1,389 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include +#include + +#include "gtest/gtest.h" +#include "lib/extras/codec.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/thread_pool_internal.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_file.h" +#include "lib/jxl/dec_params.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testdata.h" + +namespace jxl { +namespace { +using test::Roundtrip; + +TEST(PassesTest, RoundtripSmallPasses) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + io.ShrinkTo(io.xsize() / 8, io.ysize() / 8); + + CompressParams cparams; + cparams.butteraugli_distance = 1.0; + cparams.progressive_mode = true; + DecompressParams dparams; + + CodecInOut io2; + Roundtrip(&io, cparams, dparams, pool, &io2); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 1.5); +} + +TEST(PassesTest, RoundtripUnalignedPasses) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + io.ShrinkTo(io.xsize() / 12, io.ysize() / 7); + + CompressParams cparams; + cparams.butteraugli_distance = 2.0; + cparams.progressive_mode = true; + DecompressParams dparams; + + CodecInOut io2; + Roundtrip(&io, cparams, dparams, pool, &io2); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 3.2); +} + +TEST(PassesTest, RoundtripMultiGroupPasses) { + ThreadPoolInternal pool(4); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + io.ShrinkTo(600, 1024); // partial X, full Y group + + CompressParams cparams; + DecompressParams dparams; + + cparams.butteraugli_distance = 1.0f; + cparams.progressive_mode = true; + CodecInOut io2; + Roundtrip(&io, cparams, dparams, &pool, &io2); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, &pool), + 1.99f); + + cparams.butteraugli_distance = 2.0f; + CodecInOut io3; + Roundtrip(&io, cparams, dparams, &pool, &io3); + EXPECT_LE(ButteraugliDistance(io, io3, cparams.ba_params, + /*distmap=*/nullptr, &pool), + 3.0f); +} + +TEST(PassesTest, RoundtripLargeFastPasses) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.progressive_mode = true; + DecompressParams dparams; + + CodecInOut io2; + Roundtrip(&io, cparams, dparams, &pool, &io2); +} + +// Checks for differing size/distance in two consecutive runs of distance 2, +// which involves additional processing including adaptive reconstruction. +// Failing this may be a sign of race conditions or invalid memory accesses. +TEST(PassesTest, RoundtripProgressiveConsistent) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.progressive_mode = true; + cparams.butteraugli_distance = 2.0; + DecompressParams dparams; + + // Try each xsize mod kBlockDim to verify right border handling. + for (size_t xsize = 48; xsize > 40; --xsize) { + io.ShrinkTo(xsize, 15); + + CodecInOut io2; + const size_t size2 = Roundtrip(&io, cparams, dparams, &pool, &io2); + + CodecInOut io3; + const size_t size3 = Roundtrip(&io, cparams, dparams, &pool, &io3); + + // Exact same compressed size. + EXPECT_EQ(size2, size3); + + // Exact same distance. + const float dist2 = ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, &pool); + const float dist3 = ButteraugliDistance(io, io3, cparams.ba_params, + /*distmap=*/nullptr, &pool); + EXPECT_EQ(dist2, dist3); + } +} + +TEST(PassesTest, AllDownsampleFeasible) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + PaddedBytes compressed; + AuxOut aux; + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.progressive_mode = true; + cparams.butteraugli_distance = 1.0; + PassesEncoderState enc_state; + ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, &aux, &pool)); + + EXPECT_LE(compressed.size(), 240000); + float target_butteraugli[9] = {}; + target_butteraugli[1] = 2.5f; + target_butteraugli[2] = 14.5f; + target_butteraugli[4] = 20.0f; + target_butteraugli[8] = 80.0f; + + // The default progressive encoding scheme should make all these downsampling + // factors achievable. + // TODO(veluca): re-enable downsampling 16. + std::vector downsamplings = {1, 2, 4, 8}; //, 16}; + + auto check = [&](uint32_t task, uint32_t /* thread */) -> void { + const size_t downsampling = downsamplings[task]; + DecompressParams dparams; + dparams.max_downsampling = downsampling; + CodecInOut output; + ASSERT_TRUE(DecodeFile(dparams, compressed, &output, nullptr)); + EXPECT_EQ(output.xsize(), io.xsize()) << "downsampling = " << downsampling; + EXPECT_EQ(output.ysize(), io.ysize()) << "downsampling = " << downsampling; + EXPECT_LE(ButteraugliDistance(io, output, cparams.ba_params, + /*distmap=*/nullptr, nullptr), + target_butteraugli[downsampling]) + << "downsampling: " << downsampling; + }; + pool.Run(0, downsamplings.size(), ThreadPool::SkipInit(), check); +} + +TEST(PassesTest, AllDownsampleFeasibleQProgressive) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + PaddedBytes compressed; + AuxOut aux; + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.qprogressive_mode = true; + cparams.butteraugli_distance = 1.0; + PassesEncoderState enc_state; + ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, &aux, &pool)); + + EXPECT_LE(compressed.size(), 220000); + + float target_butteraugli[9] = {}; + target_butteraugli[1] = 3.0f; + target_butteraugli[2] = 6.0f; + target_butteraugli[4] = 10.0f; + target_butteraugli[8] = 80.0f; + + // The default progressive encoding scheme should make all these downsampling + // factors achievable. + std::vector downsamplings = {1, 2, 4, 8}; + + auto check = [&](uint32_t task, uint32_t /* thread */) -> void { + const size_t downsampling = downsamplings[task]; + DecompressParams dparams; + dparams.max_downsampling = downsampling; + CodecInOut output; + ASSERT_TRUE(DecodeFile(dparams, compressed, &output, nullptr)); + EXPECT_EQ(output.xsize(), io.xsize()) << "downsampling = " << downsampling; + EXPECT_EQ(output.ysize(), io.ysize()) << "downsampling = " << downsampling; + EXPECT_LE(ButteraugliDistance(io, output, cparams.ba_params, + /*distmap=*/nullptr, nullptr), + target_butteraugli[downsampling]) + << "downsampling: " << downsampling; + }; + pool.Run(0, downsamplings.size(), ThreadPool::SkipInit(), check); +} + +TEST(PassesTest, ProgressiveDownsample2DegradesCorrectlyGrayscale) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("wesaturate/500px/cvo9xd_keong_macan_grayscale.png"); + CodecInOut io_orig; + ASSERT_TRUE(SetFromBytes(Span(orig), &io_orig, &pool)); + Rect rect(0, 0, io_orig.xsize(), 128); + // need 2 DC groups for the DC frame to actually be progressive. + Image3F large(4242, rect.ysize()); + ZeroFillImage(&large); + CopyImageTo(rect, *io_orig.Main().color(), rect, &large); + CodecInOut io; + io.metadata = io_orig.metadata; + io.SetFromImage(std::move(large), io_orig.Main().c_current()); + + PaddedBytes compressed; + AuxOut aux; + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.progressive_dc = 1; + cparams.responsive = true; + cparams.qprogressive_mode = true; + cparams.butteraugli_distance = 1.0; + PassesEncoderState enc_state; + ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, &aux, &pool)); + + EXPECT_LE(compressed.size(), 10000); + + DecompressParams dparams; + dparams.max_downsampling = 1; + CodecInOut output; + ASSERT_TRUE(DecodeFile(dparams, compressed, &output, nullptr)); + + dparams.max_downsampling = 2; + CodecInOut output_d2; + ASSERT_TRUE(DecodeFile(dparams, compressed, &output_d2, nullptr)); + + // 0 if reading all the passes, ~15 if skipping the 8x pass. + float butteraugli_distance_down2_full = + ButteraugliDistance(output, output_d2, cparams.ba_params, + /*distmap=*/nullptr, nullptr); + + EXPECT_LE(butteraugli_distance_down2_full, 3.2f); + EXPECT_GE(butteraugli_distance_down2_full, 1.0f); +} + +TEST(PassesTest, ProgressiveDownsample2DegradesCorrectly) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png"); + CodecInOut io_orig; + ASSERT_TRUE(SetFromBytes(Span(orig), &io_orig, &pool)); + Rect rect(0, 0, io_orig.xsize(), 128); + // need 2 DC groups for the DC frame to actually be progressive. + Image3F large(4242, rect.ysize()); + ZeroFillImage(&large); + CopyImageTo(rect, *io_orig.Main().color(), rect, &large); + CodecInOut io; + io.SetFromImage(std::move(large), io_orig.Main().c_current()); + + PaddedBytes compressed; + AuxOut aux; + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.progressive_dc = 1; + cparams.responsive = true; + cparams.qprogressive_mode = true; + cparams.butteraugli_distance = 1.0; + PassesEncoderState enc_state; + ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, &aux, &pool)); + + EXPECT_LE(compressed.size(), 220000); + + DecompressParams dparams; + dparams.max_downsampling = 1; + CodecInOut output; + ASSERT_TRUE(DecodeFile(dparams, compressed, &output, nullptr)); + + dparams.max_downsampling = 2; + CodecInOut output_d2; + ASSERT_TRUE(DecodeFile(dparams, compressed, &output_d2, nullptr)); + + // 0 if reading all the passes, ~15 if skipping the 8x pass. + float butteraugli_distance_down2_full = + ButteraugliDistance(output, output_d2, cparams.ba_params, + /*distmap=*/nullptr, nullptr); + + EXPECT_LE(butteraugli_distance_down2_full, 3.0f); + EXPECT_GE(butteraugli_distance_down2_full, 1.0f); +} + +TEST(PassesTest, NonProgressiveDCImage) { + ThreadPoolInternal pool(8); + const PaddedBytes orig = + ReadTestData("imagecompression.info/flower_foveon.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + PaddedBytes compressed; + AuxOut aux; + + CompressParams cparams; + cparams.speed_tier = SpeedTier::kSquirrel; + cparams.progressive_mode = false; + cparams.butteraugli_distance = 2.0; + PassesEncoderState enc_state; + ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, &aux, &pool)); + + // Even in non-progressive mode, it should be possible to return a DC-only + // image. + DecompressParams dparams; + dparams.max_downsampling = 100; + CodecInOut output; + ASSERT_TRUE(DecodeFile(dparams, compressed, &output, &pool)); + EXPECT_EQ(output.xsize(), io.xsize()); + EXPECT_EQ(output.ysize(), io.ysize()); +} + +TEST(PassesTest, RoundtripSmallNoGaborishPasses) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + io.ShrinkTo(io.xsize() / 8, io.ysize() / 8); + + CompressParams cparams; + cparams.gaborish = Override::kOff; + cparams.butteraugli_distance = 1.0; + cparams.progressive_mode = true; + DecompressParams dparams; + + CodecInOut io2; + Roundtrip(&io, cparams, dparams, pool, &io2); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 1.7); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/patch_dictionary_internal.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/patch_dictionary_internal.h new file mode 100644 index 0000000000..e4172f6db6 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/patch_dictionary_internal.h @@ -0,0 +1,31 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_PATCH_DICTIONARY_INTERNAL_H_ +#define LIB_JXL_PATCH_DICTIONARY_INTERNAL_H_ + +#include "lib/jxl/dec_patch_dictionary.h" +#include "lib/jxl/passes_state.h" // for PassesSharedState + +namespace jxl { + +// Context numbers as specified in Section C.4.5, Listing C.2: +enum Contexts { + kNumRefPatchContext = 0, + kReferenceFrameContext = 1, + kPatchSizeContext = 2, + kPatchReferencePositionContext = 3, + kPatchPositionContext = 4, + kPatchBlendModeContext = 5, + kPatchOffsetContext = 6, + kPatchCountContext = 7, + kPatchAlphaChannelContext = 8, + kPatchClampContext = 9, + kNumPatchDictionaryContexts +}; + +} // namespace jxl + +#endif // LIB_JXL_PATCH_DICTIONARY_INTERNAL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/patch_dictionary_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/patch_dictionary_test.cc new file mode 100644 index 0000000000..3bcc1351e6 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/patch_dictionary_test.cc @@ -0,0 +1,58 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "gtest/gtest.h" +#include "lib/extras/codec.h" +#include "lib/jxl/dec_params.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testdata.h" + +namespace jxl { +namespace { + +using ::jxl::test::Roundtrip; + +TEST(PatchDictionaryTest, GrayscaleModular) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = ReadTestData("jxl/grayscale_patches.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + + CompressParams cparams; + cparams.color_transform = jxl::ColorTransform::kNone; + cparams.modular_mode = true; + cparams.patches = jxl::Override::kOn; + DecompressParams dparams; + + CodecInOut io2; + // Without patches: ~25k + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 8000); + VerifyRelativeError(*io.Main().color(), *io2.Main().color(), 1e-7f, 0); +} + +TEST(PatchDictionaryTest, GrayscaleVarDCT) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = ReadTestData("jxl/grayscale_patches.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + + CompressParams cparams; + cparams.patches = jxl::Override::kOn; + DecompressParams dparams; + + CodecInOut io2; + // Without patches: ~47k + EXPECT_LE(Roundtrip(&io, cparams, dparams, pool, &io2), 14000); + // Without patches: ~1.2 + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 1.1); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/preview_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/preview_test.cc new file mode 100644 index 0000000000..0a36be8c7b --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/preview_test.cc @@ -0,0 +1,83 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include + +#include "gtest/gtest.h" +#include "lib/extras/codec.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/override.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/dec_file.h" +#include "lib/jxl/dec_params.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/headers.h" +#include "lib/jxl/image_bundle.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testdata.h" + +namespace jxl { +namespace { +using test::Roundtrip; + +TEST(PreviewTest, RoundtripGivenPreview) { + ThreadPool* pool = nullptr; + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ASSERT_TRUE(SetFromBytes(Span(orig), &io, pool)); + io.ShrinkTo(io.xsize() / 8, io.ysize() / 8); + // Same as main image + io.preview_frame = io.Main().Copy(); + const size_t preview_xsize = 15; + const size_t preview_ysize = 27; + io.preview_frame.ShrinkTo(preview_xsize, preview_ysize); + io.metadata.m.have_preview = true; + ASSERT_TRUE(io.metadata.m.preview_size.Set(io.preview_frame.xsize(), + io.preview_frame.ysize())); + + CompressParams cparams; + cparams.butteraugli_distance = 2.0; + cparams.speed_tier = SpeedTier::kSquirrel; + DecompressParams dparams; + + dparams.preview = Override::kOff; + + CodecInOut io2; + Roundtrip(&io, cparams, dparams, pool, &io2); + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, pool), + 2.5); + EXPECT_EQ(0, io2.preview_frame.xsize()); + + dparams.preview = Override::kOn; + + CodecInOut io3; + Roundtrip(&io, cparams, dparams, pool, &io3); + EXPECT_EQ(preview_xsize, io3.metadata.m.preview_size.xsize()); + EXPECT_EQ(preview_ysize, io3.metadata.m.preview_size.ysize()); + EXPECT_EQ(preview_xsize, io3.preview_frame.xsize()); + EXPECT_EQ(preview_ysize, io3.preview_frame.ysize()); + + EXPECT_LE(ButteraugliDistance(io.preview_frame, io3.preview_frame, + cparams.ba_params, + /*distmap=*/nullptr, pool), + 2.5); + EXPECT_LE(ButteraugliDistance(io.Main(), io3.Main(), cparams.ba_params, + /*distmap=*/nullptr, pool), + 2.5); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/progressive_split.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/progressive_split.cc new file mode 100644 index 0000000000..d0a16b915a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/progressive_split.cc @@ -0,0 +1,128 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/progressive_split.h" + +#include + +#include +#include + +#include "lib/jxl/common.h" +#include "lib/jxl/image.h" + +namespace jxl { + +bool ProgressiveSplitter::SuperblockIsSalient(size_t row_start, + size_t col_start, size_t num_rows, + size_t num_cols) const { + if (saliency_map_ == nullptr || saliency_map_->xsize() == 0 || + saliency_threshold_ == 0.0) { + // If we do not have a saliency-map, or the threshold says to include + // every block, we straightaway classify the superblock as 'salient'. + return true; + } + const size_t row_end = std::min(saliency_map_->ysize(), row_start + num_rows); + const size_t col_end = std::min(saliency_map_->xsize(), col_start + num_cols); + for (size_t num_row = row_start; num_row < row_end; num_row++) { + const float* JXL_RESTRICT map_row = saliency_map_->ConstRow(num_row); + for (size_t num_col = col_start; num_col < col_end; num_col++) { + if (map_row[num_col] >= saliency_threshold_) { + // One of the blocks covered by this superblock is above the saliency + // threshold. + return true; + } + } + } + // We did not see any block above the saliency threshold. + return false; +} + +template +void ProgressiveSplitter::SplitACCoefficients( + const T* JXL_RESTRICT block, size_t size, const AcStrategy& acs, size_t bx, + size_t by, size_t offset, T* JXL_RESTRICT output[kMaxNumPasses][3]) { + auto shift_right_round0 = [&](T v, int shift) { + T one_if_negative = static_cast(v) >> 31; + T add = (one_if_negative << shift) - one_if_negative; + return (v + add) >> shift; + }; + // Early quit for the simple case of only one pass. + if (mode_.num_passes == 1) { + for (size_t c = 0; c < 3; c++) { + memcpy(output[0][c] + offset, block + c * size, sizeof(T) * size); + } + return; + } + size_t ncoeffs_all_done_from_earlier_passes = 1; + size_t previous_pass_salient_only = false; + + int previous_pass_shift = 0; + for (size_t num_pass = 0; num_pass < mode_.num_passes; num_pass++) { // pass + // Zero out output block. + for (size_t c = 0; c < 3; c++) { + memset(output[num_pass][c] + offset, 0, size * sizeof(T)); + } + const bool current_pass_salient_only = mode_.passes[num_pass].salient_only; + const int pass_shift = mode_.passes[num_pass].shift; + size_t frame_ncoeffs = mode_.passes[num_pass].num_coefficients; + for (size_t c = 0; c < 3; c++) { // color-channel + size_t xsize = acs.covered_blocks_x(); + size_t ysize = acs.covered_blocks_y(); + CoefficientLayout(&ysize, &xsize); + if (current_pass_salient_only || previous_pass_salient_only) { + // Current or previous pass is salient-only. + const bool superblock_is_salient = + SuperblockIsSalient(by, bx, ysize, xsize); + if (current_pass_salient_only != superblock_is_salient) { + // Current pass is salient-only, but block is not salient, + // OR last pass was salient-only, and block is salient + // (hence was already included in last pass). + continue; + } + } + for (size_t y = 0; y < ysize * frame_ncoeffs; y++) { // superblk-y + for (size_t x = 0; x < xsize * frame_ncoeffs; x++) { // superblk-x + size_t pos = y * xsize * kBlockDim + x; + if (x < xsize * ncoeffs_all_done_from_earlier_passes && + y < ysize * ncoeffs_all_done_from_earlier_passes) { + // This coefficient was already included in an earlier pass, + // which included a genuinely smaller set of coefficients + // (= is not about saliency-splitting). + continue; + } + T v = block[c * size + pos]; + // Previous pass discarded some bits: do not encode them again. + if (previous_pass_shift != 0) { + T previous_v = shift_right_round0(v, previous_pass_shift) * + (1 << previous_pass_shift); + v -= previous_v; + } + output[num_pass][c][offset + pos] = shift_right_round0(v, pass_shift); + } // superblk-x + } // superblk-y + } // color-channel + if (!current_pass_salient_only) { + // We just finished a non-salient pass. + // Hence, we are now guaranteed to have included all coeffs up to + // frame_ncoeffs in every block, unless the current pass is shifted. + if (mode_.passes[num_pass].shift == 0) { + ncoeffs_all_done_from_earlier_passes = frame_ncoeffs; + } + } + previous_pass_salient_only = current_pass_salient_only; + previous_pass_shift = mode_.passes[num_pass].shift; + } // num_pass +} + +template void ProgressiveSplitter::SplitACCoefficients( + const int32_t* JXL_RESTRICT, size_t, const AcStrategy&, size_t, size_t, + size_t, int32_t* JXL_RESTRICT[kMaxNumPasses][3]); + +template void ProgressiveSplitter::SplitACCoefficients( + const int16_t* JXL_RESTRICT, size_t, const AcStrategy&, size_t, size_t, + size_t, int16_t* JXL_RESTRICT[kMaxNumPasses][3]); + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/progressive_split.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/progressive_split.h new file mode 100644 index 0000000000..68ab7bc9dc --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/progressive_split.h @@ -0,0 +1,149 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_PROGRESSIVE_SPLIT_H_ +#define LIB_JXL_PROGRESSIVE_SPLIT_H_ + +#include +#include + +#include +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/frame_header.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/splines.h" + +// Functions to split DCT coefficients in multiple passes. All the passes of a +// single frame are added together. + +namespace jxl { + +constexpr size_t kNoDownsamplingFactor = std::numeric_limits::max(); + +struct PassDefinition { + // Side of the square of the coefficients that should be kept in each 8x8 + // block. Must be greater than 1, and at most 8. Should be in non-decreasing + // order. + size_t num_coefficients; + + // How much to shift the encoded values by, with rounding. + size_t shift; + + // Whether or not we should include only salient blocks. + // TODO(veluca): ignored for now. + bool salient_only; + + // If specified, this indicates that if the requested downsampling factor is + // sufficiently high, then it is fine to stop decoding after this pass. + // By default, passes are not marked as being suitable for any downsampling. + size_t suitable_for_downsampling_of_at_least; +}; + +struct ProgressiveMode { + size_t num_passes = 1; + PassDefinition passes[kMaxNumPasses] = {PassDefinition{ + /*num_coefficients=*/8, /*shift=*/0, /*salient_only=*/false, + /*suitable_for_downsampling_of_at_least=*/1}}; + + ProgressiveMode() = default; + + template + explicit ProgressiveMode(const PassDefinition (&p)[nump]) { + JXL_ASSERT(nump <= kMaxNumPasses); + num_passes = nump; + PassDefinition previous_pass{ + /*num_coefficients=*/1, /*shift=*/0, + /*salient_only=*/false, + /*suitable_for_downsampling_of_at_least=*/kNoDownsamplingFactor}; + size_t last_downsampling_factor = kNoDownsamplingFactor; + for (size_t i = 0; i < nump; i++) { + JXL_ASSERT(p[i].num_coefficients > previous_pass.num_coefficients || + (p[i].num_coefficients == previous_pass.num_coefficients && + !p[i].salient_only && previous_pass.salient_only) || + (p[i].num_coefficients == previous_pass.num_coefficients && + p[i].shift < previous_pass.shift)); + JXL_ASSERT(p[i].suitable_for_downsampling_of_at_least == + kNoDownsamplingFactor || + p[i].suitable_for_downsampling_of_at_least <= + last_downsampling_factor); + if (p[i].suitable_for_downsampling_of_at_least != kNoDownsamplingFactor) { + last_downsampling_factor = p[i].suitable_for_downsampling_of_at_least; + } + previous_pass = passes[i] = p[i]; + } + } +}; + +class ProgressiveSplitter { + public: + void SetProgressiveMode(ProgressiveMode mode) { mode_ = mode; } + + void SetSaliencyMap(const ImageF* saliency_map) { + saliency_map_ = saliency_map; + } + + void SetSaliencyThreshold(float threshold) { + saliency_threshold_ = threshold; + } + + size_t GetNumPasses() const { return mode_.num_passes; } + + void InitPasses(Passes* JXL_RESTRICT passes) const { + passes->num_passes = static_cast(GetNumPasses()); + passes->num_downsample = 0; + JXL_ASSERT(passes->num_passes != 0); + passes->shift[passes->num_passes - 1] = 0; + if (passes->num_passes == 1) return; // Done, arrays are empty + + for (uint32_t i = 0; i < mode_.num_passes - 1; ++i) { + const size_t min_downsampling_factor = + mode_.passes[i].suitable_for_downsampling_of_at_least; + passes->shift[i] = mode_.passes[i].shift; + if (1 < min_downsampling_factor && + min_downsampling_factor != kNoDownsamplingFactor) { + passes->downsample[passes->num_downsample] = min_downsampling_factor; + passes->last_pass[passes->num_downsample] = i; + passes->num_downsample += 1; + } + } + } + + template + void SplitACCoefficients(const T* JXL_RESTRICT block, size_t size, + const AcStrategy& acs, size_t bx, size_t by, + size_t offset, + T* JXL_RESTRICT output[kMaxNumPasses][3]); + + private: + bool SuperblockIsSalient(size_t row_start, size_t col_start, size_t num_rows, + size_t num_cols) const; + ProgressiveMode mode_; + + // Not owned, must remain valid. + const ImageF* saliency_map_ = nullptr; + float saliency_threshold_ = 0.0; +}; + +extern template void ProgressiveSplitter::SplitACCoefficients( + const int32_t* JXL_RESTRICT, size_t, const AcStrategy&, size_t, size_t, + size_t, int32_t* JXL_RESTRICT[kMaxNumPasses][3]); + +extern template void ProgressiveSplitter::SplitACCoefficients( + const int16_t* JXL_RESTRICT, size_t, const AcStrategy&, size_t, size_t, + size_t, int16_t* JXL_RESTRICT[kMaxNumPasses][3]); + +} // namespace jxl + +#endif // LIB_JXL_PROGRESSIVE_SPLIT_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights.cc new file mode 100644 index 0000000000..f7adc0a838 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights.cc @@ -0,0 +1,1184 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +#include "lib/jxl/quant_weights.h" + +#include +#include + +#include +#include +#include +#include + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/dec_modular.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image.h" + +namespace jxl { + +// kQuantWeights[N * N * c + N * y + x] is the relative weight of the (x, y) +// coefficient in component c. Higher weights correspond to finer quantization +// intervals and more bits spent in encoding. + +namespace { + +static constexpr const float kAlmostZero = 1e-8f; + +void GetQuantWeightsDCT2(const QuantEncoding::DCT2Weights& dct2weights, + float* weights) { + for (size_t c = 0; c < 3; c++) { + size_t start = c * 64; + weights[start] = 0xBAD; + weights[start + 1] = weights[start + 8] = dct2weights[c][0]; + weights[start + 9] = dct2weights[c][1]; + for (size_t y = 0; y < 2; y++) { + for (size_t x = 0; x < 2; x++) { + weights[start + y * 8 + x + 2] = dct2weights[c][2]; + weights[start + (y + 2) * 8 + x] = dct2weights[c][2]; + } + } + for (size_t y = 0; y < 2; y++) { + for (size_t x = 0; x < 2; x++) { + weights[start + (y + 2) * 8 + x + 2] = dct2weights[c][3]; + } + } + for (size_t y = 0; y < 4; y++) { + for (size_t x = 0; x < 4; x++) { + weights[start + y * 8 + x + 4] = dct2weights[c][4]; + weights[start + (y + 4) * 8 + x] = dct2weights[c][4]; + } + } + for (size_t y = 0; y < 4; y++) { + for (size_t x = 0; x < 4; x++) { + weights[start + (y + 4) * 8 + x + 4] = dct2weights[c][5]; + } + } + } +} + +void GetQuantWeightsIdentity(const QuantEncoding::IdWeights& idweights, + float* weights) { + for (size_t c = 0; c < 3; c++) { + for (int i = 0; i < 64; i++) { + weights[64 * c + i] = idweights[c][0]; + } + weights[64 * c + 1] = idweights[c][1]; + weights[64 * c + 8] = idweights[c][1]; + weights[64 * c + 9] = idweights[c][2]; + } +} + +float Mult(float v) { + if (v > 0) return 1 + v; + return 1 / (1 - v); +} + +float Interpolate(float pos, float max, const float* array, size_t len) { + float scaled_pos = pos * (len - 1) / max; + size_t idx = scaled_pos; + JXL_ASSERT(idx + 1 < len); + float a = array[idx]; + float b = array[idx + 1]; + return a * pow(b / a, scaled_pos - idx); +} + +// Computes quant weights for a COLS*ROWS-sized transform, using num_bands +// eccentricity bands and num_ebands eccentricity bands. If print_mode is 1, +// prints the resulting matrix; if print_mode is 2, prints the matrix in a +// format suitable for a 3d plot with gnuplot. +template +Status GetQuantWeights( + size_t ROWS, size_t COLS, + const DctQuantWeightParams::DistanceBandsArray& distance_bands, + size_t num_bands, float* out) { + for (size_t c = 0; c < 3; c++) { + if (print_mode) { + fprintf(stderr, "Channel %zu\n", c); + } + float bands[DctQuantWeightParams::kMaxDistanceBands] = { + distance_bands[c][0]}; + if (bands[0] < kAlmostZero) return JXL_FAILURE("Invalid distance bands"); + for (size_t i = 1; i < num_bands; i++) { + bands[i] = bands[i - 1] * Mult(distance_bands[c][i]); + if (bands[i] < kAlmostZero) return JXL_FAILURE("Invalid distance bands"); + } + for (size_t y = 0; y < ROWS; y++) { + for (size_t x = 0; x < COLS; x++) { + float dx = 1.0f * x / (COLS - 1); + float dy = 1.0f * y / (ROWS - 1); + float distance = std::sqrt(dx * dx + dy * dy); + float weight = + num_bands == 1 + ? bands[0] + : Interpolate(distance, std::sqrt(2) + 1e-6f, bands, num_bands); + + if (print_mode == 1) { + fprintf(stderr, "%15.12f, ", weight); + } + if (print_mode == 2) { + fprintf(stderr, "%zu %zu %15.12f\n", x, y, weight); + } + out[c * COLS * ROWS + y * COLS + x] = weight; + } + if (print_mode) fprintf(stderr, "\n"); + if (print_mode == 1) fprintf(stderr, "\n"); + } + if (print_mode) fprintf(stderr, "\n"); + } + return true; +} + +Status DecodeDctParams(BitReader* br, DctQuantWeightParams* params) { + params->num_distance_bands = + br->ReadFixedBits() + 1; + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < params->num_distance_bands; i++) { + JXL_RETURN_IF_ERROR(F16Coder::Read(br, ¶ms->distance_bands[c][i])); + } + if (params->distance_bands[c][0] < kAlmostZero) { + return JXL_FAILURE("Distance band seed is too small"); + } + params->distance_bands[c][0] *= 64.0f; + } + return true; +} + +Status Decode(BitReader* br, QuantEncoding* encoding, size_t required_size_x, + size_t required_size_y, size_t idx, + ModularFrameDecoder* modular_frame_decoder) { + size_t required_size = required_size_x * required_size_y; + required_size_x *= kBlockDim; + required_size_y *= kBlockDim; + int mode = br->ReadFixedBits(); + switch (mode) { + case QuantEncoding::kQuantModeLibrary: { + encoding->predefined = br->ReadFixedBits(); + if (encoding->predefined >= kNumPredefinedTables) { + return JXL_FAILURE("Invalid predefined table"); + } + break; + } + case QuantEncoding::kQuantModeID: { + if (required_size != 1) return JXL_FAILURE("Invalid mode"); + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 3; i++) { + JXL_RETURN_IF_ERROR(F16Coder::Read(br, &encoding->idweights[c][i])); + if (std::abs(encoding->idweights[c][i]) < kAlmostZero) { + return JXL_FAILURE("ID Quantizer is too small"); + } + encoding->idweights[c][i] *= 64; + } + } + break; + } + case QuantEncoding::kQuantModeDCT2: { + if (required_size != 1) return JXL_FAILURE("Invalid mode"); + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 6; i++) { + JXL_RETURN_IF_ERROR(F16Coder::Read(br, &encoding->dct2weights[c][i])); + if (std::abs(encoding->dct2weights[c][i]) < kAlmostZero) { + return JXL_FAILURE("Quantizer is too small"); + } + encoding->dct2weights[c][i] *= 64; + } + } + break; + } + case QuantEncoding::kQuantModeDCT4X8: { + if (required_size != 1) return JXL_FAILURE("Invalid mode"); + for (size_t c = 0; c < 3; c++) { + JXL_RETURN_IF_ERROR( + F16Coder::Read(br, &encoding->dct4x8multipliers[c])); + if (std::abs(encoding->dct4x8multipliers[c]) < kAlmostZero) { + return JXL_FAILURE("DCT4X8 multiplier is too small"); + } + } + JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params)); + break; + } + case QuantEncoding::kQuantModeDCT4: { + if (required_size != 1) return JXL_FAILURE("Invalid mode"); + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 2; i++) { + JXL_RETURN_IF_ERROR( + F16Coder::Read(br, &encoding->dct4multipliers[c][i])); + if (std::abs(encoding->dct4multipliers[c][i]) < kAlmostZero) { + return JXL_FAILURE("DCT4 multiplier is too small"); + } + } + } + JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params)); + break; + } + case QuantEncoding::kQuantModeAFV: { + if (required_size != 1) return JXL_FAILURE("Invalid mode"); + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < 9; i++) { + JXL_RETURN_IF_ERROR(F16Coder::Read(br, &encoding->afv_weights[c][i])); + } + for (size_t i = 0; i < 6; i++) { + encoding->afv_weights[c][i] *= 64; + } + JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params)); + JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params_afv_4x4)); + } + break; + } + case QuantEncoding::kQuantModeDCT: { + JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params)); + break; + } + case QuantEncoding::kQuantModeRAW: { + // Set mode early, to avoid mem-leak. + encoding->mode = QuantEncoding::kQuantModeRAW; + JXL_RETURN_IF_ERROR(ModularFrameDecoder::DecodeQuantTable( + required_size_x, required_size_y, br, encoding, idx, + modular_frame_decoder)); + break; + } + default: + return JXL_FAILURE("Invalid quantization table encoding"); + } + encoding->mode = QuantEncoding::Mode(mode); + return true; +} + +// TODO(veluca): SIMD-fy. With 256x256, this is actually slow. +Status ComputeQuantTable(const QuantEncoding& encoding, + float* JXL_RESTRICT table, + float* JXL_RESTRICT inv_table, size_t table_num, + DequantMatrices::QuantTable kind, size_t* pos) { + std::vector weights(3 * kMaxQuantTableSize); + + constexpr size_t N = kBlockDim; + size_t wrows = 8 * DequantMatrices::required_size_x[kind], + wcols = 8 * DequantMatrices::required_size_y[kind]; + size_t num = wrows * wcols; + + switch (encoding.mode) { + case QuantEncoding::kQuantModeLibrary: { + // Library and copy quant encoding should get replaced by the actual + // parameters by the caller. + JXL_ASSERT(false); + break; + } + case QuantEncoding::kQuantModeID: { + JXL_ASSERT(num == kDCTBlockSize); + GetQuantWeightsIdentity(encoding.idweights, weights.data()); + break; + } + case QuantEncoding::kQuantModeDCT2: { + JXL_ASSERT(num == kDCTBlockSize); + GetQuantWeightsDCT2(encoding.dct2weights, weights.data()); + break; + } + case QuantEncoding::kQuantModeDCT4: { + JXL_ASSERT(num == kDCTBlockSize); + float weights4x4[3 * 4 * 4]; + // Always use 4x4 GetQuantWeights for DCT4 quantization tables. + JXL_RETURN_IF_ERROR( + GetQuantWeights(4, 4, encoding.dct_params.distance_bands, + encoding.dct_params.num_distance_bands, weights4x4)); + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < kBlockDim; y++) { + for (size_t x = 0; x < kBlockDim; x++) { + weights[c * num + y * kBlockDim + x] = + weights4x4[c * 16 + (y / 2) * 4 + (x / 2)]; + } + } + weights[c * num + 1] /= encoding.dct4multipliers[c][0]; + weights[c * num + N] /= encoding.dct4multipliers[c][0]; + weights[c * num + N + 1] /= encoding.dct4multipliers[c][1]; + } + break; + } + case QuantEncoding::kQuantModeDCT4X8: { + JXL_ASSERT(num == kDCTBlockSize); + float weights4x8[3 * 4 * 8]; + // Always use 4x8 GetQuantWeights for DCT4X8 quantization tables. + JXL_RETURN_IF_ERROR( + GetQuantWeights(4, 8, encoding.dct_params.distance_bands, + encoding.dct_params.num_distance_bands, weights4x8)); + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < kBlockDim; y++) { + for (size_t x = 0; x < kBlockDim; x++) { + weights[c * num + y * kBlockDim + x] = + weights4x8[c * 32 + (y / 2) * 8 + x]; + } + } + weights[c * num + N] /= encoding.dct4x8multipliers[c]; + } + break; + } + case QuantEncoding::kQuantModeDCT: { + JXL_RETURN_IF_ERROR(GetQuantWeights( + wrows, wcols, encoding.dct_params.distance_bands, + encoding.dct_params.num_distance_bands, weights.data())); + break; + } + case QuantEncoding::kQuantModeRAW: { + if (!encoding.qraw.qtable || encoding.qraw.qtable->size() != 3 * num) { + return JXL_FAILURE("Invalid table encoding"); + } + for (size_t i = 0; i < 3 * num; i++) { + weights[i] = + 1.f / (encoding.qraw.qtable_den * (*encoding.qraw.qtable)[i]); + } + break; + } + case QuantEncoding::kQuantModeAFV: { + constexpr float kFreqs[] = { + 0xBAD, + 0xBAD, + 0.8517778890324296, + 5.37778436506804, + 0xBAD, + 0xBAD, + 4.734747904497923, + 5.449245381693219, + 1.6598270267479331, + 4, + 7.275749096817861, + 10.423227632456525, + 2.662932286148962, + 7.630657783650829, + 8.962388608184032, + 12.97166202570235, + }; + + float weights4x8[3 * 4 * 8]; + JXL_RETURN_IF_ERROR(( + GetQuantWeights(4, 8, encoding.dct_params.distance_bands, + encoding.dct_params.num_distance_bands, weights4x8))); + float weights4x4[3 * 4 * 4]; + JXL_RETURN_IF_ERROR((GetQuantWeights( + 4, 4, encoding.dct_params_afv_4x4.distance_bands, + encoding.dct_params_afv_4x4.num_distance_bands, weights4x4))); + + constexpr float lo = 0.8517778890324296; + constexpr float hi = 12.97166202570235 - lo + 1e-6; + for (size_t c = 0; c < 3; c++) { + float bands[4]; + bands[0] = encoding.afv_weights[c][5]; + if (bands[0] < kAlmostZero) return JXL_FAILURE("Invalid AFV bands"); + for (size_t i = 1; i < 4; i++) { + bands[i] = bands[i - 1] * Mult(encoding.afv_weights[c][i + 5]); + if (bands[i] < kAlmostZero) return JXL_FAILURE("Invalid AFV bands"); + } + size_t start = c * 64; + auto set_weight = [&start, &weights](size_t x, size_t y, float val) { + weights[start + y * 8 + x] = val; + }; + weights[start] = 1; // Not used, but causes MSAN error otherwise. + // Weights for (0, 1) and (1, 0). + set_weight(0, 1, encoding.afv_weights[c][0]); + set_weight(1, 0, encoding.afv_weights[c][1]); + // AFV special weights for 3-pixel corner. + set_weight(0, 2, encoding.afv_weights[c][2]); + set_weight(2, 0, encoding.afv_weights[c][3]); + set_weight(2, 2, encoding.afv_weights[c][4]); + + // All other AFV weights. + for (size_t y = 0; y < 4; y++) { + for (size_t x = 0; x < 4; x++) { + if (x < 2 && y < 2) continue; + float val = Interpolate(kFreqs[y * 4 + x] - lo, hi, bands, 4); + set_weight(2 * x, 2 * y, val); + } + } + + // Put 4x8 weights in odd rows, except (1, 0). + for (size_t y = 0; y < kBlockDim / 2; y++) { + for (size_t x = 0; x < kBlockDim; x++) { + if (x == 0 && y == 0) continue; + weights[c * num + (2 * y + 1) * kBlockDim + x] = + weights4x8[c * 32 + y * 8 + x]; + } + } + // Put 4x4 weights in even rows / odd columns, except (0, 1). + for (size_t y = 0; y < kBlockDim / 2; y++) { + for (size_t x = 0; x < kBlockDim / 2; x++) { + if (x == 0 && y == 0) continue; + weights[c * num + (2 * y) * kBlockDim + 2 * x + 1] = + weights4x4[c * 16 + y * 4 + x]; + } + } + } + break; + } + } + size_t prev_pos = *pos; + for (size_t c = 0; c < 3; c++) { + for (size_t i = 0; i < num; i++) { + float inv_val = weights[c * num + i]; + if (inv_val > 1.0f / kAlmostZero || inv_val < kAlmostZero) { + return JXL_FAILURE("Invalid quantization table"); + } + float val = 1.0f / inv_val; + table[*pos] = val; + inv_table[*pos] = inv_val; + (*pos)++; + } + } + // Ensure that the lowest frequencies have a 0 inverse table. + // This does not affect en/decoding, but allows AC strategy selection to be + // slightly simpler. + size_t xs = DequantMatrices::required_size_x[kind]; + size_t ys = DequantMatrices::required_size_y[kind]; + CoefficientLayout(&ys, &xs); + for (size_t c = 0; c < 3; c++) { + for (size_t y = 0; y < ys; y++) { + for (size_t x = 0; x < xs; x++) { + inv_table[prev_pos + c * ys * xs * kDCTBlockSize + y * kBlockDim * xs + + x] = 0; + } + } + } + return true; +} + +} // namespace + +// These definitions are needed before C++17. +constexpr size_t DequantMatrices::required_size_[]; +constexpr size_t DequantMatrices::required_size_x[]; +constexpr size_t DequantMatrices::required_size_y[]; +constexpr DequantMatrices::QuantTable DequantMatrices::kQuantTable[]; + +Status DequantMatrices::Decode(BitReader* br, + ModularFrameDecoder* modular_frame_decoder) { + size_t all_default = br->ReadBits(1); + size_t num_tables = all_default ? 0 : static_cast(kNum); + encodings_.clear(); + encodings_.resize(kNum, QuantEncoding::Library(0)); + for (size_t i = 0; i < num_tables; i++) { + JXL_RETURN_IF_ERROR( + jxl::Decode(br, &encodings_[i], required_size_x[i % kNum], + required_size_y[i % kNum], i, modular_frame_decoder)); + } + return DequantMatrices::Compute(); +} + +Status DequantMatrices::DecodeDC(BitReader* br) { + bool all_default = br->ReadBits(1); + if (!all_default) { + for (size_t c = 0; c < 3; c++) { + JXL_RETURN_IF_ERROR(F16Coder::Read(br, &dc_quant_[c])); + dc_quant_[c] *= 1.0f / 128.0f; + // Negative values and nearly zero are invalid values. + if (dc_quant_[c] < kAlmostZero) { + return JXL_FAILURE("Invalid dc_quant: coefficient is too small."); + } + inv_dc_quant_[c] = 1.0f / dc_quant_[c]; + } + } + return true; +} + +constexpr float V(float v) { return static_cast(v); } + +namespace { +struct DequantMatricesLibraryDef { + // DCT8 + static constexpr const QuantEncodingInternal DCT() { + return QuantEncodingInternal::DCT(DctQuantWeightParams({{{ + V(3150.0), + V(0.0), + V(-0.4), + V(-0.4), + V(-0.4), + V(-2.0), + }, + { + V(560.0), + V(0.0), + V(-0.3), + V(-0.3), + V(-0.3), + V(-0.3), + }, + { + V(512.0), + V(-2.0), + V(-1.0), + V(0.0), + V(-1.0), + V(-2.0), + }}}, + 6)); + } + + // Identity + static constexpr const QuantEncodingInternal IDENTITY() { + return QuantEncodingInternal::Identity({{{ + V(280.0), + V(3160.0), + V(3160.0), + }, + { + V(60.0), + V(864.0), + V(864.0), + }, + { + V(18.0), + V(200.0), + V(200.0), + }}}); + } + + // DCT2 + static constexpr const QuantEncodingInternal DCT2X2() { + return QuantEncodingInternal::DCT2({{{ + V(3840.0), + V(2560.0), + V(1280.0), + V(640.0), + V(480.0), + V(300.0), + }, + { + V(960.0), + V(640.0), + V(320.0), + V(180.0), + V(140.0), + V(120.0), + }, + { + V(640.0), + V(320.0), + V(128.0), + V(64.0), + V(32.0), + V(16.0), + }}}); + } + + // DCT4 (quant_kind 3) + static constexpr const QuantEncodingInternal DCT4X4() { + return QuantEncodingInternal::DCT4(DctQuantWeightParams({{{ + V(2200.0), + V(0.0), + V(0.0), + V(0.0), + }, + { + V(392.0), + V(0.0), + V(0.0), + V(0.0), + }, + { + V(112.0), + V(-0.25), + V(-0.25), + V(-0.5), + }}}, + 4), + /* kMul */ + {{{ + V(1.0), + V(1.0), + }, + { + V(1.0), + V(1.0), + }, + { + V(1.0), + V(1.0), + }}}); + } + + // DCT16 + static constexpr const QuantEncodingInternal DCT16X16() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{ + V(8996.8725711814115328), + V(-1.3000777393353804), + V(-0.49424529824571225), + V(-0.439093774457103443), + V(-0.6350101832695744), + V(-0.90177264050827612), + V(-1.6162099239887414), + }, + { + V(3191.48366296844234752), + V(-0.67424582104194355), + V(-0.80745813428471001), + V(-0.44925837484843441), + V(-0.35865440981033403), + V(-0.31322389111877305), + V(-0.37615025315725483), + }, + { + V(1157.50408145487200256), + V(-2.0531423165804414), + V(-1.4), + V(-0.50687130033378396), + V(-0.42708730624733904), + V(-1.4856834539296244), + V(-4.9209142884401604), + }}}, + 7)); + } + + // DCT32 + static constexpr const QuantEncodingInternal DCT32X32() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{ + V(15718.40830982518931456), + V(-1.025), + V(-0.98), + V(-0.9012), + V(-0.4), + V(-0.48819395464), + V(-0.421064), + V(-0.27), + }, + { + V(7305.7636810695983104), + V(-0.8041958212306401), + V(-0.7633036457487539), + V(-0.55660379990111464), + V(-0.49785304658857626), + V(-0.43699592683512467), + V(-0.40180866526242109), + V(-0.27321683125358037), + }, + { + V(3803.53173721215041536), + V(-3.060733579805728), + V(-2.0413270132490346), + V(-2.0235650159727417), + V(-0.5495389509954993), + V(-0.4), + V(-0.4), + V(-0.3), + }}}, + 8)); + } + + // DCT16X8 + static constexpr const QuantEncodingInternal DCT8X16() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{ + V(7240.7734393502), + V(-0.7), + V(-0.7), + V(-0.2), + V(-0.2), + V(-0.2), + V(-0.5), + }, + { + V(1448.15468787004), + V(-0.5), + V(-0.5), + V(-0.5), + V(-0.2), + V(-0.2), + V(-0.2), + }, + { + V(506.854140754517), + V(-1.4), + V(-0.2), + V(-0.5), + V(-0.5), + V(-1.5), + V(-3.6), + }}}, + 7)); + } + + // DCT32X8 + static constexpr const QuantEncodingInternal DCT8X32() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{ + V(16283.2494710648897), + V(-1.7812845336559429), + V(-1.6309059012653515), + V(-1.0382179034313539), + V(-0.85), + V(-0.7), + V(-0.9), + V(-1.2360638576849587), + }, + { + V(5089.15750884921511936), + V(-0.320049391452786891), + V(-0.35362849922161446), + V(-0.30340000000000003), + V(-0.61), + V(-0.5), + V(-0.5), + V(-0.6), + }, + { + V(3397.77603275308720128), + V(-0.321327362693153371), + V(-0.34507619223117997), + V(-0.70340000000000003), + V(-0.9), + V(-1.0), + V(-1.0), + V(-1.1754605576265209), + }}}, + 8)); + } + + // DCT32X16 + static constexpr const QuantEncodingInternal DCT16X32() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{ + V(13844.97076442300573), + V(-0.97113799999999995), + V(-0.658), + V(-0.42026), + V(-0.22712), + V(-0.2206), + V(-0.226), + V(-0.6), + }, + { + V(4798.964084220744293), + V(-0.61125308982767057), + V(-0.83770786552491361), + V(-0.79014862079498627), + V(-0.2692727459704829), + V(-0.38272769465388551), + V(-0.22924222653091453), + V(-0.20719098826199578), + }, + { + V(1807.236946760964614), + V(-1.2), + V(-1.2), + V(-0.7), + V(-0.7), + V(-0.7), + V(-0.4), + V(-0.5), + }}}, + 8)); + } + + // DCT4X8 and 8x4 + static constexpr const QuantEncodingInternal DCT4X8() { + return QuantEncodingInternal::DCT4X8( + DctQuantWeightParams({{ + { + V(2198.050556016380522), + V(-0.96269623020744692), + V(-0.76194253026666783), + V(-0.6551140670773547), + }, + { + V(764.3655248643528689), + V(-0.92630200888366945), + V(-0.9675229603596517), + V(-0.27845290869168118), + }, + { + V(527.107573587542228), + V(-1.4594385811273854), + V(-1.450082094097871593), + V(-1.5843722511996204), + }, + }}, + 4), + /* kMuls */ + {{ + V(1.0), + V(1.0), + V(1.0), + }}); + } + // AFV + static const QuantEncodingInternal AFV0() { + return QuantEncodingInternal::AFV(DCT4X8().dct_params, DCT4X4().dct_params, + {{{ + // 4x4/4x8 DC tendency. + V(3072.0), + V(3072.0), + // AFV corner. + V(256.0), + V(256.0), + V(256.0), + // AFV high freqs. + V(414.0), + V(0.0), + V(0.0), + V(0.0), + }, + { + // 4x4/4x8 DC tendency. + V(1024.0), + V(1024.0), + // AFV corner. + V(50), + V(50), + V(50), + // AFV high freqs. + V(58.0), + V(0.0), + V(0.0), + V(0.0), + }, + { + // 4x4/4x8 DC tendency. + V(384.0), + V(384.0), + // AFV corner. + V(12.0), + V(12.0), + V(12.0), + // AFV high freqs. + V(22.0), + V(-0.25), + V(-0.25), + V(-0.25), + }}}); + } + + // DCT64 + static const QuantEncodingInternal DCT64X64() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{ + V(0.9 * 26629.073922049845), + V(-1.025), + V(-0.78), + V(-0.65012), + V(-0.19041574084286472), + V(-0.20819395464), + V(-0.421064), + V(-0.32733845535848671), + }, + { + V(0.9 * 9311.3238710010046), + V(-0.3041958212306401), + V(-0.3633036457487539), + V(-0.35660379990111464), + V(-0.3443074455424403), + V(-0.33699592683512467), + V(-0.30180866526242109), + V(-0.27321683125358037), + }, + { + V(0.9 * 4992.2486445538634), + V(-1.2), + V(-1.2), + V(-0.8), + V(-0.7), + V(-0.7), + V(-0.4), + V(-0.5), + }}}, + 8)); + } + + // DCT64X32 + static const QuantEncodingInternal DCT32X64() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{ + V(0.65 * 23629.073922049845), + V(-1.025), + V(-0.78), + V(-0.65012), + V(-0.19041574084286472), + V(-0.20819395464), + V(-0.421064), + V(-0.32733845535848671), + }, + { + V(0.65 * 8611.3238710010046), + V(-0.3041958212306401), + V(-0.3633036457487539), + V(-0.35660379990111464), + V(-0.3443074455424403), + V(-0.33699592683512467), + V(-0.30180866526242109), + V(-0.27321683125358037), + }, + { + V(0.65 * 4492.2486445538634), + V(-1.2), + V(-1.2), + V(-0.8), + V(-0.7), + V(-0.7), + V(-0.4), + V(-0.5), + }}}, + 8)); + } + // DCT128X128 + static const QuantEncodingInternal DCT128X128() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{ + V(1.8 * 26629.073922049845), + V(-1.025), + V(-0.78), + V(-0.65012), + V(-0.19041574084286472), + V(-0.20819395464), + V(-0.421064), + V(-0.32733845535848671), + }, + { + V(1.8 * 9311.3238710010046), + V(-0.3041958212306401), + V(-0.3633036457487539), + V(-0.35660379990111464), + V(-0.3443074455424403), + V(-0.33699592683512467), + V(-0.30180866526242109), + V(-0.27321683125358037), + }, + { + V(1.8 * 4992.2486445538634), + V(-1.2), + V(-1.2), + V(-0.8), + V(-0.7), + V(-0.7), + V(-0.4), + V(-0.5), + }}}, + 8)); + } + + // DCT128X64 + static const QuantEncodingInternal DCT64X128() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{ + V(1.3 * 23629.073922049845), + V(-1.025), + V(-0.78), + V(-0.65012), + V(-0.19041574084286472), + V(-0.20819395464), + V(-0.421064), + V(-0.32733845535848671), + }, + { + V(1.3 * 8611.3238710010046), + V(-0.3041958212306401), + V(-0.3633036457487539), + V(-0.35660379990111464), + V(-0.3443074455424403), + V(-0.33699592683512467), + V(-0.30180866526242109), + V(-0.27321683125358037), + }, + { + V(1.3 * 4492.2486445538634), + V(-1.2), + V(-1.2), + V(-0.8), + V(-0.7), + V(-0.7), + V(-0.4), + V(-0.5), + }}}, + 8)); + } + // DCT256X256 + static const QuantEncodingInternal DCT256X256() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{ + V(3.6 * 26629.073922049845), + V(-1.025), + V(-0.78), + V(-0.65012), + V(-0.19041574084286472), + V(-0.20819395464), + V(-0.421064), + V(-0.32733845535848671), + }, + { + V(3.6 * 9311.3238710010046), + V(-0.3041958212306401), + V(-0.3633036457487539), + V(-0.35660379990111464), + V(-0.3443074455424403), + V(-0.33699592683512467), + V(-0.30180866526242109), + V(-0.27321683125358037), + }, + { + V(3.6 * 4992.2486445538634), + V(-1.2), + V(-1.2), + V(-0.8), + V(-0.7), + V(-0.7), + V(-0.4), + V(-0.5), + }}}, + 8)); + } + + // DCT256X128 + static const QuantEncodingInternal DCT128X256() { + return QuantEncodingInternal::DCT( + DctQuantWeightParams({{{ + V(2.6 * 23629.073922049845), + V(-1.025), + V(-0.78), + V(-0.65012), + V(-0.19041574084286472), + V(-0.20819395464), + V(-0.421064), + V(-0.32733845535848671), + }, + { + V(2.6 * 8611.3238710010046), + V(-0.3041958212306401), + V(-0.3633036457487539), + V(-0.35660379990111464), + V(-0.3443074455424403), + V(-0.33699592683512467), + V(-0.30180866526242109), + V(-0.27321683125358037), + }, + { + V(2.6 * 4492.2486445538634), + V(-1.2), + V(-1.2), + V(-0.8), + V(-0.7), + V(-0.7), + V(-0.4), + V(-0.5), + }}}, + 8)); + } +}; +} // namespace + +const DequantMatrices::DequantLibraryInternal DequantMatrices::LibraryInit() { + static_assert(kNum == 17, + "Update this function when adding new quantization kinds."); + static_assert(kNumPredefinedTables == 1, + "Update this function when adding new quantization matrices to " + "the library."); + + // The library and the indices need to be kept in sync manually. + static_assert(0 == DCT, "Update the DequantLibrary array below."); + static_assert(1 == IDENTITY, "Update the DequantLibrary array below."); + static_assert(2 == DCT2X2, "Update the DequantLibrary array below."); + static_assert(3 == DCT4X4, "Update the DequantLibrary array below."); + static_assert(4 == DCT16X16, "Update the DequantLibrary array below."); + static_assert(5 == DCT32X32, "Update the DequantLibrary array below."); + static_assert(6 == DCT8X16, "Update the DequantLibrary array below."); + static_assert(7 == DCT8X32, "Update the DequantLibrary array below."); + static_assert(8 == DCT16X32, "Update the DequantLibrary array below."); + static_assert(9 == DCT4X8, "Update the DequantLibrary array below."); + static_assert(10 == AFV0, "Update the DequantLibrary array below."); + static_assert(11 == DCT64X64, "Update the DequantLibrary array below."); + static_assert(12 == DCT32X64, "Update the DequantLibrary array below."); + static_assert(13 == DCT128X128, "Update the DequantLibrary array below."); + static_assert(14 == DCT64X128, "Update the DequantLibrary array below."); + static_assert(15 == DCT256X256, "Update the DequantLibrary array below."); + static_assert(16 == DCT128X256, "Update the DequantLibrary array below."); + return DequantMatrices::DequantLibraryInternal{ + DequantMatricesLibraryDef::DCT(), + DequantMatricesLibraryDef::IDENTITY(), + DequantMatricesLibraryDef::DCT2X2(), + DequantMatricesLibraryDef::DCT4X4(), + DequantMatricesLibraryDef::DCT16X16(), + DequantMatricesLibraryDef::DCT32X32(), + DequantMatricesLibraryDef::DCT8X16(), + DequantMatricesLibraryDef::DCT8X32(), + DequantMatricesLibraryDef::DCT16X32(), + DequantMatricesLibraryDef::DCT4X8(), + DequantMatricesLibraryDef::AFV0(), + DequantMatricesLibraryDef::DCT64X64(), + DequantMatricesLibraryDef::DCT32X64(), + // Same default for large transforms (128+) as for 64x* transforms. + DequantMatricesLibraryDef::DCT128X128(), + DequantMatricesLibraryDef::DCT64X128(), + DequantMatricesLibraryDef::DCT256X256(), + DequantMatricesLibraryDef::DCT128X256(), + }; +} + +const QuantEncoding* DequantMatrices::Library() { + static const DequantMatrices::DequantLibraryInternal kDequantLibrary = + DequantMatrices::LibraryInit(); + // Downcast the result to a const QuantEncoding* from QuantEncodingInternal* + // since the subclass (QuantEncoding) doesn't add any new members and users + // will need to upcast to QuantEncodingInternal to access the members of that + // class. This allows to have kDequantLibrary as a constexpr value while still + // allowing to create QuantEncoding::RAW() instances that use std::vector in + // C++11. + return reinterpret_cast(kDequantLibrary.data()); +} + +Status DequantMatrices::Compute() { + size_t pos = 0; + + struct DefaultMatrices { + DefaultMatrices() { + const QuantEncoding* library = Library(); + size_t pos = 0; + for (size_t i = 0; i < kNum; i++) { + JXL_CHECK(ComputeQuantTable(library[i], table, inv_table, i, + QuantTable(i), &pos)); + } + JXL_CHECK(pos == kTotalTableSize); + } + HWY_ALIGN_MAX float table[kTotalTableSize]; + HWY_ALIGN_MAX float inv_table[kTotalTableSize]; + }; + + static const DefaultMatrices& default_matrices = + *hwy::MakeUniqueAligned().release(); + + JXL_ASSERT(encodings_.size() == kNum); + + bool has_nondefault_matrix = false; + for (const auto& enc : encodings_) { + if (enc.mode != QuantEncoding::kQuantModeLibrary) { + has_nondefault_matrix = true; + } + } + if (has_nondefault_matrix) { + table_storage_ = hwy::AllocateAligned(2 * kTotalTableSize); + table_ = table_storage_.get(); + inv_table_ = table_storage_.get() + kTotalTableSize; + for (size_t table = 0; table < kNum; table++) { + size_t prev_pos = pos; + if (encodings_[table].mode == QuantEncoding::kQuantModeLibrary) { + size_t num = required_size_[table] * kDCTBlockSize; + memcpy(table_storage_.get() + prev_pos, + default_matrices.table + prev_pos, num * sizeof(float) * 3); + memcpy(table_storage_.get() + kTotalTableSize + prev_pos, + default_matrices.inv_table + prev_pos, num * sizeof(float) * 3); + pos += num * 3; + } else { + JXL_RETURN_IF_ERROR( + ComputeQuantTable(encodings_[table], table_storage_.get(), + table_storage_.get() + kTotalTableSize, table, + QuantTable(table), &pos)); + } + } + JXL_ASSERT(pos == kTotalTableSize); + } else { + table_ = default_matrices.table; + inv_table_ = default_matrices.inv_table; + } + + return true; +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights.h new file mode 100644 index 0000000000..816362f81c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights.h @@ -0,0 +1,469 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_QUANT_WEIGHTS_H_ +#define LIB_JXL_QUANT_WEIGHTS_H_ + +#include +#include + +#include +#include +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/cache_aligned.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/image.h" + +namespace jxl { + +template +constexpr T ArraySum(T (&a)[N], size_t i = N - 1) { + static_assert(N > 0, "Trying to compute the sum of an empty array"); + return i == 0 ? a[0] : a[i] + ArraySum(a, i - 1); +} + +static constexpr size_t kMaxQuantTableSize = AcStrategy::kMaxCoeffArea; +static constexpr size_t kNumPredefinedTables = 1; +static constexpr size_t kCeilLog2NumPredefinedTables = 0; +static constexpr size_t kLog2NumQuantModes = 3; + +struct DctQuantWeightParams { + static constexpr size_t kLog2MaxDistanceBands = 4; + static constexpr size_t kMaxDistanceBands = 1 + (1 << kLog2MaxDistanceBands); + typedef std::array, 3> + DistanceBandsArray; + + size_t num_distance_bands = 0; + DistanceBandsArray distance_bands = {}; + + constexpr DctQuantWeightParams() : num_distance_bands(0) {} + + constexpr DctQuantWeightParams(const DistanceBandsArray& dist_bands, + size_t num_dist_bands) + : num_distance_bands(num_dist_bands), distance_bands(dist_bands) {} + + template + explicit DctQuantWeightParams(const float dist_bands[3][num_dist_bands]) { + num_distance_bands = num_dist_bands; + for (size_t c = 0; c < 3; c++) { + memcpy(distance_bands[c].data(), dist_bands[c], + sizeof(float) * num_dist_bands); + } + } +}; + +// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) +struct QuantEncodingInternal { + enum Mode { + kQuantModeLibrary, + kQuantModeID, + kQuantModeDCT2, + kQuantModeDCT4, + kQuantModeDCT4X8, + kQuantModeAFV, + kQuantModeDCT, + kQuantModeRAW, + }; + + template + struct Tag {}; + + typedef std::array, 3> IdWeights; + typedef std::array, 3> DCT2Weights; + typedef std::array, 3> DCT4Multipliers; + typedef std::array, 3> AFVWeights; + typedef std::array DCT4x8Multipliers; + + static constexpr QuantEncodingInternal Library(uint8_t predefined) { + return ((predefined < kNumPredefinedTables) || + JXL_ABORT("Assert predefined < kNumPredefinedTables")), + QuantEncodingInternal(Tag(), predefined); + } + constexpr QuantEncodingInternal(Tag /* tag */, + uint8_t predefined) + : mode(kQuantModeLibrary), predefined(predefined) {} + + // Identity + // xybweights is an array of {xweights, yweights, bweights}. + static constexpr QuantEncodingInternal Identity(const IdWeights& xybweights) { + return QuantEncodingInternal(Tag(), xybweights); + } + constexpr QuantEncodingInternal(Tag /* tag */, + const IdWeights& xybweights) + : mode(kQuantModeID), idweights(xybweights) {} + + // DCT2 + static constexpr QuantEncodingInternal DCT2(const DCT2Weights& xybweights) { + return QuantEncodingInternal(Tag(), xybweights); + } + constexpr QuantEncodingInternal(Tag /* tag */, + const DCT2Weights& xybweights) + : mode(kQuantModeDCT2), dct2weights(xybweights) {} + + // DCT4 + static constexpr QuantEncodingInternal DCT4( + const DctQuantWeightParams& params, const DCT4Multipliers& xybmul) { + return QuantEncodingInternal(Tag(), params, xybmul); + } + constexpr QuantEncodingInternal(Tag /* tag */, + const DctQuantWeightParams& params, + const DCT4Multipliers& xybmul) + : mode(kQuantModeDCT4), dct_params(params), dct4multipliers(xybmul) {} + + // DCT4x8 + static constexpr QuantEncodingInternal DCT4X8( + const DctQuantWeightParams& params, const DCT4x8Multipliers& xybmul) { + return QuantEncodingInternal(Tag(), params, xybmul); + } + constexpr QuantEncodingInternal(Tag /* tag */, + const DctQuantWeightParams& params, + const DCT4x8Multipliers& xybmul) + : mode(kQuantModeDCT4X8), dct_params(params), dct4x8multipliers(xybmul) {} + + // DCT + static constexpr QuantEncodingInternal DCT( + const DctQuantWeightParams& params) { + return QuantEncodingInternal(Tag(), params); + } + constexpr QuantEncodingInternal(Tag /* tag */, + const DctQuantWeightParams& params) + : mode(kQuantModeDCT), dct_params(params) {} + + // AFV + static constexpr QuantEncodingInternal AFV( + const DctQuantWeightParams& params4x8, + const DctQuantWeightParams& params4x4, const AFVWeights& weights) { + return QuantEncodingInternal(Tag(), params4x8, params4x4, + weights); + } + constexpr QuantEncodingInternal(Tag /* tag */, + const DctQuantWeightParams& params4x8, + const DctQuantWeightParams& params4x4, + const AFVWeights& weights) + : mode(kQuantModeAFV), + dct_params(params4x8), + afv_weights(weights), + dct_params_afv_4x4(params4x4) {} + + // This constructor is not constexpr so it can't be used in any of the + // constexpr cases above. + explicit QuantEncodingInternal(Mode mode) : mode(mode) {} + + Mode mode; + + // Weights for DCT4+ tables. + DctQuantWeightParams dct_params; + + union { + // Weights for identity. + IdWeights idweights; + + // Weights for DCT2. + DCT2Weights dct2weights; + + // Extra multipliers for coefficients 01/10 and 11 for DCT4 and AFV. + DCT4Multipliers dct4multipliers; + + // Weights for AFV. {0, 1} are used directly for coefficients (0, 1) and (1, + // 0); {2, 3, 4} are used directly corner DC, (1,0) - (0,1) and (0, 1) + + // (1, 0) - (0, 0) inside the AFV block. Values from 5 to 8 are interpolated + // as in GetQuantWeights for DC and are used for other coefficients. + AFVWeights afv_weights = {}; + + // Extra multipliers for coefficients 01 or 10 for DCT4X8 and DCT8X4. + DCT4x8Multipliers dct4x8multipliers; + + // Only used in kQuantModeRAW mode. + struct { + // explicit quantization table (like in JPEG) + std::vector* qtable = nullptr; + float qtable_den = 1.f / (8 * 255); + } qraw; + }; + + // Weights for 4x4 sub-block in AFV. + DctQuantWeightParams dct_params_afv_4x4; + + union { + // Which predefined table to use. Only used if mode is kQuantModeLibrary. + uint8_t predefined = 0; + + // Which other quant table to copy; must copy from a table that comes before + // the current one. Only used if mode is kQuantModeCopy. + uint8_t source; + }; +}; + +class QuantEncoding final : public QuantEncodingInternal { + public: + QuantEncoding(const QuantEncoding& other) + : QuantEncodingInternal( + static_cast(other)) { + if (mode == kQuantModeRAW && qraw.qtable) { + // Need to make a copy of the passed *qtable. + qraw.qtable = new std::vector(*other.qraw.qtable); + } + } + QuantEncoding(QuantEncoding&& other) noexcept + : QuantEncodingInternal( + static_cast(other)) { + // Steal the qtable from the other object if any. + if (mode == kQuantModeRAW) { + other.qraw.qtable = nullptr; + } + } + QuantEncoding& operator=(const QuantEncoding& other) { + if (mode == kQuantModeRAW && qraw.qtable) { + delete qraw.qtable; + } + *static_cast(this) = + QuantEncodingInternal(static_cast(other)); + if (mode == kQuantModeRAW && qraw.qtable) { + // Need to make a copy of the passed *qtable. + qraw.qtable = new std::vector(*other.qraw.qtable); + } + return *this; + } + + ~QuantEncoding() { + if (mode == kQuantModeRAW && qraw.qtable) { + delete qraw.qtable; + } + } + + // Wrappers of the QuantEncodingInternal:: static functions that return a + // QuantEncoding instead. This is using the explicit and private cast from + // QuantEncodingInternal to QuantEncoding, which would be inlined anyway. + // In general, you should use this wrappers. The only reason to directly + // create a QuantEncodingInternal instance is if you need a constexpr version + // of this class. Note that RAW() is not supported in that case since it uses + // a std::vector. + static QuantEncoding Library(uint8_t predefined) { + return QuantEncoding(QuantEncodingInternal::Library(predefined)); + } + static QuantEncoding Identity(const IdWeights& xybweights) { + return QuantEncoding(QuantEncodingInternal::Identity(xybweights)); + } + static QuantEncoding DCT2(const DCT2Weights& xybweights) { + return QuantEncoding(QuantEncodingInternal::DCT2(xybweights)); + } + static QuantEncoding DCT4(const DctQuantWeightParams& params, + const DCT4Multipliers& xybmul) { + return QuantEncoding(QuantEncodingInternal::DCT4(params, xybmul)); + } + static QuantEncoding DCT4X8(const DctQuantWeightParams& params, + const DCT4x8Multipliers& xybmul) { + return QuantEncoding(QuantEncodingInternal::DCT4X8(params, xybmul)); + } + static QuantEncoding DCT(const DctQuantWeightParams& params) { + return QuantEncoding(QuantEncodingInternal::DCT(params)); + } + static QuantEncoding AFV(const DctQuantWeightParams& params4x8, + const DctQuantWeightParams& params4x4, + const AFVWeights& weights) { + return QuantEncoding( + QuantEncodingInternal::AFV(params4x8, params4x4, weights)); + } + + // RAW, note that this one is not a constexpr one. + static QuantEncoding RAW(const std::vector& qtable, int shift = 0) { + QuantEncoding encoding(kQuantModeRAW); + encoding.qraw.qtable = new std::vector(); + *encoding.qraw.qtable = qtable; + encoding.qraw.qtable_den = (1 << shift) * (1.f / (8 * 255)); + return encoding; + } + + private: + explicit QuantEncoding(const QuantEncodingInternal& other) + : QuantEncodingInternal(other) {} + + explicit QuantEncoding(QuantEncodingInternal::Mode mode) + : QuantEncodingInternal(mode) {} +}; + +// A constexpr QuantEncodingInternal instance is often downcasted to the +// QuantEncoding subclass even if the instance wasn't an instance of the +// subclass. This is safe because user will upcast to QuantEncodingInternal to +// access any of its members. +static_assert(sizeof(QuantEncoding) == sizeof(QuantEncodingInternal), + "Don't add any members to QuantEncoding"); + +// Let's try to keep these 2**N for possible future simplicity. +const float kInvDCQuant[3] = { + 4096.0f, + 512.0f, + 256.0f, +}; + +const float kDCQuant[3] = { + 1.0f / kInvDCQuant[0], + 1.0f / kInvDCQuant[1], + 1.0f / kInvDCQuant[2], +}; + +class ModularFrameEncoder; +class ModularFrameDecoder; + +class DequantMatrices { + public: + enum QuantTable : size_t { + DCT = 0, + IDENTITY, + DCT2X2, + DCT4X4, + DCT16X16, + DCT32X32, + // DCT16X8 + DCT8X16, + // DCT32X8 + DCT8X32, + // DCT32X16 + DCT16X32, + DCT4X8, + // DCT8X4 + AFV0, + // AFV1 + // AFV2 + // AFV3 + DCT64X64, + // DCT64X32, + DCT32X64, + DCT128X128, + // DCT128X64, + DCT64X128, + DCT256X256, + // DCT256X128, + DCT128X256, + kNum + }; + + static constexpr QuantTable kQuantTable[] = { + QuantTable::DCT, QuantTable::IDENTITY, QuantTable::DCT2X2, + QuantTable::DCT4X4, QuantTable::DCT16X16, QuantTable::DCT32X32, + QuantTable::DCT8X16, QuantTable::DCT8X16, QuantTable::DCT8X32, + QuantTable::DCT8X32, QuantTable::DCT16X32, QuantTable::DCT16X32, + QuantTable::DCT4X8, QuantTable::DCT4X8, QuantTable::AFV0, + QuantTable::AFV0, QuantTable::AFV0, QuantTable::AFV0, + QuantTable::DCT64X64, QuantTable::DCT32X64, QuantTable::DCT32X64, + QuantTable::DCT128X128, QuantTable::DCT64X128, QuantTable::DCT64X128, + QuantTable::DCT256X256, QuantTable::DCT128X256, QuantTable::DCT128X256, + }; + static_assert(AcStrategy::kNumValidStrategies == + sizeof(kQuantTable) / sizeof *kQuantTable, + "Update this array when adding or removing AC strategies."); + + DequantMatrices() { + encodings_.resize(size_t(QuantTable::kNum), QuantEncoding::Library(0)); + size_t pos = 0; + size_t offsets[kNum * 3]; + for (size_t i = 0; i < size_t(QuantTable::kNum); i++) { + encodings_[i] = QuantEncoding::Library(0); + size_t num = required_size_[i] * kDCTBlockSize; + for (size_t c = 0; c < 3; c++) { + offsets[3 * i + c] = pos + c * num; + } + pos += 3 * num; + } + for (size_t i = 0; i < AcStrategy::kNumValidStrategies; i++) { + for (size_t c = 0; c < 3; c++) { + table_offsets_[i * 3 + c] = offsets[kQuantTable[i] * 3 + c]; + } + } + // Default quantization tables need to be valid. + JXL_CHECK(Compute()); + } + + static const QuantEncoding* Library(); + + typedef std::array + DequantLibraryInternal; + // Return the array of library kNumPredefinedTables QuantEncoding entries as + // a constexpr array. Use Library() to obtain a pointer to the copy in the + // .cc file. + static const DequantLibraryInternal LibraryInit(); + + JXL_INLINE size_t MatrixOffset(size_t quant_kind, size_t c) const { + JXL_DASSERT(quant_kind < AcStrategy::kNumValidStrategies); + return table_offsets_[quant_kind * 3 + c]; + } + + // Returns aligned memory. + JXL_INLINE const float* Matrix(size_t quant_kind, size_t c) const { + JXL_DASSERT(quant_kind < AcStrategy::kNumValidStrategies); + return &table_[MatrixOffset(quant_kind, c)]; + } + + JXL_INLINE const float* InvMatrix(size_t quant_kind, size_t c) const { + JXL_DASSERT(quant_kind < AcStrategy::kNumValidStrategies); + return &inv_table_[MatrixOffset(quant_kind, c)]; + } + + // DC quants are used in modular mode for XYB multipliers. + JXL_INLINE float DCQuant(size_t c) const { return dc_quant_[c]; } + JXL_INLINE const float* DCQuants() const { return dc_quant_; } + + JXL_INLINE float InvDCQuant(size_t c) const { return inv_dc_quant_[c]; } + + // For encoder. + void SetEncodings(const std::vector& encodings) { + encodings_ = encodings; + } + + // For encoder. + void SetDCQuant(const float dc[3]) { + for (size_t c = 0; c < 3; c++) { + dc_quant_[c] = 1.0f / dc[c]; + inv_dc_quant_[c] = dc[c]; + } + } + + Status Decode(BitReader* br, + ModularFrameDecoder* modular_frame_decoder = nullptr); + Status DecodeDC(BitReader* br); + + const std::vector& encodings() const { return encodings_; } + + static constexpr size_t required_size_x[] = {1, 1, 1, 1, 2, 4, 1, 1, 2, + 1, 1, 8, 4, 16, 8, 32, 16}; + static_assert(kNum == sizeof(required_size_x) / sizeof(*required_size_x), + "Update this array when adding or removing quant tables."); + + static constexpr size_t required_size_y[] = {1, 1, 1, 1, 2, 4, 2, 4, 4, + 1, 1, 8, 8, 16, 16, 32, 32}; + static_assert(kNum == sizeof(required_size_y) / sizeof(*required_size_y), + "Update this array when adding or removing quant tables."); + + private: + Status Compute(); + + static constexpr size_t required_size_[] = { + 1, 1, 1, 1, 4, 16, 2, 4, 8, 1, 1, 64, 32, 256, 128, 1024, 512}; + static_assert(kNum == sizeof(required_size_) / sizeof(*required_size_), + "Update this array when adding or removing quant tables."); + static constexpr size_t kTotalTableSize = + ArraySum(required_size_) * kDCTBlockSize * 3; + + // kTotalTableSize entries followed by kTotalTableSize for inv_table + hwy::AlignedFreeUniquePtr table_storage_; + const float* table_; + const float* inv_table_; + float dc_quant_[3] = {kDCQuant[0], kDCQuant[1], kDCQuant[2]}; + float inv_dc_quant_[3] = {kInvDCQuant[0], kInvDCQuant[1], kInvDCQuant[2]}; + size_t table_offsets_[AcStrategy::kNumValidStrategies * 3]; + std::vector encodings_; +}; + +} // namespace jxl + +#endif // LIB_JXL_QUANT_WEIGHTS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights_test.cc new file mode 100644 index 0000000000..2392c74cc6 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quant_weights_test.cc @@ -0,0 +1,240 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +#include "lib/jxl/quant_weights.h" + +#include + +#include +#include +#include // HWY_ALIGN_MAX +#include +#include +#include + +#include "lib/jxl/dct_for_test.h" +#include "lib/jxl/dec_transforms_testonly.h" +#include "lib/jxl/enc_modular.h" +#include "lib/jxl/enc_quant_weights.h" +#include "lib/jxl/enc_transforms.h" + +namespace jxl { +namespace { + +template +void CheckSimilar(T a, T b) { + EXPECT_EQ(a, b); +} +// minimum exponent = -15. +template <> +void CheckSimilar(float a, float b) { + float m = std::max(std::abs(a), std::abs(b)); + // 10 bits of precision are used in the format. Relative error should be + // below 2^-10. + EXPECT_LE(std::abs(a - b), m / 1024.0f) << "a: " << a << " b: " << b; +} + +TEST(QuantWeightsTest, DC) { + DequantMatrices mat; + float dc_quant[3] = {1e+5, 1e+3, 1e+1}; + DequantMatricesSetCustomDC(&mat, dc_quant); + for (size_t c = 0; c < 3; c++) { + CheckSimilar(mat.InvDCQuant(c), dc_quant[c]); + } +} + +void RoundtripMatrices(const std::vector& encodings) { + ASSERT_TRUE(encodings.size() == DequantMatrices::kNum); + DequantMatrices mat; + CodecMetadata metadata; + FrameHeader frame_header(&metadata); + ModularFrameEncoder encoder(frame_header, CompressParams{}); + DequantMatricesSetCustom(&mat, encodings, &encoder); + const std::vector& encodings_dec = mat.encodings(); + for (size_t i = 0; i < encodings.size(); i++) { + const QuantEncoding& e = encodings[i]; + const QuantEncoding& d = encodings_dec[i]; + // Check values roundtripped correctly. + EXPECT_EQ(e.mode, d.mode); + EXPECT_EQ(e.predefined, d.predefined); + EXPECT_EQ(e.source, d.source); + + EXPECT_EQ(static_cast(e.dct_params.num_distance_bands), + static_cast(d.dct_params.num_distance_bands)); + for (size_t c = 0; c < 3; c++) { + for (size_t j = 0; j < DctQuantWeightParams::kMaxDistanceBands; j++) { + CheckSimilar(e.dct_params.distance_bands[c][j], + d.dct_params.distance_bands[c][j]); + } + } + + if (e.mode == QuantEncoding::kQuantModeRAW) { + EXPECT_FALSE(!e.qraw.qtable); + EXPECT_FALSE(!d.qraw.qtable); + EXPECT_EQ(e.qraw.qtable->size(), d.qraw.qtable->size()); + for (size_t j = 0; j < e.qraw.qtable->size(); j++) { + EXPECT_EQ((*e.qraw.qtable)[j], (*d.qraw.qtable)[j]); + } + EXPECT_NEAR(e.qraw.qtable_den, d.qraw.qtable_den, 1e-7f); + } else { + // modes different than kQuantModeRAW use one of the other fields used + // here, which all happen to be arrays of floats. + for (size_t c = 0; c < 3; c++) { + for (size_t j = 0; j < 3; j++) { + CheckSimilar(e.idweights[c][j], d.idweights[c][j]); + } + for (size_t j = 0; j < 6; j++) { + CheckSimilar(e.dct2weights[c][j], d.dct2weights[c][j]); + } + for (size_t j = 0; j < 2; j++) { + CheckSimilar(e.dct4multipliers[c][j], d.dct4multipliers[c][j]); + } + CheckSimilar(e.dct4x8multipliers[c], d.dct4x8multipliers[c]); + for (size_t j = 0; j < 9; j++) { + CheckSimilar(e.afv_weights[c][j], d.afv_weights[c][j]); + } + for (size_t j = 0; j < DctQuantWeightParams::kMaxDistanceBands; j++) { + CheckSimilar(e.dct_params_afv_4x4.distance_bands[c][j], + d.dct_params_afv_4x4.distance_bands[c][j]); + } + } + } + } +} + +TEST(QuantWeightsTest, AllDefault) { + std::vector encodings(DequantMatrices::kNum, + QuantEncoding::Library(0)); + RoundtripMatrices(encodings); +} + +void TestSingleQuantMatrix(DequantMatrices::QuantTable kind) { + std::vector encodings(DequantMatrices::kNum, + QuantEncoding::Library(0)); + encodings[kind] = DequantMatrices::Library()[kind]; + RoundtripMatrices(encodings); +} + +// Ensure we can reasonably represent default quant tables. +TEST(QuantWeightsTest, DCT) { TestSingleQuantMatrix(DequantMatrices::DCT); } +TEST(QuantWeightsTest, IDENTITY) { + TestSingleQuantMatrix(DequantMatrices::IDENTITY); +} +TEST(QuantWeightsTest, DCT2X2) { + TestSingleQuantMatrix(DequantMatrices::DCT2X2); +} +TEST(QuantWeightsTest, DCT4X4) { + TestSingleQuantMatrix(DequantMatrices::DCT4X4); +} +TEST(QuantWeightsTest, DCT16X16) { + TestSingleQuantMatrix(DequantMatrices::DCT16X16); +} +TEST(QuantWeightsTest, DCT32X32) { + TestSingleQuantMatrix(DequantMatrices::DCT32X32); +} +TEST(QuantWeightsTest, DCT8X16) { + TestSingleQuantMatrix(DequantMatrices::DCT8X16); +} +TEST(QuantWeightsTest, DCT8X32) { + TestSingleQuantMatrix(DequantMatrices::DCT8X32); +} +TEST(QuantWeightsTest, DCT16X32) { + TestSingleQuantMatrix(DequantMatrices::DCT16X32); +} +TEST(QuantWeightsTest, DCT4X8) { + TestSingleQuantMatrix(DequantMatrices::DCT4X8); +} +TEST(QuantWeightsTest, AFV0) { TestSingleQuantMatrix(DequantMatrices::AFV0); } +TEST(QuantWeightsTest, RAW) { + std::vector encodings(DequantMatrices::kNum, + QuantEncoding::Library(0)); + std::vector matrix(3 * 32 * 32); + std::mt19937 rng; + std::uniform_int_distribution dist(1, 255); + for (size_t i = 0; i < matrix.size(); i++) matrix[i] = dist(rng); + encodings[DequantMatrices::kQuantTable[AcStrategy::DCT32X32]] = + QuantEncoding::RAW(matrix, 2); + RoundtripMatrices(encodings); +} + +class QuantWeightsTargetTest : public hwy::TestWithParamTarget {}; +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(QuantWeightsTargetTest); + +TEST_P(QuantWeightsTargetTest, DCTUniform) { + constexpr float kUniformQuant = 4; + float weights[3][2] = {{1.0f / kUniformQuant, 0}, + {1.0f / kUniformQuant, 0}, + {1.0f / kUniformQuant, 0}}; + DctQuantWeightParams dct_params(weights); + std::vector encodings(DequantMatrices::kNum, + QuantEncoding::DCT(dct_params)); + DequantMatrices dequant_matrices; + CodecMetadata metadata; + FrameHeader frame_header(&metadata); + ModularFrameEncoder encoder(frame_header, CompressParams{}); + DequantMatricesSetCustom(&dequant_matrices, encodings, &encoder); + + const float dc_quant[3] = {1.0f / kUniformQuant, 1.0f / kUniformQuant, + 1.0f / kUniformQuant}; + DequantMatricesSetCustomDC(&dequant_matrices, dc_quant); + + HWY_ALIGN_MAX float scratch_space[16 * 16 * 2]; + + // DCT8 + { + HWY_ALIGN_MAX float pixels[64]; + std::iota(std::begin(pixels), std::end(pixels), 0); + HWY_ALIGN_MAX float coeffs[64]; + const AcStrategy::Type dct = AcStrategy::DCT; + TransformFromPixels(dct, pixels, 8, coeffs, scratch_space); + HWY_ALIGN_MAX double slow_coeffs[64]; + for (size_t i = 0; i < 64; i++) slow_coeffs[i] = pixels[i]; + DCTSlow<8>(slow_coeffs); + + for (size_t i = 0; i < 64; i++) { + // DCTSlow doesn't multiply/divide by 1/N, so we do it manually. + slow_coeffs[i] = roundf(slow_coeffs[i] / kUniformQuant) * kUniformQuant; + coeffs[i] = roundf(coeffs[i] / dequant_matrices.Matrix(dct, 0)[i]) * + dequant_matrices.Matrix(dct, 0)[i]; + } + IDCTSlow<8>(slow_coeffs); + TransformToPixels(dct, coeffs, pixels, 8, scratch_space); + for (size_t i = 0; i < 64; i++) { + EXPECT_NEAR(pixels[i], slow_coeffs[i], 1e-4); + } + } + + // DCT16 + { + HWY_ALIGN_MAX float pixels[64 * 4]; + std::iota(std::begin(pixels), std::end(pixels), 0); + HWY_ALIGN_MAX float coeffs[64 * 4]; + const AcStrategy::Type dct = AcStrategy::DCT16X16; + TransformFromPixels(dct, pixels, 16, coeffs, scratch_space); + HWY_ALIGN_MAX double slow_coeffs[64 * 4]; + for (size_t i = 0; i < 64 * 4; i++) slow_coeffs[i] = pixels[i]; + DCTSlow<16>(slow_coeffs); + + for (size_t i = 0; i < 64 * 4; i++) { + slow_coeffs[i] = roundf(slow_coeffs[i] / kUniformQuant) * kUniformQuant; + coeffs[i] = roundf(coeffs[i] / dequant_matrices.Matrix(dct, 0)[i]) * + dequant_matrices.Matrix(dct, 0)[i]; + } + + IDCTSlow<16>(slow_coeffs); + TransformToPixels(dct, coeffs, pixels, 16, scratch_space); + for (size_t i = 0; i < 64 * 4; i++) { + EXPECT_NEAR(pixels[i], slow_coeffs[i], 1e-4); + } + } + + // Check that all matrices have the same DC quantization, i.e. that they all + // have the same scaling. + for (size_t i = 0; i < AcStrategy::kNumValidStrategies; i++) { + EXPECT_NEAR(dequant_matrices.Matrix(i, 0)[0], kUniformQuant, 1e-6); + } +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer-inl.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer-inl.h new file mode 100644 index 0000000000..2627148dc2 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer-inl.h @@ -0,0 +1,73 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#if defined(LIB_JXL_QUANTIZER_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_QUANTIZER_INL_H_ +#undef LIB_JXL_QUANTIZER_INL_H_ +#else +#define LIB_JXL_QUANTIZER_INL_H_ +#endif + +#include + +#include +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Rebind; +using hwy::HWY_NAMESPACE::Vec; + +template +HWY_INLINE HWY_MAYBE_UNUSED Vec> AdjustQuantBias( + DI di, const size_t c, const Vec quant_i, + const float* HWY_RESTRICT biases) { + const Rebind df; + +#if JXL_HIGH_PRECISION + const auto quant = ConvertTo(df, quant_i); + + // Compare |quant|, keep sign bit for negating result. + const auto kSign = BitCast(df, Set(di, INT32_MIN)); + const auto sign = And(quant, kSign); // TODO(janwas): = abs ^ orig + const auto abs_quant = AndNot(kSign, quant); + + // If |x| is 1, kZeroBias creates a different bias for each channel. + // We're implementing the following: + // if (quant == 0) return 0; + // if (quant == 1) return biases[c]; + // if (quant == -1) return -biases[c]; + // return quant - biases[3] / quant; + + // Integer comparison is not helpful because Clang incurs bypass penalties + // from unnecessarily mixing integer and float. + const auto is_01 = abs_quant < Set(df, 1.125f); + const auto not_0 = abs_quant > Zero(df); + + // Bitwise logic is faster than quant * biases[c]. + const auto one_bias = IfThenElseZero(not_0, Xor(Set(df, biases[c]), sign)); + + // About 2E-5 worse than ReciprocalNR or division. + const auto bias = + NegMulAdd(Set(df, biases[3]), ApproximateReciprocal(quant), quant); + + return IfThenElse(is_01, one_bias, bias); +#else + auto sign = IfThenElseZero(quant_i < Zero(di), Set(di, INT32_MIN)); + return BitCast(df, IfThenElse(Abs(quant_i) == Set(di, 1), + sign | BitCast(di, Set(df, biases[c])), + BitCast(di, ConvertTo(df, quant_i)))); +#endif +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_QUANTIZER_INL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer.cc new file mode 100644 index 0000000000..2a7480f175 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer.cc @@ -0,0 +1,146 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/quantizer.h" + +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/robust_statistics.h" +#include "lib/jxl/field_encodings.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/quant_weights.h" + +namespace jxl { + +static const int kDefaultQuant = 64; + +constexpr int Quantizer::kQuantMax; + +Quantizer::Quantizer(const DequantMatrices* dequant) + : Quantizer(dequant, kDefaultQuant, kGlobalScaleDenom / kDefaultQuant) {} + +Quantizer::Quantizer(const DequantMatrices* dequant, int quant_dc, + int global_scale) + : global_scale_(global_scale), quant_dc_(quant_dc), dequant_(dequant) { + JXL_ASSERT(dequant_ != nullptr); + RecomputeFromGlobalScale(); + inv_quant_dc_ = inv_global_scale_ / quant_dc_; + + memcpy(zero_bias_, kZeroBiasDefault, sizeof(kZeroBiasDefault)); +} + +void Quantizer::ComputeGlobalScaleAndQuant(float quant_dc, float quant_median, + float quant_median_absd) { + // Target value for the median value in the quant field. + const float kQuantFieldTarget = 3.80987740592518214386f; + // We reduce the median of the quant field by the median absolute deviation: + // higher resolution on highly varying quant fields. + float scale = kGlobalScaleDenom * (quant_median - quant_median_absd) / + kQuantFieldTarget; + // Ensure that new_global_scale is positive and no more than 1<<15. + if (scale < 1) scale = 1; + if (scale > (1 << 15)) scale = 1 << 15; + int new_global_scale = static_cast(scale); + // Ensure that quant_dc_ will always be at least + // kGlobalScaleDenom/kGlobalScaleNumerator. + const int scaled_quant_dc = + static_cast(quant_dc * kGlobalScaleNumerator); + if (new_global_scale > scaled_quant_dc) { + new_global_scale = scaled_quant_dc; + if (new_global_scale <= 0) new_global_scale = 1; + } + global_scale_ = new_global_scale; + // Code below uses inv_global_scale_. + RecomputeFromGlobalScale(); + + float fval = quant_dc * inv_global_scale_ + 0.5f; + fval = std::min(1 << 16, fval); + const int new_quant_dc = static_cast(fval); + quant_dc_ = new_quant_dc; + + // quant_dc_ was updated, recompute values. + RecomputeFromGlobalScale(); +} + +void Quantizer::SetQuantFieldRect(const ImageF& qf, const Rect& rect, + ImageI* JXL_RESTRICT raw_quant_field) { + for (size_t y = 0; y < rect.ysize(); ++y) { + const float* JXL_RESTRICT row_qf = rect.ConstRow(qf, y); + int32_t* JXL_RESTRICT row_qi = rect.Row(raw_quant_field, y); + for (size_t x = 0; x < rect.xsize(); ++x) { + int val = ClampVal(row_qf[x] * inv_global_scale_ + 0.5f); + row_qi[x] = val; + } + } +} + +void Quantizer::SetQuantField(const float quant_dc, const ImageF& qf, + ImageI* JXL_RESTRICT raw_quant_field) { + JXL_CHECK(SameSize(*raw_quant_field, qf)); + std::vector data(qf.xsize() * qf.ysize()); + for (size_t y = 0; y < qf.ysize(); ++y) { + const float* JXL_RESTRICT row_qf = qf.Row(y); + for (size_t x = 0; x < qf.xsize(); ++x) { + float quant = row_qf[x]; + data[qf.xsize() * y + x] = quant; + } + } + const float quant_median = Median(&data); + const float quant_median_absd = MedianAbsoluteDeviation(data, quant_median); + ComputeGlobalScaleAndQuant(quant_dc, quant_median, quant_median_absd); + SetQuantFieldRect(qf, Rect(qf), raw_quant_field); +} + +void Quantizer::SetQuant(float quant_dc, float quant_ac, + ImageI* JXL_RESTRICT raw_quant_field) { + ComputeGlobalScaleAndQuant(quant_dc, quant_ac, 0); + int val = ClampVal(quant_ac * inv_global_scale_ + 0.5f); + FillImage(val, raw_quant_field); +} + +Status QuantizerParams::VisitFields(Visitor* JXL_RESTRICT visitor) { + JXL_QUIET_RETURN_IF_ERROR(visitor->U32( + BitsOffset(11, 1), BitsOffset(11, 2049), BitsOffset(12, 4097), + BitsOffset(16, 8193), 1, &global_scale)); + JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(16), BitsOffset(5, 1), + BitsOffset(8, 1), BitsOffset(16, 1), 1, + &quant_dc)); + return true; +} + +Status Quantizer::Encode(BitWriter* writer, size_t layer, + AuxOut* aux_out) const { + QuantizerParams params; + params.global_scale = global_scale_; + params.quant_dc = quant_dc_; + return Bundle::Write(params, writer, layer, aux_out); +} + +Status Quantizer::Decode(BitReader* reader) { + QuantizerParams params; + JXL_RETURN_IF_ERROR(Bundle::Read(reader, ¶ms)); + global_scale_ = static_cast(params.global_scale); + quant_dc_ = static_cast(params.quant_dc); + RecomputeFromGlobalScale(); + return true; +} + +void Quantizer::DumpQuantizationMap(const ImageI& raw_quant_field) const { + printf("Global scale: %d (%.7f)\nDC quant: %d\n", global_scale_, + global_scale_ * 1.0 / kGlobalScaleDenom, quant_dc_); + printf("AC quantization Map:\n"); + for (size_t y = 0; y < raw_quant_field.ysize(); ++y) { + for (size_t x = 0; x < raw_quant_field.xsize(); ++x) { + printf(" %3d", raw_quant_field.Row(y)[x]); + } + printf("\n"); + } +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer.h new file mode 100644 index 0000000000..f2da45f1c4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer.h @@ -0,0 +1,178 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_QUANTIZER_H_ +#define LIB_JXL_QUANTIZER_H_ + +#include +#include +#include + +#include +#include +#include +#include + +#include "lib/jxl/ac_strategy.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/profiler.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_util.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/enc_bit_writer.h" +#include "lib/jxl/fields.h" +#include "lib/jxl/image.h" +#include "lib/jxl/linalg.h" +#include "lib/jxl/quant_weights.h" + +// Quantizes DC and AC coefficients, with separate quantization tables according +// to the quant_kind (which is currently computed from the AC strategy and the +// block index inside that strategy). + +namespace jxl { + +static constexpr int kGlobalScaleDenom = 1 << 16; +static constexpr int kGlobalScaleNumerator = 4096; + +// zero-biases for quantizing channels X, Y, B +static constexpr float kZeroBiasDefault[3] = {0.5f, 0.5f, 0.5f}; + +// Returns adjusted version of a quantized integer, such that its value is +// closer to the expected value of the original. +// The residuals of AC coefficients that we quantize are not uniformly +// distributed. Numerical experiments show that they have a distribution with +// the "shape" of 1/(1+x^2) [up to some coefficients]. This means that the +// expected value of a coefficient that gets quantized to x will not be x +// itself, but (at least with reasonable approximation): +// - 0 if x is 0 +// - x * biases[c] if x is 1 or -1 +// - x - biases[3]/x otherwise +// This follows from computing the distribution of the quantization bias, which +// can be approximated fairly well by /x when |x| is at least two. +static constexpr float kBiasNumerator = 0.145f; + +static constexpr float kDefaultQuantBias[4] = { + 1.0f - 0.05465007330715401f, + 1.0f - 0.07005449891748593f, + 1.0f - 0.049935103337343655f, + 0.145f, +}; + +class Quantizer { + public: + explicit Quantizer(const DequantMatrices* dequant); + Quantizer(const DequantMatrices* dequant, int quant_dc, int global_scale); + + static constexpr int kQuantMax = 256; + + static JXL_INLINE int ClampVal(float val) { + return static_cast(std::max(1.0f, std::min(val, kQuantMax))); + } + + // Recomputes other derived fields after global_scale_ has changed. + void RecomputeFromGlobalScale() { + global_scale_float_ = global_scale_ * (1.0 / kGlobalScaleDenom); + inv_global_scale_ = 1.0 * kGlobalScaleDenom / global_scale_; + inv_quant_dc_ = inv_global_scale_ / quant_dc_; + for (size_t c = 0; c < 3; c++) { + mul_dc_[c] = GetDcStep(c); + inv_mul_dc_[c] = GetInvDcStep(c); + } + } + + // Returns scaling factor such that Scale() * (RawDC() or RawQuantField()) + // pixels yields the same float values returned by GetQuantField. + JXL_INLINE float Scale() const { return global_scale_float_; } + + // Reciprocal of Scale(). + JXL_INLINE float InvGlobalScale() const { return inv_global_scale_; } + + void SetQuantFieldRect(const ImageF& qf, const Rect& rect, + ImageI* JXL_RESTRICT raw_quant_field); + + void SetQuantField(float quant_dc, const ImageF& qf, + ImageI* JXL_RESTRICT raw_quant_field); + + void SetQuant(float quant_dc, float quant_ac, + ImageI* JXL_RESTRICT raw_quant_field); + + // Returns the DC quantization base value, which is currently global (not + // adaptive). The actual scale factor used to dequantize pixels in channel c + // is: inv_quant_dc() * dequant_->DCQuant(c). + float inv_quant_dc() const { return inv_quant_dc_; } + + // Dequantize by multiplying with this times dequant_matrix. + float inv_quant_ac(int32_t quant) const { return inv_global_scale_ / quant; } + + Status Encode(BitWriter* writer, size_t layer, AuxOut* aux_out) const; + + Status Decode(BitReader* reader); + + void DumpQuantizationMap(const ImageI& raw_quant_field) const; + + JXL_INLINE const float* DequantMatrix(size_t quant_kind, size_t c) const { + return dequant_->Matrix(quant_kind, c); + } + + JXL_INLINE const float* InvDequantMatrix(size_t quant_kind, size_t c) const { + return dequant_->InvMatrix(quant_kind, c); + } + + JXL_INLINE size_t DequantMatrixOffset(size_t quant_kind, size_t c) const { + return dequant_->MatrixOffset(quant_kind, c); + } + + // Calculates DC quantization step. + JXL_INLINE float GetDcStep(size_t c) const { + return inv_quant_dc_ * dequant_->DCQuant(c); + } + JXL_INLINE float GetInvDcStep(size_t c) const { + return dequant_->InvDCQuant(c) * (global_scale_float_ * quant_dc_); + } + + JXL_INLINE const float* MulDC() const { return mul_dc_; } + JXL_INLINE const float* InvMulDC() const { return inv_mul_dc_; } + + JXL_INLINE void ClearDCMul() { + std::fill(mul_dc_, mul_dc_ + 4, 1); + std::fill(inv_mul_dc_, inv_mul_dc_ + 4, 1); + } + + void ComputeGlobalScaleAndQuant(float quant_dc, float quant_median, + float quant_median_absd); + + private: + float mul_dc_[4]; + float inv_mul_dc_[4]; + + // These are serialized: + int global_scale_; + int quant_dc_; + + // These are derived from global_scale_: + float inv_global_scale_; + float global_scale_float_; // reciprocal of inv_global_scale_ + float inv_quant_dc_; + + float zero_bias_[3]; + const DequantMatrices* dequant_; +}; + +struct QuantizerParams : public Fields { + QuantizerParams() { Bundle::Init(this); } + const char* Name() const override { return "QuantizerParams"; } + + Status VisitFields(Visitor* JXL_RESTRICT visitor) override; + + uint32_t global_scale; + uint32_t quant_dc; +}; + +} // namespace jxl + +#endif // LIB_JXL_QUANTIZER_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer_test.cc new file mode 100644 index 0000000000..052e138fe3 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/quantizer_test.cc @@ -0,0 +1,82 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/quantizer.h" + +#include + +#include "gtest/gtest.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/image_ops.h" +#include "lib/jxl/image_test_utils.h" + +namespace jxl { +namespace { + +void TestEquivalence(int qxsize, int qysize, const Quantizer& quantizer1, + const Quantizer& quantizer2) { + ASSERT_NEAR(quantizer1.inv_quant_dc(), quantizer2.inv_quant_dc(), 1e-7); +} + +TEST(QuantizerTest, QuantizerParams) { + for (uint32_t i = 1; i < 10000; ++i) { + QuantizerParams p; + p.global_scale = i; + size_t extension_bits = 0, total_bits = 0; + EXPECT_TRUE(Bundle::CanEncode(p, &extension_bits, &total_bits)); + EXPECT_EQ(0, extension_bits); + EXPECT_GE(total_bits, 4); + } +} + +TEST(QuantizerTest, BitStreamRoundtripSameQuant) { + const int qxsize = 8; + const int qysize = 8; + DequantMatrices dequant; + Quantizer quantizer1(&dequant); + ImageI raw_quant_field(qxsize, qysize); + quantizer1.SetQuant(0.17f, 0.17f, &raw_quant_field); + BitWriter writer; + EXPECT_TRUE(quantizer1.Encode(&writer, 0, nullptr)); + writer.ZeroPadToByte(); + const size_t bits_written = writer.BitsWritten(); + Quantizer quantizer2(&dequant, qxsize, qysize); + BitReader reader(writer.GetSpan()); + EXPECT_TRUE(quantizer2.Decode(&reader)); + EXPECT_TRUE(reader.JumpToByteBoundary()); + EXPECT_EQ(reader.TotalBitsConsumed(), bits_written); + EXPECT_TRUE(reader.Close()); + TestEquivalence(qxsize, qysize, quantizer1, quantizer2); +} + +TEST(QuantizerTest, BitStreamRoundtripRandomQuant) { + const int qxsize = 8; + const int qysize = 8; + DequantMatrices dequant; + Quantizer quantizer1(&dequant); + ImageI raw_quant_field(qxsize, qysize); + quantizer1.SetQuant(0.17f, 0.17f, &raw_quant_field); + std::mt19937_64 rng; + std::uniform_int_distribution<> uniform(1, 256); + float quant_dc = 0.17f; + ImageF qf(qxsize, qysize); + RandomFillImage(&qf, 1.0f); + quantizer1.SetQuantField(quant_dc, qf, &raw_quant_field); + BitWriter writer; + EXPECT_TRUE(quantizer1.Encode(&writer, 0, nullptr)); + writer.ZeroPadToByte(); + const size_t bits_written = writer.BitsWritten(); + Quantizer quantizer2(&dequant, qxsize, qysize); + BitReader reader(writer.GetSpan()); + EXPECT_TRUE(quantizer2.Decode(&reader)); + EXPECT_TRUE(reader.JumpToByteBoundary()); + EXPECT_EQ(reader.TotalBitsConsumed(), bits_written); + EXPECT_TRUE(reader.Close()); + TestEquivalence(qxsize, qysize, quantizer1, quantizer2); +} +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/rational_polynomial-inl.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/rational_polynomial-inl.h new file mode 100644 index 0000000000..87bddd1bb2 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/rational_polynomial-inl.h @@ -0,0 +1,94 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Fast SIMD evaluation of rational polynomials for approximating functions. + +#if defined(LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_ +#undef LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_ +#else +#define LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_ +#endif + +#include + +#include +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// Primary template: default to actual division. +template +struct FastDivision { + HWY_INLINE V operator()(const V n, const V d) const { return n / d; } +}; +// Partial specialization for float vectors. +template +struct FastDivision { + // One Newton-Raphson iteration. + static HWY_INLINE V ReciprocalNR(const V x) { + const auto rcp = ApproximateReciprocal(x); + const auto sum = rcp + rcp; + const auto x_rcp = x * rcp; + return NegMulAdd(x_rcp, rcp, sum); + } + + V operator()(const V n, const V d) const { +#if 1 // Faster on SKX + return n / d; +#else + return n * ReciprocalNR(d); +#endif + } +}; + +// Approximates smooth functions via rational polynomials (i.e. dividing two +// polynomials). Evaluates polynomials via Horner's scheme, which is faster than +// Clenshaw recurrence for Chebyshev polynomials. LoadDup128 allows us to +// specify constants (replicated 4x) independently of the lane count. +template +HWY_INLINE HWY_MAYBE_UNUSED V EvalRationalPolynomial(const D d, const V x, + const T (&p)[NP], + const T (&q)[NQ]) { + constexpr size_t kDegP = NP / 4 - 1; + constexpr size_t kDegQ = NQ / 4 - 1; + auto yp = LoadDup128(d, &p[kDegP * 4]); + auto yq = LoadDup128(d, &q[kDegQ * 4]); + // We use pointer arithmetic to refer to &p[(kDegP - n) * 4] to avoid a + // compiler warning that the index is out of bounds since we are already + // checking that it is not out of bounds with (kDegP >= n) and the access + // will be optimized away. Similarly with q and kDegQ. + HWY_FENCE; + if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 1) * 4))); + if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 1) * 4))); + HWY_FENCE; + if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 2) * 4))); + if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 2) * 4))); + HWY_FENCE; + if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 3) * 4))); + if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 3) * 4))); + HWY_FENCE; + if (kDegP >= 4) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 4) * 4))); + if (kDegQ >= 4) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 4) * 4))); + HWY_FENCE; + if (kDegP >= 5) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 5) * 4))); + if (kDegQ >= 5) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 5) * 4))); + HWY_FENCE; + if (kDegP >= 6) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 6) * 4))); + if (kDegQ >= 6) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 6) * 4))); + HWY_FENCE; + if (kDegP >= 7) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 7) * 4))); + if (kDegQ >= 7) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 7) * 4))); + + return FastDivision()(yp, yq); +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); +#endif // LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/rational_polynomial_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/rational_polynomial_test.cc new file mode 100644 index 0000000000..699afd076e --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/rational_polynomial_test.cc @@ -0,0 +1,239 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/rational_polynomial_test.cc" +#include +#include +#include + +#include "lib/jxl/base/descriptive_statistics.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/common.h" +#include "lib/jxl/rational_polynomial-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +using T = float; // required by EvalLog2 +using D = HWY_FULL(T); + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::ShiftLeft; +using hwy::HWY_NAMESPACE::ShiftRight; + +// Generic: only computes polynomial +struct EvalPoly { + template + T operator()(T x, const T (&p)[NP], const T (&q)[NQ]) const { + const HWY_FULL(T) d; + const auto vx = Set(d, x); + const auto approx = EvalRationalPolynomial(d, vx, p, q); + return GetLane(approx); + } +}; + +// Range reduction for log2 +struct EvalLog2 { + template + T operator()(T x, const T (&p)[NP], const T (&q)[NQ]) const { + const HWY_FULL(T) d; + auto vx = Set(d, x); + + const HWY_FULL(int32_t) di; + const auto x_bits = BitCast(di, vx); + // Cannot handle negative numbers / NaN. + JXL_DASSERT(AllTrue(Abs(x_bits) == x_bits)); + + // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops + const auto exp_bits = x_bits - Set(di, 0x3f2aaaab); // = 2/3 + // Shifted exponent = log2; also used to clear mantissa. + const auto exp_shifted = ShiftRight<23>(exp_bits); + const auto mantissa = BitCast(d, x_bits - ShiftLeft<23>(exp_shifted)); + const auto exp_val = ConvertTo(d, exp_shifted); + vx = mantissa - Set(d, 1.0f); + + const auto approx = EvalRationalPolynomial(d, vx, p, q) + exp_val; + return GetLane(approx); + } +}; + +// Functions to approximate: + +T LinearToSrgb8Direct(T val) { + if (val < 0.0) return 0.0; + if (val >= 255.0) return 255.0; + if (val <= 10.0 / 12.92) return val * 12.92; + return 255.0 * (std::pow(val / 255.0, 1.0 / 2.4) * 1.055 - 0.055); +} + +T SimpleGamma(T v) { + static const T kGamma = 0.387494322593; + static const T limit = 43.01745241042018; + T bright = v - limit; + if (bright >= 0) { + static const T mul = 0.0383723643799; + v -= bright * mul; + } + static const T limit2 = 94.68634353321337; + T bright2 = v - limit2; + if (bright2 >= 0) { + static const T mul = 0.22885405968; + v -= bright2 * mul; + } + static const T offset = 0.156775786057; + static const T scale = 8.898059160493739; + T retval = scale * (offset + pow(v, kGamma)); + return retval; +} + +// Runs CaratheodoryFejer and verifies the polynomial using a lot of samples to +// return the biggest error. +template +T RunApproximation(T x0, T x1, const T (&p)[NP], const T (&q)[NQ], + const Eval& eval, T func_to_approx(T)) { + Stats err; + + T lastPrint = 0; + // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter) + for (T x = x0; x <= x1; x += (x1 - x0) / 10000.0) { + const T f = func_to_approx(x); + const T g = eval(x, p, q); + err.Notify(fabs(g - f)); + if (x == x0 || x - lastPrint > (x1 - x0) / 20.0) { + printf("x: %11.6f, f: %11.6f, g: %11.6f, e: %11.6f\n", x, f, g, + fabs(g - f)); + lastPrint = x; + } + } + printf("%s\n", err.ToString().c_str()); + + return err.Max(); +} + +void TestSimpleGamma() { + const T p[4 * (6 + 1)] = { + HWY_REP4(-5.0646949363741811E-05), HWY_REP4(6.7369380528439771E-05), + HWY_REP4(8.9376652530412794E-05), HWY_REP4(2.1153513301520462E-06), + HWY_REP4(-6.9130322970386449E-08), HWY_REP4(3.9424752749293728E-10), + HWY_REP4(1.2360288207619576E-13)}; + + const T q[4 * (6 + 1)] = { + HWY_REP4(-6.6389733798591366E-06), HWY_REP4(1.3299859726565908E-05), + HWY_REP4(3.8538748358398873E-06), HWY_REP4(-2.8707687262928236E-08), + HWY_REP4(-6.6897385800005434E-10), HWY_REP4(6.1428748869186003E-12), + HWY_REP4(-2.5475738169252870E-15)}; + + const T err = RunApproximation(0.77, 274.579999999999984, p, q, EvalPoly(), + SimpleGamma); + EXPECT_LT(err, 0.05); +} + +void TestLinearToSrgb8Direct() { + const T p[4 * (5 + 1)] = { + HWY_REP4(-9.5357499040105154E-05), HWY_REP4(4.6761186249798248E-04), + HWY_REP4(2.5708174333943594E-04), HWY_REP4(1.5250087770436082E-05), + HWY_REP4(1.1946768008931187E-07), HWY_REP4(5.9916446295972850E-11)}; + + const T q[4 * (4 + 1)] = { + HWY_REP4(1.8932479758079768E-05), HWY_REP4(2.7312342474687321E-05), + HWY_REP4(4.3901204783327006E-06), HWY_REP4(1.0417787306920273E-07), + HWY_REP4(3.0084206762140419E-10)}; + + const T err = + RunApproximation(0.77, 255, p, q, EvalPoly(), LinearToSrgb8Direct); + EXPECT_LT(err, 0.05); +} + +void TestExp() { + const T p[4 * (2 + 1)] = {HWY_REP4(9.6266879665530902E-01), + HWY_REP4(4.8961265681586763E-01), + HWY_REP4(8.2619259189548433E-02)}; + const T q[4 * (2 + 1)] = {HWY_REP4(9.6259895571622622E-01), + HWY_REP4(-4.7272457588933831E-01), + HWY_REP4(7.4802088567547664E-02)}; + const T err = + RunApproximation(-1, 1, p, q, EvalPoly(), [](T x) { return T(exp(x)); }); + EXPECT_LT(err, 1E-4); +} + +void TestNegExp() { + // 4,3 is the min required for monotonicity; max error in 0,10: 751 ppm + // no benefit for k>50. + const T p[4 * (4 + 1)] = { + HWY_REP4(5.9580258551150123E-02), HWY_REP4(-2.5073728806886408E-02), + HWY_REP4(4.1561830213689248E-03), HWY_REP4(-3.1815408488900372E-04), + HWY_REP4(9.3866690094906802E-06)}; + const T q[4 * (3 + 1)] = { + HWY_REP4(5.9579108238812878E-02), HWY_REP4(3.4542074345478582E-02), + HWY_REP4(8.7263562483501714E-03), HWY_REP4(1.4095109143061216E-03)}; + + const T err = + RunApproximation(0, 10, p, q, EvalPoly(), [](T x) { return T(exp(-x)); }); + EXPECT_LT(err, sizeof(T) == 8 ? 2E-5 : 3E-5); +} + +void TestSin() { + const T p[4 * (6 + 1)] = { + HWY_REP4(1.5518122109203780E-05), HWY_REP4(2.3388958643675966E+00), + HWY_REP4(-8.6705520940849157E-01), HWY_REP4(-1.9702294764873535E-01), + HWY_REP4(1.2193404314472320E-01), HWY_REP4(-1.7373966109788839E-02), + HWY_REP4(7.8829435883034796E-04)}; + const T q[4 * (5 + 1)] = { + HWY_REP4(2.3394371422557279E+00), HWY_REP4(-8.7028221081288615E-01), + HWY_REP4(2.0052872219658430E-01), HWY_REP4(-3.2460335995264836E-02), + HWY_REP4(3.1546157932479282E-03), HWY_REP4(-1.6692542019380155E-04)}; + + const T err = RunApproximation(0, Pi(1) * 2, p, q, EvalPoly(), + [](T x) { return T(sin(x)); }); + EXPECT_LT(err, sizeof(T) == 8 ? 5E-4 : 7E-4); +} + +void TestLog() { + HWY_ALIGN const T p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06), + HWY_REP4(1.4287160470083755E+00), + HWY_REP4(7.4245873327820566E-01)}; + HWY_ALIGN const T q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01), + HWY_REP4(1.0096718572241148E+00), + HWY_REP4(1.7409343003366853E-01)}; + const T err = RunApproximation(1E-6, 1000, p, q, EvalLog2(), std::log2); + printf("%E\n", err); +} + +HWY_NOINLINE void TestRationalPolynomial() { + TestSimpleGamma(); + TestLinearToSrgb8Direct(); + TestExp(); + TestNegExp(); + TestSin(); + TestLog(); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +class RationalPolynomialTest : public hwy::TestWithParamTarget {}; +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(RationalPolynomialTest); + +HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestSimpleGamma); +HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestLinearToSrgb8Direct); +HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestExp); +HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestNegExp); +HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestSin); +HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestLog); + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/robust_statistics_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/robust_statistics_test.cc new file mode 100644 index 0000000000..22ee56abdb --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/robust_statistics_test.cc @@ -0,0 +1,150 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/robust_statistics.h" + +#include + +#include // partial_sum +#include + +#include "gtest/gtest.h" +#include "lib/jxl/noise_distributions.h" + +namespace jxl { +namespace { + +TEST(RobustStatisticsTest, TestMode) { + // Enough to populate bins. We have to sort this many values. + constexpr size_t kReps = 15000; + constexpr size_t kBins = 101; + + std::mt19937 rng(65537); + + // Place Poisson mean at 1/10, 2/10 .. 9/10 of the bin range. + for (int frac = 1; frac < 10; ++frac) { + printf("===========================frac %d\n", frac); + + NoisePoisson noise(frac * kBins / 10); + std::vector values; + values.reserve(kReps); + + uint32_t bins[kBins] = {0}; + + std::uniform_real_distribution jitter(-1E-3f, 1E-3f); + for (size_t rep = 0; rep < kReps; ++rep) { + // Scale back to integer, add jitter to avoid too many repeated values. + const float poisson = noise(0.0f, &rng) * 1E3f + jitter(rng); + + values.push_back(poisson); + + const int idx_bin = static_cast(poisson); + if (idx_bin < static_cast(kBins)) { + bins[idx_bin] += 1; + } // else skip instead of clamping to avoid bias + } + + // // Print histogram + // for (const uint32_t b : bins) { + // printf("%u\n", b); + // } + + // (Smoothed) argmax and median for verification + float smoothed[kBins]; + smoothed[0] = bins[0]; + smoothed[kBins - 1] = bins[kBins - 1]; + for (size_t i = 1; i < kBins - 1; ++i) { + smoothed[i] = (2 * bins[i] + bins[i - 1] + bins[i + 1]) * 0.25f; + } + const float argmax = + std::max_element(smoothed, smoothed + kBins) - smoothed; + const float median = Median(&values); + + std::sort(values.begin(), values.end()); + const float hsm = HalfSampleMode()(values.data(), values.size()); + + uint32_t cdf[kBins]; + std::partial_sum(bins, bins + kBins, cdf); + const int hrm = HalfRangeMode()(cdf, kBins); + + const auto is_near = [](const float expected, const float actual) { + return std::abs(expected - actual) <= 1.0f + 1E-5f; + }; + EXPECT_TRUE(is_near(hsm, argmax) || is_near(hsm, median)); + EXPECT_TRUE(is_near(hrm, argmax) || is_near(hrm, median)); + + printf("hsm %.1f hrm %d argmax %.1f median %f\n", hsm, hrm, argmax, median); + const int center = static_cast(argmax); + printf("%d %d %d %d %d\n", bins[center - 2], bins[center - 1], bins[center], + bins[center + 1], bins[center + 2]); + } +} + +// Ensures Median3/5 return the same results as Median. +TEST(RobustStatisticsTest, TestMedian) { + std::vector v3(3), v5(5); + + std::uniform_real_distribution dist(-100.0f, 100.0f); + std::mt19937 rng(129); + +#ifdef NDEBUG + constexpr size_t kReps = 100000; +#else + constexpr size_t kReps = 100; +#endif + for (size_t i = 0; i < kReps; ++i) { + v3[0] = dist(rng); + v3[1] = dist(rng); + v3[2] = dist(rng); + for (size_t j = 0; j < 5; ++j) { + v5[j] = dist(rng); + } + + JXL_ASSERT(Median(&v3) == Median3(v3[0], v3[1], v3[2])); + JXL_ASSERT(Median(&v5) == Median5(v5[0], v5[1], v5[2], v5[3], v5[4])); + } +} + +template +void TestLine(const Noise& noise, float max_l1_limit, float mad_limit) { + std::vector points; + Line perfect(0.6f, 2.0f); + + // Random spacing of X (must be unique) + float x = -100.0f; + std::mt19937_64 rng(129); + std::uniform_real_distribution x_dist(1E-6f, 10.0f); + for (size_t ix = 0; ix < 500; ++ix) { + x += x_dist(rng); + const float y = noise(perfect(x), &rng); + points.emplace_back(x, y); + // printf("%f,%f\n", x, y); + } + + Line est(points); + float max_l1, mad; + EvaluateQuality(est, points, &max_l1, &mad); + printf("x %f slope=%.2f b=%.2f max_l1 %f mad %f\n", x, est.slope(), + est.intercept(), max_l1, mad); + + EXPECT_LE(max_l1, max_l1_limit); + EXPECT_LE(mad, mad_limit); +} + +TEST(RobustStatisticsTest, CleanLine) { + const NoiseNone noise; + TestLine(noise, 1E-6, 1E-7); +} +TEST(RobustStatisticsTest, Uniform) { + const NoiseUniform noise(-100.0f, 100.0f); + TestLine(noise, 107, 53); +} +TEST(RobustStatisticsTest, Gauss) { + const NoiseGaussian noise(10.0f); + TestLine(noise, 37, 7); +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/roundtrip_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/roundtrip_test.cc new file mode 100644 index 0000000000..219619fd7f --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/roundtrip_test.cc @@ -0,0 +1,573 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "gtest/gtest.h" +#include "jxl/decode.h" +#include "jxl/decode_cxx.h" +#include "jxl/encode.h" +#include "jxl/encode_cxx.h" +#include "lib/extras/codec.h" +#include "lib/jxl/dec_external_image.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/encode_internal.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testdata.h" + +namespace { + +// Converts a test image to a CodecInOut. +// icc_profile can be empty to automatically deduce profile from the pixel +// format, or filled in to force this ICC profile +jxl::CodecInOut ConvertTestImage(const std::vector& buf, + const size_t xsize, const size_t ysize, + const JxlPixelFormat& pixel_format, + const jxl::PaddedBytes& icc_profile) { + jxl::CodecInOut io; + io.SetSize(xsize, ysize); + + bool is_gray = + pixel_format.num_channels == 1 || pixel_format.num_channels == 2; + bool has_alpha = + pixel_format.num_channels == 2 || pixel_format.num_channels == 4; + + io.metadata.m.color_encoding.SetColorSpace(is_gray ? jxl::ColorSpace::kGray + : jxl::ColorSpace::kRGB); + if (has_alpha) { + // Note: alpha > 16 not yet supported by the C++ codec + switch (pixel_format.data_type) { + case JXL_TYPE_UINT8: + io.metadata.m.SetAlphaBits(8); + break; + case JXL_TYPE_UINT16: + case JXL_TYPE_UINT32: + case JXL_TYPE_FLOAT: + case JXL_TYPE_FLOAT16: + io.metadata.m.SetAlphaBits(16); + break; + default: + EXPECT_TRUE(false) << "Roundtrip tests for data type " + << pixel_format.data_type << " not yet implemented."; + } + } + size_t bitdepth = 0; + switch (pixel_format.data_type) { + case JXL_TYPE_FLOAT: + bitdepth = 32; + io.metadata.m.SetFloat32Samples(); + break; + case JXL_TYPE_FLOAT16: + bitdepth = 16; + io.metadata.m.SetFloat16Samples(); + break; + case JXL_TYPE_UINT8: + bitdepth = 8; + io.metadata.m.SetUintSamples(8); + break; + case JXL_TYPE_UINT16: + bitdepth = 16; + io.metadata.m.SetUintSamples(16); + break; + default: + EXPECT_TRUE(false) << "Roundtrip tests for data type " + << pixel_format.data_type << " not yet implemented."; + } + jxl::ColorEncoding color_encoding; + if (!icc_profile.empty()) { + jxl::PaddedBytes icc_profile_copy(icc_profile); + EXPECT_TRUE(color_encoding.SetICC(std::move(icc_profile_copy))); + } else if (pixel_format.data_type == JXL_TYPE_FLOAT) { + color_encoding = jxl::ColorEncoding::LinearSRGB(is_gray); + } else { + color_encoding = jxl::ColorEncoding::SRGB(is_gray); + } + EXPECT_TRUE( + ConvertFromExternal(jxl::Span(buf.data(), buf.size()), + xsize, ysize, color_encoding, has_alpha, + /*alpha_is_premultiplied=*/false, + /*bits_per_sample=*/bitdepth, pixel_format.endianness, + /*flipped_y=*/false, /*pool=*/nullptr, &io.Main())); + return io; +} + +template +T ConvertTestPixel(const float val); + +template <> +float ConvertTestPixel(const float val) { + return val; +} + +template <> +uint16_t ConvertTestPixel(const float val) { + return (uint16_t)(val * UINT16_MAX); +} + +template <> +uint8_t ConvertTestPixel(const float val) { + return (uint8_t)(val * UINT8_MAX); +} + +// Returns a test image. +template +std::vector GetTestImage(const size_t xsize, const size_t ysize, + const JxlPixelFormat& pixel_format) { + std::vector pixels(xsize * ysize * pixel_format.num_channels); + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xsize; x++) { + for (size_t chan = 0; chan < pixel_format.num_channels; chan++) { + float val; + switch (chan % 4) { + case 0: + val = static_cast(y) / static_cast(ysize); + break; + case 1: + val = static_cast(x) / static_cast(xsize); + break; + case 2: + val = static_cast(x + y) / static_cast(xsize + ysize); + break; + case 3: + val = static_cast(x * y) / static_cast(xsize * ysize); + break; + } + pixels[(y * xsize + x) * pixel_format.num_channels + chan] = + ConvertTestPixel(val); + } + } + } + std::vector bytes(pixels.size() * sizeof(T)); + memcpy(bytes.data(), pixels.data(), sizeof(T) * pixels.size()); + return bytes; +} + +void EncodeWithEncoder(JxlEncoder* enc, std::vector* compressed) { + compressed->resize(64); + uint8_t* next_out = compressed->data(); + size_t avail_out = compressed->size() - (next_out - compressed->data()); + JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT; + while (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + process_result = JxlEncoderProcessOutput(enc, &next_out, &avail_out); + if (process_result == JXL_ENC_NEED_MORE_OUTPUT) { + size_t offset = next_out - compressed->data(); + compressed->resize(compressed->size() * 2); + next_out = compressed->data() + offset; + avail_out = compressed->size() - offset; + } + } + compressed->resize(next_out - compressed->data()); + EXPECT_EQ(JXL_ENC_SUCCESS, process_result); +} + +// Generates some pixels using using some dimensions and pixel_format, +// compresses them, and verifies that the decoded version is similar to the +// original pixels. +template +void VerifyRoundtripCompression(const size_t xsize, const size_t ysize, + const JxlPixelFormat& input_pixel_format, + const JxlPixelFormat& output_pixel_format, + const bool lossless, const bool use_container) { + const std::vector original_bytes = + GetTestImage(xsize, ysize, input_pixel_format); + jxl::CodecInOut original_io = + ConvertTestImage(original_bytes, xsize, ysize, input_pixel_format, {}); + + JxlEncoder* enc = JxlEncoderCreate(nullptr); + EXPECT_NE(nullptr, enc); + + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc, use_container)); + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &input_pixel_format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + basic_info.uses_original_profile = lossless; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info)); + JxlColorEncoding color_encoding; + if (input_pixel_format.data_type == JXL_TYPE_FLOAT) { + JxlColorEncodingSetToLinearSRGB( + &color_encoding, + /*is_gray=*/input_pixel_format.num_channels < 3); + } else { + JxlColorEncodingSetToSRGB(&color_encoding, + /*is_gray=*/input_pixel_format.num_channels < 3); + } + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetColorEncoding(enc, &color_encoding)); + JxlEncoderOptions* opts = JxlEncoderOptionsCreate(enc, nullptr); + JxlEncoderOptionsSetLossless(opts, lossless); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddImageFrame(opts, &input_pixel_format, + (void*)original_bytes.data(), + original_bytes.size())); + JxlEncoderCloseInput(enc); + + std::vector compressed; + EncodeWithEncoder(enc, &compressed); + JxlEncoderDestroy(enc); + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_NE(nullptr, dec); + + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO | + JXL_DEC_COLOR_ENCODING | + JXL_DEC_FULL_IMAGE)); + + JxlDecoderSetInput(dec, next_in, avail_in); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderImageOutBufferSize( + dec, &output_pixel_format, &buffer_size)); + if (&input_pixel_format == &output_pixel_format) { + EXPECT_EQ(buffer_size, original_bytes.size()); + } + + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(xsize, info.xsize); + EXPECT_EQ(ysize, info.ysize); + + EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec)); + + size_t icc_profile_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize(dec, &output_pixel_format, + JXL_COLOR_PROFILE_TARGET_DATA, + &icc_profile_size)); + jxl::PaddedBytes icc_profile(icc_profile_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsICCProfile( + dec, &output_pixel_format, JXL_COLOR_PROFILE_TARGET_DATA, + icc_profile.data(), icc_profile.size())); + + std::vector decoded_bytes(buffer_size); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer( + dec, &output_pixel_format, + decoded_bytes.data(), decoded_bytes.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + + JxlDecoderDestroy(dec); + + jxl::CodecInOut decoded_io = ConvertTestImage( + decoded_bytes, xsize, ysize, output_pixel_format, icc_profile); + + jxl::ButteraugliParams ba; + float butteraugli_score = ButteraugliDistance(original_io, decoded_io, ba, + /*distmap=*/nullptr, nullptr); + if (lossless) { + EXPECT_LE(butteraugli_score, 0.0f); + } else { + EXPECT_LE(butteraugli_score, 2.0f); + } +} + +} // namespace + +TEST(RoundtripTest, FloatFrameRoundtripTest) { + for (int use_container = 0; use_container < 2; use_container++) { + for (int lossless = 0; lossless < 2; lossless++) { + for (uint32_t num_channels = 1; num_channels < 5; num_channels++) { + // There's no support (yet) for lossless extra float channels, so we + // don't test it. + if (num_channels % 2 != 0 || !lossless) { + JxlPixelFormat pixel_format = JxlPixelFormat{ + num_channels, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0}; + VerifyRoundtripCompression(63, 129, pixel_format, pixel_format, + (bool)lossless, + (bool)use_container); + } + } + } + } +} + +TEST(RoundtripTest, Uint16FrameRoundtripTest) { + for (int use_container = 0; use_container < 2; use_container++) { + for (int lossless = 0; lossless < 2; lossless++) { + for (uint32_t num_channels = 1; num_channels < 5; num_channels++) { + JxlPixelFormat pixel_format = + JxlPixelFormat{num_channels, JXL_TYPE_UINT16, JXL_NATIVE_ENDIAN, 0}; + VerifyRoundtripCompression(63, 129, pixel_format, + pixel_format, (bool)lossless, + (bool)use_container); + } + } + } +} + +TEST(RoundtripTest, Uint8FrameRoundtripTest) { + for (int use_container = 0; use_container < 2; use_container++) { + for (int lossless = 0; lossless < 2; lossless++) { + for (uint32_t num_channels = 1; num_channels < 5; num_channels++) { + JxlPixelFormat pixel_format = + JxlPixelFormat{num_channels, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0}; + VerifyRoundtripCompression(63, 129, pixel_format, pixel_format, + (bool)lossless, + (bool)use_container); + } + } + } +} + +TEST(RoundtripTest, TestNonlinearSrgbAsXybEncoded) { + for (int use_container = 0; use_container < 2; use_container++) { + for (uint32_t num_channels = 1; num_channels < 5; num_channels++) { + JxlPixelFormat pixel_format_in = + JxlPixelFormat{num_channels, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0}; + JxlPixelFormat pixel_format_out = + JxlPixelFormat{num_channels, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0}; + VerifyRoundtripCompression( + 63, 129, pixel_format_in, pixel_format_out, + /*lossless=*/false, (bool)use_container); + } + } +} + +TEST(RoundtripTest, ExtraBoxesTest) { + JxlPixelFormat pixel_format = + JxlPixelFormat{4, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0}; + const size_t xsize = 61; + const size_t ysize = 71; + + const std::vector original_bytes = + GetTestImage(xsize, ysize, pixel_format); + jxl::CodecInOut original_io = + ConvertTestImage(original_bytes, xsize, ysize, pixel_format, {}); + + JxlEncoder* enc = JxlEncoderCreate(nullptr); + EXPECT_NE(nullptr, enc); + + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc, true)); + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + basic_info.uses_original_profile = false; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info)); + JxlColorEncoding color_encoding; + if (pixel_format.data_type == JXL_TYPE_FLOAT) { + JxlColorEncodingSetToLinearSRGB(&color_encoding, + /*is_gray=*/pixel_format.num_channels < 3); + } else { + JxlColorEncodingSetToSRGB(&color_encoding, + /*is_gray=*/pixel_format.num_channels < 3); + } + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetColorEncoding(enc, &color_encoding)); + JxlEncoderOptions* opts = JxlEncoderOptionsCreate(enc, nullptr); + JxlEncoderOptionsSetLossless(opts, false); + EXPECT_EQ( + JXL_ENC_SUCCESS, + JxlEncoderAddImageFrame(opts, &pixel_format, (void*)original_bytes.data(), + original_bytes.size())); + JxlEncoderCloseInput(enc); + + std::vector compressed; + EncodeWithEncoder(enc, &compressed); + JxlEncoderDestroy(enc); + + std::vector extra_data(1023); + jxl::AppendBoxHeader(jxl::MakeBoxType("crud"), extra_data.size(), false, + &compressed); + compressed.insert(compressed.end(), extra_data.begin(), extra_data.end()); + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_NE(nullptr, dec); + + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO | + JXL_DEC_COLOR_ENCODING | + JXL_DEC_FULL_IMAGE)); + + JxlDecoderSetInput(dec, next_in, avail_in); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &pixel_format, &buffer_size)); + EXPECT_EQ(buffer_size, original_bytes.size()); + + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(xsize, info.xsize); + EXPECT_EQ(ysize, info.ysize); + + EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec)); + + size_t icc_profile_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize(dec, &pixel_format, + JXL_COLOR_PROFILE_TARGET_DATA, + &icc_profile_size)); + jxl::PaddedBytes icc_profile(icc_profile_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsICCProfile( + dec, &pixel_format, JXL_COLOR_PROFILE_TARGET_DATA, + icc_profile.data(), icc_profile.size())); + + std::vector decoded_bytes(buffer_size); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(dec, &pixel_format, + decoded_bytes.data(), + decoded_bytes.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + + JxlDecoderDestroy(dec); + + jxl::CodecInOut decoded_io = + ConvertTestImage(decoded_bytes, xsize, ysize, pixel_format, icc_profile); + + jxl::ButteraugliParams ba; + float butteraugli_score = ButteraugliDistance(original_io, decoded_io, ba, + /*distmap=*/nullptr, nullptr); + EXPECT_LE(butteraugli_score, 2.0f); +} + +TEST(RoundtripTest, TestICCProfile) { + // This ICC profile is not a valid ICC profile, however neither the encoder + // nor the decoder parse this profile, and the bytes should be passed on + // correctly through the roundtrip. + jxl::PaddedBytes icc; + for (size_t i = 0; i < 200; i++) { + icc.push_back(i ^ 55); + } + + JxlPixelFormat format = + JxlPixelFormat{3, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0}; + + size_t xsize = 25; + size_t ysize = 37; + const std::vector original_bytes = + GetTestImage(xsize, ysize, format); + + JxlEncoder* enc = JxlEncoderCreate(nullptr); + EXPECT_NE(nullptr, enc); + + JxlBasicInfo basic_info; + jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &format); + basic_info.xsize = xsize; + basic_info.ysize = ysize; + basic_info.uses_original_profile = JXL_FALSE; + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info)); + + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderSetICCProfile(enc, icc.data(), icc.size())); + JxlEncoderOptions* opts = JxlEncoderOptionsCreate(enc, nullptr); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddImageFrame(opts, &format, (void*)original_bytes.data(), + original_bytes.size())); + JxlEncoderCloseInput(enc); + + std::vector compressed; + EncodeWithEncoder(enc, &compressed); + JxlEncoderDestroy(enc); + + JxlDecoder* dec = JxlDecoderCreate(nullptr); + EXPECT_NE(nullptr, dec); + + const uint8_t* next_in = compressed.data(); + size_t avail_in = compressed.size(); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO | + JXL_DEC_COLOR_ENCODING | + JXL_DEC_FULL_IMAGE)); + + JxlDecoderSetInput(dec, next_in, avail_in); + EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec)); + size_t buffer_size; + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)); + EXPECT_EQ(buffer_size, original_bytes.size()); + + JxlBasicInfo info; + EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info)); + EXPECT_EQ(xsize, info.xsize); + EXPECT_EQ(ysize, info.ysize); + + EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec)); + + size_t dec_icc_size; + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderGetICCProfileSize( + dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &dec_icc_size)); + EXPECT_EQ(icc.size(), dec_icc_size); + jxl::PaddedBytes dec_icc(dec_icc_size); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderGetColorAsICCProfile(dec, &format, + JXL_COLOR_PROFILE_TARGET_ORIGINAL, + dec_icc.data(), dec_icc.size())); + + std::vector decoded_bytes(buffer_size); + + EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetImageOutBuffer(dec, &format, decoded_bytes.data(), + decoded_bytes.size())); + + EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec)); + + EXPECT_EQ(icc, dec_icc); + + JxlDecoderDestroy(dec); +} + +TEST(RoundtripTest, JXL_TRANSCODE_JPEG_TEST(TestJPEGReconstruction)) { + const std::string jpeg_path = + "imagecompression.info/flower_foveon.png.im_q85_420.jpg"; + const jxl::PaddedBytes orig = jxl::ReadTestData(jpeg_path); + jxl::CodecInOut orig_io; + ASSERT_TRUE( + SetFromBytes(jxl::Span(orig), &orig_io, /*pool=*/nullptr)); + + JxlEncoderPtr enc = JxlEncoderMake(nullptr); + JxlEncoderOptions* options = JxlEncoderOptionsCreate(enc.get(), NULL); + + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc.get(), JXL_TRUE)); + EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderStoreJPEGMetadata(enc.get(), JXL_TRUE)); + EXPECT_EQ(JXL_ENC_SUCCESS, + JxlEncoderAddJPEGFrame(options, orig.data(), orig.size())); + JxlEncoderCloseInput(enc.get()); + + std::vector compressed; + EncodeWithEncoder(enc.get(), &compressed); + + JxlDecoderPtr dec = JxlDecoderMake(nullptr); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSubscribeEvents( + dec.get(), JXL_DEC_JPEG_RECONSTRUCTION | JXL_DEC_FULL_IMAGE)); + JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size()); + EXPECT_EQ(JXL_DEC_JPEG_RECONSTRUCTION, JxlDecoderProcessInput(dec.get())); + std::vector reconstructed_buffer(128); + EXPECT_EQ(JXL_DEC_SUCCESS, + JxlDecoderSetJPEGBuffer(dec.get(), reconstructed_buffer.data(), + reconstructed_buffer.size())); + size_t used = 0; + JxlDecoderStatus dec_process_result = JXL_DEC_JPEG_NEED_MORE_OUTPUT; + while (dec_process_result == JXL_DEC_JPEG_NEED_MORE_OUTPUT) { + used = reconstructed_buffer.size() - JxlDecoderReleaseJPEGBuffer(dec.get()); + reconstructed_buffer.resize(reconstructed_buffer.size() * 2); + EXPECT_EQ( + JXL_DEC_SUCCESS, + JxlDecoderSetJPEGBuffer(dec.get(), reconstructed_buffer.data() + used, + reconstructed_buffer.size() - used)); + dec_process_result = JxlDecoderProcessInput(dec.get()); + } + ASSERT_EQ(JXL_DEC_FULL_IMAGE, dec_process_result); + used = reconstructed_buffer.size() - JxlDecoderReleaseJPEGBuffer(dec.get()); + ASSERT_EQ(used, orig.size()); + EXPECT_EQ(0, memcmp(reconstructed_buffer.data(), orig.data(), used)); +} diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/sanitizers.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/sanitizers.h new file mode 100644 index 0000000000..69cec8afac --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/sanitizers.h @@ -0,0 +1,222 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_SANITIZERS_H_ +#define LIB_JXL_SANITIZERS_H_ + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/image.h" + +#ifdef MEMORY_SANITIZER +#define JXL_MEMORY_SANITIZER 1 +#elif defined(__has_feature) +#if __has_feature(memory_sanitizer) +#define JXL_MEMORY_SANITIZER 1 +#else +#define JXL_MEMORY_SANITIZER 0 +#endif +#else +#define JXL_MEMORY_SANITIZER 0 +#endif + +#if JXL_MEMORY_SANITIZER +#include + +#include +#include +#include + +#include "lib/jxl/base/status.h" +#include "sanitizer/msan_interface.h" +#endif + +namespace jxl { +namespace msan { + +#if JXL_MEMORY_SANITIZER + +// Chosen so that kSanitizerSentinel is four copies of kSanitizerSentinelByte. +constexpr uint8_t kSanitizerSentinelByte = 0x48; +constexpr float kSanitizerSentinel = 205089.125f; + +static JXL_INLINE JXL_MAYBE_UNUSED void PoisonMemory(const volatile void* m, + size_t size) { + __msan_poison(m, size); +} + +static JXL_INLINE JXL_MAYBE_UNUSED void UnpoisonMemory(const volatile void* m, + size_t size) { + __msan_unpoison(m, size); +} + +// Mark all the bytes of an image (including padding) as poisoned bytes. +static JXL_INLINE JXL_MAYBE_UNUSED void PoisonImage(const PlaneBase& im) { + PoisonMemory(im.bytes(), im.bytes_per_row() * im.ysize()); +} + +template +static JXL_INLINE JXL_MAYBE_UNUSED void PoisonImage(const Image3& im) { + PoisonImage(im.Plane(0)); + PoisonImage(im.Plane(1)); + PoisonImage(im.Plane(2)); +} + +// Print the uninitialized regions of an image. +template +static JXL_INLINE JXL_MAYBE_UNUSED void PrintImageUninitialized( + const Plane& im) { + fprintf(stderr, "Uninitialized regions for image of size %zux%zu:\n", + im.xsize(), im.ysize()); + + // A segment of uninitialized pixels in a row, in the format [first, second). + typedef std::pair PixelSegment; + + // Helper class to merge and print a list of rows of PixelSegment that may be + // the same over big ranges of rows. This compacts the output to ranges of + // rows like "[y0, y1): [x0, x1) [x2, x3)". + class RowsMerger { + public: + // Add a new row the list of rows. If the row is the same as the previous + // one it will be merged showing a range of rows [y0, y1), but if the new + // row is different the current range of rows (if any) will be printed and a + // new one will be started. + void AddRow(size_t y, std::vector&& new_row) { + if (start_y_ != -1 && new_row != segments_) { + PrintRow(y); + } + if (new_row.empty()) { + // Skip ranges with no uninitialized pixels. + start_y_ = -1; + segments_.clear(); + return; + } + if (start_y_ == -1) { + start_y_ = y; + segments_ = std::move(new_row); + } + } + + // Print the contents of the range of rows [start_y_, end_y) if any. + void PrintRow(size_t end_y) { + if (start_y_ == -1) return; + if (segments_.empty()) { + start_y_ = -1; + return; + } + if (end_y - start_y_ > 1) { + fprintf(stderr, " y=[%zd, %zu):", start_y_, end_y); + } else { + fprintf(stderr, " y=[%zd]:", start_y_); + } + for (const auto& seg : segments_) { + if (seg.first + 1 == seg.second) { + fprintf(stderr, " [%zd]", seg.first); + } else { + fprintf(stderr, " [%zd, %zu)", seg.first, seg.second); + } + } + fprintf(stderr, "\n"); + start_y_ = -1; + } + + private: + std::vector segments_; + // Row number of the first row in the range of rows that have |segments| as + // the undefined segments. + ssize_t start_y_ = -1; + } rows_merger; + + class SegmentsMerger { + public: + void AddValue(size_t x) { + if (row.empty() || row.back().second != x) { + row.emplace_back(x, x + 1); + } else { + row.back().second = x + 1; + } + } + + std::vector row; + }; + + for (size_t y = 0; y < im.ysize(); y++) { + auto* row = im.Row(y); + SegmentsMerger seg_merger; + size_t x = 0; + while (x < im.xsize()) { + intptr_t ret = + __msan_test_shadow(row + x, (im.xsize() - x) * sizeof(row[0])); + if (ret < 0) break; + size_t next_x = x + ret / sizeof(row[0]); + seg_merger.AddValue(next_x); + x = next_x + 1; + } + rows_merger.AddRow(y, std::move(seg_merger.row)); + } + rows_merger.PrintRow(im.ysize()); +} + +// Check that all the pixels in the provided rect of the image are initialized +// (not poisoned). If any of the values is poisoned it will abort. +template +static JXL_INLINE JXL_MAYBE_UNUSED void CheckImageInitialized( + const Plane& im, const Rect& r, const char* message) { + JXL_ASSERT(r.x0() <= im.xsize()); + JXL_ASSERT(r.x0() + r.xsize() <= im.xsize()); + JXL_ASSERT(r.y0() <= im.ysize()); + JXL_ASSERT(r.y0() + r.ysize() <= im.ysize()); + for (size_t y = r.y0(); y < r.y0() + r.ysize(); y++) { + const auto* row = im.Row(y); + intptr_t ret = __msan_test_shadow(row + r.x0(), sizeof(*row) * r.xsize()); + if (ret != -1) { + JXL_DEBUG(1, + "Checking an image of %zu x %zu, rect x0=%zu, y0=%zu, " + "xsize=%zu, ysize=%zu", + im.xsize(), im.ysize(), r.x0(), r.y0(), r.xsize(), r.ysize()); + size_t x = ret / sizeof(*row); + JXL_DEBUG(1, "CheckImageInitialized failed at x=%zu, y=%zu: %s", x, y, + message ? message : ""); + PrintImageUninitialized(im); + } + // This will report an error if memory is not initialized. + __msan_check_mem_is_initialized(row + r.x0(), sizeof(*row) * r.xsize()); + } +} + +template +static JXL_INLINE JXL_MAYBE_UNUSED void CheckImageInitialized( + const Image3& im, const Rect& r, const char* message) { + for (size_t c = 0; c < 3; c++) { + std::string str_message(message); + str_message += " c=" + std::to_string(c); + CheckImageInitialized(im.Plane(c), r, str_message.c_str()); + } +} + +#define JXL_CHECK_IMAGE_INITIALIZED(im, r) \ + ::jxl::msan::CheckImageInitialized(im, r, "im=" #im ", r=" #r); + +#else // JXL_MEMORY_SANITIZER + +// In non-msan mode these functions don't use volatile since it is not needed +// for the empty functions. + +static JXL_INLINE JXL_MAYBE_UNUSED void PoisonMemory(const void*, size_t) {} +static JXL_INLINE JXL_MAYBE_UNUSED void UnpoisonMemory(const void*, size_t) {} + +static JXL_INLINE JXL_MAYBE_UNUSED void PoisonImage(const PlaneBase& im) {} +template +static JXL_INLINE JXL_MAYBE_UNUSED void PoisonImage(const Plane& im) {} + +#define JXL_CHECK_IMAGE_INITIALIZED(im, r) + +#endif + +} // namespace msan +} // namespace jxl + +#endif // LIB_JXL_SANITIZERS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/speed_tier_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/speed_tier_test.cc new file mode 100644 index 0000000000..4e7c9f9fc4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/speed_tier_test.cc @@ -0,0 +1,112 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include + +#include "gtest/gtest.h" +#include "lib/extras/codec.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/thread_pool_internal.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/dec_file.h" +#include "lib/jxl/dec_params.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_cache.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_params.h" +#include "lib/jxl/image.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testdata.h" + +namespace jxl { +namespace { + +struct SpeedTierTestParams { + explicit SpeedTierTestParams(const SpeedTier speed_tier, + const bool shrink8 = false) + : speed_tier(speed_tier), shrink8(shrink8) {} + SpeedTier speed_tier; + bool shrink8; +}; + +std::ostream& operator<<(std::ostream& os, SpeedTierTestParams params) { + auto previous_flags = os.flags(); + os << std::boolalpha; + os << "SpeedTierTestParams{" << SpeedTierName(params.speed_tier) + << ", /*shrink8=*/" << params.shrink8 << "}"; + os.flags(previous_flags); + return os; +} + +class SpeedTierTest : public testing::TestWithParam {}; + +JXL_GTEST_INSTANTIATE_TEST_SUITE_P( + SpeedTierTestInstantiation, SpeedTierTest, + testing::Values(SpeedTierTestParams{SpeedTier::kCheetah, + /*shrink8=*/true}, + SpeedTierTestParams{SpeedTier::kCheetah, + /*shrink8=*/false}, + SpeedTierTestParams{SpeedTier::kThunder, + /*shrink8=*/true}, + SpeedTierTestParams{SpeedTier::kThunder, + /*shrink8=*/false}, + SpeedTierTestParams{SpeedTier::kLightning, + /*shrink8=*/true}, + SpeedTierTestParams{SpeedTier::kLightning, + /*shrink8=*/false}, + SpeedTierTestParams{SpeedTier::kFalcon, + /*shrink8=*/true}, + SpeedTierTestParams{SpeedTier::kFalcon, + /*shrink8=*/false}, + SpeedTierTestParams{SpeedTier::kHare, + /*shrink8=*/true}, + SpeedTierTestParams{SpeedTier::kHare, + /*shrink8=*/false}, + SpeedTierTestParams{SpeedTier::kWombat, + /*shrink8=*/true}, + SpeedTierTestParams{SpeedTier::kWombat, + /*shrink8=*/false}, + SpeedTierTestParams{SpeedTier::kSquirrel, + /*shrink8=*/true}, + SpeedTierTestParams{SpeedTier::kSquirrel, + /*shrink8=*/false}, + SpeedTierTestParams{SpeedTier::kKitten, + /*shrink8=*/true}, + SpeedTierTestParams{SpeedTier::kKitten, + /*shrink8=*/false}, + // Only downscaled image for Tortoise mode. + SpeedTierTestParams{SpeedTier::kTortoise, + /*shrink8=*/true})); + +TEST_P(SpeedTierTest, Roundtrip) { + const PaddedBytes orig = + ReadTestData("wesaturate/500px/u76c0g_bliznaca_srgb8.png"); + CodecInOut io; + ThreadPoolInternal pool(8); + ASSERT_TRUE(SetFromBytes(Span(orig), &io, &pool)); + + const SpeedTierTestParams& params = GetParam(); + + if (params.shrink8) { + io.ShrinkTo(io.xsize() / 8, io.ysize() / 8); + } + + CompressParams cparams; + cparams.speed_tier = params.speed_tier; + DecompressParams dparams; + + CodecInOut io2; + test::Roundtrip(&io, cparams, dparams, nullptr, &io2); + + // Can be 2.2 in non-hare mode. + EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, + /*distmap=*/nullptr, /*pool=*/nullptr), + 2.8); +} +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/splines.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/splines.cc new file mode 100644 index 0000000000..802fc5b029 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/splines.cc @@ -0,0 +1,514 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/splines.h" + +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/common.h" +#include "lib/jxl/dct_scales.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/opsin_params.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/splines.cc" +#include +#include + +#include "lib/jxl/fast_math-inl.h" +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// Given a set of DCT coefficients, this returns the result of performing cosine +// interpolation on the original samples. +float ContinuousIDCT(const float dct[32], float t) { + // We compute here the DCT-3 of the `dct` vector, rescaled by a factor of + // sqrt(32). This is such that an input vector vector {x, 0, ..., 0} produces + // a constant result of x. dct[0] was scaled in Dequantize() to allow uniform + // treatment of all the coefficients. + constexpr float kMultipliers[32] = { + kPi / 32 * 0, kPi / 32 * 1, kPi / 32 * 2, kPi / 32 * 3, kPi / 32 * 4, + kPi / 32 * 5, kPi / 32 * 6, kPi / 32 * 7, kPi / 32 * 8, kPi / 32 * 9, + kPi / 32 * 10, kPi / 32 * 11, kPi / 32 * 12, kPi / 32 * 13, kPi / 32 * 14, + kPi / 32 * 15, kPi / 32 * 16, kPi / 32 * 17, kPi / 32 * 18, kPi / 32 * 19, + kPi / 32 * 20, kPi / 32 * 21, kPi / 32 * 22, kPi / 32 * 23, kPi / 32 * 24, + kPi / 32 * 25, kPi / 32 * 26, kPi / 32 * 27, kPi / 32 * 28, kPi / 32 * 29, + kPi / 32 * 30, kPi / 32 * 31, + }; + HWY_CAPPED(float, 32) df; + auto result = Zero(df); + const auto tandhalf = Set(df, t + 0.5f); + for (int i = 0; i < 32; i += Lanes(df)) { + auto cos_arg = LoadU(df, kMultipliers + i) * tandhalf; + auto cos = FastCosf(df, cos_arg); + auto local_res = LoadU(df, dct + i) * cos; + result = MulAdd(Set(df, square_root<2>::value), local_res, result); + } + return GetLane(SumOfLanes(result)); +} + +// Splats a single Gaussian on the image. +void DrawGaussian(Image3F* const opsin, const Rect& opsin_rect, + const Rect& image_rect, const Spline::Point& center, + const float intensity, const float color[3], + const float sigma, std::vector& xs, + std::vector& ys, + std::vector& local_intensity_storage) { + constexpr float kDistanceMultiplier = 4.605170185988091f; // -2 * log(0.1) + // Distance beyond which exp(-d^2 / (2 * sigma^2)) drops below 0.1. + const float maximum_distance = sigma * sigma * kDistanceMultiplier; + const auto xbegin_s = + std::max(image_rect.x0(), center.x - maximum_distance + .5f); + const auto xend_s = + std::min(center.x + maximum_distance + .5f, + image_rect.x0() + image_rect.xsize() - 1); + const auto ybegin_s = + std::max(image_rect.y0(), center.y - maximum_distance + .5f); + const auto yend_s = + std::min(center.y + maximum_distance + .5f, + image_rect.y0() + image_rect.ysize() - 1); + if ((xend_s) <= 0 || (xend_s < xbegin_s)) return; + const size_t xbegin = xbegin_s; + const size_t xend = xend_s; + if ((yend_s <= 0) || (yend_s < ybegin_s)) return; + const size_t ybegin = ybegin_s; + const size_t yend = yend_s; + const size_t opsin_stride = opsin->PixelsPerRow(); + float* JXL_RESTRICT rows[3] = { + opsin_rect.PlaneRow(opsin, 0, ybegin - image_rect.y0()), + opsin_rect.PlaneRow(opsin, 1, ybegin - image_rect.y0()), + opsin_rect.PlaneRow(opsin, 2, ybegin - image_rect.y0()), + }; + const size_t nx = xend + 1 - xbegin; + const size_t ny = yend + 1 - ybegin; + HWY_FULL(float) df; + if (xs.size() < nx * ny) { + size_t sz = DivCeil(nx * ny, Lanes(df)) * Lanes(df); + xs.resize(sz); + ys.resize(sz); + local_intensity_storage.resize(sz); + } + for (size_t y = ybegin; y <= yend; ++y) { + for (size_t x = xbegin; x <= xend; ++x) { + xs[(y - ybegin) * nx + (x - xbegin)] = x; + ys[(y - ybegin) * nx + (x - xbegin)] = y; + } + } + Rebind di; + const auto inv_sigma = Set(df, 1.0f / sigma); + const auto half = Set(df, 0.5f); + const auto one_over_2s2 = Set(df, 0.353553391f); + const auto sigma_over_4_times_intensity = Set(df, .25f * sigma * intensity); + for (size_t i = 0; i < nx * ny; i += Lanes(df)) { + const auto x = ConvertTo(df, LoadU(di, &xs[i])); + const auto y = ConvertTo(df, LoadU(di, &ys[i])); + const auto dx = x - Set(df, center.x); + const auto dy = y - Set(df, center.y); + const auto sqd = MulAdd(dx, dx, dy * dy); + const auto distance = Sqrt(sqd); + const auto one_dimensional_factor = + FastErff(df, MulAdd(distance, half, one_over_2s2) * inv_sigma) - + FastErff(df, MulSub(distance, half, one_over_2s2) * inv_sigma); + const auto local_intensity = sigma_over_4_times_intensity * + one_dimensional_factor * + one_dimensional_factor; + StoreU(local_intensity, df, &local_intensity_storage[i]); + } + ssize_t off = -static_cast(image_rect.x0()); + for (size_t y = ybegin; y <= yend; ++y) { + HWY_CAPPED(float, 1) df; + for (size_t x = xbegin; x <= xend; ++x) { + const auto local_intensity = Load( + df, local_intensity_storage.data() + (y - ybegin) * nx + x - xbegin); + for (size_t c = 0; c < 3; ++c) { + const auto cm = Set(df, color[c]); + const auto in = LoadU(df, rows[c] + x + off); + StoreU(MulAdd(cm, local_intensity, in), df, rows[c] + x + off); + } + } + off += opsin_stride; + } +} + +void DrawFromPoints( + Image3F* const opsin, const Rect& opsin_rect, const Rect& image_rect, + const Spline& spline, bool add, + const std::vector>& points_to_draw, + float arc_length) { + float inv_arc_length = 1.0f / arc_length; + int k = 0; + std::vector xs, ys; + std::vector local_intensity_storage; + for (const auto& point_to_draw : points_to_draw) { + const Spline::Point& point = point_to_draw.first; + const float multiplier = add ? point_to_draw.second : -point_to_draw.second; + const float progress_along_arc = + std::min(1.f, (k * kDesiredRenderingDistance) * inv_arc_length); + ++k; + float color[3]; + for (size_t c = 0; c < 3; ++c) { + color[c] = + ContinuousIDCT(spline.color_dct[c], (32 - 1) * progress_along_arc); + } + const float sigma = + ContinuousIDCT(spline.sigma_dct, (32 - 1) * progress_along_arc); + DrawGaussian(opsin, opsin_rect, image_rect, point, multiplier, color, sigma, + xs, ys, local_intensity_storage); + } +} +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +HWY_EXPORT(DrawFromPoints); + +namespace { + +// Maximum number of spline control points per frame is +// std::min(kMaxNumControlPoints, xsize * ysize / 2) +constexpr size_t kMaxNumControlPoints = 1u << 20u; +constexpr size_t kMaxNumControlPointsPerPixelRatio = 2; + +// X, Y, B, sigma. +float ColorQuantizationWeight(const int32_t adjustment, const int channel, + const int i) { + const float multiplier = adjustment >= 0 ? 1.f + .125f * adjustment + : 1.f / (1.f + .125f * -adjustment); + + static constexpr float kChannelWeight[] = {0.0042f, 0.075f, 0.07f, .3333f}; + + return multiplier / kChannelWeight[channel]; +} + +Status DecodeAllStartingPoints(std::vector* const points, + BitReader* const br, ANSSymbolReader* reader, + const std::vector& context_map, + size_t num_splines) { + points->clear(); + points->reserve(num_splines); + int64_t last_x = 0; + int64_t last_y = 0; + for (size_t i = 0; i < num_splines; i++) { + int64_t x = + reader->ReadHybridUint(kStartingPositionContext, br, context_map); + int64_t y = + reader->ReadHybridUint(kStartingPositionContext, br, context_map); + if (i != 0) { + x = UnpackSigned(x) + last_x; + y = UnpackSigned(y) + last_y; + } + points->emplace_back(static_cast(x), static_cast(y)); + last_x = x; + last_y = y; + } + return true; +} + +struct Vector { + float x, y; + Vector operator-() const { return {-x, -y}; } + Vector operator+(const Vector& other) const { + return {x + other.x, y + other.y}; + } + float SquaredNorm() const { return x * x + y * y; } +}; +Vector operator*(const float k, const Vector& vec) { + return {k * vec.x, k * vec.y}; +} + +Spline::Point operator+(const Spline::Point& p, const Vector& vec) { + return {p.x + vec.x, p.y + vec.y}; +} +Spline::Point operator-(const Spline::Point& p, const Vector& vec) { + return p + -vec; +} +Vector operator-(const Spline::Point& a, const Spline::Point& b) { + return {a.x - b.x, a.y - b.y}; +} + +std::vector DrawCentripetalCatmullRomSpline( + std::vector points) { + if (points.size() <= 1) return points; + // Number of points to compute between each control point. + static constexpr int kNumPoints = 16; + std::vector result; + result.reserve((points.size() - 1) * kNumPoints + 1); + points.insert(points.begin(), points[0] + (points[0] - points[1])); + points.push_back(points[points.size() - 1] + + (points[points.size() - 1] - points[points.size() - 2])); + // points has at least 4 elements at this point. + for (size_t start = 0; start < points.size() - 3; ++start) { + // 4 of them are used, and we draw from p[1] to p[2]. + const Spline::Point* const p = &points[start]; + result.push_back(p[1]); + float t[4] = {0}; + for (int k = 1; k < 4; ++k) { + t[k] = std::sqrt(hypotf(p[k].x - p[k - 1].x, p[k].y - p[k - 1].y)) + + t[k - 1]; + } + for (int i = 1; i < kNumPoints; ++i) { + const float tt = + t[1] + (static_cast(i) / kNumPoints) * (t[2] - t[1]); + Spline::Point a[3]; + for (int k = 0; k < 3; ++k) { + a[k] = p[k] + ((tt - t[k]) / (t[k + 1] - t[k])) * (p[k + 1] - p[k]); + } + Spline::Point b[2]; + for (int k = 0; k < 2; ++k) { + b[k] = a[k] + ((tt - t[k]) / (t[k + 2] - t[k])) * (a[k + 1] - a[k]); + } + result.push_back(b[0] + ((tt - t[1]) / (t[2] - t[1])) * (b[1] - b[0])); + } + } + result.push_back(points[points.size() - 2]); + return result; +} + +// Move along the line segments defined by `points`, `kDesiredRenderingDistance` +// pixels at a time, and call `functor` with each point and the actual distance +// to the previous point (which will always be kDesiredRenderingDistance except +// possibly for the very last point). +template +void ForEachEquallySpacedPoint(const Points& points, const Functor& functor) { + JXL_ASSERT(!points.empty()); + Spline::Point current = points.front(); + functor(current, kDesiredRenderingDistance); + auto next = points.begin(); + while (next != points.end()) { + const Spline::Point* previous = ¤t; + float arclength_from_previous = 0.f; + for (;;) { + if (next == points.end()) { + functor(*previous, arclength_from_previous); + return; + } + const float arclength_to_next = + std::sqrt((*next - *previous).SquaredNorm()); + if (arclength_from_previous + arclength_to_next >= + kDesiredRenderingDistance) { + current = + *previous + ((kDesiredRenderingDistance - arclength_from_previous) / + arclength_to_next) * + (*next - *previous); + functor(current, kDesiredRenderingDistance); + break; + } + arclength_from_previous += arclength_to_next; + previous = &*next; + ++next; + } + } +} + +} // namespace + +QuantizedSpline::QuantizedSpline(const Spline& original, + const int32_t quantization_adjustment, + float ytox, float ytob) { + JXL_ASSERT(!original.control_points.empty()); + control_points_.reserve(original.control_points.size() - 1); + const Spline::Point& starting_point = original.control_points.front(); + int previous_x = static_cast(roundf(starting_point.x)), + previous_y = static_cast(roundf(starting_point.y)); + int previous_delta_x = 0, previous_delta_y = 0; + for (auto it = original.control_points.begin() + 1; + it != original.control_points.end(); ++it) { + const int new_x = static_cast(roundf(it->x)); + const int new_y = static_cast(roundf(it->y)); + const int new_delta_x = new_x - previous_x; + const int new_delta_y = new_y - previous_y; + control_points_.emplace_back(new_delta_x - previous_delta_x, + new_delta_y - previous_delta_y); + previous_delta_x = new_delta_x; + previous_delta_y = new_delta_y; + previous_x = new_x; + previous_y = new_y; + } + + for (int c = 0; c < 3; ++c) { + float factor = c == 0 ? ytox : c == 1 ? 0 : ytob; + for (int i = 0; i < 32; ++i) { + const float coefficient = + original.color_dct[c][i] - + factor * color_dct_[1][i] / + ColorQuantizationWeight(quantization_adjustment, 1, i); + color_dct_[c][i] = static_cast( + roundf(coefficient * + ColorQuantizationWeight(quantization_adjustment, c, i))); + } + } + for (int i = 0; i < 32; ++i) { + sigma_dct_[i] = static_cast( + roundf(original.sigma_dct[i] * + ColorQuantizationWeight(quantization_adjustment, 3, i))); + } +} + +Spline QuantizedSpline::Dequantize(const Spline::Point& starting_point, + const int32_t quantization_adjustment, + float ytox, float ytob) const { + Spline result; + + result.control_points.reserve(control_points_.size() + 1); + int current_x = static_cast(roundf(starting_point.x)), + current_y = static_cast(roundf(starting_point.y)); + result.control_points.push_back(Spline::Point{static_cast(current_x), + static_cast(current_y)}); + int current_delta_x = 0, current_delta_y = 0; + for (const auto& point : control_points_) { + current_delta_x += point.first; + current_delta_y += point.second; + current_x += current_delta_x; + current_y += current_delta_y; + result.control_points.push_back(Spline::Point{ + static_cast(current_x), static_cast(current_y)}); + } + + for (int c = 0; c < 3; ++c) { + for (int i = 0; i < 32; ++i) { + result.color_dct[c][i] = + color_dct_[c][i] * (i == 0 ? 1.0f / square_root<2>::value : 1.0f) / + ColorQuantizationWeight(quantization_adjustment, c, i); + } + } + for (int i = 0; i < 32; ++i) { + result.color_dct[0][i] += ytox * result.color_dct[1][i]; + result.color_dct[2][i] += ytob * result.color_dct[1][i]; + } + for (int i = 0; i < 32; ++i) { + result.sigma_dct[i] = + sigma_dct_[i] * (i == 0 ? 1.0f / square_root<2>::value : 1.0f) / + ColorQuantizationWeight(quantization_adjustment, 3, i); + } + + return result; +} + +Status QuantizedSpline::Decode(const std::vector& context_map, + ANSSymbolReader* const decoder, + BitReader* const br, size_t max_control_points, + size_t* total_num_control_points) { + const size_t num_control_points = + decoder->ReadHybridUint(kNumControlPointsContext, br, context_map); + *total_num_control_points += num_control_points; + if (*total_num_control_points > max_control_points) { + return JXL_FAILURE("Too many control points: %zu", + *total_num_control_points); + } + control_points_.resize(num_control_points); + for (std::pair& control_point : control_points_) { + control_point.first = UnpackSigned( + decoder->ReadHybridUint(kControlPointsContext, br, context_map)); + control_point.second = UnpackSigned( + decoder->ReadHybridUint(kControlPointsContext, br, context_map)); + } + + const auto decode_dct = [decoder, br, &context_map](int dct[32]) -> Status { + for (int i = 0; i < 32; ++i) { + dct[i] = + UnpackSigned(decoder->ReadHybridUint(kDCTContext, br, context_map)); + } + return true; + }; + for (int c = 0; c < 3; ++c) { + JXL_RETURN_IF_ERROR(decode_dct(color_dct_[c])); + } + JXL_RETURN_IF_ERROR(decode_dct(sigma_dct_)); + return true; +} + +Status Splines::Decode(jxl::BitReader* br, size_t num_pixels) { + std::vector context_map; + ANSCode code; + JXL_RETURN_IF_ERROR( + DecodeHistograms(br, kNumSplineContexts, &code, &context_map)); + ANSSymbolReader decoder(&code, br); + const size_t num_splines = + 1 + decoder.ReadHybridUint(kNumSplinesContext, br, context_map); + size_t max_control_points = std::min( + kMaxNumControlPoints, num_pixels / kMaxNumControlPointsPerPixelRatio); + if (num_splines > max_control_points) { + return JXL_FAILURE("Too many splines: %zu", num_splines); + } + JXL_RETURN_IF_ERROR(DecodeAllStartingPoints(&starting_points_, br, &decoder, + context_map, num_splines)); + + quantization_adjustment_ = UnpackSigned( + decoder.ReadHybridUint(kQuantizationAdjustmentContext, br, context_map)); + + splines_.clear(); + splines_.reserve(num_splines); + size_t num_control_points = num_splines; + for (size_t i = 0; i < num_splines; ++i) { + QuantizedSpline spline; + JXL_RETURN_IF_ERROR(spline.Decode(context_map, &decoder, br, + max_control_points, &num_control_points)); + splines_.push_back(std::move(spline)); + } + + JXL_RETURN_IF_ERROR(decoder.CheckANSFinalState()); + + if (!HasAny()) { + return JXL_FAILURE("Decoded splines but got none"); + } + + return true; +} + +Status Splines::AddTo(Image3F* const opsin, const Rect& opsin_rect, + const Rect& image_rect, + const ColorCorrelationMap& cmap) const { + return Apply(opsin, opsin_rect, image_rect, cmap); +} + +Status Splines::SubtractFrom(Image3F* const opsin, + const ColorCorrelationMap& cmap) const { + return Apply(opsin, Rect(*opsin), Rect(*opsin), cmap); +} + +template +Status Splines::Apply(Image3F* const opsin, const Rect& opsin_rect, + const Rect& image_rect, + const ColorCorrelationMap& cmap) const { + for (size_t i = 0; i < splines_.size(); ++i) { + const Spline spline = + splines_[i].Dequantize(starting_points_[i], quantization_adjustment_, + cmap.YtoXRatio(0), cmap.YtoBRatio(0)); + if (std::adjacent_find(spline.control_points.begin(), + spline.control_points.end()) != + spline.control_points.end()) { + return JXL_FAILURE("identical successive control points in spline %zu", + i); + } + std::vector> points_to_draw; + ForEachEquallySpacedPoint( + DrawCentripetalCatmullRomSpline(spline.control_points), + [&](const Spline::Point& point, const float multiplier) { + points_to_draw.emplace_back(point, multiplier); + }); + const float arc_length = + (points_to_draw.size() - 2) * kDesiredRenderingDistance + + points_to_draw.back().second; + if (arc_length <= 0.f) { + // This spline wouldn't have any effect. + continue; + } + HWY_DYNAMIC_DISPATCH(DrawFromPoints) + (opsin, opsin_rect, image_rect, spline, add, points_to_draw, arc_length); + } + return true; +} + +} // namespace jxl +#endif // HWY_ONCE diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/splines.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/splines.h new file mode 100644 index 0000000000..8ec10e928d --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/splines.h @@ -0,0 +1,124 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_SPLINES_H_ +#define LIB_JXL_SPLINES_H_ + +#include +#include + +#include +#include + +#include "lib/jxl/ans_params.h" +#include "lib/jxl/aux_out.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/chroma_from_luma.h" +#include "lib/jxl/dec_ans.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/entropy_coder.h" +#include "lib/jxl/image.h" + +namespace jxl { + +static constexpr float kDesiredRenderingDistance = 1.f; + +enum SplineEntropyContexts : size_t { + kQuantizationAdjustmentContext = 0, + kStartingPositionContext, + kNumSplinesContext, + kNumControlPointsContext, + kControlPointsContext, + kDCTContext, + kNumSplineContexts +}; + +struct Spline { + struct Point { + Point() : x(0.0f), y(0.0f) {} + Point(float x, float y) : x(x), y(y) {} + float x, y; + bool operator==(const Point& other) const { + return std::fabs(x - other.x) < 1e-3f && std::fabs(y - other.y) < 1e-3f; + } + }; + std::vector control_points; + // X, Y, B. + float color_dct[3][32]; + // Splines are draws by normalized Gaussian splatting. This controls the + // Gaussian's parameter along the spline. + float sigma_dct[32]; +}; + +class QuantizedSplineEncoder; + +class QuantizedSpline { + public: + QuantizedSpline() = default; + explicit QuantizedSpline(const Spline& original, + int32_t quantization_adjustment, float ytox, + float ytob); + + Spline Dequantize(const Spline::Point& starting_point, + int32_t quantization_adjustment, float ytox, + float ytob) const; + + Status Decode(const std::vector& context_map, + ANSSymbolReader* decoder, BitReader* br, + size_t max_control_points, size_t* total_num_control_points); + + private: + friend class QuantizedSplineEncoder; + + std::vector> + control_points_; // Double delta-encoded. + int color_dct_[3][32] = {}; + int sigma_dct_[32] = {}; +}; + +class Splines { + public: + Splines() = default; + explicit Splines(const int32_t quantization_adjustment, + std::vector splines, + std::vector starting_points) + : quantization_adjustment_(quantization_adjustment), + splines_(std::move(splines)), + starting_points_(std::move(starting_points)) {} + + bool HasAny() const { return !splines_.empty(); } + + Status Decode(BitReader* br, size_t num_pixels); + + Status AddTo(Image3F* opsin, const Rect& opsin_rect, const Rect& image_rect, + const ColorCorrelationMap& cmap) const; + Status SubtractFrom(Image3F* opsin, const ColorCorrelationMap& cmap) const; + + const std::vector& QuantizedSplines() const { + return splines_; + } + const std::vector& StartingPoints() const { + return starting_points_; + } + + int32_t GetQuantizationAdjustment() const { return quantization_adjustment_; } + + private: + template + Status Apply(Image3F* opsin, const Rect& opsin_rect, const Rect& image_rect, + const ColorCorrelationMap& cmap) const; + + // If positive, quantization weights are multiplied by 1 + this/8, which + // increases precision. If negative, they are divided by 1 - this/8. If 0, + // they are unchanged. + int32_t quantization_adjustment_ = 0; + std::vector splines_; + std::vector starting_points_; +}; + +} // namespace jxl + +#endif // LIB_JXL_SPLINES_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/splines_gbench.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/splines_gbench.cc new file mode 100644 index 0000000000..490bdb00e1 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/splines_gbench.cc @@ -0,0 +1,51 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "benchmark/benchmark.h" +#include "lib/jxl/splines.h" + +namespace jxl { +namespace { + +constexpr int kQuantizationAdjustment = 0; +const ColorCorrelationMap* const cmap = new ColorCorrelationMap; +const float kYToX = cmap->YtoXRatio(0); +const float kYToB = cmap->YtoBRatio(0); + +void BM_Splines(benchmark::State& state) { + const size_t n = state.range(); + + std::vector spline_data = { + {/*control_points=*/{ + {9, 54}, {118, 159}, {97, 3}, {10, 40}, {150, 25}, {120, 300}}, + /*color_dct=*/ + {{0.03125f, 0.00625f, 0.003125f}, {1.f, 0.321875f}, {1.f, 0.24375f}}, + /*sigma_dct=*/{0.3125f, 0.f, 0.f, 0.0625f}}}; + std::vector quantized_splines; + std::vector starting_points; + for (const Spline& spline : spline_data) { + quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX, + kYToB); + starting_points.push_back(spline.control_points.front()); + } + Splines splines(kQuantizationAdjustment, std::move(quantized_splines), + std::move(starting_points)); + + Image3F drawing_area(320, 320); + ZeroFillImage(&drawing_area); + for (auto _ : state) { + for (size_t i = 0; i < n; ++i) { + JXL_CHECK(splines.AddTo(&drawing_area, Rect(drawing_area), + Rect(drawing_area), *cmap)); + } + } + + state.SetItemsProcessed(n * state.iterations()); +} + +BENCHMARK(BM_Splines)->Range(1, 1 << 10); + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/splines_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/splines_test.cc new file mode 100644 index 0000000000..8e6dfc5dd4 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/splines_test.cc @@ -0,0 +1,312 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/splines.h" + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "lib/extras/codec.h" +#include "lib/jxl/enc_butteraugli_comparator.h" +#include "lib/jxl/enc_splines.h" +#include "lib/jxl/image_test_utils.h" +#include "lib/jxl/testdata.h" + +namespace jxl { + +std::ostream& operator<<(std::ostream& os, const Spline::Point& p) { + return os << "(" << p.x << ", " << p.y << ")"; +} + +std::ostream& operator<<(std::ostream& os, const Spline& spline) { + return os << "(spline with " << spline.control_points.size() + << " control points)"; +} + +namespace { + +using ::testing::AllOf; +using ::testing::Field; +using ::testing::FloatNear; +using ::testing::Pointwise; + +constexpr int kQuantizationAdjustment = 0; +const ColorCorrelationMap* const cmap = new ColorCorrelationMap; +const float kYToX = cmap->YtoXRatio(0); +const float kYToB = cmap->YtoBRatio(0); + +constexpr float kTolerance = 0.003125; + +std::vector DequantizeSplines(const Splines& splines) { + const auto& quantized_splines = splines.QuantizedSplines(); + const auto& starting_points = splines.StartingPoints(); + JXL_ASSERT(quantized_splines.size() == starting_points.size()); + + std::vector dequantized; + for (size_t i = 0; i < quantized_splines.size(); ++i) { + dequantized.push_back(quantized_splines[i].Dequantize( + starting_points[i], kQuantizationAdjustment, kYToX, kYToB)); + } + return dequantized; +} + +MATCHER(ControlPointIs, "") { + const Spline::Point& actual = std::get<0>(arg); + const Spline::Point& expected = std::get<1>(arg); + return testing::ExplainMatchResult( + AllOf(Field(&Spline::Point::x, FloatNear(expected.x, kTolerance)), + Field(&Spline::Point::y, FloatNear(expected.y, kTolerance))), + actual, result_listener); +} + +MATCHER(ControlPointsMatch, "") { + const Spline& actual = std::get<0>(arg); + const Spline& expected = std::get<1>(arg); + return testing::ExplainMatchResult( + Field(&Spline::control_points, + Pointwise(ControlPointIs(), expected.control_points)), + actual, result_listener); +} + +MATCHER(SplinesMatch, "") { + const Spline& actual = std::get<0>(arg); + const Spline& expected = std::get<1>(arg); + if (!testing::ExplainMatchResult(ControlPointsMatch(), arg, + result_listener)) { + return false; + } + for (int i = 0; i < 3; ++i) { + size_t color_dct_size = + sizeof(expected.color_dct[i]) / sizeof(expected.color_dct[i][0]); + for (size_t j = 0; j < color_dct_size; j++) { + testing::StringMatchResultListener color_dct_listener; + if (!testing::ExplainMatchResult( + FloatNear(expected.color_dct[i][j], kTolerance), + actual.color_dct[i][j], &color_dct_listener)) { + *result_listener << ", where color_dct[" << i << "][" << j + << "] don't match, " << color_dct_listener.str(); + return false; + } + } + } + size_t sigma_dct_size = + sizeof(expected.sigma_dct) / sizeof(expected.sigma_dct[0]); + for (size_t i = 0; i < sigma_dct_size; i++) { + testing::StringMatchResultListener sigma_listener; + if (!testing::ExplainMatchResult( + FloatNear(expected.sigma_dct[i], kTolerance), actual.sigma_dct[i], + &sigma_listener)) { + *result_listener << ", where sigma_dct[" << i << "] don't match, " + << sigma_listener.str(); + return false; + } + } + return true; +} + +} // namespace + +TEST(SplinesTest, Serialization) { + std::vector spline_data = { + {/*control_points=*/{ + {109, 54}, {218, 159}, {80, 3}, {110, 274}, {94, 185}, {17, 277}}, + /*color_dct=*/ + {{36.3, 39.7, 23.2, 67.5, 4.4, 71.5, 62.3, 32.3, 92.2, 10.1, 10.8, + 9.2, 6.1, 10.5, 79.1, 7, 24.6, 90.8, 5.5, 84, 43.8, 49, + 33.5, 78.9, 54.5, 77.9, 62.1, 51.4, 36.4, 14.3, 83.7, 35.4}, + {9.4, 53.4, 9.5, 74.9, 72.7, 26.7, 7.9, 0.9, 84.9, 23.2, 26.5, + 31.1, 91, 11.7, 74.1, 39.3, 23.7, 82.5, 4.8, 2.7, 61.2, 96.4, + 13.7, 66.7, 62.9, 82.4, 5.9, 98.7, 21.5, 7.9, 51.7, 63.1}, + {48, 39.3, 6.9, 26.3, 33.3, 6.2, 1.7, 98.9, 59.9, 59.6, 95, + 61.3, 82.7, 53, 6.1, 30.4, 34.7, 96.9, 93.4, 17, 38.8, 80.8, + 63, 18.6, 43.6, 32.3, 61, 20.2, 24.3, 28.3, 69.1, 62.4}}, + /*sigma_dct=*/{32.7, 21.5, 44.4, 1.8, 45.8, 90.6, 29.3, 59.2, + 23.7, 85.2, 84.8, 27.2, 42.1, 84.1, 50.6, 17.6, + 93.7, 4.9, 2.6, 69.8, 94.9, 52, 24.3, 18.8, + 12.1, 95.7, 28.5, 81.4, 89.9, 31.4, 74.8, 52}}, + {/*control_points=*/{{172, 309}, + {196, 277}, + {42, 238}, + {114, 350}, + {307, 290}, + {316, 269}, + {124, 66}, + {233, 267}}, + /*color_dct=*/ + {{15, 28.9, 22, 6.6, 41.8, 83, 8.6, 56.8, 68.9, 9.7, 5.4, + 19.8, 70.8, 90, 52.5, 65.2, 7.8, 23.5, 26.4, 72.2, 64.7, 87.1, + 1.3, 67.5, 46, 68.4, 65.4, 35.5, 29.1, 13, 41.6, 23.9}, + {47.7, 79.4, 62.7, 29.1, 96.8, 18.5, 17.6, 15.2, 80.5, 56, 96.2, + 59.9, 26.7, 96.1, 92.3, 42.1, 35.8, 54, 23.2, 55, 76, 35.8, + 58.4, 88.7, 2.4, 78.1, 95.6, 27.5, 6.6, 78.5, 24.1, 69.8}, + {43.8, 96.5, 0.9, 95.1, 49.1, 71.2, 25.1, 33.6, 75.2, 95, 82.1, + 19.7, 10.5, 44.9, 50, 93.3, 83.5, 99.5, 64.6, 54, 3.5, 99.7, + 45.3, 82.1, 22.4, 37.9, 60, 32.2, 12.6, 4.6, 65.5, 96.4}}, + /*sigma_dct=*/{72.5, 2.6, 41.7, 2.2, 39.7, 79.1, 69.6, 19.9, + 92.3, 71.5, 41.9, 62.1, 30, 49.4, 70.3, 45.3, + 62.5, 47.2, 46.7, 41.2, 90.8, 46.8, 91.2, 55, + 8.1, 69.6, 25.4, 84.7, 61.7, 27.6, 3.7, 46.9}}, + {/*control_points=*/{{100, 186}, + {257, 97}, + {170, 49}, + {25, 169}, + {309, 104}, + {232, 237}, + {385, 101}, + {122, 168}, + {26, 300}, + {390, 88}}, + /*color_dct=*/ + {{16.9, 64.8, 4.2, 10.6, 23.5, 17, 79.3, 5.7, 60.4, 16.6, 94.9, + 63.7, 87.6, 10.5, 3.8, 61.1, 22.9, 81.9, 80.4, 40.5, 45.9, 25.4, + 39.8, 30, 50.2, 90.4, 27.9, 93.7, 65.1, 48.2, 22.3, 43.9}, + {24.9, 66, 3.5, 90.2, 97.1, 15.8, 35.6, 0.6, 68, 39.6, 24.4, + 85.9, 57.7, 77.6, 47.5, 67.9, 4.3, 5.4, 91.2, 58.5, 0.1, 52.2, + 3.5, 47.8, 63.2, 43.5, 85.8, 35.8, 50.2, 35.9, 19.2, 48.2}, + {82.8, 44.9, 76.4, 39.5, 94.1, 14.3, 89.8, 10, 10.5, 74.5, 56.3, + 65.8, 7.8, 23.3, 52.8, 99.3, 56.8, 46, 76.7, 13.5, 67, 22.4, + 29.9, 43.3, 70.3, 26, 74.3, 53.9, 62, 19.1, 49.3, 46.7}}, + /*sigma_dct=*/{83.5, 1.7, 25.1, 18.7, 46.5, 75.3, 28, 62.3, + 50.3, 23.3, 85.6, 96, 45.8, 33.1, 33.4, 52.9, + 26.3, 58.5, 19.6, 70, 92.6, 22.5, 57, 21.6, + 76.8, 87.5, 22.9, 66.3, 35.7, 35.6, 56.8, 67.2}}, + }; + + std::vector quantized_splines; + std::vector starting_points; + for (const Spline& spline : spline_data) { + quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX, + kYToB); + starting_points.push_back(spline.control_points.front()); + } + + Splines splines(kQuantizationAdjustment, std::move(quantized_splines), + std::move(starting_points)); + const std::vector quantized_spline_data = DequantizeSplines(splines); + EXPECT_THAT(quantized_spline_data, + Pointwise(ControlPointsMatch(), spline_data)); + + BitWriter writer; + EncodeSplines(splines, &writer, kLayerSplines, HistogramParams(), nullptr); + writer.ZeroPadToByte(); + const size_t bits_written = writer.BitsWritten(); + + printf("Wrote %zu bits of splines.\n", bits_written); + + BitReader reader(writer.GetSpan()); + Splines decoded_splines; + ASSERT_TRUE(decoded_splines.Decode(&reader, /*num_pixels=*/1000)); + ASSERT_TRUE(reader.JumpToByteBoundary()); + EXPECT_EQ(reader.TotalBitsConsumed(), bits_written); + ASSERT_TRUE(reader.Close()); + + const std::vector decoded_spline_data = + DequantizeSplines(decoded_splines); + EXPECT_THAT(decoded_spline_data, + Pointwise(SplinesMatch(), quantized_spline_data)); +} + +#ifdef JXL_CRASH_ON_ERROR +TEST(SplinesTest, DISABLED_TooManySplinesTest) { +#else +TEST(SplinesTest, TooManySplinesTest) { +#endif + // This is more than the limit for 1000 pixels. + const size_t kNumSplines = 300; + + std::vector quantized_splines; + std::vector starting_points; + for (size_t i = 0; i < kNumSplines; i++) { + Spline spline = { + /*control_points=*/{{1.f + i, 2}, {10.f + i, 25}, {30.f + i, 300}}, + /*color_dct=*/ + {{1.f, 0.2f, 0.1f}, {35.7f, 10.3f}, {35.7f, 7.8f}}, + /*sigma_dct=*/{10.f, 0.f, 0.f, 2.f}}; + quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX, + kYToB); + starting_points.push_back(spline.control_points.front()); + } + + Splines splines(kQuantizationAdjustment, std::move(quantized_splines), + std::move(starting_points)); + BitWriter writer; + EncodeSplines(splines, &writer, kLayerSplines, + HistogramParams(SpeedTier::kFalcon, 1), nullptr); + writer.ZeroPadToByte(); + // Re-read splines. + BitReader reader(writer.GetSpan()); + Splines decoded_splines; + EXPECT_FALSE(decoded_splines.Decode(&reader, /*num_pixels=*/1000)); + EXPECT_TRUE(reader.Close()); +} + +#ifdef JXL_CRASH_ON_ERROR +TEST(SplinesTest, DISABLED_DuplicatePoints) { +#else +TEST(SplinesTest, DuplicatePoints) { +#endif + std::vector control_points{ + {9, 54}, {118, 159}, {97, 3}, // Repeated. + {97, 3}, {10, 40}, {150, 25}, {120, 300}}; + Spline spline{control_points, + /*color_dct=*/ + {{1.f, 0.2f, 0.1f}, {35.7f, 10.3f}, {35.7f, 7.8f}}, + /*sigma_dct=*/{10.f, 0.f, 0.f, 2.f}}; + std::vector spline_data{spline}; + std::vector quantized_splines; + std::vector starting_points; + for (const Spline& spline : spline_data) { + quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX, + kYToB); + starting_points.push_back(spline.control_points.front()); + } + Splines splines(kQuantizationAdjustment, std::move(quantized_splines), + std::move(starting_points)); + + Image3F image(320, 320); + ZeroFillImage(&image); + EXPECT_FALSE(splines.AddTo(&image, Rect(image), Rect(image), *cmap)); +} + +TEST(SplinesTest, Drawing) { + CodecInOut io_expected; + const PaddedBytes orig = ReadTestData("jxl/splines.png"); + ASSERT_TRUE(SetFromBytes(Span(orig), &io_expected, + /*pool=*/nullptr)); + + std::vector control_points{{9, 54}, {118, 159}, {97, 3}, + {10, 40}, {150, 25}, {120, 300}}; + const Spline spline{ + control_points, + /*color_dct=*/ + {{0.03125f, 0.00625f, 0.003125f}, {1.f, 0.321875f}, {1.f, 0.24375f}}, + /*sigma_dct=*/{0.3125f, 0.f, 0.f, 0.0625f}}; + std::vector spline_data = {spline}; + std::vector quantized_splines; + std::vector starting_points; + for (const Spline& spline : spline_data) { + quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX, + kYToB); + starting_points.push_back(spline.control_points.front()); + } + Splines splines(kQuantizationAdjustment, std::move(quantized_splines), + std::move(starting_points)); + + Image3F image(320, 320); + ZeroFillImage(&image); + ASSERT_TRUE(splines.AddTo(&image, Rect(image), Rect(image), *cmap)); + + OpsinParams opsin_params{}; + opsin_params.Init(kDefaultIntensityTarget); + (void)OpsinToLinearInplace(&image, /*pool=*/nullptr, opsin_params); + + CodecInOut io_actual; + io_actual.SetFromImage(CopyImage(image), ColorEncoding::LinearSRGB()); + ASSERT_TRUE(io_actual.TransformTo(io_expected.Main().c_current())); + + VerifyRelativeError(*io_expected.Main().color(), *io_actual.Main().color(), + 1e-2f, 1e-1f); +} + +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/test_utils.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/test_utils.h new file mode 100644 index 0000000000..31abf4a96a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/test_utils.h @@ -0,0 +1,388 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_TEST_UTILS_H_ +#define LIB_JXL_TEST_UTILS_H_ + +// Macros and functions useful for tests. + +#include + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "jxl/codestream_header.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/codec_in_out.h" +#include "lib/jxl/color_encoding_internal.h" +#include "lib/jxl/common.h" // JPEGXL_ENABLE_TRANSCODE_JPEG +#include "lib/jxl/dec_file.h" +#include "lib/jxl/dec_params.h" +#include "lib/jxl/enc_external_image.h" +#include "lib/jxl/enc_file.h" +#include "lib/jxl/enc_params.h" + +#ifdef JXL_DISABLE_SLOW_TESTS +#define JXL_SLOW_TEST(X) DISABLED_##X +#else +#define JXL_SLOW_TEST(X) X +#endif // JXL_DISABLE_SLOW_TESTS + +#if JPEGXL_ENABLE_TRANSCODE_JPEG +#define JXL_TRANSCODE_JPEG_TEST(X) X +#else +#define JXL_TRANSCODE_JPEG_TEST(X) DISABLED_##X +#endif // JPEGXL_ENABLE_TRANSCODE_JPEG + +#ifdef THREAD_SANITIZER +#define JXL_TSAN_SLOW_TEST(X) DISABLED_##X +#else +#define JXL_TSAN_SLOW_TEST(X) X +#endif // THREAD_SANITIZER + +// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead +// used INSTANTIATE_TEST_CASE_P which is now deprecated. +#ifdef INSTANTIATE_TEST_SUITE_P +#define JXL_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P +#else +#define JXL_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P +#endif + +namespace jxl { +namespace test { + +void JxlBasicInfoSetFromPixelFormat(JxlBasicInfo* basic_info, + const JxlPixelFormat* pixel_format) { + switch (pixel_format->data_type) { + case JXL_TYPE_FLOAT: + basic_info->bits_per_sample = 32; + basic_info->exponent_bits_per_sample = 8; + break; + case JXL_TYPE_FLOAT16: + basic_info->bits_per_sample = 16; + basic_info->exponent_bits_per_sample = 5; + break; + case JXL_TYPE_UINT8: + basic_info->bits_per_sample = 8; + basic_info->exponent_bits_per_sample = 0; + break; + case JXL_TYPE_UINT16: + basic_info->bits_per_sample = 16; + basic_info->exponent_bits_per_sample = 0; + break; + case JXL_TYPE_UINT32: + basic_info->bits_per_sample = 32; + basic_info->exponent_bits_per_sample = 0; + break; + case JXL_TYPE_BOOLEAN: + basic_info->bits_per_sample = 1; + basic_info->exponent_bits_per_sample = 0; + break; + } + if (pixel_format->num_channels == 2 || pixel_format->num_channels == 4) { + basic_info->alpha_exponent_bits = 0; + if (basic_info->bits_per_sample == 32) { + basic_info->alpha_bits = 16; + } else { + basic_info->alpha_bits = basic_info->bits_per_sample; + } + } else { + basic_info->alpha_exponent_bits = 0; + basic_info->alpha_bits = 0; + } +} + +MATCHER_P(MatchesPrimariesAndTransferFunction, color_encoding, "") { + return arg.primaries == color_encoding.primaries && + arg.tf.IsSame(color_encoding.tf); +} + +MATCHER(MatchesPrimariesAndTransferFunction, "") { + return testing::ExplainMatchResult( + MatchesPrimariesAndTransferFunction(std::get<1>(arg)), std::get<0>(arg), + result_listener); +} + +// Returns compressed size [bytes]. +size_t Roundtrip(const CodecInOut* io, const CompressParams& cparams, + const DecompressParams& dparams, ThreadPool* pool, + CodecInOut* JXL_RESTRICT io2, AuxOut* aux_out = nullptr) { + PaddedBytes compressed; + + std::vector original_metadata_encodings; + std::vector original_current_encodings; + for (const ImageBundle& ib : io->frames) { + // Remember original encoding, will be returned by decoder. + original_metadata_encodings.push_back(ib.metadata()->color_encoding); + // c_current should not change during encoding. + original_current_encodings.push_back(ib.c_current()); + } + + std::unique_ptr enc_state = + jxl::make_unique(); + EXPECT_TRUE( + EncodeFile(cparams, io, enc_state.get(), &compressed, aux_out, pool)); + + std::vector metadata_encodings_1; + for (const ImageBundle& ib1 : io->frames) { + metadata_encodings_1.push_back(ib1.metadata()->color_encoding); + } + + // Should still be in the same color space after encoding. + EXPECT_THAT(metadata_encodings_1, + testing::Pointwise(MatchesPrimariesAndTransferFunction(), + original_metadata_encodings)); + + EXPECT_TRUE(DecodeFile(dparams, compressed, io2, pool)); + + std::vector metadata_encodings_2; + std::vector current_encodings_2; + for (const ImageBundle& ib2 : io2->frames) { + metadata_encodings_2.push_back(ib2.metadata()->color_encoding); + current_encodings_2.push_back(ib2.c_current()); + } + + EXPECT_THAT(io2->frames, testing::SizeIs(io->frames.size())); + // We always produce the original color encoding if a color transform hook is + // set. + EXPECT_THAT(current_encodings_2, + testing::Pointwise(MatchesPrimariesAndTransferFunction(), + original_current_encodings)); + + // Decoder returns the originals passed to the encoder. + EXPECT_THAT(metadata_encodings_2, + testing::Pointwise(MatchesPrimariesAndTransferFunction(), + original_metadata_encodings)); + + return compressed.size(); +} + +void CoalesceGIFAnimationWithAlpha(CodecInOut* io) { + ImageBundle canvas = io->frames[0].Copy(); + for (size_t i = 1; i < io->frames.size(); i++) { + const ImageBundle& frame = io->frames[i]; + ImageBundle rendered = canvas.Copy(); + for (size_t y = 0; y < frame.ysize(); y++) { + float* row0 = + rendered.color()->PlaneRow(0, frame.origin.y0 + y) + frame.origin.x0; + float* row1 = + rendered.color()->PlaneRow(1, frame.origin.y0 + y) + frame.origin.x0; + float* row2 = + rendered.color()->PlaneRow(2, frame.origin.y0 + y) + frame.origin.x0; + float* rowa = + rendered.alpha()->Row(frame.origin.y0 + y) + frame.origin.x0; + const float* row0f = frame.color().PlaneRow(0, y); + const float* row1f = frame.color().PlaneRow(1, y); + const float* row2f = frame.color().PlaneRow(2, y); + const float* rowaf = frame.alpha().Row(y); + for (size_t x = 0; x < frame.xsize(); x++) { + if (rowaf[x] != 0) { + row0[x] = row0f[x]; + row1[x] = row1f[x]; + row2[x] = row2f[x]; + rowa[x] = rowaf[x]; + } + } + } + if (frame.use_for_next_frame) { + canvas = rendered.Copy(); + } + io->frames[i] = std::move(rendered); + } +} + +// A POD descriptor of a ColorEncoding. Only used in tests as the return value +// of AllEncodings(). +struct ColorEncodingDescriptor { + ColorSpace color_space; + WhitePoint white_point; + Primaries primaries; + TransferFunction tf; + RenderingIntent rendering_intent; +}; + +static inline ColorEncoding ColorEncodingFromDescriptor( + const ColorEncodingDescriptor& desc) { + ColorEncoding c; + c.SetColorSpace(desc.color_space); + c.white_point = desc.white_point; + c.primaries = desc.primaries; + c.tf.SetTransferFunction(desc.tf); + c.rendering_intent = desc.rendering_intent; + return c; +} + +// Define the operator<< for tests. +static inline ::std::ostream& operator<<(::std::ostream& os, + const ColorEncodingDescriptor& c) { + return os << "ColorEncoding/" << Description(ColorEncodingFromDescriptor(c)); +} + +// Returns ColorEncodingDescriptors, which are only used in tests. To obtain a +// ColorEncoding object call ColorEncodingFromDescriptor and then call +// ColorEncoding::CreateProfile() on that object to generate a profile. +std::vector AllEncodings() { + std::vector all_encodings; + all_encodings.reserve(300); + ColorEncoding c; + + for (ColorSpace cs : Values()) { + if (cs == ColorSpace::kUnknown || cs == ColorSpace::kXYB) continue; + c.SetColorSpace(cs); + + for (WhitePoint wp : Values()) { + if (wp == WhitePoint::kCustom) continue; + if (c.ImplicitWhitePoint() && c.white_point != wp) continue; + c.white_point = wp; + + for (Primaries primaries : Values()) { + if (primaries == Primaries::kCustom) continue; + if (!c.HasPrimaries()) continue; + c.primaries = primaries; + + for (TransferFunction tf : Values()) { + if (tf == TransferFunction::kUnknown) continue; + if (c.tf.SetImplicit() && + (c.tf.IsGamma() || c.tf.GetTransferFunction() != tf)) { + continue; + } + c.tf.SetTransferFunction(tf); + + for (RenderingIntent ri : Values()) { + ColorEncodingDescriptor cdesc; + cdesc.color_space = cs; + cdesc.white_point = wp; + cdesc.primaries = primaries; + cdesc.tf = tf; + cdesc.rendering_intent = ri; + all_encodings.push_back(cdesc); + } + } + } + } + } + + return all_encodings; +} + +// Returns a test image with some autogenerated pixel content, using 16 bits per +// channel, big endian order, 1 to 4 channels +// The seed parameter allows to create images with different pixel content. +std::vector GetSomeTestImage(size_t xsize, size_t ysize, + size_t num_channels, uint16_t seed) { + // Cause more significant image difference for successive seeds. + std::mt19937 std_rng(seed); + std::uniform_int_distribution std_distr(0, 65535); + + // Returns random integer in interval (0, max_value - 1) + auto rng = [&std_rng, &std_distr](size_t max_value) -> size_t { + return static_cast(std_distr(std_rng) / 65536.0f * max_value); + }; + + // Dark background gradient color + uint16_t r0 = rng(32768); + uint16_t g0 = rng(32768); + uint16_t b0 = rng(32768); + uint16_t a0 = rng(32768); + uint16_t r1 = rng(32768); + uint16_t g1 = rng(32768); + uint16_t b1 = rng(32768); + uint16_t a1 = rng(32768); + + // Circle with different color + size_t circle_x = rng(xsize); + size_t circle_y = rng(ysize); + size_t circle_r = rng(std::min(xsize, ysize)); + + // Rectangle with random noise + size_t rect_x0 = rng(xsize); + size_t rect_y0 = rng(ysize); + size_t rect_x1 = rng(xsize); + size_t rect_y1 = rng(ysize); + if (rect_x1 < rect_x0) std::swap(rect_x0, rect_y1); + if (rect_y1 < rect_y0) std::swap(rect_y0, rect_y1); + + size_t num_pixels = xsize * ysize; + // 16 bits per channel, big endian, 4 channels + std::vector pixels(num_pixels * num_channels * 2); + // Create pixel content to test, actual content does not matter as long as it + // can be compared after roundtrip. + for (size_t y = 0; y < ysize; y++) { + for (size_t x = 0; x < xsize; x++) { + uint16_t r = r0 * (ysize - y - 1) / ysize + r1 * y / ysize; + uint16_t g = g0 * (ysize - y - 1) / ysize + g1 * y / ysize; + uint16_t b = b0 * (ysize - y - 1) / ysize + b1 * y / ysize; + uint16_t a = a0 * (ysize - y - 1) / ysize + a1 * y / ysize; + // put some shape in there for visual debugging + if ((x - circle_x) * (x - circle_x) + (y - circle_y) * (y - circle_y) < + circle_r * circle_r) { + r = (65535 - x * y) ^ seed; + g = (x << 8) + y + seed; + b = (y << 8) + x * seed; + a = 32768 + x * 256 - y; + } else if (x > rect_x0 && x < rect_x1 && y > rect_y0 && y < rect_y1) { + r = rng(65536); + g = rng(65536); + b = rng(65536); + a = rng(65536); + } + size_t i = (y * xsize + x) * 2 * num_channels; + pixels[i + 0] = (r >> 8); + pixels[i + 1] = (r & 255); + if (num_channels >= 2) { + // This may store what is called 'g' in the alpha channel of a 2-channel + // image, but that's ok since the content is arbitrary + pixels[i + 2] = (g >> 8); + pixels[i + 3] = (g & 255); + } + if (num_channels >= 3) { + pixels[i + 4] = (b >> 8); + pixels[i + 5] = (b & 255); + } + if (num_channels >= 4) { + pixels[i + 6] = (a >> 8); + pixels[i + 7] = (a & 255); + } + } + } + return pixels; +} + +// Returns a CodecInOut based on the buf, xsize, ysize, and the assumption +// that the buffer was created using `GetSomeTestImage`. +jxl::CodecInOut SomeTestImageToCodecInOut(const std::vector& buf, + size_t num_channels, size_t xsize, + size_t ysize) { + jxl::CodecInOut io; + io.SetSize(xsize, ysize); + io.metadata.m.SetAlphaBits(16); + io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB( + /*is_gray=*/num_channels == 1 || num_channels == 2); + EXPECT_TRUE(ConvertFromExternal( + jxl::Span(buf.data(), buf.size()), xsize, ysize, + jxl::ColorEncoding::SRGB(/*is_gray=*/num_channels == 1 || + num_channels == 2), + /*has_alpha=*/num_channels == 2 || num_channels == 4, + /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, JXL_BIG_ENDIAN, + /*flipped_y=*/false, /*pool=*/nullptr, + /*ib=*/&io.Main())); + return io; +} + +} // namespace test + +bool operator==(const jxl::PaddedBytes& a, const jxl::PaddedBytes& b) { + if (a.size() != b.size()) return false; + if (memcmp(a.data(), b.data(), a.size()) != 0) return false; + return true; +} + +// Allow using EXPECT_EQ on jxl::PaddedBytes +bool operator!=(const jxl::PaddedBytes& a, const jxl::PaddedBytes& b) { + return !(a == b); +} +} // namespace jxl + +#endif // LIB_JXL_TEST_UTILS_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/testdata.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/testdata.h new file mode 100644 index 0000000000..28d1015d0b --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/testdata.h @@ -0,0 +1,60 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_TESTDATA_H_ +#define LIB_JXL_TESTDATA_H_ + +#ifdef __EMSCRIPTEN__ +#include +#endif + +#include + +#include "lib/jxl/base/file_io.h" + +namespace jxl { + +static inline PaddedBytes ReadTestData(const std::string& filename) { + std::string full_path = std::string(TEST_DATA_PATH "/") + filename; + PaddedBytes data; + bool ok = ReadFile(full_path, &data); +#ifdef __EMSCRIPTEN__ + // Fallback in case FS is not supported in current JS engine. + if (!ok) { + // {size_t size, uint8_t* bytes} pair. + uint32_t size_bytes[2] = {0, 0}; + EM_ASM( + { + let buffer = null; + try { + buffer = readbuffer(UTF8ToString($0)); + } catch { + } + if (!buffer) return; + let bytes = new Uint8Array(buffer); + let size = bytes.length; + let out = _malloc(size); + if (!out) return; + HEAP8.set(bytes, out); + HEAP32[$1 >> 2] = size; + HEAP32[($1 + 4) >> 2] = out; + }, + full_path.c_str(), size_bytes); + size_t size = size_bytes[0]; + uint8_t* bytes = reinterpret_cast(size_bytes[1]); + if (size) { + data.append(bytes, bytes + size); + free(reinterpret_cast(bytes)); + ok = true; + } + } +#endif + JXL_CHECK(ok); + return data; +} + +} // namespace jxl + +#endif // LIB_JXL_TESTDATA_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/tf_gbench.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/tf_gbench.cc new file mode 100644 index 0000000000..9c010d460a --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/tf_gbench.cc @@ -0,0 +1,143 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "benchmark/benchmark.h" +#include "lib/jxl/image_ops.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/tf_gbench.cc" +#include +#include + +#include "lib/jxl/transfer_functions-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +#define RUN_BENCHMARK(F) \ + constexpr size_t kNum = 1 << 12; \ + HWY_FULL(float) d; \ + /* Three parallel runs, as this will run on R, G and B. */ \ + auto sum1 = Zero(d); \ + auto sum2 = Zero(d); \ + auto sum3 = Zero(d); \ + for (auto _ : state) { \ + auto x = Set(d, 1e-5); \ + auto v1 = Set(d, 1e-5); \ + auto v2 = Set(d, 1.1e-5); \ + auto v3 = Set(d, 1.2e-5); \ + for (size_t i = 0; i < kNum; i++) { \ + sum1 += F(d, v1); \ + sum2 += F(d, v2); \ + sum3 += F(d, v3); \ + v1 += x; \ + v2 += x; \ + v3 += x; \ + } \ + } \ + /* floats per second */ \ + state.SetItemsProcessed(kNum* state.iterations() * Lanes(d) * 3); \ + benchmark::DoNotOptimize(sum1 + sum2 + sum3); + +#define RUN_BENCHMARK_SCALAR(F) \ + constexpr size_t kNum = 1 << 12; \ + /* Three parallel runs, as this will run on R, G and B. */ \ + float sum1 = 0, sum2 = 0, sum3 = 0; \ + for (auto _ : state) { \ + float x = 1e-5; \ + float v1 = 1e-5; \ + float v2 = 1.1e-5; \ + float v3 = 1.2e-5; \ + for (size_t i = 0; i < kNum; i++) { \ + sum1 += F(v1); \ + sum2 += F(v2); \ + sum3 += F(v3); \ + v1 += x; \ + v2 += x; \ + v3 += x; \ + } \ + } \ + /* floats per second */ \ + state.SetItemsProcessed(kNum* state.iterations() * 3); \ + benchmark::DoNotOptimize(sum1 + sum2 + sum3); + +HWY_NOINLINE void BM_FastSRGB(benchmark::State& state) { + RUN_BENCHMARK(FastLinearToSRGB); +} + +HWY_NOINLINE void BM_TFSRGB(benchmark::State& state) { + RUN_BENCHMARK(TF_SRGB().EncodedFromDisplay); +} + +HWY_NOINLINE void BM_PQDFE(benchmark::State& state) { + RUN_BENCHMARK(TF_PQ().DisplayFromEncoded); +} + +HWY_NOINLINE void BM_PQEFD(benchmark::State& state) { + RUN_BENCHMARK(TF_PQ().EncodedFromDisplay); +} + +HWY_NOINLINE void BM_PQSlowDFE(benchmark::State& state) { + RUN_BENCHMARK_SCALAR(TF_PQ().DisplayFromEncoded); +} + +HWY_NOINLINE void BM_PQSlowEFD(benchmark::State& state) { + RUN_BENCHMARK_SCALAR(TF_PQ().EncodedFromDisplay); +} +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +namespace { + +HWY_EXPORT(BM_FastSRGB); +HWY_EXPORT(BM_TFSRGB); +HWY_EXPORT(BM_PQDFE); +HWY_EXPORT(BM_PQEFD); +HWY_EXPORT(BM_PQSlowDFE); +HWY_EXPORT(BM_PQSlowEFD); + +float SRGB_pow(float x) { + return x < 0.0031308f ? 12.92f * x : 1.055f * powf(x, 1.0f / 2.4f) - 0.055f; +} + +void BM_FastSRGB(benchmark::State& state) { + HWY_DYNAMIC_DISPATCH(BM_FastSRGB)(state); +} +void BM_TFSRGB(benchmark::State& state) { + HWY_DYNAMIC_DISPATCH(BM_TFSRGB)(state); +} +void BM_PQDFE(benchmark::State& state) { + HWY_DYNAMIC_DISPATCH(BM_PQDFE)(state); +} +void BM_PQEFD(benchmark::State& state) { + HWY_DYNAMIC_DISPATCH(BM_PQEFD)(state); +} +void BM_PQSlowDFE(benchmark::State& state) { + HWY_DYNAMIC_DISPATCH(BM_PQSlowDFE)(state); +} +void BM_PQSlowEFD(benchmark::State& state) { + HWY_DYNAMIC_DISPATCH(BM_PQSlowEFD)(state); +} + +void BM_SRGB_pow(benchmark::State& state) { RUN_BENCHMARK_SCALAR(SRGB_pow); } + +BENCHMARK(BM_FastSRGB); +BENCHMARK(BM_TFSRGB); +BENCHMARK(BM_SRGB_pow); +BENCHMARK(BM_PQDFE); +BENCHMARK(BM_PQEFD); +BENCHMARK(BM_PQSlowDFE); +BENCHMARK(BM_PQSlowEFD); + +} // namespace +} // namespace jxl +#endif diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc new file mode 100644 index 0000000000..3a2193e42d --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/toc.cc @@ -0,0 +1,97 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/toc.h" + +#include + +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/coeff_order.h" +#include "lib/jxl/coeff_order_fwd.h" +#include "lib/jxl/common.h" +#include "lib/jxl/fields.h" + +namespace jxl { +size_t MaxBits(const size_t num_sizes) { + const size_t entry_bits = U32Coder::MaxEncodedBits(kTocDist) * num_sizes; + // permutation bit (not its tokens!), padding, entries, padding. + return 1 + kBitsPerByte + entry_bits + kBitsPerByte; +} + +Status ReadGroupOffsets(size_t toc_entries, BitReader* JXL_RESTRICT reader, + std::vector* JXL_RESTRICT offsets, + std::vector* JXL_RESTRICT sizes, + uint64_t* total_size) { + if (toc_entries > 65536) { + // Prevent out of memory if invalid JXL codestream causes a bogus amount + // of toc_entries such as 2720436919446 to be computed. + // TODO(lode): verify whether 65536 is a reasonable upper bound + return JXL_FAILURE("too many toc entries"); + } + + const auto check_bit_budget = [&](size_t num_entries) -> Status { + // U32Coder reads 2 bits to recognize variant and kTocDist cheapest variant + // is Bits(10), this way at least 12 bits are required per toc-entry. + size_t minimal_bit_cost = num_entries * (2 + 10); + size_t bit_budget = reader->TotalBytes() * 8; + size_t expenses = reader->TotalBitsConsumed(); + if ((expenses <= bit_budget) && + (minimal_bit_cost <= bit_budget - expenses)) { + return true; + } + return JXL_STATUS(StatusCode::kNotEnoughBytes, "Not enough bytes for TOC"); + }; + + JXL_DASSERT(offsets != nullptr && sizes != nullptr); + std::vector permutation; + if (reader->ReadFixedBits<1>() == 1 && toc_entries > 0) { + // Skip permutation description if the toc_entries is 0. + JXL_RETURN_IF_ERROR(check_bit_budget(toc_entries)); + permutation.resize(toc_entries); + JXL_RETURN_IF_ERROR( + DecodePermutation(/*skip=*/0, toc_entries, permutation.data(), reader)); + } + + JXL_RETURN_IF_ERROR(reader->JumpToByteBoundary()); + JXL_RETURN_IF_ERROR(check_bit_budget(toc_entries)); + sizes->clear(); + sizes->reserve(toc_entries); + for (size_t i = 0; i < toc_entries; ++i) { + sizes->push_back(U32Coder::Read(kTocDist, reader)); + } + JXL_RETURN_IF_ERROR(reader->JumpToByteBoundary()); + JXL_RETURN_IF_ERROR(check_bit_budget(0)); + + // Prefix sum starting with 0 and ending with the offset of the last group + offsets->clear(); + offsets->reserve(toc_entries); + uint64_t offset = 0; + for (size_t i = 0; i < toc_entries; ++i) { + if (offset + (*sizes)[i] < offset) { + return JXL_FAILURE("group offset overflow"); + } + offsets->push_back(offset); + offset += (*sizes)[i]; + } + if (total_size) { + *total_size = offset; + } + + if (!permutation.empty()) { + std::vector permuted_offsets; + std::vector permuted_sizes; + permuted_offsets.reserve(toc_entries); + permuted_sizes.reserve(toc_entries); + for (coeff_order_t index : permutation) { + permuted_offsets.push_back((*offsets)[index]); + permuted_sizes.push_back((*sizes)[index]); + } + std::swap(*offsets, permuted_offsets); + std::swap(*sizes, permuted_sizes); + } + + return true; +} +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/toc.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/toc.h new file mode 100644 index 0000000000..ffebdf9115 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/toc.h @@ -0,0 +1,50 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_TOC_H_ +#define LIB_JXL_TOC_H_ + +#include +#include + +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/dec_bit_reader.h" +#include "lib/jxl/field_encodings.h" + +namespace jxl { + +// (2+bits) = 2,3,4 bytes so encoders can patch TOC after encoding. +// 30 is sufficient for 4K channels of uncompressed 16-bit samples. +constexpr U32Enc kTocDist(Bits(10), BitsOffset(14, 1024), BitsOffset(22, 17408), + BitsOffset(30, 4211712)); + +size_t MaxBits(const size_t num_sizes); + +// TODO(veluca): move these to FrameDimensions. +static JXL_INLINE size_t AcGroupIndex(size_t pass, size_t group, + size_t num_groups, size_t num_dc_groups, + bool has_ac_global) { + return 1 + num_dc_groups + static_cast(has_ac_global) + + pass * num_groups + group; +} + +static JXL_INLINE size_t NumTocEntries(size_t num_groups, size_t num_dc_groups, + size_t num_passes, bool has_ac_global) { + if (num_groups == 1 && num_passes == 1) return 1; + return AcGroupIndex(0, 0, num_groups, num_dc_groups, has_ac_global) + + num_groups * num_passes; +} + +Status ReadGroupOffsets(size_t toc_entries, BitReader* JXL_RESTRICT reader, + std::vector* JXL_RESTRICT offsets, + std::vector* JXL_RESTRICT sizes, + uint64_t* total_size); + +} // namespace jxl + +#endif // LIB_JXL_TOC_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/toc_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/toc_test.cc new file mode 100644 index 0000000000..ef27320926 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/toc_test.cc @@ -0,0 +1,93 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/toc.h" + +#include + +#include "gtest/gtest.h" +#include "lib/jxl/aux_out_fwd.h" +#include "lib/jxl/base/span.h" +#include "lib/jxl/common.h" +#include "lib/jxl/enc_toc.h" + +namespace jxl { +namespace { + +void Roundtrip(size_t num_entries, bool permute, std::mt19937* rng) { + // Generate a random permutation. + std::vector permutation(num_entries); + std::vector inv_permutation(num_entries); + for (size_t i = 0; i < num_entries; i++) { + permutation[i] = i; + inv_permutation[i] = i; + } + if (permute) { + std::shuffle(permutation.begin(), permutation.end(), *rng); + for (size_t i = 0; i < num_entries; i++) { + inv_permutation[permutation[i]] = i; + } + } + + // Generate num_entries groups of random (byte-aligned) length + std::vector group_codes(num_entries); + for (BitWriter& writer : group_codes) { + const size_t max_bits = (*rng)() & 0xFFF; + BitWriter::Allotment allotment(&writer, max_bits + kBitsPerByte); + size_t i = 0; + for (; i + BitWriter::kMaxBitsPerCall < max_bits; + i += BitWriter::kMaxBitsPerCall) { + writer.Write(BitWriter::kMaxBitsPerCall, 0); + } + for (; i < max_bits; i += 1) { + writer.Write(/*n_bits=*/1, 0); + } + writer.ZeroPadToByte(); + AuxOut aux_out; + ReclaimAndCharge(&writer, &allotment, 0, &aux_out); + } + + BitWriter writer; + AuxOut aux_out; + ASSERT_TRUE(WriteGroupOffsets(group_codes, permute ? &permutation : nullptr, + &writer, &aux_out)); + + BitReader reader(writer.GetSpan()); + std::vector group_offsets; + std::vector group_sizes; + uint64_t total_size; + ASSERT_TRUE(ReadGroupOffsets(num_entries, &reader, &group_offsets, + &group_sizes, &total_size)); + ASSERT_EQ(num_entries, group_offsets.size()); + ASSERT_EQ(num_entries, group_sizes.size()); + EXPECT_TRUE(reader.Close()); + + uint64_t prefix_sum = 0; + for (size_t i = 0; i < num_entries; ++i) { + EXPECT_EQ(prefix_sum, group_offsets[inv_permutation[i]]); + + EXPECT_EQ(0, group_codes[i].BitsWritten() % kBitsPerByte); + prefix_sum += group_codes[i].BitsWritten() / kBitsPerByte; + + if (i + 1 < num_entries) { + EXPECT_EQ( + group_offsets[inv_permutation[i]] + group_sizes[inv_permutation[i]], + group_offsets[inv_permutation[i + 1]]); + } + } + EXPECT_EQ(prefix_sum, total_size); +} + +TEST(TocTest, Test) { + std::mt19937 rng(12345); + for (size_t num_entries = 0; num_entries < 10; ++num_entries) { + for (bool permute : std::vector{false, true}) { + Roundtrip(num_entries, permute, &rng); + } + } +} + +} // namespace +} // namespace jxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/transfer_functions-inl.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/transfer_functions-inl.h new file mode 100644 index 0000000000..43069ac0be --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/transfer_functions-inl.h @@ -0,0 +1,397 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Transfer functions for color encodings. + +#if defined(LIB_JXL_TRANSFER_FUNCTIONS_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_TRANSFER_FUNCTIONS_INL_H_ +#undef LIB_JXL_TRANSFER_FUNCTIONS_INL_H_ +#else +#define LIB_JXL_TRANSFER_FUNCTIONS_INL_H_ +#endif + +#include +#include +#include + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" +#include "lib/jxl/rational_polynomial-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// Definitions for BT.2100-2 transfer functions (used inside/outside SIMD): +// "display" is linear light (nits) normalized to [0, 1]. +// "encoded" is a nonlinear encoding (e.g. PQ) in [0, 1]. +// "scene" is a linear function of photon counts, normalized to [0, 1]. + +// Despite the stated ranges, we need unbounded transfer functions: see +// http://www.littlecms.com/CIC18_UnboundedCMM.pdf. Inputs can be negative or +// above 1 due to chromatic adaptation. To avoid severe round-trip errors caused +// by clamping, we mirror negative inputs via copysign (f(-x) = -f(x), see +// https://developer.apple.com/documentation/coregraphics/cgcolorspace/1644735-extendedsrgb) +// and extend the function domains above 1. + +// Hybrid Log-Gamma. +class TF_HLG { + public: + // EOTF. e = encoded. + JXL_INLINE double DisplayFromEncoded(const double e) const { + const double lifted = e * (1.0 - kBeta) + kBeta; + return OOTF(InvOETF(lifted)); + } + + // Inverse EOTF. d = display. + JXL_INLINE double EncodedFromDisplay(const double d) const { + const double lifted = OETF(InvOOTF(d)); + const double e = (lifted - kBeta) * (1.0 / (1.0 - kBeta)); + return e; + } + + // Maximum error 5e-7. + template + JXL_INLINE V EncodedFromDisplay(D d, V x) const { + const hwy::HWY_NAMESPACE::Rebind du; + const V kSign = BitCast(d, Set(du, 0x80000000u)); + const V original_sign = And(x, kSign); + x = AndNot(kSign, x); // abs + const V below_div12 = Sqrt(Set(d, 3.0f) * x); + const V e = + MulAdd(Set(d, kA * 0.693147181f), + FastLog2f(d, MulAdd(Set(d, 12), x, Set(d, -kB))), Set(d, kC)); + const V magnitude = IfThenElse(x <= Set(d, kDiv12), below_div12, e); + const V lifted = Or(AndNot(kSign, magnitude), original_sign); + const V kMul = Set(d, 1.0f / (1.0f - kBeta)); + const V kAdd = Set(d, -kBeta / (1.0f - kBeta)); + return MulAdd(kMul, lifted, kAdd); + } + + private: + // OETF (defines the HLG approach). s = scene, returns encoded. + JXL_INLINE double OETF(double s) const { + if (s == 0.0) return 0.0; + const double original_sign = s; + s = std::abs(s); + + if (s <= kDiv12) return copysignf(std::sqrt(3.0 * s), original_sign); + + const double e = kA * std::log(12 * s - kB) + kC; + JXL_ASSERT(e > 0.0); + return copysignf(e, original_sign); + } + + // e = encoded, returns scene. + JXL_INLINE double InvOETF(double e) const { + if (e == 0.0) return 0.0; + const double original_sign = e; + e = std::abs(e); + + if (e <= 0.5) return copysignf(e * e * (1.0 / 3), original_sign); + + const double s = (std::exp((e - kC) * kRA) + kB) * kDiv12; + JXL_ASSERT(s >= 0); + return copysignf(s, original_sign); + } + + // s = scene, returns display. + JXL_INLINE double OOTF(const double s) const { + // The actual (red channel) OOTF is RD = alpha * YS^(gamma-1) * RS, where + // YS = 0.2627 * RS + 0.6780 * GS + 0.0593 * BS. Let alpha = 1 so we return + // "display" (normalized [0, 1]) instead of nits. Our transfer function + // interface does not allow a dependency on YS. Fortunately, the system + // gamma at 334 nits is 1.0, so this reduces to RD = RS. + return s; + } + + // d = display, returns scene. + JXL_INLINE double InvOOTF(const double d) const { + return d; // see OOTF(). + } + + // Assume 1000:1 contrast @ 200 nits => gamma 0.9 + static constexpr double kBeta = 0.04; // = sqrt(3 * contrast^(1/gamma)) + + static constexpr double kA = 0.17883277; + static constexpr double kRA = 1.0 / kA; + static constexpr double kB = 1 - 4 * kA; + static constexpr double kC = 0.5599107295; + static constexpr double kDiv12 = 1.0 / 12; +}; + +class TF_709 { + public: + JXL_INLINE double EncodedFromDisplay(const double d) const { + if (d < kThresh) return kMulLow * d; + return kMulHi * std::pow(d, kPowHi) + kSub; + } + + // Maximum error 1e-6. + template + JXL_INLINE V EncodedFromDisplay(D d, V x) const { + auto low = Set(d, kMulLow) * x; + auto hi = + MulAdd(Set(d, kMulHi), FastPowf(d, x, Set(d, kPowHi)), Set(d, kSub)); + return IfThenElse(x <= Set(d, kThresh), low, hi); + } + + private: + static constexpr double kThresh = 0.018; + static constexpr double kMulLow = 4.5; + static constexpr double kMulHi = 1.099; + static constexpr double kPowHi = 0.45; + static constexpr double kSub = -0.099; +}; + +// Perceptual Quantization +class TF_PQ { + public: + // EOTF (defines the PQ approach). e = encoded. + JXL_INLINE double DisplayFromEncoded(double e) const { + if (e == 0.0) return 0.0; + const double original_sign = e; + e = std::abs(e); + + const double xp = std::pow(e, 1.0 / kM2); + const double num = std::max(xp - kC1, 0.0); + const double den = kC2 - kC3 * xp; + JXL_DASSERT(den != 0.0); + const double d = std::pow(num / den, 1.0 / kM1); + JXL_DASSERT(d >= 0.0); // Equal for e ~= 1E-9 + return copysignf(d, original_sign); + } + + // Maximum error 3e-6 + template + JXL_INLINE V DisplayFromEncoded(D d, V x) const { + const hwy::HWY_NAMESPACE::Rebind du; + const V kSign = BitCast(d, Set(du, 0x80000000u)); + const V original_sign = And(x, kSign); + x = AndNot(kSign, x); // abs + // 4-over-4-degree rational polynomial approximation on x+x*x. This improves + // the maximum error by about 5x over a rational polynomial for x. + auto xpxx = MulAdd(x, x, x); + HWY_ALIGN constexpr float p[(4 + 1) * 4] = { + HWY_REP4(2.62975656e-04f), HWY_REP4(-6.23553089e-03f), + HWY_REP4(7.38602301e-01f), HWY_REP4(2.64553172e+00f), + HWY_REP4(5.50034862e-01f), + }; + HWY_ALIGN constexpr float q[(4 + 1) * 4] = { + HWY_REP4(4.21350107e+02f), HWY_REP4(-4.28736818e+02f), + HWY_REP4(1.74364667e+02f), HWY_REP4(-3.39078883e+01f), + HWY_REP4(2.67718770e+00f), + }; + auto magnitude = EvalRationalPolynomial(d, xpxx, p, q); + return Or(AndNot(kSign, magnitude), original_sign); + } + + // Inverse EOTF. d = display. + JXL_INLINE double EncodedFromDisplay(double d) const { + if (d == 0.0) return 0.0; + const double original_sign = d; + d = std::abs(d); + + const double xp = std::pow(d, kM1); + const double num = kC1 + xp * kC2; + const double den = 1.0 + xp * kC3; + const double e = std::pow(num / den, kM2); + JXL_DASSERT(e > 0.0); + return copysignf(e, original_sign); + } + + // Maximum error 7e-7. + template + JXL_INLINE V EncodedFromDisplay(D d, V x) const { + const hwy::HWY_NAMESPACE::Rebind du; + const V kSign = BitCast(d, Set(du, 0x80000000u)); + const V original_sign = And(x, kSign); + x = AndNot(kSign, x); // abs + // 4-over-4-degree rational polynomial approximation on x**0.25, with two + // different polynomials above and below 1e-4. + auto xto025 = Sqrt(Sqrt(x)); + HWY_ALIGN constexpr float p[(4 + 1) * 4] = { + HWY_REP4(1.351392e-02f), HWY_REP4(-1.095778e+00f), + HWY_REP4(5.522776e+01f), HWY_REP4(1.492516e+02f), + HWY_REP4(4.838434e+01f), + }; + HWY_ALIGN constexpr float q[(4 + 1) * 4] = { + HWY_REP4(1.012416e+00f), HWY_REP4(2.016708e+01f), + HWY_REP4(9.263710e+01f), HWY_REP4(1.120607e+02f), + HWY_REP4(2.590418e+01f), + }; + + HWY_ALIGN constexpr float plo[(4 + 1) * 4] = { + HWY_REP4(9.863406e-06f), HWY_REP4(3.881234e-01f), + HWY_REP4(1.352821e+02f), HWY_REP4(6.889862e+04f), + HWY_REP4(-2.864824e+05f), + }; + HWY_ALIGN constexpr float qlo[(4 + 1) * 4] = { + HWY_REP4(3.371868e+01f), HWY_REP4(1.477719e+03f), + HWY_REP4(1.608477e+04f), HWY_REP4(-4.389884e+04f), + HWY_REP4(-2.072546e+05f), + }; + + auto magnitude = IfThenElse(x < Set(d, 1e-4f), + EvalRationalPolynomial(d, xto025, plo, qlo), + EvalRationalPolynomial(d, xto025, p, q)); + return Or(AndNot(kSign, magnitude), original_sign); + } + + private: + static constexpr double kM1 = 2610.0 / 16384; + static constexpr double kM2 = (2523.0 / 4096) * 128; + static constexpr double kC1 = 3424.0 / 4096; + static constexpr double kC2 = (2413.0 / 4096) * 32; + static constexpr double kC3 = (2392.0 / 4096) * 32; +}; + +// sRGB +class TF_SRGB { + public: + template + JXL_INLINE V DisplayFromEncoded(V x) const { + const HWY_FULL(float) d; + const HWY_FULL(uint32_t) du; + const V kSign = BitCast(d, Set(du, 0x80000000u)); + const V original_sign = And(x, kSign); + x = AndNot(kSign, x); // abs + + // TODO(janwas): range reduction + // Computed via af_cheb_rational (k=100); replicated 4x. + HWY_ALIGN constexpr float p[(4 + 1) * 4] = { + 2.200248328e-04f, 2.200248328e-04f, 2.200248328e-04f, 2.200248328e-04f, + 1.043637593e-02f, 1.043637593e-02f, 1.043637593e-02f, 1.043637593e-02f, + 1.624820318e-01f, 1.624820318e-01f, 1.624820318e-01f, 1.624820318e-01f, + 7.961564959e-01f, 7.961564959e-01f, 7.961564959e-01f, 7.961564959e-01f, + 8.210152774e-01f, 8.210152774e-01f, 8.210152774e-01f, 8.210152774e-01f, + }; + HWY_ALIGN constexpr float q[(4 + 1) * 4] = { + 2.631846970e-01f, 2.631846970e-01f, 2.631846970e-01f, + 2.631846970e-01f, 1.076976492e+00f, 1.076976492e+00f, + 1.076976492e+00f, 1.076976492e+00f, 4.987528350e-01f, + 4.987528350e-01f, 4.987528350e-01f, 4.987528350e-01f, + -5.512498495e-02f, -5.512498495e-02f, -5.512498495e-02f, + -5.512498495e-02f, 6.521209011e-03f, 6.521209011e-03f, + 6.521209011e-03f, 6.521209011e-03f, + }; + const V linear = x * Set(d, kLowDivInv); + const V poly = EvalRationalPolynomial(d, x, p, q); + const V magnitude = + IfThenElse(x > Set(d, kThreshSRGBToLinear), poly, linear); + return Or(AndNot(kSign, magnitude), original_sign); + } + + // Error ~5e-07 + template + JXL_INLINE V EncodedFromDisplay(D d, V x) const { + const hwy::HWY_NAMESPACE::Rebind du; + const V kSign = BitCast(d, Set(du, 0x80000000u)); + const V original_sign = And(x, kSign); + x = AndNot(kSign, x); // abs + + // Computed via af_cheb_rational (k=100); replicated 4x. + HWY_ALIGN constexpr float p[(4 + 1) * 4] = { + -5.135152395e-04f, -5.135152395e-04f, -5.135152395e-04f, + -5.135152395e-04f, 5.287254571e-03f, 5.287254571e-03f, + 5.287254571e-03f, 5.287254571e-03f, 3.903842876e-01f, + 3.903842876e-01f, 3.903842876e-01f, 3.903842876e-01f, + 1.474205315e+00f, 1.474205315e+00f, 1.474205315e+00f, + 1.474205315e+00f, 7.352629620e-01f, 7.352629620e-01f, + 7.352629620e-01f, 7.352629620e-01f, + }; + HWY_ALIGN constexpr float q[(4 + 1) * 4] = { + 1.004519624e-02f, 1.004519624e-02f, 1.004519624e-02f, 1.004519624e-02f, + 3.036675394e-01f, 3.036675394e-01f, 3.036675394e-01f, 3.036675394e-01f, + 1.340816930e+00f, 1.340816930e+00f, 1.340816930e+00f, 1.340816930e+00f, + 9.258482155e-01f, 9.258482155e-01f, 9.258482155e-01f, 9.258482155e-01f, + 2.424867759e-02f, 2.424867759e-02f, 2.424867759e-02f, 2.424867759e-02f, + }; + const V linear = x * Set(d, kLowDiv); + const V poly = EvalRationalPolynomial(d, Sqrt(x), p, q); + const V magnitude = + IfThenElse(x > Set(d, kThreshLinearToSRGB), poly, linear); + return Or(AndNot(kSign, magnitude), original_sign); + } + + private: + static constexpr float kThreshSRGBToLinear = 0.04045f; + static constexpr float kThreshLinearToSRGB = 0.0031308f; + static constexpr float kLowDiv = 12.92f; + static constexpr float kLowDivInv = 1.0f / kLowDiv; +}; + +// Linear to sRGB conversion with error of at most 1.2e-4. +template +V FastLinearToSRGB(D d, V v) { + const hwy::HWY_NAMESPACE::Rebind du; + const hwy::HWY_NAMESPACE::Rebind di; + // Convert to 0.25 - 0.5 range. + auto v025_05 = + BitCast(d, (BitCast(du, v) | Set(du, 0x3e800000)) & Set(du, 0x3effffff)); + // third degree polynomial approximation between 0.25 and 0.5 + // of 1.055/2^(7/2.4) * x^(1/2.4) * 0.5. A degree 4 polynomial only improves + // accuracy by about 3x. + auto d1 = MulAdd(v025_05, Set(d, 0.059914046f), Set(d, -0.108894556f)); + auto d2 = MulAdd(d1, v025_05, Set(d, 0.107963754f)); + auto pow = MulAdd(d2, v025_05, Set(d, 0.018092343f)); + // Compute extra multiplier depending on exponent. Valid exponent range for + // [0.0031308f, 1.0) is 0...8 after subtracting 118. + // The next three constants contain a representation of the powers of + // 2**(1/2.4) = 2**(5/12) times two; in particular, bits from 26 to 31 are + // always the same and in k2to512powers_basebits, and the two arrays contain + // the next groups of 8 bits. This ends up being a 22-bit representation (with + // a mantissa of 13 bits). The choice of polynomial to approximate is such + // that the multiplication factor has the highest 5 bits constant, and that + // the factor for the lowest possible exponent is a power of two (thus making + // the additional bits 0, which is used to correctly merge back together the + // floats). + constexpr uint32_t k2to512powers_basebits = 0x40000000; + HWY_ALIGN constexpr uint8_t k2to512powers_25to18bits[16] = { + 0x0, 0xa, 0x19, 0x26, 0x32, 0x41, 0x4d, 0x5c, + 0x68, 0x75, 0x83, 0x8f, 0xa0, 0xaa, 0xb9, 0xc6, + }; + HWY_ALIGN constexpr uint8_t k2to512powers_17to10bits[16] = { + 0x0, 0xb7, 0x4, 0xd, 0xcb, 0xe7, 0x41, 0x68, + 0x51, 0xd1, 0xeb, 0xf2, 0x0, 0xb7, 0x4, 0xd, + }; + // Note that vld1q_s8_x2 on ARM seems to actually be slower. +#if HWY_TARGET != HWY_SCALAR + using hwy::HWY_NAMESPACE::ShiftLeft; + using hwy::HWY_NAMESPACE::ShiftRight; + // Every lane of exp is now (if cast to byte) {0, 0, 0, }. + auto exp = ShiftRight<23>(BitCast(di, v)) - Set(di, 118); + auto pow25to18bits = TableLookupBytes( + LoadDup128(di, + reinterpret_cast(k2to512powers_25to18bits)), + exp); + auto pow17to10bits = TableLookupBytes( + LoadDup128(di, + reinterpret_cast(k2to512powers_17to10bits)), + exp); + // Now, pow* contain {0, 0, 0, }. Here + // we take advantage of the fact that each table has its position 0 equal to + // 0. + // We can now just reassemble the float. + auto mul = + BitCast(d, ShiftLeft<18>(pow25to18bits) | ShiftLeft<10>(pow17to10bits) | + Set(di, k2to512powers_basebits)); +#else + // Fallback for scalar. + uint32_t exp = ((BitCast(di, v).raw >> 23) - 118) & 0xf; + auto mul = BitCast(d, Set(di, (k2to512powers_25to18bits[exp] << 18) | + (k2to512powers_17to10bits[exp] << 10) | + k2to512powers_basebits)); +#endif + return IfThenElse(v < Set(d, 0.0031308f), v * Set(d, 12.92f), + MulAdd(pow, mul, Set(d, -0.055))); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_TRANSFER_FUNCTIONS_INL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/transpose-inl.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/transpose-inl.h new file mode 100644 index 0000000000..d12b1295e8 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/transpose-inl.h @@ -0,0 +1,201 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Block transpose for DCT/IDCT + +#if defined(LIB_JXL_TRANSPOSE_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_TRANSPOSE_INL_H_ +#undef LIB_JXL_TRANSPOSE_INL_H_ +#else +#define LIB_JXL_TRANSPOSE_INL_H_ +#endif + +#include + +#include +#include + +#include "lib/jxl/base/status.h" +#include "lib/jxl/dct_block-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +#ifndef JXL_INLINE_TRANSPOSE +// Workaround for issue #42 - (excessive?) inlining causes invalid codegen. +#if defined(__arm__) +#define JXL_INLINE_TRANSPOSE HWY_NOINLINE +#else +#define JXL_INLINE_TRANSPOSE HWY_INLINE +#endif +#endif // JXL_INLINE_TRANSPOSE + +// Simple wrapper that ensures that a function will not be inlined. +template +JXL_NOINLINE void NoInlineWrapper(const T& f, const Args&... args) { + return f(args...); +} + +template +struct TransposeSimdTag {}; + +// TODO(veluca): it's not super useful to have this in the SIMD namespace. +template +JXL_INLINE_TRANSPOSE void GenericTransposeBlock(TransposeSimdTag, + const From& from, const To& to, + size_t ROWSp, size_t COLSp) { + size_t ROWS = ROWS_or_0 == 0 ? ROWSp : ROWS_or_0; + size_t COLS = COLS_or_0 == 0 ? COLSp : COLS_or_0; + for (size_t n = 0; n < ROWS; ++n) { + for (size_t m = 0; m < COLS; ++m) { + to.Write(from.Read(n, m), m, n); + } + } +} + +// TODO(veluca): AVX3? +#if HWY_CAP_GE256 +constexpr bool TransposeUseSimd(size_t ROWS, size_t COLS) { + return ROWS % 8 == 0 && COLS % 8 == 0; +} + +template +JXL_INLINE_TRANSPOSE void GenericTransposeBlock(TransposeSimdTag, + const From& from, const To& to, + size_t ROWSp, size_t COLSp) { + size_t ROWS = ROWS_or_0 == 0 ? ROWSp : ROWS_or_0; + size_t COLS = COLS_or_0 == 0 ? COLSp : COLS_or_0; + static_assert(MaxLanes(BlockDesc<8>()) == 8, "Invalid descriptor size"); + static_assert(ROWS_or_0 % 8 == 0, "Invalid number of rows"); + static_assert(COLS_or_0 % 8 == 0, "Invalid number of columns"); + for (size_t n = 0; n < ROWS; n += 8) { + for (size_t m = 0; m < COLS; m += 8) { + auto i0 = from.LoadPart(BlockDesc<8>(), n + 0, m + 0); + auto i1 = from.LoadPart(BlockDesc<8>(), n + 1, m + 0); + auto i2 = from.LoadPart(BlockDesc<8>(), n + 2, m + 0); + auto i3 = from.LoadPart(BlockDesc<8>(), n + 3, m + 0); + auto i4 = from.LoadPart(BlockDesc<8>(), n + 4, m + 0); + auto i5 = from.LoadPart(BlockDesc<8>(), n + 5, m + 0); + auto i6 = from.LoadPart(BlockDesc<8>(), n + 6, m + 0); + auto i7 = from.LoadPart(BlockDesc<8>(), n + 7, m + 0); + // Surprisingly, this straightforward implementation (24 cycles on port5) + // is faster than load128+insert and LoadDup128+ConcatUpperLower+blend. + const auto q0 = InterleaveLower(i0, i2); + const auto q1 = InterleaveLower(i1, i3); + const auto q2 = InterleaveUpper(i0, i2); + const auto q3 = InterleaveUpper(i1, i3); + const auto q4 = InterleaveLower(i4, i6); + const auto q5 = InterleaveLower(i5, i7); + const auto q6 = InterleaveUpper(i4, i6); + const auto q7 = InterleaveUpper(i5, i7); + + const auto r0 = InterleaveLower(q0, q1); + const auto r1 = InterleaveUpper(q0, q1); + const auto r2 = InterleaveLower(q2, q3); + const auto r3 = InterleaveUpper(q2, q3); + const auto r4 = InterleaveLower(q4, q5); + const auto r5 = InterleaveUpper(q4, q5); + const auto r6 = InterleaveLower(q6, q7); + const auto r7 = InterleaveUpper(q6, q7); + + i0 = ConcatLowerLower(r4, r0); + i1 = ConcatLowerLower(r5, r1); + i2 = ConcatLowerLower(r6, r2); + i3 = ConcatLowerLower(r7, r3); + i4 = ConcatUpperUpper(r4, r0); + i5 = ConcatUpperUpper(r5, r1); + i6 = ConcatUpperUpper(r6, r2); + i7 = ConcatUpperUpper(r7, r3); + to.StorePart(BlockDesc<8>(), i0, m + 0, n + 0); + to.StorePart(BlockDesc<8>(), i1, m + 1, n + 0); + to.StorePart(BlockDesc<8>(), i2, m + 2, n + 0); + to.StorePart(BlockDesc<8>(), i3, m + 3, n + 0); + to.StorePart(BlockDesc<8>(), i4, m + 4, n + 0); + to.StorePart(BlockDesc<8>(), i5, m + 5, n + 0); + to.StorePart(BlockDesc<8>(), i6, m + 6, n + 0); + to.StorePart(BlockDesc<8>(), i7, m + 7, n + 0); + } + } +} +#elif HWY_TARGET != HWY_SCALAR +constexpr bool TransposeUseSimd(size_t ROWS, size_t COLS) { + return ROWS % 4 == 0 && COLS % 4 == 0; +} + +template +JXL_INLINE_TRANSPOSE void GenericTransposeBlock(TransposeSimdTag, + const From& from, const To& to, + size_t ROWSp, size_t COLSp) { + size_t ROWS = ROWS_or_0 == 0 ? ROWSp : ROWS_or_0; + size_t COLS = COLS_or_0 == 0 ? COLSp : COLS_or_0; + static_assert(MaxLanes(BlockDesc<4>()) == 4, "Invalid descriptor size"); + static_assert(ROWS_or_0 % 4 == 0, "Invalid number of rows"); + static_assert(COLS_or_0 % 4 == 0, "Invalid number of columns"); + for (size_t n = 0; n < ROWS; n += 4) { + for (size_t m = 0; m < COLS; m += 4) { + const auto p0 = from.LoadPart(BlockDesc<4>(), n + 0, m + 0); + const auto p1 = from.LoadPart(BlockDesc<4>(), n + 1, m + 0); + const auto p2 = from.LoadPart(BlockDesc<4>(), n + 2, m + 0); + const auto p3 = from.LoadPart(BlockDesc<4>(), n + 3, m + 0); + + const auto q0 = InterleaveLower(p0, p2); + const auto q1 = InterleaveLower(p1, p3); + const auto q2 = InterleaveUpper(p0, p2); + const auto q3 = InterleaveUpper(p1, p3); + + const auto r0 = InterleaveLower(q0, q1); + const auto r1 = InterleaveUpper(q0, q1); + const auto r2 = InterleaveLower(q2, q3); + const auto r3 = InterleaveUpper(q2, q3); + + to.StorePart(BlockDesc<4>(), r0, m + 0, n + 0); + to.StorePart(BlockDesc<4>(), r1, m + 1, n + 0); + to.StorePart(BlockDesc<4>(), r2, m + 2, n + 0); + to.StorePart(BlockDesc<4>(), r3, m + 3, n + 0); + } + } +} +#else +constexpr bool TransposeUseSimd(size_t ROWS, size_t COLS) { return false; } +#endif + +template +struct Transpose { + template + static void Run(const From& from, const To& to) { + // This does not guarantee anything, just saves from the most stupid + // mistakes. + JXL_DASSERT(from.Address(0, 0) != to.Address(0, 0)); + TransposeSimdTag tag; + GenericTransposeBlock(tag, from, to, N, M); + } +}; + +// Avoid inlining and unrolling transposes for large blocks. +template +struct Transpose< + N, M, typename std::enable_if<(N >= 8 && M >= 8 && N * M >= 512)>::type> { + template + static void Run(const From& from, const To& to) { + // This does not guarantee anything, just saves from the most stupid + // mistakes. + JXL_DASSERT(from.Address(0, 0) != to.Address(0, 0)); + TransposeSimdTag tag; + constexpr void (*transpose)(TransposeSimdTag, + const From&, const To&, size_t, size_t) = + GenericTransposeBlock<0, 0, From, To>; + NoInlineWrapper(transpose, tag, from, to, N, M); + } +}; + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_TRANSPOSE_INL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/xorshift128plus-inl.h b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/xorshift128plus-inl.h new file mode 100644 index 0000000000..6c1865181c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/xorshift128plus-inl.h @@ -0,0 +1,88 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Fast but weak random generator. + +#if defined(LIB_JXL_XORSHIFT128PLUS_INL_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef LIB_JXL_XORSHIFT128PLUS_INL_H_ +#undef LIB_JXL_XORSHIFT128PLUS_INL_H_ +#else +#define LIB_JXL_XORSHIFT128PLUS_INL_H_ +#endif + +#include + +#include +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { +namespace { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::ShiftLeft; +using hwy::HWY_NAMESPACE::ShiftRight; + +// Adapted from https://github.com/vpxyz/xorshift/blob/master/xorshift128plus/ +// (MIT-license) +class Xorshift128Plus { + public: + // 8 independent generators (= single iteration for AVX-512) + enum { N = 8 }; + + explicit HWY_MAYBE_UNUSED Xorshift128Plus(const uint64_t seed) { + // Init state using SplitMix64 generator + s0_[0] = SplitMix64(seed + 0x9E3779B97F4A7C15ull); + s1_[0] = SplitMix64(s0_[0]); + for (size_t i = 1; i < N; ++i) { + s0_[i] = SplitMix64(s1_[i - 1]); + s1_[i] = SplitMix64(s0_[i]); + } + } + + HWY_INLINE HWY_MAYBE_UNUSED void Fill(uint64_t* HWY_RESTRICT random_bits) { +#if HWY_CAP_INTEGER64 + const HWY_FULL(uint64_t) d; + for (size_t i = 0; i < N; i += Lanes(d)) { + auto s1 = Load(d, s0_ + i); + const auto s0 = Load(d, s1_ + i); + const auto bits = s1 + s0; // b, c + Store(s0, d, s0_ + i); + s1 ^= ShiftLeft<23>(s1); + Store(bits, d, random_bits + i); + s1 ^= s0 ^ ShiftRight<18>(s1) ^ ShiftRight<5>(s0); + Store(s1, d, s1_ + i); + } +#else + for (size_t i = 0; i < N; ++i) { + auto s1 = s0_[i]; + const auto s0 = s1_[i]; + const auto bits = s1 + s0; // b, c + s0_[i] = s0; + s1 ^= s1 << 23; + random_bits[i] = bits; + s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5); + s1_[i] = s1; + } +#endif + } + + private: + static uint64_t SplitMix64(uint64_t z) { + z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull; + z = (z ^ (z >> 27)) * 0x94D049BB133111EBull; + return z ^ (z >> 31); + } + + HWY_ALIGN uint64_t s0_[N]; + HWY_ALIGN uint64_t s1_[N]; +}; + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#endif // LIB_JXL_XORSHIFT128PLUS_INL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/jxl/xorshift128plus_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/xorshift128plus_test.cc new file mode 100644 index 0000000000..f86f921906 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/jxl/xorshift128plus_test.cc @@ -0,0 +1,372 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/xorshift128plus_test.cc" +#include +#include +#include + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/thread_pool_internal.h" +#include "lib/jxl/xorshift128plus-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace HWY_NAMESPACE { + +// These templates are not found via ADL. +using hwy::HWY_NAMESPACE::ShiftRight; + +// Define to nonzero in order to print the (new) golden outputs. +#define PRINT_RESULTS 0 + +const size_t kVectors = 64; + +#if PRINT_RESULTS + +template +void Print(const uint64_t (&result)[kNumLanes]) { + printf("{ "); + for (int i = 0; i < kNumLanes; ++i) { + if (i != 0) { + printf(", "); + } + printf("0x%016llXull", result[i]); + } + printf("},\n"); +} + +#else // PRINT_RESULTS + +const uint64_t kExpected[kVectors][Xorshift128Plus::N] = { + {0x6E901576D477CBB1ull, 0xE9E53789195DA2A2ull, 0xB681F6DDA5E0AE99ull, + 0x8EFD18CE21FD6896ull, 0xA898A80DF75CF532ull, 0x50CEB2C9E2DE7E32ull, + 0x3CA7C2FEB25C0DD0ull, 0xA4D0866B80B4D836ull}, + {0x8CD6A1E6233D3A26ull, 0x3D4603ADE98B112Dull, 0xDC427AF674019E36ull, + 0xE28B4D230705AC53ull, 0x7297E9BBA88783DDull, 0x34D3D23CFCD9B41Aull, + 0x5A223615ADBE96B8ull, 0xE5EB529027CFBD01ull}, + {0xC1894CF00DFAC6A2ull, 0x18EDF8AE9085E404ull, 0x8E936625296B4CCDull, + 0x31971EF3A14A899Bull, 0xBE87535FCE0BF26Aull, 0x576F7A752BC6649Full, + 0xA44CBADCE0C6B937ull, 0x3DBA819BB17A353Aull}, + {0x27CE38DFCC1C5EB6ull, 0x920BEB5606340256ull, 0x3986CBC40C9AFC2Cull, + 0xE22BCB3EEB1E191Eull, 0x6E1FCDD3602A8FBAull, 0x052CB044E5415A29ull, + 0x46266646EFB9ECD7ull, 0x8F44914618D29335ull}, + {0xDD30AEDF72A362C5ull, 0xBC1D824E16BB98F4ull, 0x9EA6009C2AA3D2F1ull, + 0xF65C0FBBE17AF081ull, 0x22424D06A8738991ull, 0x8A62763F2B7611D2ull, + 0x2F3E89F722637939ull, 0x84D338BEF50AFD50ull}, + {0x00F46494898E2B0Bull, 0x81239DC4FB8E8003ull, 0x414AD93EC5773FE7ull, + 0x791473C450E4110Full, 0x87F127BF68C959ACull, 0x6429282D695EF67Bull, + 0x661082E11546CBA8ull, 0x5815D53FA5436BFDull}, + {0xB3DEADAB9BE6E0F9ull, 0xAA1B7B8F7CED0202ull, 0x4C5ED437699D279Eull, + 0xA4471727F1CB39D3ull, 0xE439DA193F802F70ull, 0xF89401BB04FA6493ull, + 0x3B08045A4FE898BAull, 0x32137BFE98227950ull}, + {0xFBAE4A092897FEF3ull, 0x0639F6CE56E71C8Eull, 0xF0AD6465C07F0C1Eull, + 0xFF8E28563361DCE5ull, 0xC2013DB7F86BC6B9ull, 0x8EFCC0503330102Full, + 0x3F6B767EA5C4DA40ull, 0xB9864B950B2232E1ull}, + {0x76EB58DE8E5EC22Aull, 0x9BBBF49A18B32F4Full, 0xC8405F02B2B2FAB9ull, + 0xC3E122A5F146BC34ull, 0xC90BB046660F5765ull, 0xB933981310DBECCFull, + 0x5A2A7BFC9126FD1Cull, 0x8BB388C94DF87901ull}, + {0x753EB89AD63EF3C3ull, 0xF24AAF40C89D65ADull, 0x23F68931C1A6AA6Dull, + 0xF47E79BF702C6DD0ull, 0xA3AD113244EE7EAEull, 0xD42CBEA28F793DC3ull, + 0xD896FCF1820F497Cull, 0x042B86D2818948C1ull}, + {0x8F2A4FC5A4265763ull, 0xEC499E6F95EAA10Cull, 0xE3786D4ECCD0DEB5ull, + 0xC725C53D3AC4CC43ull, 0x065A4ACBBF83610Eull, 0x35C61C9FEF167129ull, + 0x7B720AEAA7D70048ull, 0x14206B841377D039ull}, + {0xAD27D78BF96055F6ull, 0x5F43B20FF47ADCD4ull, 0xE184C2401E2BF71Eull, + 0x30B263D78990045Dull, 0xC22F00EBFF9BA201ull, 0xAE7F86522B53A562ull, + 0x2853312BC039F0A4ull, 0x868D619E6549C3C8ull}, + {0xFD5493D8AE9A8371ull, 0x773D5E224DF61B3Bull, 0x5377C54FBB1A8280ull, + 0xCAD4DE3B8265CAFAull, 0xCDF3F19C91EBD5F6ull, 0xC8EA0F182D73BD78ull, + 0x220502D593433FF1ull, 0xB81205E612DC31B1ull}, + {0x8F32A39EAEDA4C70ull, 0x1D4B0914AA4DAC7Full, 0x56EF1570F3A8B405ull, + 0x29812CB17404A592ull, 0x97A2AAF69CAE90F2ull, 0x12BF5E02778BBFE5ull, + 0x9D4B55AD42A05FD2ull, 0x06C2BAB5E6086620ull}, + {0x8DB4B9648302B253ull, 0xD756AD9E3AEA12C7ull, 0x68709B7F11D4B188ull, + 0x7CC299DDCD707A4Bull, 0x97B860C370A7661Dull, 0xCECD314FC20E64F5ull, + 0x55F412CDFB4C7EC3ull, 0x55EE97591193B525ull}, + {0xCF70F3ACA96E6254ull, 0x022FEDECA2E09F46ull, 0x686823DB60AE1ECFull, + 0xFD36190D3739830Eull, 0x74E1C09027F68120ull, 0xB5883A835C093842ull, + 0x93E1EFB927E9E4E3ull, 0xB2721E249D7E5EBEull}, + {0x69B6E21C44188CB8ull, 0x5D6CFB853655A7AAull, 0x3E001A0B425A66DCull, + 0x8C57451103A5138Full, 0x7BF8B4BE18EAB402ull, 0x494102EB8761A365ull, + 0xB33796A9F6A81F0Eull, 0x10005AB3BCCFD960ull}, + {0xB2CF25740AE965DCull, 0x6F7C1DF7EF53D670ull, 0x648DD6087AC2251Eull, + 0x040955D9851D487Dull, 0xBD550FC7E21A7F66ull, 0x57408F484DEB3AB5ull, + 0x481E24C150B506C1ull, 0x72C0C3EAF91A40D6ull}, + {0x1997A481858A5D39ull, 0x539718F4BEF50DC1ull, 0x2EC4DC4787E7E368ull, + 0xFF1CE78879419845ull, 0xE219A93DD6F6DD30ull, 0x85328618D02FEC1Aull, + 0xC86E02D969181B20ull, 0xEBEC8CD8BBA34E6Eull}, + {0x28B55088A16CE947ull, 0xDD25AC11E6350195ull, 0xBD1F176694257B1Cull, + 0x09459CCF9FCC9402ull, 0xF8047341E386C4E4ull, 0x7E8E9A9AD984C6C0ull, + 0xA4661E95062AA092ull, 0x70A9947005ED1152ull}, + {0x4C01CF75DBE98CCDull, 0x0BA076CDFC7373B9ull, 0x6C5E7A004B57FB59ull, + 0x336B82297FD3BC56ull, 0x7990C0BE74E8D60Full, 0xF0275CC00EC5C8C8ull, + 0x6CF29E682DFAD2E9ull, 0xFA4361524BD95D72ull}, + {0x631D2A19FF62F018ull, 0x41C43863B985B3FAull, 0xE052B2267038EFD9ull, + 0xE2A535FAC575F430ull, 0xE004EEA90B1FF5B8ull, 0x42DFE2CA692A1F26ull, + 0x90FB0BFC9A189ECCull, 0x4484102BD3536BD0ull}, + {0xD027134E9ACCA5A5ull, 0xBBAB4F966D476A9Bull, 0x713794A96E03D693ull, + 0x9F6335E6B94CD44Aull, 0xC5090C80E7471617ull, 0x6D9C1B0C87B58E33ull, + 0x1969CE82E31185A5ull, 0x2099B97E87754EBEull}, + {0x60EBAF4ED934350Full, 0xC26FBF0BA5E6ECFFull, 0x9E54150F0312EC57ull, + 0x0973B48364ED0041ull, 0x800A523241426CFCull, 0x03AB5EC055F75989ull, + 0x8CF315935DEEB40Aull, 0x83D3FC0190BD1409ull}, + {0x26D35394CF720A51ull, 0xCE9EAA15243CBAFEull, 0xE2B45FBAF21B29E0ull, + 0xDB92E98EDE73F9E0ull, 0x79B16F5101C26387ull, 0x1AC15959DE88C86Full, + 0x387633AEC6D6A580ull, 0xA6FC05807BFC5EB8ull}, + {0x2D26C8E47C6BADA9ull, 0x820E6EC832D52D73ull, 0xB8432C3E0ED0EE5Bull, + 0x0F84B3C4063AAA87ull, 0xF393E4366854F651ull, 0x749E1B4D2366A567ull, + 0x805EACA43480D004ull, 0x244EBF3AA54400A5ull}, + {0xBFDC3763AA79F75Aull, 0x9E3A74CC751F41DBull, 0xF401302A149DBC55ull, + 0x6B25F7973D7BF7BCull, 0x13371D34FDBC3DAEull, 0xC5E1998C8F484DCDull, + 0x7031B8AE5C364464ull, 0x3847F0C4F3DA2C25ull}, + {0x24C6387D2C0F1225ull, 0x77CCE960255C67A4ull, 0x21A0947E497B10EBull, + 0xBB5DB73A825A9D7Eull, 0x26294A41999E553Dull, 0x3953E0089F87D925ull, + 0x3DAE6E5D4E5EAAFEull, 0x74B545460341A7AAull}, + {0x710E5EB08A7DB820ull, 0x7E43C4E77CAEA025ull, 0xD4C91529C8B060C1ull, + 0x09AE26D8A7B0CA29ull, 0xAB9F356BB360A772ull, 0xB68834A25F19F6E9ull, + 0x79B8D9894C5734E2ull, 0xC6847E7C8FFD265Full}, + {0x10C4BCB06A5111E6ull, 0x57CB50955B6A2516ull, 0xEF53C87798B6995Full, + 0xAB38E15BBD8D0197ull, 0xA51C6106EFF73C93ull, 0x83D7F0E2270A7134ull, + 0x0923FD330397FCE5ull, 0xF9DE54EDFE58FB45ull}, + {0x07D44833ACCD1A94ull, 0xAAD3C9E945E2F9F3ull, 0xABF4C879B876AA37ull, + 0xF29C69A21B301619ull, 0x2DDCE959111C788Bull, 0x7CEDB48F8AC1729Bull, + 0x93F3BA9A02B659BEull, 0xF20A87FF17933CBEull}, + {0x8E96EBE93180CFE6ull, 0x94CAA12873937079ull, 0x05F613D9380D4189ull, + 0xBCAB40C1DC79F38Aull, 0x0AD8907B7C61D19Eull, 0x88534E189D103910ull, + 0x2DB2FAABA160AB8Full, 0xA070E7506B06F15Cull}, + {0x6FB1FCDAFFEF87A9ull, 0xE735CF25337A090Dull, 0x172C6EDCEFEF1825ull, + 0x76957EA49EF0542Dull, 0x819BF4CD250F7C49ull, 0xD6FF23E4AD00C4D4ull, + 0xE79673C1EC358FF0ull, 0xAC9C048144337938ull}, + {0x4C5387FF258B3AF4ull, 0xEDB68FAEC2CB1AA3ull, 0x02A624E67B4E1DA4ull, + 0x5C44797A38E08AF2ull, 0x36546A70E9411B4Bull, 0x47C17B24D2FD9675ull, + 0x101957AAA020CA26ull, 0x47A1619D4779F122ull}, + {0xF84B8BCDC92D9A3Cull, 0x951D7D2C74B3066Bull, 0x7AC287C06EDDD9B2ull, + 0x4C38FC476608D38Full, 0x224D793B19CB4BCDull, 0x835A255899BF1A41ull, + 0x4AD250E9F62DB4ABull, 0xD9B44F4B58781096ull}, + {0xABBAF99A8EB5C6B8ull, 0xFB568E900D3A9F56ull, 0x11EDF63D23C5DF11ull, + 0xA9C3011D3FA7C5A8ull, 0xAEDD3CF11AFFF725ull, 0xABCA472B5F1EDD6Bull, + 0x0600B6BB5D879804ull, 0xDB4DE007F22191A0ull}, + {0xD76CC9EFF0CE9392ull, 0xF5E0A772B59BA49Aull, 0x7D1AE1ED0C1261B5ull, + 0x79224A33B5EA4F4Aull, 0x6DD825D80C40EA60ull, 0x47FC8E747E51C953ull, + 0x695C05F72888BF98ull, 0x1A012428440B9015ull}, + {0xD754DD61F9B772BFull, 0xC4A2FCF4C0F9D4EBull, 0x461167CDF67A24A2ull, + 0x434748490EBCB9D4ull, 0x274DD9CDCA5781DEull, 0x36BAC63BA9A85209ull, + 0x30324DAFDA36B70Full, 0x337570DB4FE6DAB3ull}, + {0xF46CBDD57C551546ull, 0x8E02507E676DA3E3ull, 0xD826245A8C15406Dull, + 0xDFB38A5B71113B72ull, 0x5EA38454C95B16B5ull, 0x28C054FB87ABF3E1ull, + 0xAA2724C0BA1A8096ull, 0xECA83EC980304F2Full}, + {0x6AA76EC294EB3303ull, 0x42D4CDB2A8032E3Bull, 0x7999EDF75DCD8735ull, + 0xB422BFFE696CCDCCull, 0x8F721461FD7CCDFEull, 0x148E1A5814FDE253ull, + 0x4DC941F4375EF8FFull, 0x27B2A9E0EB5B49CFull}, + {0xCEA592EF9343EBE1ull, 0xF7D38B5FA7698903ull, 0x6CCBF352203FEAB6ull, + 0x830F3095FCCDA9C5ull, 0xDBEEF4B81B81C8F4ull, 0x6D7EB9BCEECA5CF9ull, + 0xC58ABB0FBE436C69ull, 0xE4B97E6DB2041A4Bull}, + {0x7E40FC772978AF14ull, 0xCDDA4BBAE28354A1ull, 0xE4F993B832C32613ull, + 0xD3608093C68A4B35ull, 0x9A3B60E01BEE3699ull, 0x03BEF248F3288713ull, + 0x70B9294318F3E9B4ull, 0x8D2ABB913B8610DEull}, + {0x37F209128E7D8B2Cull, 0x81D2AB375BD874BCull, 0xA716A1B7373F7408ull, + 0x0CEE97BEC4706540ull, 0xA40C5FD9CDBC1512ull, 0x73CAF6C8918409E7ull, + 0x45E11BCEDF0BBAA1ull, 0x612C612BFF6E6605ull}, + {0xF8ECB14A12D0F649ull, 0xDA683CD7C01BA1ACull, 0xA2203F7510E124C1ull, + 0x7F83E52E162F3C78ull, 0x77D2BB73456ACADBull, 0x37FC34FC840BBA6Full, + 0x3076BC7D4C6EBC1Full, 0x4F514123632B5FA9ull}, + {0x44D789DED935E884ull, 0xF8291591E09FEC9Full, 0xD9CED2CF32A2E4B7ull, + 0x95F70E1EB604904Aull, 0xDE438FE43C14F6ABull, 0x4C8D23E4FAFCF8D8ull, + 0xC716910A3067EB86ull, 0x3D6B7915315095D3ull}, + {0x3170FDBADAB92095ull, 0x8F1963933FC5650Bull, 0x72F94F00ABECFEABull, + 0x6E3AE826C6AAB4CEull, 0xA677A2BF31068258ull, 0x9660CDC4F363AF10ull, + 0xD81A15A152379EF1ull, 0x5D7D285E1080A3F9ull}, + {0xDAD5DDFF9A2249B3ull, 0x6F9721D926103FAEull, 0x1418CBB83FFA349Aull, + 0xE71A30AD48C012B2ull, 0xBE76376C63751132ull, 0x3496467ACA713AE6ull, + 0x8D7EC01369F991A3ull, 0xD8C73A88B96B154Eull}, + {0x8B5D9C74AEB4833Aull, 0xF914FB3F867B912Full, 0xB894EA034936B1DCull, + 0x8A16D21BE51C4F5Bull, 0x31FF048ED582D98Eull, 0xB95AB2F4DC65B820ull, + 0x04082B9170561AF7ull, 0xA215610A5DC836FAull}, + {0xB2ADE592C092FAACull, 0x7A1E683BCBF13294ull, 0xC7A4DBF86858C096ull, + 0x3A49940F97BFF316ull, 0xCAE5C06B82C46703ull, 0xC7F413A0F951E2BDull, + 0x6665E7BB10EB5916ull, 0x86F84A5A94EDE319ull}, + {0x4EA199D8FAA79CA3ull, 0xDFA26E5BF1981704ull, 0x0F5E081D37FA4E01ull, + 0x9CB632F89CD675CDull, 0x4A09DB89D48C0304ull, 0x88142742EA3C7672ull, + 0xAC4F149E6D2E9BDBull, 0x6D9E1C23F8B1C6C6ull}, + {0xD58BE47B92DEC0E9ull, 0x8E57573645E34328ull, 0x4CC094CCB5FB5126ull, + 0x5F1D66AF6FB40E3Cull, 0x2BA15509132D3B00ull, 0x0D6545646120E567ull, + 0x3CF680C45C223666ull, 0x96B28E32930179DAull}, + {0x5900C45853AC7990ull, 0x61881E3E8B7FF169ull, 0x4DE5F835DF2230FFull, + 0x4427A9E7932F73FFull, 0x9B641BAD379A8C8Dull, 0xDF271E5BF98F4E5Cull, + 0xDFDA16DB830FF5EEull, 0x371C7E7CFB89C0E9ull}, + {0x4410A8576247A250ull, 0x6AD2DA12B45AC0D9ull, 0x18DFC72AAC85EECCull, + 0x06FC8BB2A0EF25C8ull, 0xEB287619C85E6118ull, 0x19553ECA67F25A2Cull, + 0x3B9557F1DCEC5BAAull, 0x7BAD9E8B710D1079ull}, + {0x34F365D66BD22B28ull, 0xE6E124B9F10F835Dull, 0x0573C38ABF2B24DCull, + 0xD32E6AF10A0125AEull, 0x383590ACEA979519ull, 0x8376ED7A39E28205ull, + 0xF0B7F184DCBDA435ull, 0x062A203390E31794ull}, + {0xA2AFFD7E41918760ull, 0x7F90FC1BD0819C86ull, 0x5033C08E5A969533ull, + 0x2707AF5C6D039590ull, 0x57BBD5980F17DF9Cull, 0xD3FE6E61D763268Aull, + 0x9E0A0AE40F335A3Bull, 0x43CF4EB0A99613C5ull}, + {0xD4D2A397CE1A7C2Eull, 0x3DF7CE7CC3212DADull, 0x0880F0D5D356C75Aull, + 0xA8AFC44DD03B1346ull, 0x79263B46C13A29E0ull, 0x11071B3C0ED58E7Aull, + 0xED46DC9F538406BFull, 0x2C94974F2B94843Dull}, + {0xE246E13C39AB5D5Eull, 0xAC1018489D955B20ull, 0x8601B558771852B8ull, + 0x110BD4C06DB40173ull, 0x738FC8A18CCA0EBBull, 0x6673E09BE0EA76E5ull, + 0x024BC7A0C7527877ull, 0x45E6B4652E2EC34Eull}, + {0xD1ED26A1A375CDC8ull, 0xAABC4E896A617CB8ull, 0x0A9C9E8E57D753C6ull, + 0xA3774A75FEB4C30Eull, 0x30B816C01C93E49Eull, 0xF405BABC06D2408Cull, + 0xCC0CE6B4CE788ABCull, 0x75E7922D0447956Cull}, + {0xD07C1676A698BC95ull, 0x5F9AEA4840E2D860ull, 0xD5FC10D58BDF6F02ull, + 0xF190A2AD4BC2EEA7ull, 0x0C24D11F51726931ull, 0xDB646899A16B6512ull, + 0x7BC10670047B1DD8ull, 0x2413A5ABCD45F092ull}, + {0x4E66892190CFD923ull, 0xF10162440365EC8Eull, 0x158ACA5A6A2280AEull, + 0x0D60ED11C0224166ull, 0x7CD2E9A71B9D7488ull, 0x450D7289706AB2A3ull, + 0x88FAE34EC9A0D7DCull, 0x96FF9103575A97DAull}, + {0x77990FAC6046C446ull, 0xB174B5FB30C76676ull, 0xE352CE3EB56CF82Aull, + 0xC6039B6873A9A082ull, 0xE3F80F3AE333148Aull, 0xB853BA24BA3539B9ull, + 0xE8863E52ECCB0C74ull, 0x309B4CC1092CC245ull}, + {0xBC2B70BEE8388D9Full, 0xE48D92AE22216DCEull, 0xF15F3BF3E2C15D8Full, + 0x1DD964D4812D8B24ull, 0xD56AF02FB4665E4Cull, 0x98002200595BD9A3ull, + 0x049246D50BB8FA12ull, 0x1B542DF485B579B9ull}, + {0x2347409ADFA8E497ull, 0x36015C2211D62498ull, 0xE9F141F32EB82690ull, + 0x1F839912D0449FB9ull, 0x4E4DCFFF2D02D97Cull, 0xF8A03AB4C0F625C9ull, + 0x0605F575795DAC5Cull, 0x4746C9BEA0DDA6B1ull}, + {0xCA5BB519ECE7481Bull, 0xFD496155E55CA945ull, 0xF753B9DBB1515F81ull, + 0x50549E8BAC0F70E7ull, 0x8614FB0271E21C60ull, 0x60C72947EB0F0070ull, + 0xA6511C10AEE742B6ull, 0x48FB48F2CACCB43Eull}}; + +#endif // PRINT_RESULTS + +// Ensures Xorshift128+ returns consistent and unchanging values. +void TestGolden() { + HWY_ALIGN Xorshift128Plus rng(12345); + for (uint64_t vector = 0; vector < kVectors; ++vector) { + HWY_ALIGN uint64_t lanes[Xorshift128Plus::N]; + rng.Fill(lanes); +#if PRINT_RESULTS + Print(lanes); +#else + for (size_t i = 0; i < Xorshift128Plus::N; ++i) { + ASSERT_EQ(kExpected[vector][i], lanes[i]) + << "Where vector=" << vector << " i=" << i; + } +#endif + } +} + +// Output changes when given different seeds +void TestSeedChanges() { + HWY_ALIGN uint64_t lanes[Xorshift128Plus::N]; + + std::vector first; + constexpr size_t kNumSeeds = 16384; + first.reserve(kNumSeeds); + + // All 14-bit seeds + for (size_t seed = 0; seed < kNumSeeds; ++seed) { + HWY_ALIGN Xorshift128Plus rng(seed); + + rng.Fill(lanes); + first.push_back(lanes[0]); + } + + // All outputs are unique + ASSERT_EQ(kNumSeeds, first.size()); + std::sort(first.begin(), first.end()); + first.erase(std::unique(first.begin(), first.end()), first.end()); + EXPECT_EQ(kNumSeeds, first.size()); +} + +void TestFloat() { + ThreadPoolInternal pool(8); + +#ifdef JXL_DISABLE_SLOW_TESTS + const uint32_t kMaxSeed = 2048; +#else // JXL_DISABLE_SLOW_TESTS + const uint32_t kMaxSeed = 16384; // All 14-bit seeds +#endif // JXL_DISABLE_SLOW_TESTS + pool.Run(0, kMaxSeed, ThreadPool::SkipInit(), + [](const int seed, const int /*thread*/) { + HWY_ALIGN Xorshift128Plus rng(seed); + + const HWY_FULL(uint32_t) du; + const HWY_FULL(float) df; + HWY_ALIGN uint64_t batch[Xorshift128Plus::N]; + HWY_ALIGN float lanes[MaxLanes(df)]; + double sum = 0.0; + size_t count = 0; + const size_t kReps = 2000; + for (size_t reps = 0; reps < kReps; ++reps) { + rng.Fill(batch); + for (size_t i = 0; i < Xorshift128Plus::N * 2; i += Lanes(df)) { + const auto bits = + Load(du, reinterpret_cast(batch) + i); + // 1.0 + 23 random mantissa bits = [1, 2) + const auto rand12 = + BitCast(df, ShiftRight<9>(bits) | Set(du, 0x3F800000)); + const auto rand01 = rand12 - Set(df, 1.0f); + Store(rand01, df, lanes); + for (float lane : lanes) { + sum += lane; + count += 1; + EXPECT_LE(lane, 1.0f); + EXPECT_GE(lane, 0.0f); + } + } + } + + // Verify average (uniform distribution) + EXPECT_NEAR(0.5, sum / count, 0.00702); + }); +} + +// Not more than one 64-bit zero +void TestNotZero() { + ThreadPoolInternal pool(8); + +#ifdef JXL_DISABLE_SLOW_TESTS + const uint32_t kMaxSeed = 500; +#else // JXL_DISABLE_SLOW_TESTS + const uint32_t kMaxSeed = 2000; +#endif // JXL_DISABLE_SLOW_TESTS + pool.Run(0, kMaxSeed, ThreadPool::SkipInit(), + [](const int task, const int /*thread*/) { + HWY_ALIGN uint64_t lanes[Xorshift128Plus::N]; + + HWY_ALIGN Xorshift128Plus rng(task); + size_t num_zero = 0; + for (size_t vectors = 0; vectors < 10000; ++vectors) { + rng.Fill(lanes); + for (uint64_t lane : lanes) { + num_zero += static_cast(lane == 0); + } + } + EXPECT_LE(num_zero, 1); + }); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { + +class Xorshift128Test : public hwy::TestWithParamTarget {}; + +HWY_TARGET_INSTANTIATE_TEST_SUITE_P(Xorshift128Test); + +HWY_EXPORT_AND_TEST_P(Xorshift128Test, TestNotZero); +HWY_EXPORT_AND_TEST_P(Xorshift128Test, TestGolden); +HWY_EXPORT_AND_TEST_P(Xorshift128Test, TestSeedChanges); +HWY_EXPORT_AND_TEST_P(Xorshift128Test, TestFloat); + +} // namespace jxl +#endif diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/profiler/profiler.cc b/codec/L2/demos/jxlEnc/third_partys/lib/profiler/profiler.cc new file mode 100644 index 0000000000..d21ee098f8 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/profiler/profiler.cc @@ -0,0 +1,459 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/profiler.h" + +#if PROFILER_ENABLED + +#include +#include +#include // memcpy + +#include // sort +#include +#include // PRIu64 +#include +#include + +#include "lib/jxl/base/robust_statistics.h" // HalfSampleMode + +// Optionally use SIMD in StreamCacheLine if available. +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/profiler/profiler.cc" +#include +#include + +HWY_BEFORE_NAMESPACE(); +namespace profiler { +namespace HWY_NAMESPACE { + +// Overwrites `to` without loading it into cache (read-for-ownership). +// Copies 64 bytes from/to naturally aligned addresses. +void StreamCacheLine(const Packet* HWY_RESTRICT from, Packet* HWY_RESTRICT to) { +#if HWY_TARGET == HWY_SCALAR + hwy::CopyBytes<64>(from, to); +#else + const HWY_CAPPED(uint64_t, 2) d; + HWY_FENCE; + const uint64_t* HWY_RESTRICT from64 = reinterpret_cast(from); + const auto v0 = Load(d, from64 + 0); + const auto v1 = Load(d, from64 + 2); + const auto v2 = Load(d, from64 + 4); + const auto v3 = Load(d, from64 + 6); + // Fences prevent the compiler from reordering loads/stores, which may + // interfere with write-combining. + HWY_FENCE; + uint64_t* HWY_RESTRICT to64 = reinterpret_cast(to); + Stream(v0, d, to64 + 0); + Stream(v1, d, to64 + 2); + Stream(v2, d, to64 + 4); + Stream(v3, d, to64 + 6); + HWY_FENCE; +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace profiler +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace profiler { + +HWY_EXPORT(StreamCacheLine); + +namespace { + +// How many mebibytes to allocate (if PROFILER_ENABLED) per thread that +// enters at least one zone. Once this buffer is full, the thread will analyze +// packets (two per zone), which introduces observer overhead. +#ifndef PROFILER_THREAD_STORAGE +#define PROFILER_THREAD_STORAGE 32ULL +#endif + +#define PROFILER_PRINT_OVERHEAD 0 + +// Upper bounds for fixed-size data structures (guarded via HWY_ASSERT): +constexpr size_t kMaxDepth = 64; // Maximum nesting of zones. +constexpr size_t kMaxZones = 256; // Total number of zones. + +// Stack of active (entered but not exited) zones. POD, uninitialized. +// Used to deduct child duration from the parent's self time. +struct ActiveZone { + const char* name; + uint64_t entry_timestamp; + uint64_t child_total; +}; + +// Totals for all Zones with the same name. POD, must be zero-initialized. +struct ZoneTotals { + uint64_t total_duration; + const char* name; + uint64_t num_calls; +}; + +template +inline T ClampedSubtract(const T minuend, const T subtrahend) { + if (subtrahend > minuend) { + return 0; + } + return minuend - subtrahend; +} + +} // namespace + +// Per-thread call graph (stack) and ZoneTotals for each zone. +class Results { + public: + Results() { + // Zero-initialize all accumulators (avoids a check for num_zones_ == 0). + memset(zones_, 0, sizeof(zones_)); + } + + // Used for computing overhead when this thread encounters its first Zone. + // This has no observable effect apart from increasing "analyze_elapsed_". + uint64_t ZoneDuration(const Packet* packets) { + HWY_ASSERT(depth_ == 0); + HWY_ASSERT(num_zones_ == 0); + AnalyzePackets(packets, 2); + const uint64_t duration = zones_[0].total_duration; + zones_[0].num_calls = 0; + zones_[0].total_duration = 0; + HWY_ASSERT(depth_ == 0); + num_zones_ = 0; + return duration; + } + + void SetSelfOverhead(const uint64_t self_overhead) { + self_overhead_ = self_overhead; + } + + void SetChildOverhead(const uint64_t child_overhead) { + child_overhead_ = child_overhead; + } + + // Draw all required information from the packets, which can be discarded + // afterwards. Called whenever this thread's storage is full. + void AnalyzePackets(const Packet* HWY_RESTRICT packets, + const size_t num_packets) { + // Ensures prior weakly-ordered streaming stores are globally visible. + hwy::StoreFence(); + + const uint64_t t0 = TicksBefore(); + + for (size_t i = 0; i < num_packets; ++i) { + const uint64_t timestamp = packets[i].timestamp; + // Entering a zone + if (packets[i].name != nullptr) { + HWY_ASSERT(depth_ < kMaxDepth); + zone_stack_[depth_].name = packets[i].name; + zone_stack_[depth_].entry_timestamp = timestamp; + zone_stack_[depth_].child_total = 0; + ++depth_; + continue; + } + + HWY_ASSERT(depth_ != 0); + const ActiveZone& active = zone_stack_[depth_ - 1]; + const uint64_t duration = timestamp - active.entry_timestamp; + const uint64_t self_duration = ClampedSubtract( + duration, self_overhead_ + child_overhead_ + active.child_total); + + UpdateOrAdd(active.name, 1, self_duration); + --depth_; + + // "Deduct" the nested time from its parent's self_duration. + if (depth_ != 0) { + zone_stack_[depth_ - 1].child_total += duration + child_overhead_; + } + } + + const uint64_t t1 = TicksAfter(); + analyze_elapsed_ += t1 - t0; + } + + // Incorporates results from another thread. Call after all threads have + // exited any zones. + void Assimilate(const Results& other) { + const uint64_t t0 = TicksBefore(); + HWY_ASSERT(depth_ == 0); + HWY_ASSERT(other.depth_ == 0); + + for (size_t i = 0; i < other.num_zones_; ++i) { + const ZoneTotals& zone = other.zones_[i]; + UpdateOrAdd(zone.name, zone.num_calls, zone.total_duration); + } + const uint64_t t1 = TicksAfter(); + analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_; + } + + // Single-threaded. + void Print() { + const uint64_t t0 = TicksBefore(); + MergeDuplicates(); + + // Sort by decreasing total (self) cost. + std::sort(zones_, zones_ + num_zones_, + [](const ZoneTotals& r1, const ZoneTotals& r2) { + return r1.total_duration > r2.total_duration; + }); + + uint64_t total_visible_duration = 0; + for (size_t i = 0; i < num_zones_; ++i) { + const ZoneTotals& r = zones_[i]; + if (r.name[0] != '@') { + total_visible_duration += r.total_duration; + printf("%-40s: %10" PRIu64 " x %15" PRIu64 "= %15" PRIu64 "\n", r.name, + r.num_calls, r.total_duration / r.num_calls, r.total_duration); + } + } + + const uint64_t t1 = TicksAfter(); + analyze_elapsed_ += t1 - t0; + printf("Total clocks during analysis: %" PRIu64 "\n", analyze_elapsed_); + printf("Total clocks measured: %" PRIu64 "\n", total_visible_duration); + } + + // Single-threaded. Clears all results as if no zones had been recorded. + void Reset() { + analyze_elapsed_ = 0; + HWY_ASSERT(depth_ == 0); + num_zones_ = 0; + memset(zone_stack_, 0, sizeof(zone_stack_)); + memset(zones_, 0, sizeof(zones_)); + } + + private: + // Updates ZoneTotals of the same name, or inserts a new one if this thread + // has not yet seen that name. Uses a self-organizing list data structure, + // which avoids dynamic memory allocations and is faster than unordered_map. + void UpdateOrAdd(const char* name, const uint64_t num_calls, + const uint64_t duration) { + // Special case for first zone: (maybe) update, without swapping. + if (zones_[0].name == name) { + zones_[0].total_duration += duration; + zones_[0].num_calls += num_calls; + return; + } + + // Look for a zone with the same name. + for (size_t i = 1; i < num_zones_; ++i) { + if (zones_[i].name == name) { + zones_[i].total_duration += duration; + zones_[i].num_calls += num_calls; + // Swap with predecessor (more conservative than move to front, + // but at least as successful). + std::swap(zones_[i - 1], zones_[i]); + return; + } + } + + // Not found; create a new ZoneTotals. + HWY_ASSERT(num_zones_ < kMaxZones); + ZoneTotals* HWY_RESTRICT zone = zones_ + num_zones_; + zone->name = name; + zone->num_calls = num_calls; + zone->total_duration = duration; + ++num_zones_; + } + + // Each instantiation of a function template seems to get its own copy of + // __func__ and GCC doesn't merge them. An N^2 search for duplicates is + // acceptable because we only expect a few dozen zones. + void MergeDuplicates() { + for (size_t i = 0; i < num_zones_; ++i) { + // Add any subsequent duplicates to num_calls and total_duration. + for (size_t j = i + 1; j < num_zones_;) { + if (!strcmp(zones_[i].name, zones_[j].name)) { + zones_[i].num_calls += zones_[j].num_calls; + zones_[i].total_duration += zones_[j].total_duration; + // Fill hole with last item. + zones_[j] = zones_[--num_zones_]; + } else { // Name differed, try next ZoneTotals. + ++j; + } + } + } + } + + uint64_t analyze_elapsed_ = 0; + uint64_t self_overhead_ = 0; + uint64_t child_overhead_ = 0; + + size_t depth_ = 0; // Number of active zones <= kMaxDepth. + size_t num_zones_ = 0; // Number of unique zones <= kMaxZones. + + // After other members to avoid large pointer offsets. + alignas(64) ActiveZone zone_stack_[kMaxDepth]; // Last = newest + alignas(64) ZoneTotals zones_[kMaxZones]; // Self-organizing list +}; + +ThreadSpecific::ThreadSpecific() + : max_packets_(PROFILER_THREAD_STORAGE << 16), // MiB / sizeof(Packet) + packets_(hwy::AllocateAligned(max_packets_)), + num_packets_(0), + results_(hwy::MakeUniqueAligned()) {} + +ThreadSpecific::~ThreadSpecific() {} + +void ThreadSpecific::FlushBuffer() { + if (num_packets_ + kBufferCapacity > max_packets_) { + results_->AnalyzePackets(packets_.get(), num_packets_); + num_packets_ = 0; + } + // This buffering halves observer overhead and decreases the overall + // runtime by about 3%. + HWY_DYNAMIC_DISPATCH(StreamCacheLine) + (buffer_, packets_.get() + num_packets_); + num_packets_ += kBufferCapacity; + buffer_size_ = 0; +} + +void ThreadSpecific::AnalyzeRemainingPackets() { + // Storage full => empty it. + if (num_packets_ + buffer_size_ > max_packets_) { + results_->AnalyzePackets(packets_.get(), num_packets_); + num_packets_ = 0; + } + + // Move buffer to storage + memcpy(packets_.get() + num_packets_, buffer_, buffer_size_ * sizeof(Packet)); + num_packets_ += buffer_size_; + buffer_size_ = 0; + + results_->AnalyzePackets(packets_.get(), num_packets_); + num_packets_ = 0; +} + +void ThreadSpecific::ComputeOverhead() { + // Delay after capturing timestamps before/after the actual zone runs. Even + // with frequency throttling disabled, this has a multimodal distribution, + // including 32, 34, 48, 52, 59, 62. + uint64_t self_overhead; + { + const size_t kNumSamples = 32; + uint32_t samples[kNumSamples]; + for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { + const size_t kNumDurations = 1024; + uint32_t durations[kNumDurations]; + + for (size_t idx_duration = 0; idx_duration < kNumDurations; + ++idx_duration) { + { // + PROFILER_ZONE("Dummy Zone (never shown)"); + } + const uint64_t duration = results_->ZoneDuration(buffer_); + buffer_size_ = 0; + durations[idx_duration] = static_cast(duration); + HWY_ASSERT(num_packets_ == 0); + } + jxl::CountingSort(durations, durations + kNumDurations); + samples[idx_sample] = jxl::HalfSampleMode()(durations, kNumDurations); + } + // Median. + jxl::CountingSort(samples, samples + kNumSamples); + self_overhead = samples[kNumSamples / 2]; +#if PROFILER_PRINT_OVERHEAD + printf("Overhead: %zu\n", self_overhead); +#endif + results_->SetSelfOverhead(self_overhead); + } + + // Delay before capturing start timestamp / after end timestamp. + const size_t kNumSamples = 32; + uint32_t samples[kNumSamples]; + for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { + const size_t kNumDurations = 16; + uint32_t durations[kNumDurations]; + for (size_t idx_duration = 0; idx_duration < kNumDurations; + ++idx_duration) { + const size_t kReps = 10000; + // Analysis time should not be included => must fit within buffer. + HWY_ASSERT(kReps * 2 < max_packets_); + hwy::StoreFence(); + const uint64_t t0 = TicksBefore(); + for (size_t i = 0; i < kReps; ++i) { + PROFILER_ZONE("Dummy"); + } + hwy::StoreFence(); + const uint64_t t1 = TicksAfter(); + HWY_ASSERT(num_packets_ + buffer_size_ == kReps * 2); + buffer_size_ = 0; + num_packets_ = 0; + const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps; + durations[idx_duration] = + static_cast(ClampedSubtract(avg_duration, self_overhead)); + } + jxl::CountingSort(durations, durations + kNumDurations); + samples[idx_sample] = jxl::HalfSampleMode()(durations, kNumDurations); + } + jxl::CountingSort(samples, samples + kNumSamples); + const uint64_t child_overhead = samples[9 * kNumSamples / 10]; +#if PROFILER_PRINT_OVERHEAD + printf("Child overhead: %zu\n", child_overhead); +#endif + results_->SetChildOverhead(child_overhead); +} + +namespace { + +// Could be a static member of Zone, but that would expose in header. +std::atomic& GetHead() { + static std::atomic head_{nullptr}; // Owning + return head_; +} + +} // namespace + +// Thread-safe. +ThreadSpecific* Zone::InitThreadSpecific() { + ThreadSpecific* thread_specific = + hwy::MakeUniqueAligned().release(); + + // Insert into unordered list + std::atomic& head = GetHead(); + ThreadSpecific* old_head = head.load(std::memory_order_relaxed); + thread_specific->SetNext(old_head); + while (!head.compare_exchange_weak(old_head, thread_specific, + std::memory_order_release, + std::memory_order_relaxed)) { + thread_specific->SetNext(old_head); + // TODO(janwas): pause + } + + // ComputeOverhead also creates a Zone, so this needs to be set before that + // to prevent infinite recursion. + GetThreadSpecific() = thread_specific; + + thread_specific->ComputeOverhead(); + return thread_specific; +} + +// Single-threaded. +/*static*/ void Zone::PrintResults() { + ThreadSpecific* head = GetHead().load(std::memory_order_relaxed); + ThreadSpecific* p = head; + while (p) { + p->AnalyzeRemainingPackets(); + + // Combine all threads into a single Result. + if (p != head) { + head->GetResults().Assimilate(p->GetResults()); + p->GetResults().Reset(); + } + + p = p->GetNext(); + } + + if (head != nullptr) { + head->GetResults().Print(); + head->GetResults().Reset(); + } +} + +} // namespace profiler + +#endif // HWY_ONCE +#endif // PROFILER_ENABLED diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/profiler/profiler.h b/codec/L2/demos/jxlEnc/third_partys/lib/profiler/profiler.h new file mode 100644 index 0000000000..c71f63cb3f --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/profiler/profiler.h @@ -0,0 +1,165 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_PROFILER_PROFILER_H_ +#define LIB_PROFILER_PROFILER_H_ + +// High precision, low overhead time measurements. Returns exact call counts and +// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes). +// +// Usage: instrument regions of interest: { PROFILER_ZONE("name"); /*code*/ } or +// void FuncToMeasure() { PROFILER_FUNC; /*code*/ }. +// After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to +// print call counts and average durations [CPU cycles] to stdout, sorted in +// descending order of total duration. + +// If zero, this file has no effect and no measurements will be recorded. +#ifndef PROFILER_ENABLED +#define PROFILER_ENABLED 0 +#endif +#if PROFILER_ENABLED + +#include +#include + +#include +#include + +#include "lib/profiler/tsc_timer.h" + +#if HWY_COMPILER_MSVC +#define PROFILER_PUBLIC +#else +#define PROFILER_PUBLIC __attribute__((visibility("default"))) +#endif + +namespace profiler { + +// Represents zone entry/exit events. POD. +#pragma pack(push, 1) +struct Packet { + // Computing a hash or string table is likely too expensive, and offsets + // from other libraries' string literals can be too large to combine them and + // a full-resolution timestamp into 64 bits. + uint64_t timestamp; + const char* name; // nullptr for exit packets +#if UINTPTR_MAX <= 0xFFFFFFFFu + uint32_t padding; +#endif +}; +#pragma pack(pop) +static_assert(sizeof(Packet) == 16, "Wrong Packet size"); + +class Results; // pImpl + +// Per-thread packet storage, dynamically allocated and aligned. +class ThreadSpecific { + static constexpr size_t kBufferCapacity = 64 / sizeof(Packet); + + public: + PROFILER_PUBLIC explicit ThreadSpecific(); + PROFILER_PUBLIC ~ThreadSpecific(); + + // Depends on Zone => defined out of line. + PROFILER_PUBLIC void ComputeOverhead(); + + HWY_INLINE void WriteEntry(const char* name) { Write(name, TicksBefore()); } + HWY_INLINE void WriteExit() { Write(nullptr, TicksAfter()); } + + PROFILER_PUBLIC void AnalyzeRemainingPackets(); + + // Accessors instead of public member for well-defined data layout. + void SetNext(ThreadSpecific* next) { next_ = next; } + ThreadSpecific* GetNext() const { return next_; } + + Results& GetResults() { return *results_; } + + private: + PROFILER_PUBLIC void FlushBuffer(); + + // Write packet to buffer/storage, emptying them as needed. + void Write(const char* name, const uint64_t timestamp) { + if (buffer_size_ == kBufferCapacity) { // Full + FlushBuffer(); + } + buffer_[buffer_size_].name = name; + buffer_[buffer_size_].timestamp = timestamp; + ++buffer_size_; + } + + // Write-combining buffer to avoid cache pollution. Must be the first + // non-static member to ensure cache-line alignment. + Packet buffer_[kBufferCapacity]; + size_t buffer_size_ = 0; + + // Contiguous storage for zone enter/exit packets. + const size_t max_packets_; + hwy::AlignedFreeUniquePtr packets_; + size_t num_packets_; + + // Linked list of all threads. + ThreadSpecific* next_ = nullptr; // Owned, never released. + + hwy::AlignedUniquePtr results_; +}; + +// RAII zone enter/exit recorder constructed by PROFILER_ZONE; also +// responsible for initializing ThreadSpecific. +class Zone { + public: + HWY_NOINLINE explicit Zone(const char* name) { + HWY_FENCE; + ThreadSpecific* HWY_RESTRICT thread_specific = GetThreadSpecific(); + if (HWY_UNLIKELY(thread_specific == nullptr)) { + thread_specific = InitThreadSpecific(); + } + + thread_specific->WriteEntry(name); + } + + HWY_NOINLINE ~Zone() { GetThreadSpecific()->WriteExit(); } + + // Call exactly once after all threads have exited all zones. + PROFILER_PUBLIC static void PrintResults(); + + private: + // Returns reference to the thread's ThreadSpecific pointer (initially null). + // Function-local static avoids needing a separate definition. + static ThreadSpecific*& GetThreadSpecific() { + static thread_local ThreadSpecific* thread_specific; + return thread_specific; + } + + // Non time-critical. + PROFILER_PUBLIC ThreadSpecific* InitThreadSpecific(); +}; + +// Creates a zone starting from here until the end of the current scope. +// Timestamps will be recorded when entering and exiting the zone. +// To ensure the name pointer remains valid, we require it to be a string +// literal (by merging with ""). We also compare strings by address. +#define PROFILER_ZONE(name) \ + HWY_FENCE; \ + const ::profiler::Zone zone("" name); \ + HWY_FENCE + +// Creates a zone for an entire function (when placed at its beginning). +// Shorter/more convenient than ZONE. +#define PROFILER_FUNC \ + HWY_FENCE; \ + const ::profiler::Zone zone(__func__); \ + HWY_FENCE + +#define PROFILER_PRINT_RESULTS ::profiler::Zone::PrintResults + +} // namespace profiler + +#else // !PROFILER_ENABLED +#define PROFILER_ZONE(name) +#define PROFILER_FUNC +#define PROFILER_PRINT_RESULTS() +#endif + +#endif // LIB_PROFILER_PROFILER_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/profiler/tsc_timer.h b/codec/L2/demos/jxlEnc/third_partys/lib/profiler/tsc_timer.h new file mode 100644 index 0000000000..61ccd5af59 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/profiler/tsc_timer.h @@ -0,0 +1,131 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_PROFILER_TSC_TIMER_H_ +#define LIB_PROFILER_TSC_TIMER_H_ + +// High-resolution (~10 ns) timestamps, using fences to prevent reordering and +// ensure exactly the desired regions are measured. + +#include + +#include +#include +#include // LoadFence + +namespace profiler { + +// TicksBefore/After return absolute timestamps and must be placed immediately +// before and after the region to measure. The functions are distinct because +// they use different fences. +// +// Background: RDTSC is not 'serializing'; earlier instructions may complete +// after it, and/or later instructions may complete before it. 'Fences' ensure +// regions' elapsed times are independent of such reordering. The only +// documented unprivileged serializing instruction is CPUID, which acts as a +// full fence (no reordering across it in either direction). Unfortunately +// the latency of CPUID varies wildly (perhaps made worse by not initializing +// its EAX input). Because it cannot reliably be deducted from the region's +// elapsed time, it must not be included in the region to measure (i.e. +// between the two RDTSC). +// +// The newer RDTSCP is sometimes described as serializing, but it actually +// only serves as a half-fence with release semantics. Although all +// instructions in the region will complete before the final timestamp is +// captured, subsequent instructions may leak into the region and increase the +// elapsed time. Inserting another fence after the final RDTSCP would prevent +// such reordering without affecting the measured region. +// +// Fortunately, such a fence exists. The LFENCE instruction is only documented +// to delay later loads until earlier loads are visible. However, Intel's +// reference manual says it acts as a full fence (waiting until all earlier +// instructions have completed, and delaying later instructions until it +// completes). AMD assigns the same behavior to MFENCE. +// +// We need a fence before the initial RDTSC to prevent earlier instructions +// from leaking into the region, and arguably another after RDTSC to avoid +// region instructions from completing before the timestamp is recorded. +// When surrounded by fences, the additional RDTSCP half-fence provides no +// benefit, so the initial timestamp can be recorded via RDTSC, which has +// lower overhead than RDTSCP because it does not read TSC_AUX. In summary, +// we define Before = LFENCE/RDTSC/LFENCE; After = RDTSCP/LFENCE. +// +// Using Before+Before leads to higher variance and overhead than After+After. +// However, After+After includes an LFENCE in the region measurements, which +// adds a delay dependent on earlier loads. The combination of Before+After +// is faster than Before+Before and more consistent than Stop+Stop because +// the first LFENCE already delayed subsequent loads before the measured +// region. This combination seems not to have been considered in prior work: +// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c +// +// Note: performance counters can measure 'exact' instructions-retired or +// (unhalted) cycle counts. The RDPMC instruction is not serializing and also +// requires fences. Unfortunately, it is not accessible on all OSes and we +// prefer to avoid kernel-mode drivers. Performance counters are also affected +// by several under/over-count errata, so we use the TSC instead. + +// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds, +// divide by InvariantTicksPerSecond. Although 32-bit ticks are faster to read, +// they overflow too quickly to measure long regions. +static HWY_INLINE HWY_MAYBE_UNUSED uint64_t TicksBefore() { + uint64_t t; +#if HWY_ARCH_PPC + asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); +#elif HWY_ARCH_X86_64 && HWY_COMPILER_MSVC + hwy::LoadFence(); + HWY_FENCE; + t = __rdtsc(); + hwy::LoadFence(); + HWY_FENCE; +#elif HWY_ARCH_X86_64 && (HWY_COMPILER_CLANG || HWY_COMPILER_GCC) + asm volatile( + "lfence\n\t" + "rdtsc\n\t" + "shl $32, %%rdx\n\t" + "or %%rdx, %0\n\t" + "lfence" + : "=a"(t) + : + // "memory" avoids reordering. rdx = TSC >> 32. + // "cc" = flags modified by SHL. + : "rdx", "memory", "cc"); +#else + // Fall back to OS - unsure how to reliably query cntvct_el0 frequency. + timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + t = ts.tv_sec * 1000000000LL + ts.tv_nsec; +#endif + return t; +} + +static HWY_INLINE HWY_MAYBE_UNUSED uint64_t TicksAfter() { + uint64_t t; +#if HWY_ARCH_X86_64 && HWY_COMPILER_MSVC + HWY_FENCE; + unsigned aux; + t = __rdtscp(&aux); + hwy::LoadFence(); + HWY_FENCE; +#elif HWY_ARCH_X86_64 && (HWY_COMPILER_CLANG || HWY_COMPILER_GCC) + // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). + asm volatile( + "rdtscp\n\t" + "shl $32, %%rdx\n\t" + "or %%rdx, %0\n\t" + "lfence" + : "=a"(t) + : + // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. + // "cc" = flags modified by SHL. + : "rcx", "rdx", "memory", "cc"); +#else + t = TicksBefore(); // no difference on other platforms. +#endif + return t; +} + +} // namespace profiler + +#endif // LIB_PROFILER_TSC_TIMER_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/threads/libjxl_threads.pc.in b/codec/L2/demos/jxlEnc/third_partys/lib/threads/libjxl_threads.pc.in new file mode 100644 index 0000000000..8a3275cf1c --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/threads/libjxl_threads.pc.in @@ -0,0 +1,12 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=${prefix} +libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@ +includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ + +Name: libjxl_threads +Description: JPEG XL multi-thread runner using std::threads. +Version: @JPEGXL_LIBRARY_VERSION@ +Requires.private: @JPEGXL_THREADS_LIBRARY_REQUIRES@ +Libs: -L${libdir} -ljxl_threads +Libs.private: -lm +Cflags: -I${includedir} diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/threads/resizable_parallel_runner.cc b/codec/L2/demos/jxlEnc/third_partys/lib/threads/resizable_parallel_runner.cc new file mode 100644 index 0000000000..1208a3856e --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/threads/resizable_parallel_runner.cc @@ -0,0 +1,195 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "jxl/resizable_parallel_runner.h" + +#include +#include +#include +#include +#include +#include + +namespace jpegxl { +namespace { + +// A thread pool that allows changing the number of threads it runs. It also +// runs tasks on the calling thread, which can work better on schedulers for +// heterogeneous architectures. +struct ResizeableParallelRunner { + void SetNumThreads(size_t num) { + if (num > 0) { + num -= 1; + } + { + std::unique_lock l(state_mutex_); + num_desired_workers_ = num; + workers_can_proceed_.notify_all(); + } + if (workers_.size() < num) { + for (size_t i = workers_.size(); i < num; i++) { + workers_.emplace_back([this, i]() { WorkerBody(i); }); + } + } + if (workers_.size() > num) { + for (size_t i = num; i < workers_.size(); i++) { + workers_[i].join(); + } + workers_.resize(num); + } + } + + ~ResizeableParallelRunner() { SetNumThreads(0); } + + JxlParallelRetCode Run(void* jxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start, + uint32_t end) { + if (start + 1 == end) { + JxlParallelRetCode ret = init(jxl_opaque, 1); + if (ret != 0) return ret; + + func(jxl_opaque, start, 0); + return ret; + } + + size_t num_workers = std::min(workers_.size() + 1, end - start); + JxlParallelRetCode ret = init(jxl_opaque, num_workers); + if (ret != 0) { + return ret; + } + + { + std::unique_lock l(state_mutex_); + // Avoid waking up more workers than needed. + max_running_workers_ = end - start - 1; + next_task_ = start; + end_task_ = end; + func_ = func; + jxl_opaque_ = jxl_opaque; + work_available_ = true; + num_running_workers_++; + workers_can_proceed_.notify_all(); + } + + DequeueTasks(0); + + while (true) { + std::unique_lock l(state_mutex_); + if (num_running_workers_ == 0) break; + work_done_.wait(l); + } + + return ret; + } + + private: + void WorkerBody(size_t worker_id) { + while (true) { + { + std::unique_lock l(state_mutex_); + // Worker pool was reduced, resize down. + if (worker_id >= num_desired_workers_) { + return; + } + // Nothing to do this time. + if (!work_available_ || worker_id >= max_running_workers_) { + workers_can_proceed_.wait(l); + continue; + } + num_running_workers_++; + } + DequeueTasks(worker_id + 1); + } + } + + void DequeueTasks(size_t thread_id) { + while (true) { + uint32_t task = next_task_++; + if (task >= end_task_) { + std::unique_lock l(state_mutex_); + num_running_workers_--; + work_available_ = false; + if (num_running_workers_ == 0) { + work_done_.notify_all(); + } + break; + } + func_(jxl_opaque_, task, thread_id); + } + } + + // Checks when the worker has something to do, which can be one of: + // - quitting (when worker_id >= num_desired_workers_) + // - having work available for them (work_available_ is true and worker_id >= + // max_running_workers_) + std::condition_variable workers_can_proceed_; + + // Workers are done, and the main thread can proceed (num_running_workers_ == + // 0) + std::condition_variable work_done_; + + std::vector workers_; + + // Protects all the remaining variables, except for func_, jxl_opaque_ and + // end_task_ (for which only the write by the main thread is protected, and + // subsequent uses by workers happen-after it) and next_task_ (which is + // atomic). + std::mutex state_mutex_; + + // Range of tasks still need to be done. + std::atomic next_task_; + uint32_t end_task_; + + // Function to run and its argument. + JxlParallelRunFunction func_; + void* jxl_opaque_; // not owned + + // Variables that control the workers: + // - work_available_ is set to true after a call to Run() and to false at the + // end of it. + // - num_desired_workers_ represents the number of workers that should be + // present. + // - max_running_workers_ represents the number of workers that should be + // executing tasks. + // - num_running_workers_ represents the number of workers that are executing + // tasks. + size_t num_desired_workers_ = 0; + size_t max_running_workers_ = 0; + size_t num_running_workers_ = 0; + bool work_available_ = false; +}; +} // namespace +} // namespace jpegxl + +extern "C" { +JXL_THREADS_EXPORT JxlParallelRetCode JxlResizableParallelRunner( + void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) { + return static_cast(runner_opaque) + ->Run(jpegxl_opaque, init, func, start_range, end_range); +} + +JXL_THREADS_EXPORT void* JxlResizableParallelRunnerCreate( + const JxlMemoryManager* memory_manager) { + return new jpegxl::ResizeableParallelRunner(); +} + +JXL_THREADS_EXPORT void JxlResizableParallelRunnerSetThreads( + void* runner_opaque, size_t num_threads) { + static_cast(runner_opaque) + ->SetNumThreads(num_threads); +} + +JXL_THREADS_EXPORT void JxlResizableParallelRunnerDestroy(void* runner_opaque) { + delete static_cast(runner_opaque); +} + +JXL_THREADS_EXPORT uint32_t +JxlResizableParallelRunnerSuggestThreads(uint64_t xsize, uint64_t ysize) { + // ~one thread per group. + return std::min(std::thread::hardware_concurrency(), + xsize * ysize / (256 * 256)); +} +} diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner.cc b/codec/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner.cc new file mode 100644 index 0000000000..b9cf4aa6cd --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner.cc @@ -0,0 +1,101 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "jxl/thread_parallel_runner.h" + +#include + +#include "lib/threads/thread_parallel_runner_internal.h" + +namespace { + +// Default JxlMemoryManager using malloc and free for the jpegxl_threads +// library. Same as the default JxlMemoryManager for the jpegxl library +// itself. + +// Default alloc and free functions. +void* ThreadMemoryManagerDefaultAlloc(void* opaque, size_t size) { + return malloc(size); +} + +void ThreadMemoryManagerDefaultFree(void* opaque, void* address) { + free(address); +} + +// Initializes the memory manager instance with the passed one. The +// MemoryManager passed in |memory_manager| may be NULL or contain NULL +// functions which will be initialized with the default ones. If either alloc +// or free are NULL, then both must be NULL, otherwise this function returns an +// error. +bool ThreadMemoryManagerInit(JxlMemoryManager* self, + const JxlMemoryManager* memory_manager) { + if (memory_manager) { + *self = *memory_manager; + } else { + memset(self, 0, sizeof(*self)); + } + if (!self->alloc != !self->free) { + return false; + } + if (!self->alloc) self->alloc = ThreadMemoryManagerDefaultAlloc; + if (!self->free) self->free = ThreadMemoryManagerDefaultFree; + + return true; +} + +void* ThreadMemoryManagerAlloc(const JxlMemoryManager* memory_manager, + size_t size) { + return memory_manager->alloc(memory_manager->opaque, size); +} + +void ThreadMemoryManagerFree(const JxlMemoryManager* memory_manager, + void* address) { + return memory_manager->free(memory_manager->opaque, address); +} + +} // namespace + +JxlParallelRetCode JxlThreadParallelRunner( + void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) { + return jpegxl::ThreadParallelRunner::Runner( + runner_opaque, jpegxl_opaque, init, func, start_range, end_range); +} + +/// Starts the given number of worker threads and blocks until they are ready. +/// "num_worker_threads" defaults to one per hyperthread. If zero, all tasks +/// run on the main thread. +void* JxlThreadParallelRunnerCreate(const JxlMemoryManager* memory_manager, + size_t num_worker_threads) { + JxlMemoryManager local_memory_manager; + if (!ThreadMemoryManagerInit(&local_memory_manager, memory_manager)) + return nullptr; + + void* alloc = ThreadMemoryManagerAlloc(&local_memory_manager, + sizeof(jpegxl::ThreadParallelRunner)); + if (!alloc) return nullptr; + // Placement new constructor on allocated memory + jpegxl::ThreadParallelRunner* runner = + new (alloc) jpegxl::ThreadParallelRunner(num_worker_threads); + runner->memory_manager = local_memory_manager; + + return runner; +} + +void JxlThreadParallelRunnerDestroy(void* runner_opaque) { + jpegxl::ThreadParallelRunner* runner = + reinterpret_cast(runner_opaque); + if (runner) { + // Call destructor directly since custom free function is used. + runner->~ThreadParallelRunner(); + ThreadMemoryManagerFree(&runner->memory_manager, runner); + } +} + +// Get default value for num_worker_threads parameter of +// InitJxlThreadParallelRunner. +size_t JxlThreadParallelRunnerDefaultNumWorkerThreads() { + return std::thread::hardware_concurrency(); +} diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_internal.cc b/codec/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_internal.cc new file mode 100644 index 0000000000..5ceede42af --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_internal.cc @@ -0,0 +1,217 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/threads/thread_parallel_runner_internal.h" + +#include + +#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ + defined(THREAD_SANITIZER) +#include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace +#endif // defined(*_SANITIZER) + +#include "jxl/thread_parallel_runner.h" +#include "lib/jxl/base/profiler.h" + +namespace { + +// Exits the program after printing a stack trace when possible. +bool Abort() { +#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ + defined(THREAD_SANITIZER) + // If compiled with any sanitizer print a stack trace. This call doesn't crash + // the program, instead the trap below will crash it also allowing gdb to + // break there. + __sanitizer_print_stack_trace(); +#endif // defined(*_SANITIZER) + +#ifdef _MSC_VER + __debugbreak(); + abort(); +#else + __builtin_trap(); +#endif +} + +// Does not guarantee running the code, use only for debug mode checks. +#if JXL_ENABLE_ASSERT +#define JXL_ASSERT(condition) \ + do { \ + if (!(condition)) { \ + Abort(); \ + } \ + } while (0) +#else +#define JXL_ASSERT(condition) \ + do { \ + } while (0) +#endif +} // namespace + +namespace jpegxl { + +// static +JxlParallelRetCode ThreadParallelRunner::Runner( + void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) { + ThreadParallelRunner* self = + static_cast(runner_opaque); + if (start_range > end_range) return -1; + if (start_range == end_range) return 0; + + int ret = init(jpegxl_opaque, std::max(self->num_worker_threads_, 1)); + if (ret != 0) return ret; + + // Use a sequential run when num_worker_threads_ is zero since we have no + // worker threads. + if (self->num_worker_threads_ == 0) { + const size_t thread = 0; + for (uint32_t task = start_range; task < end_range; ++task) { + func(jpegxl_opaque, task, thread); + } + return 0; + } + + if (self->depth_.fetch_add(1, std::memory_order_acq_rel) != 0) { + return -1; // Must not re-enter. + } + + const WorkerCommand worker_command = + (static_cast(start_range) << 32) + end_range; + // Ensure the inputs do not result in a reserved command. + JXL_ASSERT(worker_command != kWorkerWait); + JXL_ASSERT(worker_command != kWorkerOnce); + JXL_ASSERT(worker_command != kWorkerExit); + + self->data_func_ = func; + self->jpegxl_opaque_ = jpegxl_opaque; + self->num_reserved_.store(0, std::memory_order_relaxed); + + self->StartWorkers(worker_command); + self->WorkersReadyBarrier(); + + if (self->depth_.fetch_add(-1, std::memory_order_acq_rel) != 1) { + return -1; + } + return 0; +} + +// static +void ThreadParallelRunner::RunRange(ThreadParallelRunner* self, + const WorkerCommand command, + const int thread) { + const uint32_t begin = command >> 32; + const uint32_t end = command & 0xFFFFFFFF; + const uint32_t num_tasks = end - begin; + const uint32_t num_worker_threads = self->num_worker_threads_; + + // OpenMP introduced several "schedule" strategies: + // "single" (static assignment of exactly one chunk per thread): slower. + // "dynamic" (allocates k tasks at a time): competitive for well-chosen k. + // "guided" (allocates k tasks, decreases k): computing k = remaining/n + // is faster than halving k each iteration. We prefer this strategy + // because it avoids user-specified parameters. + + for (;;) { +#if 0 + // dynamic + const uint32_t my_size = std::max(num_tasks / (num_worker_threads * 4), 1); +#else + // guided + const uint32_t num_reserved = + self->num_reserved_.load(std::memory_order_relaxed); + const uint32_t num_remaining = num_tasks - num_reserved; + const uint32_t my_size = + std::max(num_remaining / (num_worker_threads * 4), 1u); +#endif + const uint32_t my_begin = begin + self->num_reserved_.fetch_add( + my_size, std::memory_order_relaxed); + const uint32_t my_end = std::min(my_begin + my_size, begin + num_tasks); + // Another thread already reserved the last task. + if (my_begin >= my_end) { + break; + } + for (uint32_t task = my_begin; task < my_end; ++task) { + self->data_func_(self->jpegxl_opaque_, task, thread); + } + } +} + +// static +void ThreadParallelRunner::ThreadFunc(ThreadParallelRunner* self, + const int thread) { + // Until kWorkerExit command received: + for (;;) { + std::unique_lock lock(self->mutex_); + // Notify main thread that this thread is ready. + if (++self->workers_ready_ == self->num_threads_) { + self->workers_ready_cv_.notify_one(); + } + RESUME_WAIT: + // Wait for a command. + self->worker_start_cv_.wait(lock); + const WorkerCommand command = self->worker_start_command_; + switch (command) { + case kWorkerWait: // spurious wakeup: + goto RESUME_WAIT; // lock still held, avoid incrementing ready. + case kWorkerOnce: + lock.unlock(); + self->data_func_(self->jpegxl_opaque_, thread, thread); + break; + case kWorkerExit: + return; // exits thread + default: + lock.unlock(); + RunRange(self, command, thread); + break; + } + } +} + +ThreadParallelRunner::ThreadParallelRunner(const int num_worker_threads) +#if defined(__EMSCRIPTEN__) + : num_worker_threads_(0), num_threads_(1) { + // TODO(eustas): find out if pthreads would work for us. + (void)num_worker_threads; +#else + : num_worker_threads_(num_worker_threads), + num_threads_(std::max(num_worker_threads, 1)) { +#endif + PROFILER_ZONE("ThreadParallelRunner ctor"); + + threads_.reserve(num_worker_threads_); + + // Suppress "unused-private-field" warning. + (void)padding1; + (void)padding2; + + // Safely handle spurious worker wakeups. + worker_start_command_ = kWorkerWait; + + for (uint32_t i = 0; i < num_worker_threads_; ++i) { + threads_.emplace_back(ThreadFunc, this, i); + } + + if (num_worker_threads_ != 0) { + WorkersReadyBarrier(); + } + + // Warm up profiler on worker threads so its expensive initialization + // doesn't count towards other timer measurements. + RunOnEachThread( + [](const int task, const int thread) { PROFILER_ZONE("@InitWorkers"); }); +} + +ThreadParallelRunner::~ThreadParallelRunner() { + if (num_worker_threads_ != 0) { + StartWorkers(kWorkerExit); + } + + for (std::thread& thread : threads_) { + JXL_ASSERT(thread.joinable()); + thread.join(); + } +} +} // namespace jpegxl diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_internal.h b/codec/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_internal.h new file mode 100644 index 0000000000..372c6a8950 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_internal.h @@ -0,0 +1,172 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +// C++ implementation using std::thread of a ::JxlParallelRunner. + +// The main class in this module, ThreadParallelRunner, implements a static +// method ThreadParallelRunner::Runner than can be passed as a +// JxlParallelRunner when using the JPEG XL library. This uses std::thread +// internally and related synchronization functions. The number of threads +// created is fixed at construction time and the threads are re-used for every +// ThreadParallelRunner::Runner call. Only one concurrent Runner() call per +// instance is allowed at a time. +// +// This is a scalable, lower-overhead thread pool runner, especially suitable +// for data-parallel computations in the fork-join model, where clients need to +// know when all tasks have completed. +// +// This thread pool can efficiently load-balance millions of tasks using an +// atomic counter, thus avoiding per-task virtual or system calls. With 48 +// hyperthreads and 1M tasks that add to an atomic counter, overall runtime is +// 10-20x higher when using std::async, and ~200x for a queue-based thread +// pool. +// +// Usage: +// ThreadParallelRunner runner; +// JxlDecode( +// ... , &ThreadParallelRunner::Runner, static_cast(&runner)); + +#ifndef LIB_THREADS_THREAD_PARALLEL_RUNNER_INTERNAL_H_ +#define LIB_THREADS_THREAD_PARALLEL_RUNNER_INTERNAL_H_ + +#include +#include +#include + +#include +#include //NOLINT +#include //NOLINT +#include //NOLINT +#include + +#include "jxl/memory_manager.h" +#include "jxl/parallel_runner.h" + +namespace jpegxl { + +// Main helper class implementing the ::JxlParallelRunner interface. +class ThreadParallelRunner { + public: + // ::JxlParallelRunner interface. + static JxlParallelRetCode Runner(void* runner_opaque, void* jpegxl_opaque, + JxlParallelRunInit init, + JxlParallelRunFunction func, + uint32_t start_range, uint32_t end_range); + + // Starts the given number of worker threads and blocks until they are ready. + // "num_worker_threads" defaults to one per hyperthread. If zero, all tasks + // run on the main thread. + explicit ThreadParallelRunner( + int num_worker_threads = std::thread::hardware_concurrency()); + + // Waits for all threads to exit. + ~ThreadParallelRunner(); + + // Returns number of worker threads created (some may be sleeping and never + // wake up in time to participate in Run). Useful for characterizing + // performance; 0 means "run on main thread". + size_t NumWorkerThreads() const { return num_worker_threads_; } + + // Returns maximum number of main/worker threads that may call Func. Useful + // for allocating per-thread storage. + size_t NumThreads() const { return num_threads_; } + + // Runs func(thread, thread) on all thread(s) that may participate in Run. + // If NumThreads() == 0, runs on the main thread with thread == 0, otherwise + // concurrently called by each worker thread in [0, NumThreads()). + template + void RunOnEachThread(const Func& func) { + if (num_worker_threads_ == 0) { + const int thread = 0; + func(thread, thread); + return; + } + + data_func_ = reinterpret_cast(&CallClosure); + jpegxl_opaque_ = const_cast(static_cast(&func)); + StartWorkers(kWorkerOnce); + WorkersReadyBarrier(); + } + + JxlMemoryManager memory_manager; + + private: + // After construction and between calls to Run, workers are "ready", i.e. + // waiting on worker_start_cv_. They are "started" by sending a "command" + // and notifying all worker_start_cv_ waiters. (That is why all workers + // must be ready/waiting - otherwise, the notification will not reach all of + // them and the main thread waits in vain for them to report readiness.) + using WorkerCommand = uint64_t; + + // Special values; all others encode the begin/end parameters. Note that all + // these are no-op ranges (begin >= end) and therefore never used to encode + // ranges. + static constexpr WorkerCommand kWorkerWait = ~1ULL; + static constexpr WorkerCommand kWorkerOnce = ~2ULL; + static constexpr WorkerCommand kWorkerExit = ~3ULL; + + // Calls f(task, thread). Used for type erasure of Func arguments. The + // signature must match JxlParallelRunFunction, hence a void* argument. + template + static void CallClosure(void* f, const uint32_t task, const size_t thread) { + (*reinterpret_cast(f))(task, thread); + } + + void WorkersReadyBarrier() { + std::unique_lock lock(mutex_); + // Typically only a single iteration. + while (workers_ready_ != threads_.size()) { + workers_ready_cv_.wait(lock); + } + workers_ready_ = 0; + + // Safely handle spurious worker wakeups. + worker_start_command_ = kWorkerWait; + } + + // Precondition: all workers are ready. + void StartWorkers(const WorkerCommand worker_command) { + mutex_.lock(); + worker_start_command_ = worker_command; + // Workers will need this lock, so release it before they wake up. + mutex_.unlock(); + worker_start_cv_.notify_all(); + } + + // Attempts to reserve and perform some work from the global range of tasks, + // which is encoded within "command". Returns after all tasks are reserved. + static void RunRange(ThreadParallelRunner* self, const WorkerCommand command, + const int thread); + + static void ThreadFunc(ThreadParallelRunner* self, int thread); + + // Unmodified after ctor, but cannot be const because we call thread::join(). + std::vector threads_; + + const uint32_t num_worker_threads_; // == threads_.size() + const uint32_t num_threads_; + + std::atomic depth_{0}; // detects if Run is re-entered (not supported). + + std::mutex mutex_; // guards both cv and their variables. + std::condition_variable workers_ready_cv_; + uint32_t workers_ready_ = 0; + std::condition_variable worker_start_cv_; + WorkerCommand worker_start_command_; + + // Written by main thread, read by workers (after mutex lock/unlock). + JxlParallelRunFunction data_func_; + void* jpegxl_opaque_; + + // Updated by workers; padding avoids false sharing. + uint8_t padding1[64]; + std::atomic num_reserved_{0}; + uint8_t padding2[64]; +}; + +} // namespace jpegxl + +#endif // LIB_THREADS_THREAD_PARALLEL_RUNNER_INTERNAL_H_ diff --git a/codec/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_test.cc b/codec/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_test.cc new file mode 100644 index 0000000000..7ff260e2f1 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/lib/threads/thread_parallel_runner_test.cc @@ -0,0 +1,115 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "gtest/gtest.h" +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/base/thread_pool_internal.h" + +namespace jpegxl { +namespace { + +int PopulationCount(uint64_t bits) { + int num_set = 0; + while (bits != 0) { + num_set += bits & 1; + bits >>= 1; + } + return num_set; +} + +// Ensures task parameter is in bounds, every parameter is reached, +// pool can be reused (multiple consecutive Run calls), pool can be destroyed +// (joining with its threads), num_threads=0 works (runs on current thread). +TEST(ThreadParallelRunnerTest, TestPool) { + for (int num_threads = 0; num_threads <= 18; ++num_threads) { + jxl::ThreadPoolInternal pool(num_threads); + for (int num_tasks = 0; num_tasks < 32; ++num_tasks) { + std::vector mementos(num_tasks); + for (int begin = 0; begin < 32; ++begin) { + std::fill(mementos.begin(), mementos.end(), 0); + pool.Run( + begin, begin + num_tasks, jxl::ThreadPool::SkipInit(), + [begin, num_tasks, &mementos](const int task, const int thread) { + // Parameter is in the given range + EXPECT_GE(task, begin); + EXPECT_LT(task, begin + num_tasks); + + // Store mementos to be sure we visited each task. + mementos.at(task - begin) = 1000 + task; + }); + for (int task = begin; task < begin + num_tasks; ++task) { + EXPECT_EQ(1000 + task, mementos.at(task - begin)); + } + } + } + } +} + +// Verify "thread" parameter when processing few tasks. +TEST(ThreadParallelRunnerTest, TestSmallAssignments) { + // WARNING: cumulative total threads must not exceed profiler.h kMaxThreads. + const int kMaxThreads = 8; + for (int num_threads = 1; num_threads <= kMaxThreads; ++num_threads) { + jxl::ThreadPoolInternal pool(num_threads); + + // (Avoid mutex because it may perturb the worker thread scheduling) + std::atomic id_bits{0}; + std::atomic num_calls{0}; + + pool.Run( + 0, num_threads, jxl::ThreadPool::SkipInit(), + [&num_calls, num_threads, &id_bits](const int task, const int thread) { + num_calls.fetch_add(1, std::memory_order_relaxed); + + EXPECT_LT(thread, num_threads); + uint64_t bits = id_bits.load(std::memory_order_relaxed); + while ( + !id_bits.compare_exchange_weak(bits, bits | (1ULL << thread))) { + } + }); + + // Correct number of tasks. + EXPECT_EQ(num_threads, num_calls.load()); + + const int num_participants = PopulationCount(id_bits.load()); + // Can't expect equality because other workers may have woken up too late. + EXPECT_LE(num_participants, num_threads); + } +} + +struct Counter { + Counter() { + // Suppress "unused-field" warning. + (void)padding; + } + void Assimilate(const Counter& victim) { counter += victim.counter; } + int counter = 0; + int padding[31]; +}; + +TEST(ThreadParallelRunnerTest, TestCounter) { + const int kNumThreads = 12; + jxl::ThreadPoolInternal pool(kNumThreads); + alignas(128) Counter counters[kNumThreads]; + + const int kNumTasks = kNumThreads * 19; + pool.Run(0, kNumTasks, jxl::ThreadPool::SkipInit(), + [&counters](const int task, const int thread) { + counters[thread].counter += task; + }); + + int expected = 0; + for (int i = 0; i < kNumTasks; ++i) { + expected += i; + } + + for (int i = 1; i < kNumThreads; ++i) { + counters[0].Assimilate(counters[i]); + } + EXPECT_EQ(expected, counters[0].counter); +} + +} // namespace +} // namespace jpegxl diff --git a/codec/L2/demos/jxlEnc/third_partys/third_party/brotli/LICENSE b/codec/L2/demos/jxlEnc/third_partys/third_party/brotli/LICENSE new file mode 100644 index 0000000000..33b7cdd2db --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/third_party/brotli/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2009, 2010, 2013-2016 by the Brotli Authors. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/codec/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/constants.h b/codec/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/constants.h new file mode 100644 index 0000000000..f6e44dc7b7 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/constants.h @@ -0,0 +1,184 @@ +/* Copyright 2016 Google Inc. All Rights Reserved. + + Distributed under MIT license. + See file LICENSE for detail or copy at https://opensource.org/licenses/MIT +*/ + +/** + * @file + * Common constants used in decoder and encoder API. + */ + +#ifndef BROTLI_COMMON_CONSTANTS_H_ +#define BROTLI_COMMON_CONSTANTS_H_ + +#include "./platform.h" +#include + +/* Specification: 7.3. Encoding of the context map */ +#define BROTLI_CONTEXT_MAP_MAX_RLE 16 + +/* Specification: 2. Compressed representation overview */ +#define BROTLI_MAX_NUMBER_OF_BLOCK_TYPES 256 + +/* Specification: 3.3. Alphabet sizes: insert-and-copy length */ +#define BROTLI_NUM_LITERAL_SYMBOLS 256 +#define BROTLI_NUM_COMMAND_SYMBOLS 704 +#define BROTLI_NUM_BLOCK_LEN_SYMBOLS 26 +#define BROTLI_MAX_CONTEXT_MAP_SYMBOLS (BROTLI_MAX_NUMBER_OF_BLOCK_TYPES + \ + BROTLI_CONTEXT_MAP_MAX_RLE) +#define BROTLI_MAX_BLOCK_TYPE_SYMBOLS (BROTLI_MAX_NUMBER_OF_BLOCK_TYPES + 2) + +/* Specification: 3.5. Complex prefix codes */ +#define BROTLI_REPEAT_PREVIOUS_CODE_LENGTH 16 +#define BROTLI_REPEAT_ZERO_CODE_LENGTH 17 +#define BROTLI_CODE_LENGTH_CODES (BROTLI_REPEAT_ZERO_CODE_LENGTH + 1) +/* "code length of 8 is repeated" */ +#define BROTLI_INITIAL_REPEATED_CODE_LENGTH 8 + +/* "Large Window Brotli" */ + +/** + * The theoretical maximum number of distance bits specified for large window + * brotli, for 64-bit encoders and decoders. Even when in practice 32-bit + * encoders and decoders only support up to 30 max distance bits, the value is + * set to 62 because it affects the large window brotli file format. + * Specifically, it affects the encoding of simple huffman tree for distances, + * see Specification RFC 7932 chapter 3.4. + */ +#define BROTLI_LARGE_MAX_DISTANCE_BITS 62U +#define BROTLI_LARGE_MIN_WBITS 10 +/** + * The maximum supported large brotli window bits by the encoder and decoder. + * Large window brotli allows up to 62 bits, however the current encoder and + * decoder, designed for 32-bit integers, only support up to 30 bits maximum. + */ +#define BROTLI_LARGE_MAX_WBITS 30 + +/* Specification: 4. Encoding of distances */ +#define BROTLI_NUM_DISTANCE_SHORT_CODES 16 +/** + * Maximal number of "postfix" bits. + * + * Number of "postfix" bits is stored as 2 bits in meta-block header. + */ +#define BROTLI_MAX_NPOSTFIX 3 +#define BROTLI_MAX_NDIRECT 120 +#define BROTLI_MAX_DISTANCE_BITS 24U +#define BROTLI_DISTANCE_ALPHABET_SIZE(NPOSTFIX, NDIRECT, MAXNBITS) ( \ + BROTLI_NUM_DISTANCE_SHORT_CODES + (NDIRECT) + \ + ((MAXNBITS) << ((NPOSTFIX) + 1))) +/* BROTLI_NUM_DISTANCE_SYMBOLS == 1128 */ +#define BROTLI_NUM_DISTANCE_SYMBOLS \ + BROTLI_DISTANCE_ALPHABET_SIZE( \ + BROTLI_MAX_NDIRECT, BROTLI_MAX_NPOSTFIX, BROTLI_LARGE_MAX_DISTANCE_BITS) + +/* ((1 << 26) - 4) is the maximal distance that can be expressed in RFC 7932 + brotli stream using NPOSTFIX = 0 and NDIRECT = 0. With other NPOSTFIX and + NDIRECT values distances up to ((1 << 29) + 88) could be expressed. */ +#define BROTLI_MAX_DISTANCE 0x3FFFFFC + +/* ((1 << 31) - 4) is the safe distance limit. Using this number as a limit + allows safe distance calculation without overflows, given the distance + alphabet size is limited to corresponding size + (see kLargeWindowDistanceCodeLimits). */ +#define BROTLI_MAX_ALLOWED_DISTANCE 0x7FFFFFFC + +/* 7.1. Context modes and context ID lookup for literals */ +/* "context IDs for literals are in the range of 0..63" */ +#define BROTLI_LITERAL_CONTEXT_BITS 6 + +/* 7.2. Context ID for distances */ +#define BROTLI_DISTANCE_CONTEXT_BITS 2 + +/* 9.1. Format of the Stream Header */ +/* Number of slack bytes for window size. Don't confuse + with BROTLI_NUM_DISTANCE_SHORT_CODES. */ +#define BROTLI_WINDOW_GAP 16 +#define BROTLI_MAX_BACKWARD_LIMIT(W) (((size_t)1 << (W)) - BROTLI_WINDOW_GAP) + +typedef struct BrotliDistanceCodeLimit { + uint32_t max_alphabet_size; + uint32_t max_distance; +} BrotliDistanceCodeLimit; + +/* This function calculates maximal size of distance alphabet, such that the + distances greater than the given values can not be represented. + + This limits are designed to support fast and safe 32-bit decoders. + "32-bit" means that signed integer values up to ((1 << 31) - 1) could be + safely expressed. + + Brotli distance alphabet symbols do not represent consecutive distance + ranges. Each distance alphabet symbol (excluding direct distances and short + codes), represent interleaved (for NPOSTFIX > 0) range of distances. + A "group" of consecutive (1 << NPOSTFIX) symbols represent non-interleaved + range. Two consecutive groups require the same amount of "extra bits". + + It is important that distance alphabet represents complete "groups". + To avoid complex logic on encoder side about interleaved ranges + it was decided to restrict both sides to complete distance code "groups". + */ +BROTLI_UNUSED_FUNCTION BrotliDistanceCodeLimit BrotliCalculateDistanceCodeLimit( + uint32_t max_distance, uint32_t npostfix, uint32_t ndirect) { + BrotliDistanceCodeLimit result; + /* Marking this function as unused, because not all files + including "constants.h" use it -> compiler warns about that. */ + BROTLI_UNUSED(&BrotliCalculateDistanceCodeLimit); + if (max_distance <= ndirect) { + /* This case never happens / exists only for the sake of completeness. */ + result.max_alphabet_size = max_distance + BROTLI_NUM_DISTANCE_SHORT_CODES; + result.max_distance = max_distance; + return result; + } else { + /* The first prohibited value. */ + uint32_t forbidden_distance = max_distance + 1; + /* Subtract "directly" encoded region. */ + uint32_t offset = forbidden_distance - ndirect - 1; + uint32_t ndistbits = 0; + uint32_t tmp; + uint32_t half; + uint32_t group; + /* Postfix for the last dcode in the group. */ + uint32_t postfix = (1u << npostfix) - 1; + uint32_t extra; + uint32_t start; + /* Remove postfix and "head-start". */ + offset = (offset >> npostfix) + 4; + /* Calculate the number of distance bits. */ + tmp = offset / 2; + /* Poor-man's log2floor, to avoid extra dependencies. */ + while (tmp != 0) {ndistbits++; tmp = tmp >> 1;} + /* One bit is covered with subrange addressing ("half"). */ + ndistbits--; + /* Find subrange. */ + half = (offset >> ndistbits) & 1; + /* Calculate the "group" part of dcode. */ + group = ((ndistbits - 1) << 1) | half; + /* Calculated "group" covers the prohibited distance value. */ + if (group == 0) { + /* This case is added for correctness; does not occur for limit > 128. */ + result.max_alphabet_size = ndirect + BROTLI_NUM_DISTANCE_SHORT_CODES; + result.max_distance = ndirect; + return result; + } + /* Decrement "group", so it is the last permitted "group". */ + group--; + /* After group was decremented, ndistbits and half must be recalculated. */ + ndistbits = (group >> 1) + 1; + /* The last available distance in the subrange has all extra bits set. */ + extra = (1u << ndistbits) - 1; + /* Calculate region start. NB: ndistbits >= 1. */ + start = (1u << (ndistbits + 1)) - 4; + /* Move to subregion. */ + start += (group & 1) << ndistbits; + /* Calculate the alphabet size. */ + result.max_alphabet_size = ((group << npostfix) | postfix) + ndirect + + BROTLI_NUM_DISTANCE_SHORT_CODES + 1; + /* Calculate the maximal distance representable by alphabet. */ + result.max_distance = ((start + extra) << npostfix) + postfix + ndirect + 1; + return result; + } +} + +#endif /* BROTLI_COMMON_CONSTANTS_H_ */ diff --git a/codec/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/context.h b/codec/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/context.h new file mode 100755 index 0000000000..24b3eb48f5 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/context.h @@ -0,0 +1,261 @@ +/* Copyright 2013 Google Inc. All Rights Reserved. + + Distributed under MIT license. + See file LICENSE for detail or copy at https://opensource.org/licenses/MIT +*/ + +/* Lookup table to map the previous two bytes to a context id. + + There are four different context modeling modes defined here: + CONTEXT_LSB6: context id is the least significant 6 bits of the last byte, + CONTEXT_MSB6: context id is the most significant 6 bits of the last byte, + CONTEXT_UTF8: second-order context model tuned for UTF8-encoded text, + CONTEXT_SIGNED: second-order context model tuned for signed integers. + + If |p1| and |p2| are the previous two bytes, and |mode| is current context + mode, we calculate the context as: + + context = ContextLut(mode)[p1] | ContextLut(mode)[p2 + 256]. + + For CONTEXT_UTF8 mode, if the previous two bytes are ASCII characters + (i.e. < 128), this will be equivalent to + + context = 4 * context1(p1) + context2(p2), + + where context1 is based on the previous byte in the following way: + + 0 : non-ASCII control + 1 : \t, \n, \r + 2 : space + 3 : other punctuation + 4 : " ' + 5 : % + 6 : ( < [ { + 7 : ) > ] } + 8 : , ; : + 9 : . + 10 : = + 11 : number + 12 : upper-case vowel + 13 : upper-case consonant + 14 : lower-case vowel + 15 : lower-case consonant + + and context2 is based on the second last byte: + + 0 : control, space + 1 : punctuation + 2 : upper-case letter, number + 3 : lower-case letter + + If the last byte is ASCII, and the second last byte is not (in a valid UTF8 + stream it will be a continuation byte, value between 128 and 191), the + context is the same as if the second last byte was an ASCII control or space. + + If the last byte is a UTF8 lead byte (value >= 192), then the next byte will + be a continuation byte and the context id is 2 or 3 depending on the LSB of + the last byte and to a lesser extent on the second last byte if it is ASCII. + + If the last byte is a UTF8 continuation byte, the second last byte can be: + - continuation byte: the next byte is probably ASCII or lead byte (assuming + 4-byte UTF8 characters are rare) and the context id is 0 or 1. + - lead byte (192 - 207): next byte is ASCII or lead byte, context is 0 or 1 + - lead byte (208 - 255): next byte is continuation byte, context is 2 or 3 + + The possible value combinations of the previous two bytes, the range of + context ids and the type of the next byte is summarized in the table below: + + |--------\-----------------------------------------------------------------| + | \ Last byte | + | Second \---------------------------------------------------------------| + | last byte \ ASCII | cont. byte | lead byte | + | \ (0-127) | (128-191) | (192-) | + |=============|===================|=====================|==================| + | ASCII | next: ASCII/lead | not valid | next: cont. | + | (0-127) | context: 4 - 63 | | context: 2 - 3 | + |-------------|-------------------|---------------------|------------------| + | cont. byte | next: ASCII/lead | next: ASCII/lead | next: cont. | + | (128-191) | context: 4 - 63 | context: 0 - 1 | context: 2 - 3 | + |-------------|-------------------|---------------------|------------------| + | lead byte | not valid | next: ASCII/lead | not valid | + | (192-207) | | context: 0 - 1 | | + |-------------|-------------------|---------------------|------------------| + | lead byte | not valid | next: cont. | not valid | + | (208-) | | context: 2 - 3 | | + |-------------|-------------------|---------------------|------------------| +*/ + +#ifndef BROTLI_COMMON_CONTEXT_H_ +#define BROTLI_COMMON_CONTEXT_H_ + +#include + +typedef enum ContextType { + CONTEXT_LSB6 = 0, + CONTEXT_MSB6 = 1, + CONTEXT_UTF8 = 2, + CONTEXT_SIGNED = 3 +} ContextType; + +/* Common context lookup table for all context modes. */ +static const uint8_t kContextLookup[2048] = { + /* CONTEXT_LSB6, last byte. */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + + /* CONTEXT_LSB6, second last byte, */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + /* CONTEXT_MSB6, last byte. */ + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, + 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, + 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, + 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, + 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, + 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, + 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, + 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, + 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, + 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 39, 39, + 40, 40, 40, 40, 41, 41, 41, 41, 42, 42, 42, 42, 43, 43, 43, 43, + 44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, + 48, 48, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51, 51, + 52, 52, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55, + 56, 56, 56, 56, 57, 57, 57, 57, 58, 58, 58, 58, 59, 59, 59, 59, + 60, 60, 60, 60, 61, 61, 61, 61, 62, 62, 62, 62, 63, 63, 63, 63, + + /* CONTEXT_MSB6, second last byte, */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + /* CONTEXT_UTF8, last byte. */ + /* ASCII range. */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 4, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12, + 12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48, + 52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12, + 12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56, + 60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12, 0, + /* UTF8 continuation byte range. */ + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + /* UTF8 lead byte range. */ + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + + /* CONTEXT_UTF8 second last byte. */ + /* ASCII range. */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, + 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0, + /* UTF8 continuation byte range. */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* UTF8 lead byte range. */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + + /* CONTEXT_SIGNED, last byte, same as the above values shifted by 3 bits. */ + 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 56, + + /* CONTEXT_SIGNED, second last byte. */ + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, +}; + +typedef const uint8_t* ContextLut; + +/* typeof(MODE) == ContextType; returns ContextLut */ +#define BROTLI_CONTEXT_LUT(MODE) (&kContextLookup[(MODE) << 9]) + +/* typeof(LUT) == ContextLut */ +#define BROTLI_CONTEXT(P1, P2, LUT) ((LUT)[P1] | ((LUT) + 256)[P2]) + +#endif /* BROTLI_COMMON_CONTEXT_H_ */ diff --git a/codec/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/dictionary.bin b/codec/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/dictionary.bin new file mode 100644 index 0000000000..a585c0e292 --- /dev/null +++ b/codec/L2/demos/jxlEnc/third_partys/third_party/brotli/c/common/dictionary.bin @@ -0,0 +1,432 @@ +timedownlifeleftbackcodedatashowonlysitecityopenjustlikefreeworktextyearoverbodyloveformbookplaylivelinehelphomesidemorewordlongthemviewfindpagedaysfullheadtermeachareafromtruemarkableuponhighdatelandnewsevennextcasebothpostusedmadehandherewhatnameLinkblogsizebaseheldmakemainuser') +holdendswithNewsreadweresigntakehavegameseencallpathwellplusmenufilmpartjointhislistgoodneedwayswestjobsmindalsologorichuseslastteamarmyfoodkingwilleastwardbestfirePageknowaway.pngmovethanloadgiveselfnotemuchfeedmanyrockicononcelookhidediedHomerulehostajaxinfoclublawslesshalfsomesuchzone100%onescareTimeracebluefourweekfacehopegavehardlostwhenparkkeptpassshiproomHTMLplanTypedonesavekeepflaglinksoldfivetookratetownjumpthusdarkcardfilefearstaykillthatfallautoever.comtalkshopvotedeepmoderestturnbornbandfellroseurl(skinrolecomeactsagesmeetgold.jpgitemvaryfeltthensenddropViewcopy1.0"stopelseliestourpack.gifpastcss?graymean>rideshotlatesaidroadvar feeljohnrickportfast'UA-deadpoorbilltypeU.S.woodmust2px;Inforankwidewantwalllead[0];paulwavesure$('#waitmassarmsgoesgainlangpaid!-- lockunitrootwalkfirmwifexml"songtest20pxkindrowstoolfontmailsafestarmapscorerainflowbabyspansays4px;6px;artsfootrealwikiheatsteptriporg/lakeweaktoldFormcastfansbankveryrunsjulytask1px;goalgrewslowedgeid="sets5px;.js?40pxif (soonseatnonetubezerosentreedfactintogiftharm18pxcamehillboldzoomvoideasyringfillpeakinitcost3px;jacktagsbitsrolleditknewnearironfreddiskwentsoilputs/js/holyT22:ISBNT20:adamsees

json', 'contT21: RSSloopasiamoon

soulLINEfortcartT14:

80px!--<9px;T04:mike:46ZniceinchYorkricezh:ä'));puremageparatonebond:37Z_of_']);000,zh:çtankyardbowlbush:56ZJava30px +|} +%C3%:34ZjeffEXPIcashvisagolfsnowzh:équer.csssickmeatmin.binddellhirepicsrent:36ZHTTP-201fotowolfEND xbox:54ZBODYdick; +} +exit:35Zvarsbeat'});diet999;anne}}sonyguysfuckpipe|- +!002)ndow[1];[]; +Log salt + bangtrimbath){ +00px +});ko:ěfeesad> s:// [];tollplug(){ +{ + .js'200pdualboat.JPG); +}quot); + +'); + +} 201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037201320122011201020092008200720062005200420032002200120001999199819971996199519941993199219911990198919881987198619851984198319821981198019791978197719761975197419731972197119701969196819671966196519641963196219611960195919581957195619551954195319521951195010001024139400009999comomásesteestaperotodohacecadaañobiendĂ­aasĂ­vidacasootroforosolootracualdijosidograntipotemadebealgoquĂ©estonadatrespococasabajotodasinoaguapuesunosantediceluisellamayozonaamorpisoobraclicellodioshoracasiзанаомрарŃтанепоотизнодотожеонихНаеебымыВыŃовывоНообПолиниРФНеМытыОнимдаЗаДаНŃОбтеĐзейнŃммТыŃжŮيأنمامعŮلأŮردياŮىهŮلملŮاŮلهبسالإنهيأيقدهلثمبهلŮليبلايبŮشيامأمنتبيلنحبهممشŮŘ´firstvideolightworldmediawhitecloseblackrightsmallbooksplacemusicfieldorderpointvalueleveltableboardhousegroupworksyearsstatetodaywaterstartstyledeathpowerphonenighterrorinputabouttermstitletoolseventlocaltimeslargewordsgamesshortspacefocusclearmodelblockguideradiosharewomenagainmoneyimagenamesyounglineslatercolorgreenfront&watchforcepricerulesbeginaftervisitissueareasbelowindextotalhourslabelprintpressbuiltlinksspeedstudytradefoundsenseundershownformsrangeaddedstillmovedtakenaboveflashfixedoftenotherviewschecklegalriveritemsquickshapehumanexistgoingmoviethirdbasicpeacestagewidthloginideaswrotepagesusersdrivestorebreaksouthvoicesitesmonthwherebuildwhichearthforumthreesportpartyClicklowerlivesclasslayerentrystoryusagesoundcourtyour birthpopuptypesapplyImagebeinguppernoteseveryshowsmeansextramatchtrackknownearlybegansuperpapernorthlearngivennamedendedTermspartsGroupbrandusingwomanfalsereadyaudiotakeswhile.com/livedcasesdailychildgreatjudgethoseunitsneverbroadcoastcoverapplefilescyclesceneplansclickwritequeenpieceemailframeolderphotolimitcachecivilscaleenterthemetheretouchboundroyalaskedwholesincestock namefaithheartemptyofferscopeownedmightalbumthinkbloodarraymajortrustcanonunioncountvalidstoneStyleLoginhappyoccurleft:freshquitefilmsgradeneedsurbanfightbasishoverauto;route.htmlmixedfinalYour slidetopicbrownalonedrawnsplitreachRightdatesmarchquotegoodsLinksdoubtasyncthumballowchiefyouthnovel10px;serveuntilhandsCheckSpacequeryjamesequaltwice0,000Startpanelsongsroundeightshiftworthpostsleadsweeksavoidthesemilesplanesmartalphaplantmarksratesplaysclaimsalestextsstarswrong

thing.org/multiheardPowerstandtokensolid(thisbringshipsstafftriedcallsfullyfactsagentThis //-->adminegyptEvent15px;Emailtrue"crossspentblogsbox">notedleavechinasizesguestrobotheavytrue,sevengrandcrimesignsawaredancephase> + + +name=diegopage swiss--> + +#fff;">Log.com"treatsheet) && 14px;sleepntentfiledja:ăid="cName"worseshots-box-delta +<bears:48Z spendbakershops= "";php">ction13px;brianhellosize=o=%2F joinmaybe, fjsimg" ")[0]MTopBType"newlyDanskczechtrailknowsfaq">zh-cn10); +-1");type=bluestrulydavis.js';> + +form jesus100% menu. + +walesrisksumentddingb-likteachgif" vegasdanskeestishqipsuomisobredesdeentretodospuedeañosestátienehastaotrospartedondenuevohacerformamismomejormundoaquĂ­dĂ­assĂłloayudafechatodastantomenosdatosotrassitiomuchoahoralugarmayorestoshorastenerantesfotosestaspaĂ­snuevasaludforosmedioquienmesespoderchileserávecesdecirjosĂ©estarventagrupohechoellostengoamigocosasnivelgentemismaairesjuliotemashaciafavorjuniolibrepuntobuenoautorabrilbuenatextomarzosaberlistaluegocĂłmoenerojuegoperĂşhaberestoynuncamujervalorfueralibrogustaigualvotoscasosguĂ­apuedosomosavisousteddebennochebuscafaltaeurosseriedichocursoclavecasasleĂłnplazolargoobrasvistaapoyojuntotratavistocrearcampohemoscincocargopisosordenhacenáreadiscopedrocercapuedapapelmenorĂştilclarojorgecalleponertardenadiemarcasigueellassiglocochemotosmadreclaserestoniñoquedapasarbancohijosviajepabloĂ©stevienereinodejarfondocanalnorteletracausatomarmanoslunesautosvillavendopesartipostengamarcollevapadreunidovamoszonasambosbandamariaabusomuchasubirriojavivirgradochicaallĂ­jovendichaestantalessalirsuelopesosfinesllamabuscoĂ©stalleganegroplazahumorpagarjuntadobleislasbolsabañohablaluchaĂreadicenjugarnotasvalleallácargadolorabajoestĂ©gustomentemariofirmacostofichaplatahogarartesleyesaquelmuseobasespocosmitadcielochicomiedoganarsantoetapadebesplayaredessietecortecoreadudasdeseoviejodeseaaguas"domaincommonstatuseventsmastersystemactionbannerremovescrollupdateglobalmediumfilternumberchangeresultpublicscreenchoosenormaltravelissuessourcetargetspringmodulemobileswitchphotosborderregionitselfsocialactivecolumnrecordfollowtitle>eitherlengthfamilyfriendlayoutauthorcreatereviewsummerserverplayedplayerexpandpolicyformatdoublepointsseriespersonlivingdesignmonthsforcesuniqueweightpeopleenergynaturesearchfigurehavingcustomoffsetletterwindowsubmitrendergroupsuploadhealthmethodvideosschoolfutureshadowdebatevaluesObjectothersrightsleaguechromesimplenoticesharedendingseasonreportonlinesquarebuttonimagesenablemovinglatestwinterFranceperiodstrongrepeatLondondetailformeddemandsecurepassedtoggleplacesdevicestaticcitiesstreamyellowattackstreetflighthiddeninfo">openedusefulvalleycausesleadersecretseconddamagesportsexceptratingsignedthingseffectfieldsstatesofficevisualeditorvolumeReportmuseummoviesparentaccessmostlymother" id="marketgroundchancesurveybeforesymbolmomentspeechmotioninsidematterCenterobjectexistsmiddleEuropegrowthlegacymannerenoughcareeransweroriginportalclientselectrandomclosedtopicscomingfatheroptionsimplyraisedescapechosenchurchdefinereasoncorneroutputmemoryiframepolicemodelsNumberduringoffersstyleskilledlistedcalledsilvermargindeletebetterbrowselimitsGlobalsinglewidgetcenterbudgetnowrapcreditclaimsenginesafetychoicespirit-stylespreadmakingneededrussiapleaseextentScriptbrokenallowschargedividefactormember-basedtheoryconfigaroundworkedhelpedChurchimpactshouldalwayslogo" bottomlist">){var prefixorangeHeader.push(couplegardenbridgelaunchReviewtakingvisionlittledatingButtonbeautythemesforgotSearchanchoralmostloadedChangereturnstringreloadMobileincomesupplySourceordersviewed courseAbout islandPhilipawardshandleimportOfficeregardskillsnationSportsdegreeweekly (e.g.behinddoctorloggedunitedbeyond-scaleacceptservedmarineFootercamera +_form"leavesstress" /> +.gif" onloadloaderOxfordsistersurvivlistenfemaleDesignsize="appealtext">levelsthankshigherforcedanimalanyoneAfricaagreedrecentPeople
wonderpricesturned|| {};main">inlinesundaywrap">failedcensusminutebeaconquotes150px|estateremoteemail"linkedright;signalformal1.htmlsignupprincefloat:.png" forum.AccesspaperssoundsextendHeightsliderUTF-8"& Before. WithstudioownersmanageprofitjQueryannualparamsboughtfamousgooglelongeri++) {israelsayingdecidehome">headerensurebranchpiecesblock;statedtop">boston.test(avatartested_countforumsschemaindex,filledsharesreaderalert(appearSubmitline">body"> +* TheThoughseeingjerseyNews +System DavidcancertablesprovedApril reallydriveritem">more">boardscolorscampusfirst || [];media.guitarfinishwidth:showedOther .php" assumelayerswilsonstoresreliefswedenCustomeasily your String + +Whiltaylorclear:resortfrenchthough") + "buyingbrandsMembername">oppingsector5px;">vspacepostermajor coffeemartinmaturehappenkansaslink">Images=falsewhile hspace0& + +In powerPolski-colorjordanBottomStart -count2.htmlnews">01.jpgOnline-rightmillerseniorISBN 00,000 guidesvalue)ectionrepair.xml" rights.html-blockregExp:hoverwithinvirginphones using + var >'); + + +bahasabrasilgalegomagyarpolskisrpskiردŮ中文简体çąé«”信ćŻä¸­ĺ›˝ć‘们一个公司管ç†č®şĺť›ĺŹŻä»ĄćśŤĺŠˇć—¶é—´ä¸Şäşşäş§ĺ“自己äĽä¸šćźĄçś‹ĺ·Ąä˝śč”系没有网站所有评论中ĺżć–‡ç« ç”¨ć·é¦–页作者技术问é˘ç›¸ĺ…łä¸‹č˝˝ćśç´˘ä˝żç”¨č˝Żä»¶ĺś¨çşżä¸»é˘čµ„料视频回复注册网络收藏内容推čŤĺ¸‚ĺśşć¶ćŻç©şé—´ĺŹ‘ĺ¸ä»€äąĺĄ˝ĺŹ‹ç”źć´»ĺ›ľç‰‡ĺŹ‘ĺ±•ĺ¦‚ćžść‰‹ćśşć–°é—»ćś€ć–°ć–ąĺĽŹĺŚ—äş¬ćŹäľ›ĺ…łäşŽć›´ĺ¤ščż™ä¸Şçł»ç»źçźĄé“游ćŹĺążĺ‘Šĺ…¶ä»–发表安全第一会ĺ‘进行点击ç‰ćťç”µĺ­ä¸–界设计免费教育加入活动他们商ĺ“博客现在上海如何已经留言详细社区登录本站需č¦ä»·ć Ľć”ŻćŚĺ›˝é™…链接国家建设朋友é…读法律位置经济选择这样当前ĺ†ç±»ćŽ’čˇŚĺ› ä¸şäş¤ć“最ĺŽéźłäąä¸Ťč˝é€ščż‡čˇŚä¸šç§‘技可č˝č®ľĺ¤‡ĺ作大家社会研究专业全é¨éˇąç›®čż™é‡ŚčżćŻĺĽ€ĺ§‹ć…况电脑文件ĺ“牌帮助文化资ćşĺ¤§ĺ­¦ĺ­¦äą ĺś°ĺť€ćµŹč§ćŠ•čµ„ĺ·Ąç¨‹č¦ć±‚怎äąć—¶ĺ€™ĺŠźč˝ä¸»č¦ç›®ĺ‰Ťčµ„讯城市方法电影招č声ćŽä»»ä˝•ĺĄĺş·ć•°ćŤ®çľŽĺ›˝ć±˝č˝¦ä»‹ç»Ťä˝†ćŻäş¤ćµç”źäş§ć‰€ä»Ąç”µčŻťćľç¤şä¸€äş›ĺŤ•位人ĺ‘ĺ†ćžĺś°ĺ›ľć—…游工具学生系ĺ—网友帖ĺ­ĺ݆ç é˘‘é“控ĺ¶ĺś°ĺŚşĺźşćś¬ĺ…¨ĺ›˝ç˝‘ä¸Šé‡Ťč¦ç¬¬äşŚĺ–ść¬˘čż›ĺ…ĄĺŹ‹ć…这些č€čŻ•ĺŹ‘çŽ°ĺźąč®­ä»Ąä¸Šć”żĺşść为环ĺ˘é¦™ć¸ŻĺŚć—¶ĺ¨±äąĺŹ‘é€ä¸€ĺ®šĺĽ€ĺŹ‘ä˝śĺ“标准欢迎解决地方一下以及责任ć–者客ć·ä»Łčˇ¨ç§Żĺ†ĺĄłäşşć•°ç é”€ĺ”®ĺ‡şçŽ°ç¦»çşżĺş”ç”¨ĺ—表不ĺŚçĽ–辑统计查询不č¦ćś‰ĺ…łćśşćž„ĺľĺ¤šć’­ć”ľç»„织政策直接č˝ĺŠ›ćťĄćşć™‚é–“çś‹ĺ°ç­é—¨ĺ…łé”®ä¸“区非常英语百度希望美女比čľçźĄčŻ†č§„ĺ®šĺ»şč®®é¨é—¨ć„Źč§ç˛ľĺ˝©ć—Ąćś¬ćŹé«ĺŹ‘č¨€ć–ąéť˘ĺźşé‡‘ĺ¤„ç†ćťé™ĺ˝±ç‰‡é“¶čˇŚčżćś‰ĺ†äş«ç‰©ĺ“经čĄć·»ĺŠ ä¸“ĺ®¶čż™ç§ŤčŻťé˘čµ·ćťĄä¸šĺŠˇĺ…¬ĺ‘Šč®°ĺ˝•ç®€ä»‹č´¨é‡Źç”·äşşĺ˝±ĺ“ŤĺĽ•ç”¨ćŠĄĺ‘Šé¨ĺ†ĺż«é€źĺ’¨čŻ˘ć—¶ĺ°šćł¨ć„Źç”łčŻ·ĺ­¦ć ˇĺş”čŻĄĺŽ†ĺŹ˛ĺŹŞćŻčż”回购买ĺŤç§°ä¸şäş†ć功说ćŽäľ›ĺş”ĺ­©ĺ­ä¸“é˘ç¨‹ĺşŹä¸€č¬ćśĺ“ˇĺŹŞćś‰ĺ…¶ĺ®äżťćŠ¤č€Śä¸”ä»Šĺ¤©çŞ—ĺŹŁĺŠ¨ć€çжć€ç‰ąĺ«č®¤ä¸şĺż…须更新小说ć‘們作为媒体包括那äąä¸€ć ·ĺ›˝ĺ†…ćŻĺ¦ć ąćŤ®ç”µč§†ĺ­¦é™˘ĺ…·ćś‰čż‡ç¨‹ç”±äşŽäşşć‰Ťĺ‡şćťĄä¸Ťčż‡ć­Łĺś¨ćŽćźć•…事关系标é˘ĺ•†ĺŠˇčľ“ĺ…Ąä¸€ç›´ĺźşçˇ€ć•™ĺ­¦äş†č§Łĺ»şç­‘ç»“ćžśĺ…¨ç通知计ĺ’对于艺术相册发生真的建立等级类型经验实现ĺ¶ä˝śćťĄč‡Şć ‡ç­ľä»Ąä¸‹ĺŽźĺ›ć— ćł•其中個人一ĺ‡ćŚ‡ĺŤ—ĺ…łé—­é›†ĺ›˘ç¬¬ä¸‰ĺ…łćł¨ĺ› ć­¤ç…§ç‰‡ć·±ĺśłĺ•†ä¸šĺążĺ·žć—Ąćśźé«çş§ćś€čż‘综ĺ表示专辑行为交通评价觉得精华家庭完ć感觉安装得ĺ°é‚®ä»¶ĺ¶ĺş¦éŁźĺ“虽然转载报价记者方ćˇčˇŚć”żäşşć°‘用ĺ“东西ćŹĺ‡şé…’ĺş—ç„¶ĺŽä»ć¬ľç­ç‚ąä»Ąĺ‰Ťĺ®Śĺ…¨ĺŹ‘ĺ¸–č®ľç˝®é˘†ĺŻĽĺ·Ąä¸šĺŚ»é™˘çś‹çś‹ç»Źĺ…¸ĺŽźĺ› ĺąłĺŹ°ĺ„种增加ćťć–™ć–°ĺ˘žäą‹ĺŽčŚä¸šć•果今年论文ć‘国告诉ç‰ä¸»äż®ć”ąĺŹ‚ä¸Žć‰“ĺŤ°ĺż«äąćśşć˘°č§‚点ĺ­ĺś¨ç˛ľçĄžčŽ·ĺľ—ĺ©ç”¨ç»§ç»­ä˝ ä»¬čż™äąć¨ˇĺĽŹčݭ荀č˝ĺ¤źé›…虎操作风格一起科学体育短信条件治疗čżĺŠ¨äş§ä¸šäĽšč®®ĺŻĽčŞĺ…生č”盟可ćŻĺ•ŹéˇŚç»“构作用č°ćźĄčł‡ć–™č‡ŞĺŠ¨č´źč´Łĺ†śä¸šč®żé—®ĺ®žć–˝ćŽĄĺŹ—č®¨č®şé‚Łä¸ŞĺŹŤé¦ĺŠ ĺĽşĺĄłć€§čŚĺ›´ćśŤĺ‹™äĽ‘闲今日客服觀看参加的话一点保čŻĺ›ľäą¦ćś‰ć•测试移动才č˝ĺ†łĺ®šč‚ˇçĄ¨ä¸Ťć–­éś€ć±‚不得办法之间采用čĄé”€ćŠ•čŻ‰ç›®ć ‡ç±ć…摄影有些複製文学机会数字装修购物农村全面精ĺ“其实事ć…ć°´ĺąłćŹç¤şä¸Šĺ¸‚谢谢普通教ĺ¸ä¸ŠäĽ ç±»ĺ«ć­Ść›˛ć‹Ąćś‰ĺ›ć–°é…Ťä»¶ĺŹŞč¦ć—¶ä»Łčł‡č¨Ščľľĺ°äşşç”źč®˘é…č€ĺ¸ĺ±•示ĺżç†č´´ĺ­ç¶˛ç«™ä¸»éˇŚč‡Şç„¶çş§ĺ«ç®€ĺŤ•改革那些来说打开代ç ĺ é™¤čŻĺ¸čŠ‚ç›®é‡Ťç‚ąć¬ˇć•¸ĺ¤šĺ°‘č§„ĺ’资金找ĺ°ä»ĄĺŽĺ¤§ĺ…¨ä¸»éˇµćś€ä˝łĺ›žç­”天下保障现代检查投票小时沒有正常甚至代ç†ç›®ĺ˝•公开复ĺ¶é‡‘融幸福ç‰ćś¬ĺ˝˘ć准备行ć…回ĺ°ć€ťćłć€Žć ·ĺŤŹč®®č®¤čŻćś€ĺĄ˝äş§ç”źćŚ‰ç…§ćśŤčŁ…ĺążä¸śĺŠ¨ćĽ«é‡‡č´­ć–°ć‰‹ç»„ĺ›ľéť˘ćťżĺŹ‚č€ć”żć˛»ĺ®ąć“天地努力人们升级速度人物č°ć•´ćµčˇŚé€ ć文字韩国贸ć“开展相關表现影视如此美容大小报é“条款ĺżć…许多法规家居书店连接立即举报技巧奥čżç™»ĺ…Ąä»ĄćťĄç†č®şäş‹ä»¶č‡Şç”±ä¸­ĺŤŽĺŠžĺ…¬ĺ¦ĺ¦çśźć­Łä¸Ťé”™ĺ…¨ć–‡ĺĺŚä»·ĺ€Ľĺ«äşşç›‘督具体世纪团éźĺ›ä¸šć‰żć‹…增长有人保ćŚĺ•†ĺ®¶ç»´äż®ĺŹ°ćąľĺ·¦ĺŹłč‚ˇä»˝ç­”ćˇĺ®žé™…电信经ç†ç”źĺ‘˝ĺ®ŁäĽ ä»»ĺŠˇć­ŁĺĽŹç‰ąč‰˛ä¸‹ćťĄĺŤŹäĽšĺŹŞč˝ĺ˝“然重新內容指导čżčˇŚć—Ąĺż—賣家超过土地浙江支ä»ćŽ¨ĺ‡şç«™é•żćť­ĺ·žć‰§čˇŚĺ¶é€ äą‹ä¸€ćŽ¨ĺążçŽ°ĺśşćŹŹčż°ĺŹĺŚ–äĽ ç»źć­Ść‰‹äżťé™©čŻľç¨‹ĺŚ»ç–—ç»Źčż‡čż‡ĺŽ»äą‹ĺ‰Ťć”¶ĺ…Ąĺą´ĺş¦ćť‚ĺż—çľŽä¸˝ćś€é«ç™»é™†ćśŞćťĄĺŠ ĺ·Ąĺ…Ťč´Łć•™ç¨‹ç‰ĺť—身体重庆出售ć本形式土豆出ĺąä¸ść–ąé‚®ç®±ĺŤ—京求čŚĺŹ–ĺľ—čŚä˝Ťç›¸äżˇéˇµéť˘ĺ†é’źç˝‘页确定图例网址积ćžé”™čŻŻç›®çš„ĺ®ťč´ťćśşĺ…łéŁŽé™©ćŽćťç—…毒宠物除了評論疾病及时求购站点儿童每天中央认识每个天津字体台çŁç»´ćŠ¤ćś¬éˇµä¸Şć€§ĺ®ć–ąĺ¸¸č§ç›¸ćśşć略应当律ĺ¸ć–ąäľżć ˇĺ›­č‚ˇĺ¸‚ćżĺ±‹ć Źç›®ĺ‘工导致çŞç„¶é“具本网结ĺ档ćˇĺŠłĺŠ¨ĺŹ¦ĺ¤–çľŽĺ…引起改ĺŹç¬¬ĺ››äĽšč®ˇčŞŞćŽéšç§ĺ®ťĺ®ťč§„čŚć¶č´ąĺ…±ĺŚĺżč®°ä˝“系带来ĺŤĺ­—發表开放加盟受ĺ°äşŚć‰‹ĺ¤§é‡Źć人数量共享区域女孩原ĺ™ć‰€ĺś¨ç»“束通信超级配置当时äĽç§€ć€§ć„źćżäş§éŠć˛ĺ‡şĺŹŁćŹäş¤ĺ°±ä¸šäżťĺĄç¨‹ĺş¦ĺŹ‚ć•°äş‹ä¸šć•´ä¸Şĺ±±ä¸ść…感特殊ĺ†éˇžćśĺ°‹ĺ±žäşŽé—¨ć·č´˘ĺŠˇĺŁ°éźłĺŹŠĺ…¶č´˘ç»ŹĺťšćŚĺą˛é¨ćç«‹ĺ©ç›Šč€č™‘ćé˝ĺŚ…čŁ…ç”¨ć¶ćݔ赛㖇ćŽć‹›ĺ•†ĺ®Ść•´çśźćŻçśĽçť›äĽ™äĽ´ĺ¨ćś›é˘†ĺźźĺŤ«ç”źäĽć č«–壇公共良好充ĺ†ç¬¦ĺ附件特点不可英文资产根本ćŽćľĺŻ†ç˘Ľĺ…¬äĽ—ć°‘ć—Źć›´ĺŠ äş«ĺŹ—ĺŚĺ­¦ĺŻĺЍ适ĺ原来问答本文美食绿色稳定ç»äşŽç”źç‰©äľ›ć±‚ćśç‹ĺŠ›é‡Źä¸Ąé‡Ťć°¸čżśĺ†™çśźćś‰é™ç«žäş‰ĺŻąč±ˇč´ąç”¨ä¸ŤĺĄ˝ç»ťĺŻąĺŤĺ†äżčż›ç‚ąčŻ„ĺ˝±éźłäĽĺŠżä¸Ťĺ°‘ć¬ŁčµŹĺą¶ä¸”ćś‰ç‚ąć–ąĺ‘全新信用设施形象资格çŞç ´éšŹçť€é‡Ťĺ¤§äşŽćŻćŻ•ä¸šć™şč˝ĺŚ–ĺ·Ąĺ®ŚçľŽĺ•†ĺźŽç»źä¸€ĺ‡şç‰ć‰“造產ĺ“概况用于保留因素中國ĺ­ĺ‚¨č´´ĺ›ľćś€ć„›é•żćśźĺŹŁä»·ç†č´˘ĺźşĺś°ĺ®‰ćŽ’ć­¦ć±‰é‡Śéť˘ĺ›ĺ»şĺ¤©ç©şé¦–ĺ…完善驱动下面不再诚信意义éłĺ…‰č‹±ĺ›˝ćĽ‚亮军事玩家群众农民即可ĺŤç¨±ĺ®¶ĺ…·ĺŠ¨ç”»ćłĺ°ćł¨ćŽĺ°Źĺ­¦ć€§č˝č€ç ”硬件观看清楚ćžç¬‘首é é»„金适用江苏真实主管é¶ć®µč¨»ĺ†Šçż»čŻ‘ćťĺ©ĺšĺĄ˝äĽĽäąŽé€šč®Żć–˝ĺ·Ąç‹€ć…‹äąźč®¸çŽŻäżťĺźąĺ…»ć¦‚ĺżµĺ¤§ĺž‹ćśşçĄ¨ç†č§ŁĺŚżĺŤcuandoenviarmadridbuscariniciotiempoporquecuentaestadopuedenjuegoscontraestánnombretienenperfilmaneraamigosciudadcentroaunquepuedesdentroprimerpreciosegĂşnbuenosvolverpuntossemanahabĂ­aagostonuevosunidoscarlosequiponiñosmuchosalgunacorreoimagenpartirarribamarĂ­ahombreempleoverdadcambiomuchasfueronpasadolĂ­neaparecenuevascursosestabaquierolibroscuantoaccesomiguelvarioscuatrotienesgruposseráneuropamediosfrenteacercademásofertacochesmodeloitalialetrasalgĂşncompracualesexistecuerposiendoprensallegarviajesdineromurciapodrápuestodiariopuebloquieremanuelpropiocrisisciertoseguromuertefuentecerrargrandeefectopartesmedidapropiaofrecetierrae-mailvariasformasfuturoobjetoseguirriesgonormasmismosĂşnicocaminositiosrazĂłndebidopruebatoledotenĂ­ajesĂşsesperococinaorigentiendacientocádizhablarserĂ­alatinafuerzaestiloguerraentrarĂ©xitolĂłpezagendavĂ­deoevitarpaginametrosjavierpadresfácilcabezaáreassalidaenvĂ­ojapĂłnabusosbienestextosllevarpuedanfuertecomĂşnclaseshumanotenidobilbaounidadestáseditarcreadoдлячтокакилиэтовŃеегопритакещеŃжеКакбезбылониВŃеподЭтотомчемнетлетразонагдемнеДляПринаŃнихтемктогодвоттамСШĐмаяЧтоваŃвамемŃТакдванамэтиэтŃВамтехпротŃтнаддняВоттринейВаŃнимŃамтотрŃбОнимирнееОООлицэтаОнанемдоммойдвеоноŃŃдकेहŕĄŕ¤•ीसेकाकोऔरपरनेएककिभीइसकरतोहोआपहीयहयातकथाjagranआजजोअबदोगŕ¤ŕ¤śŕ¤ľŕ¤—एहमइनवहयेथेथीŕ¤ŕ¤°ŕ¤śŕ¤¬ŕ¤¦ŕĄ€ŕ¤•ŕ¤ŕ¤śŕĄ€ŕ¤µŕĄ‡ŕ¤¨ŕ¤ŕ¤¨ŕ¤Źŕ¤ąŕ¤°ŕ¤‰ŕ¤¸ŕ¤®ŕĄ‡ŕ¤•मवोलेसबमŕ¤ŕ¤¦ŕĄ‡ŕ¤“रआमबसभरबनचलमनआगसीलीعلىإلىهذاآخرعددالىهذهصŮرغيرŮانŮلابينعرضذلŮهنايŮمقالعليانالŮنحتىقبلŮحةاخرŮقطعبدرŮنإذاŮمااحدإلاŮيهبعضŮŮŠŮبحثŮمنŮهŮأناجدالهاسلمعندليسعبرصلىمنذبهاأنهمثلŮنتالاحيثمصرشرححŮŮ„ŮŮياذالŮلمرةانتالŮأبŮخاصأنتانهاليعضŮŮقدابنخيربنتلŮمشاءŮهيابŮقصصŮمارقمأحدنحنعدمرأياحةŮتبدŮنيجبمنهتحتجهةسنةيتمŮرةغزةنŮسبيتللهلناتلŮقلبلماعنهأŮلشيءنŮرأماŮŮŠŮبŮلذاترتببأنهمسانŮبيعŮقدحسنلهمشعرأهلشهرقطرطلبprofileservicedefaulthimselfdetailscontentsupportstartedmessagesuccessfashioncountryaccountcreatedstoriesresultsrunningprocesswritingobjectsvisiblewelcomearticleunknownnetworkcompanydynamicbrowserprivacyproblemServicerespectdisplayrequestreservewebsitehistoryfriendsoptionsworkingversionmillionchannelwindow.addressvisitedweathercorrectproductedirectforwardyou canremovedsubjectcontrolarchivecurrentreadinglibrarylimitedmanagerfurthersummarymachineminutesprivatecontextprogramsocietynumberswrittenenabledtriggersourcesloadingelementpartnerfinallyperfectmeaningsystemskeepingculture",journalprojectsurfaces"expiresreviewsbalanceEnglishContentthroughPlease opinioncontactaverageprimaryvillageSpanishgallerydeclinemeetingmissionpopularqualitymeasuregeneralspeciessessionsectionwriterscounterinitialreportsfiguresmembersholdingdisputeearlierexpressdigitalpictureAnothermarriedtrafficleadingchangedcentralvictoryimages/reasonsstudiesfeaturelistingmust beschoolsVersionusuallyepisodeplayinggrowingobviousoverlaypresentactions</ul> +wrapperalreadycertainrealitystorageanotherdesktopofferedpatternunusualDigitalcapitalWebsitefailureconnectreducedAndroiddecadesregular & animalsreleaseAutomatgettingmethodsnothingPopularcaptionletterscapturesciencelicensechangesEngland=1&History = new CentralupdatedSpecialNetworkrequirecommentwarningCollegetoolbarremainsbecauseelectedDeutschfinanceworkersquicklybetweenexactlysettingdiseaseSocietyweaponsexhibit<!--Controlclassescoveredoutlineattacksdevices(windowpurposetitle="Mobile killingshowingItaliandroppedheavilyeffects-1']); +confirmCurrentadvancesharingopeningdrawingbillionorderedGermanyrelated</form>includewhetherdefinedSciencecatalogArticlebuttonslargestuniformjourneysidebarChicagoholidayGeneralpassage,"animatefeelingarrivedpassingnaturalroughly. + +The but notdensityBritainChineselack oftributeIreland" data-factorsreceivethat isLibraryhusbandin factaffairsCharlesradicalbroughtfindinglanding:lang="return leadersplannedpremiumpackageAmericaEdition]"Messageneed tovalue="complexlookingstationbelievesmaller-mobilerecordswant tokind ofFirefoxyou aresimilarstudiedmaximumheadingrapidlyclimatekingdomemergedamountsfoundedpioneerformuladynastyhow to SupportrevenueeconomyResultsbrothersoldierlargelycalling."AccountEdward segmentRobert effortsPacificlearnedup withheight:we haveAngelesnations_searchappliedacquiremassivegranted: falsetreatedbiggestbenefitdrivingStudiesminimumperhapsmorningsellingis usedreversevariant role="missingachievepromotestudentsomeoneextremerestorebottom:evolvedall thesitemapenglishway to AugustsymbolsCompanymattersmusicalagainstserving})(); +paymenttroubleconceptcompareparentsplayersregionsmonitor ''The winningexploreadaptedGalleryproduceabilityenhancecareers). The collectSearch ancientexistedfooter handlerprintedconsoleEasternexportswindowsChannelillegalneutralsuggest_headersigning.html">settledwesterncausing-webkitclaimedJusticechaptervictimsThomas mozillapromisepartieseditionoutside:false,hundredOlympic_buttonauthorsreachedchronicdemandssecondsprotectadoptedprepareneithergreatlygreateroverallimprovecommandspecialsearch.worshipfundingthoughthighestinsteadutilityquarterCulturetestingclearlyexposedBrowserliberal} catchProjectexamplehide();FloridaanswersallowedEmperordefenseseriousfreedomSeveral-buttonFurtherout of != nulltrainedDenmarkvoid(0)/all.jspreventRequestStephen + +When observe</h2> +Modern provide" alt="borders. + +For + +Many artistspoweredperformfictiontype ofmedicalticketsopposedCouncilwitnessjusticeGeorge Belgium...</a>twitternotablywaitingwarfare Other rankingphrasesmentionsurvivescholar</p> + Countryignoredloss ofjust asGeorgiastrange<head><stopped1']); +islandsnotableborder:list ofcarried100,000</h3> + severalbecomesselect wedding00.htmlmonarchoff theteacherhighly biologylife ofor evenrise of»plusonehunting(thoughDouglasjoiningcirclesFor theAncientVietnamvehiclesuch ascrystalvalue =Windowsenjoyeda smallassumed<a id="foreign All rihow theDisplayretiredhoweverhidden;battlesseekingcabinetwas notlook atconductget theJanuaryhappensturninga:hoverOnline French lackingtypicalextractenemieseven ifgeneratdecidedare not/searchbeliefs-image:locatedstatic.login">convertviolententeredfirst">circuitFinlandchemistshe was10px;">as suchdivided</span>will beline ofa greatmystery/index.fallingdue to railwaycollegemonsterdescentit withnuclearJewish protestBritishflowerspredictreformsbutton who waslectureinstantsuicidegenericperiodsmarketsSocial fishingcombinegraphicwinners<br /><by the NaturalPrivacycookiesoutcomeresolveSwedishbrieflyPersianso muchCenturydepictscolumnshousingscriptsnext tobearingmappingrevisedjQuery(-width:title">tooltipSectiondesignsTurkishyounger.match(})(); + +burningoperatedegreessource=Richardcloselyplasticentries</tr> +color:#ul id="possessrollingphysicsfailingexecutecontestlink toDefault<br /> +: true,chartertourismclassicproceedexplain</h1> +online.?xml vehelpingdiamonduse theairlineend -->).attr(readershosting#ffffffrealizeVincentsignals src="/ProductdespitediversetellingPublic held inJoseph theatreaffects<style>a largedoesn'tlater, ElementfaviconcreatorHungaryAirportsee theso thatMichaelSystemsPrograms, and width=e"tradingleft"> +personsGolden Affairsgrammarformingdestroyidea ofcase ofoldest this is.src = cartoonregistrCommonsMuslimsWhat isin manymarkingrevealsIndeed,equally/show_aoutdoorescape(Austriageneticsystem,In the sittingHe alsoIslandsAcademy + <!--Daniel bindingblock">imposedutilizeAbraham(except{width:putting).html(|| []; +DATA[ *kitchenmountedactual dialectmainly _blank'installexpertsif(typeIt also© ">Termsborn inOptionseasterntalkingconcerngained ongoingjustifycriticsfactoryits ownassaultinvitedlastinghis ownhref="/" rel="developconcertdiagramdollarsclusterphp?id=alcohol);})();using a><span>vesselsrevivalAddressamateurandroidallegedillnesswalkingcentersqualifymatchesunifiedextinctDefensedied in + <!-- customslinkingLittle Book ofeveningmin.js?are thekontakttoday's.html" target=wearingAll Rig; +})();raising Also, crucialabout">declare--> +<scfirefoxas muchappliesindex, s, but type = + +<!--towardsRecordsPrivateForeignPremierchoicesVirtualreturnsCommentPoweredinline;povertychamberLiving volumesAnthonylogin" RelatedEconomyreachescuttinggravitylife inChapter-shadowNotable</td> + returnstadiumwidgetsvaryingtravelsheld bywho arework infacultyangularwho hadairporttown of + +Some 'click'chargeskeywordit willcity of(this);Andrew unique checkedor more300px; return;rsion="pluginswithin herselfStationFederalventurepublishsent totensionactresscome tofingersDuke ofpeople,exploitwhat isharmonya major":"httpin his menu"> +monthlyofficercouncilgainingeven inSummarydate ofloyaltyfitnessand wasemperorsupremeSecond hearingRussianlongestAlbertalateralset of small">.appenddo withfederalbank ofbeneathDespiteCapitalgrounds), and percentit fromclosingcontainInsteadfifteenas well.yahoo.respondfighterobscurereflectorganic= Math.editingonline paddinga wholeonerroryear ofend of barrierwhen itheader home ofresumedrenamedstrong>heatingretainscloudfrway of March 1knowingin partBetweenlessonsclosestvirtuallinks">crossedEND -->famous awardedLicenseHealth fairly wealthyminimalAfricancompetelabel">singingfarmersBrasil)discussreplaceGregoryfont copursuedappearsmake uproundedboth ofblockedsaw theofficescoloursif(docuwhen heenforcepush(fuAugust UTF-8">Fantasyin mostinjuredUsuallyfarmingclosureobject defenceuse of Medical<body> +evidentbe usedkeyCodesixteenIslamic#000000entire widely active (typeofone cancolor =speakerextendsPhysicsterrain<tbody>funeralviewingmiddle cricketprophetshifteddoctorsRussell targetcompactalgebrasocial-bulk ofman and</td> + he left).val()false);logicalbankinghome tonaming Arizonacredits); +}); +founderin turnCollinsbefore But thechargedTitle">CaptainspelledgoddessTag -->Adding:but wasRecent patientback in=false&Lincolnwe knowCounterJudaismscript altered']); + has theunclearEvent',both innot all + +<!-- placinghard to centersort ofclientsstreetsBernardassertstend tofantasydown inharbourFreedomjewelry/about..searchlegendsis mademodern only ononly toimage" linear painterand notrarely acronymdelivershorter00&as manywidth="/* <![Ctitle =of the lowest picked escapeduses ofpeoples PublicMatthewtacticsdamagedway forlaws ofeasy to windowstrong simple}catch(seventhinfoboxwent topaintedcitizenI don'tretreat. Some ww."); +bombingmailto:made in. Many carries||{};wiwork ofsynonymdefeatsfavoredopticalpageTraunless sendingleft"><comScorAll thejQuery.touristClassicfalse" Wilhelmsuburbsgenuinebishops.split(global followsbody ofnominalContactsecularleft tochiefly-hidden-banner</li> + +. When in bothdismissExplorealways via thespañolwelfareruling arrangecaptainhis sonrule ofhe tookitself,=0&(calledsamplesto makecom/pagMartin Kennedyacceptsfull ofhandledBesides//--></able totargetsessencehim to its by common.mineralto takeways tos.org/ladvisedpenaltysimple:if theyLettersa shortHerbertstrikes groups.lengthflightsoverlapslowly lesser social </p> + it intoranked rate oful> + attemptpair ofmake itKontaktAntoniohaving ratings activestreamstrapped").css(hostilelead tolittle groups,Picture--> + + rows=" objectinverse<footerCustomV><\/scrsolvingChamberslaverywoundedwhereas!= 'undfor allpartly -right:Arabianbacked centuryunit ofmobile-Europe,is homerisk ofdesiredClintoncost ofage of become none ofp"Middle ead')[0Criticsstudios>©group">assemblmaking pressedwidget.ps:" ? rebuiltby someFormer editorsdelayedCanonichad thepushingclass="but arepartialBabylonbottom carrierCommandits useAs withcoursesa thirddenotesalso inHouston20px;">accuseddouble goal ofFamous ).bind(priests Onlinein Julyst + "gconsultdecimalhelpfulrevivedis veryr'+'iptlosing femalesis alsostringsdays ofarrivalfuture <objectforcingString(" /> + here isencoded. The balloondone by/commonbgcolorlaw of Indianaavoidedbut the2px 3pxjquery.after apolicy.men andfooter-= true;for usescreen.Indian image =family,http://  driverseternalsame asnoticedviewers})(); + is moreseasonsformer the newis justconsent Searchwas thewhy theshippedbr><br>width: height=made ofcuisineis thata very Admiral fixed;normal MissionPress, ontariocharsettry to invaded="true"spacingis mosta more totallyfall of}); + immensetime inset outsatisfyto finddown tolot of Playersin Junequantumnot thetime todistantFinnishsrc = (single help ofGerman law andlabeledforestscookingspace">header-well asStanleybridges/globalCroatia About [0]; + it, andgroupedbeing a){throwhe madelighterethicalFFFFFF"bottom"like a employslive inas seenprintermost ofub-linkrejectsand useimage">succeedfeedingNuclearinformato helpWomen'sNeitherMexicanprotein<table by manyhealthylawsuitdevised.push({sellerssimply Through.cookie Image(older">us.js"> Since universlarger open to!-- endlies in']); + marketwho is ("DOMComanagedone fortypeof Kingdomprofitsproposeto showcenter;made itdressedwere inmixtureprecisearisingsrc = 'make a securedBaptistvoting + var March 2grew upClimate.removeskilledway the</head>face ofacting right">to workreduceshas haderectedshow();action=book ofan area== "htt<header +<html>conformfacing cookie.rely onhosted .customhe wentbut forspread Family a meansout theforums.footage">MobilClements" id="as highintense--><!--female is seenimpliedset thea stateand hisfastestbesidesbutton_bounded"><img Infoboxevents,a youngand areNative cheaperTimeoutand hasengineswon the(mostlyright: find a -bottomPrince area ofmore ofsearch_nature,legallyperiod,land ofor withinducedprovingmissilelocallyAgainstthe wayk"px;"> +pushed abandonnumeralCertainIn thismore inor somename isand, incrownedISBN 0-createsOctobermay notcenter late inDefenceenactedwish tobroadlycoolingonload=it. TherecoverMembersheight assumes<html> +people.in one =windowfooter_a good reklamaothers,to this_cookiepanel">London,definescrushedbaptismcoastalstatus title" move tolost inbetter impliesrivalryservers SystemPerhapses and contendflowinglasted rise inGenesisview ofrising seem tobut in backinghe willgiven agiving cities.flow of Later all butHighwayonly bysign ofhe doesdiffersbattery&lasinglesthreatsintegertake onrefusedcalled =US&See thenativesby thissystem.head of:hover,lesbiansurnameand allcommon/header__paramsHarvard/pixel.removalso longrole ofjointlyskyscraUnicodebr /> +AtlantanucleusCounty,purely count">easily build aonclicka givenpointerh"events else { +ditionsnow the, with man whoorg/Webone andcavalryHe diedseattle00,000 {windowhave toif(windand itssolely m"renewedDetroitamongsteither them inSenatorUs</a><King ofFrancis-produche usedart andhim andused byscoringat hometo haverelatesibilityfactionBuffalolink"><what hefree toCity ofcome insectorscountedone daynervoussquare };if(goin whatimg" alis onlysearch/tuesdaylooselySolomonsexual - <a hrmedium"DO NOT France,with a war andsecond take a > + + +market.highwaydone inctivity"last">obligedrise to"undefimade to Early praisedin its for hisathleteJupiterYahoo! termed so manyreally s. The a woman?value=direct right" bicycleacing="day andstatingRather,higher Office are nowtimes, when a pay foron this-link">;borderaround annual the Newput the.com" takin toa brief(in thegroups.; widthenzymessimple in late{returntherapya pointbanninginks"> +();" rea place\u003Caabout atr> + ccount gives a<SCRIPTRailwaythemes/toolboxById("xhumans,watchesin some if (wicoming formats Under but hashanded made bythan infear ofdenoted/iframeleft involtagein eacha"base ofIn manyundergoregimesaction </p> +<ustomVa;></importsor thatmostly &re size="</a></ha classpassiveHost = WhetherfertileVarious=[];(fucameras/></td>acts asIn some> + +<!organis <br />BeijingcatalĂ deutscheuropeueuskaragaeilgesvenskaespañamensajeusuariotrabajomĂ©xicopáginasiempresistemaoctubreduranteañadirempresamomentonuestroprimeratravĂ©sgraciasnuestraprocesoestadoscalidadpersonanĂşmeroacuerdomĂşsicamiembroofertasalgunospaĂ­sesejemploderechoademásprivadoagregarenlacesposiblehotelessevillaprimeroĂşltimoeventosarchivoculturamujeresentradaanuncioembargomercadograndesestudiomejoresfebrerodiseñoturismocĂłdigoportadaespaciofamiliaantoniopermiteguardaralgunaspreciosalguiensentidovisitastĂ­tuloconocersegundoconsejofranciaminutossegundatenemosefectosmálagasesiĂłnrevistagranadacompraringresogarcĂ­aacciĂłnecuadorquienesinclusodeberámateriahombresmuestrapodrĂ­amañanaĂşltimaestamosoficialtambienningĂşnsaludospodemosmejorarpositionbusinesshomepagesecuritylanguagestandardcampaignfeaturescategoryexternalchildrenreservedresearchexchangefavoritetemplatemilitaryindustryservicesmaterialproductsz-index:commentssoftwarecompletecalendarplatformarticlesrequiredmovementquestionbuildingpoliticspossiblereligionphysicalfeedbackregisterpicturesdisabledprotocolaudiencesettingsactivityelementslearninganythingabstractprogressoverviewmagazineeconomictrainingpressurevarious <strong>propertyshoppingtogetheradvancedbehaviordownloadfeaturedfootballselectedLanguagedistanceremembertrackingpasswordmodifiedstudentsdirectlyfightingnortherndatabasefestivalbreakinglocationinternetdropdownpracticeevidencefunctionmarriageresponseproblemsnegativeprogramsanalysisreleasedbanner">purchasepoliciesregionalcreativeargumentbookmarkreferrerchemicaldivisioncallbackseparateprojectsconflicthardwareinterestdeliverymountainobtained= false;for(var acceptedcapacitycomputeridentityaircraftemployedproposeddomesticincludesprovidedhospitalverticalcollapseapproachpartnerslogo"><adaughterauthor" culturalfamilies/images/assemblypowerfulteachingfinisheddistrictcriticalcgi-bin/purposesrequireselectionbecomingprovidesacademicexerciseactuallymedicineconstantaccidentMagazinedocumentstartingbottom">observed: "extendedpreviousSoftwarecustomerdecisionstrengthdetailedslightlyplanningtextareacurrencyeveryonestraighttransferpositiveproducedheritageshippingabsolutereceivedrelevantbutton" violenceanywherebenefitslaunchedrecentlyalliancefollowedmultiplebulletinincludedoccurredinternal$(this).republic><tr><tdcongressrecordedultimatesolution<ul id="discoverHome</a>websitesnetworksalthoughentirelymemorialmessagescontinueactive">somewhatvictoriaWestern title="LocationcontractvisitorsDownloadwithout right"> +measureswidth = variableinvolvedvirginianormallyhappenedaccountsstandingnationalRegisterpreparedcontrolsaccuratebirthdaystrategyofficialgraphicscriminalpossiblyconsumerPersonalspeakingvalidateachieved.jpg" />machines</h2> + keywordsfriendlybrotherscombinedoriginalcomposedexpectedadequatepakistanfollow" valuable</label>relativebringingincreasegovernorplugins/List of Header">" name=" ("graduate</head> +commercemalaysiadirectormaintain;height:schedulechangingback to catholicpatternscolor: #greatestsuppliesreliable</ul> + <select citizensclothingwatching<li id="specificcarryingsentence<center>contrastthinkingcatch(e)southernMichael merchantcarouselpadding:interior.split("lizationOctober ){returnimproved--> + +coveragechairman.png" />subjectsRichard whateverprobablyrecoverybaseballjudgmentconnect..css" /> websitereporteddefault"/></a> +electricscotlandcreationquantity. ISBN 0did not instance-search-" lang="speakersComputercontainsarchivesministerreactiondiscountItalianocriteriastrongly: 'http:'script'coveringofferingappearedBritish identifyFacebooknumerousvehiclesconcernsAmericanhandlingdiv id="William provider_contentaccuracysection andersonflexibleCategorylawrence<script>layout="approved maximumheader"></table>Serviceshamiltoncurrent canadianchannels/themes//articleoptionalportugalvalue=""intervalwirelessentitledagenciesSearch" measuredthousandspending…new Date" size="pageNamemiddle" " /></a>hidden">sequencepersonaloverflowopinionsillinoislinks"> + <title>versionssaturdayterminalitempropengineersectionsdesignerproposal="false"Españolreleasessubmit" er"additionsymptomsorientedresourceright"><pleasurestationshistory.leaving border=contentscenter">. + +Some directedsuitablebulgaria.show();designedGeneral conceptsExampleswilliamsOriginal"><span>search">operatorrequestsa "allowingDocumentrevision. + +The yourselfContact michiganEnglish columbiapriorityprintingdrinkingfacilityreturnedContent officersRussian generate-8859-1"indicatefamiliar qualitymargin:0 contentviewportcontacts-title">portable.length eligibleinvolvesatlanticonload="default.suppliedpaymentsglossary + +After guidance</td><tdencodingmiddle">came to displaysscottishjonathanmajoritywidgets.clinicalthailandteachers<head> + affectedsupportspointer;toString</small>oklahomawill be investor0" alt="holidaysResourcelicensed (which . After considervisitingexplorerprimary search" android"quickly meetingsestimate;return ;color:# height=approval, " checked.min.js"magnetic></a></hforecast. While thursdaydvertiseéhasClassevaluateorderingexistingpatients Online coloradoOptions"campbell<!-- end</span><<br /> +_popups|sciences," quality Windows assignedheight: <b classle" value=" Companyexamples<iframe believespresentsmarshallpart of properly). + +The taxonomymuch of </span> +" data-srtuguĂŞsscrollTo project<head> +attorneyemphasissponsorsfancyboxworld's wildlifechecked=sessionsprogrammpx;font- Projectjournalsbelievedvacationthompsonlightingand the special border=0checking</tbody><button Completeclearfix +<head> +article <sectionfindingsrole in popular Octoberwebsite exposureused to changesoperatedclickingenteringcommandsinformed numbers </div>creatingonSubmitmarylandcollegesanalyticlistingscontact.loggedInadvisorysiblingscontent"s")s. This packagescheckboxsuggestspregnanttomorrowspacing=icon.pngjapanesecodebasebutton">gamblingsuch as , while </span> missourisportingtop:1px .</span>tensionswidth="2lazyloadnovemberused in height="cript"> + </<tr><td height:2/productcountry include footer" <!-- title"></jquery.</form> +(简体)(çąé«”)hrvatskiitalianoromânÄtĂĽrkçeاردŮtambiĂ©nnoticiasmensajespersonasderechosnacionalserviciocontactousuariosprogramagobiernoempresasanunciosvalenciacolombiadespuĂ©sdeportesproyectoproductopĂşbliconosotroshistoriapresentemillonesmediantepreguntaanteriorrecursosproblemasantiagonuestrosopiniĂłnimprimirmientrasamĂ©ricavendedorsociedadrespectorealizarregistropalabrasinterĂ©sentoncesespecialmiembrosrealidadcĂłrdobazaragozapáginassocialesbloqueargestiĂłnalquilersistemascienciascompletoversiĂłncompletaestudiospĂşblicaobjetivoalicantebuscadorcantidadentradasaccionesarchivossuperiormayorĂ­aalemaniafunciĂłnĂşltimoshaciendoaquellosediciĂłnfernandoambientefacebooknuestrasclientesprocesosbastantepresentareportarcongresopublicarcomerciocontratojĂłvenesdistritotĂ©cnicaconjuntoenergĂ­atrabajarasturiasrecienteutilizarboletĂ­nsalvadorcorrectatrabajosprimerosnegocioslibertaddetallespantallaprĂłximoalmerĂ­aanimalesquiĂ©nescorazĂłnsecciĂłnbuscandoopcionesexteriorconceptotodavĂ­agalerĂ­aescribirmedicinalicenciaconsultaaspectoscrĂ­ticadĂłlaresjusticiadeberánperĂ­odonecesitamantenerpequeñorecibidatribunaltenerifecanciĂłncanariasdescargadiversosmallorcarequieretĂ©cnicodeberĂ­aviviendafinanzasadelantefuncionaconsejosdifĂ­cilciudadesantiguasavanzadatĂ©rminounidadessánchezcampañasoftonicrevistascontienesectoresmomentosfacultadcrĂ©ditodiversassupuestofactoressegundospequeñaгодаеŃлиеŃтьбылобытьэтомЕŃлитогоменявŃехэтойдажебылигодŃденьэтотбылаŃебяодинŃебенадоŃайтфотонегоŃвоиŃвойигрытожевŃемŃвоюлиŃьэтихпокаднейдомамиралиботемŃхотядвŃŃ…ŃетилюдиделомиретебяŃвоевидечегоэтимŃчеттемыценыŃталведьтемеводытебевыŃенамитипатомŃправлицаоднагодызнаюмогŃĐ´Ń€ŃгвŃейидеткиноодноделаделеŃрокиюнявеŃŃŚĐ•ŃтьразанаŃиاللهالتيجميعخاصةالذيعليهجديدالآنالردتحŮمصŮŘ­Ř©ŮانتاللييŮŮنشبŮŘ©ŮيهابناتحŮاءأŮثرخلالالحبدليلدرŮساضغطتŮŮنهناŮساحةناديالطبعليŮŘ´ŮرايمŮنمنهاشرŮةرئيسنشيطماذاالŮنشبابتعبررحمةŮاŮةيقŮلمرŮزŮلمةأحمدقلبييعنيصŮرةطريقشارŮجŮالأخرىمعناابحثعرŮضبشŮلمسجلبنانخالدŮتابŮليةبدŮنأيضايŮجدŮريقŮتبتأŮضلمطبخاŮثربارŮاŮضلاحلىنŮسهأيامردŮدأنهاديناالانمعرضتعلمداخلممŮن���������������������� +  + ˙˙˙˙��������˙˙˙˙������������������˙˙������˙˙����������������resourcescountriesquestionsequipmentcommunityavailablehighlightDTD/xhtmlmarketingknowledgesomethingcontainerdirectionsubscribeadvertisecharacter" value="</select>Australia" class="situationauthorityfollowingprimarilyoperationchallengedevelopedanonymousfunction functionscompaniesstructureagreement" title="potentialeducationargumentssecondarycopyrightlanguagesexclusivecondition</form> +statementattentionBiography} else { +solutionswhen the Analyticstemplatesdangeroussatellitedocumentspublisherimportantprototypeinfluence»</effectivegenerallytransformbeautifultransportorganizedpublishedprominentuntil thethumbnailNational .focus();over the migrationannouncedfooter"> +exceptionless thanexpensiveformationframeworkterritoryndicationcurrentlyclassNamecriticismtraditionelsewhereAlexanderappointedmaterialsbroadcastmentionedaffiliate</option>treatmentdifferent/default.Presidentonclick="biographyotherwisepermanentFrançaisHollywoodexpansionstandards</style> +reductionDecember preferredCambridgeopponentsBusiness confusion> +<title>presentedexplaineddoes not worldwideinterfacepositionsnewspaper</table> +mountainslike the essentialfinancialselectionaction="/abandonedEducationparseInt(stabilityunable to +relationsNote thatefficientperformedtwo yearsSince thethereforewrapper">alternateincreasedBattle ofperceivedtrying tonecessaryportrayedelectionsElizabethdiscoveryinsurances.length;legendaryGeographycandidatecorporatesometimesservices.inheritedCommunityreligiouslocationsCommitteebuildingsthe worldno longerbeginningreferencecannot befrequencytypicallyinto the relative;recordingpresidentinitiallytechniquethe otherit can beexistenceunderlinethis timetelephoneitemscopepracticesadvantage);return For otherprovidingdemocracyboth the extensivesufferingsupportedcomputers functionpracticalsaid thatit may beEnglish
+suspectedmargin: 0spiritual + +microsoftgraduallydiscussedhe becameexecutivejquery.jshouseholdconfirmedpurchasedliterallydestroyedup to thevariationremainingit is notcenturiesJapanese among thecompletedalgorithminterestsrebellionundefinedencourageresizableinvolvingsensitiveuniversalprovision(althoughfeaturingconducted), which continued-header">February numerous overflow:componentfragmentsexcellentcolspan="technicalnear the Advanced source ofexpressedHong Kong Facebookmultiple mechanismelevationoffensive + sponsoreddocument.or "there arethose whomovementsprocessesdifficultsubmittedrecommendconvincedpromoting" width=".replace(classicalcoalitionhis firstdecisionsassistantindicatedevolution-wrapper"enough toalong thedelivered--> + + +
Archbishop class="nobeing usedapproachesprivilegesnoscript> +results inmay be theEaster eggmechanismsreasonablePopulationCollectionselected">noscript> /index.phparrival of-jssdk'));managed toincompletecasualtiescompletionChristiansSeptember arithmeticproceduresmight haveProductionit appearsPhilosophyfriendshipleading togiving thetoward theguaranteeddocumentedcolor:#000video gamecommissionreflectingchange theassociatedsans-serifonkeypress; padding:He was theunderlyingtypically , and the srcElementsuccessivesince the should be networkingaccountinguse of thelower thanshows that + complaintscontinuousquantitiesastronomerhe did notdue to itsapplied toan averageefforts tothe futureattempt toTherefore,capabilityRepublicanwas formedElectronickilometerschallengespublishingthe formerindigenousdirectionssubsidiaryconspiracydetails ofand in theaffordablesubstancesreason forconventionitemtype="absolutelysupposedlyremained aattractivetravellingseparatelyfocuses onelementaryapplicablefound thatstylesheetmanuscriptstands for no-repeat(sometimesCommercialin Americaundertakenquarter ofan examplepersonallyindex.php? +percentagebest-knowncreating a" dir="ltrLieutenant +
is said tostructuralreferendummost oftena separate-> +
implementedcan be seenthere was ademonstratecontainer">connectionsthe Britishwas written!important;px; margin-followed byability to complicatedduring the immigrationalso called

as follows:merged withthrough thecommercial pointed outopportunityview of therequirementdivision ofprogramminghe receivedsetInterval">maintainingChristopherMuch of thewritings of" height="2size of theversion of mixture of between theExamples ofeducationalcompetitive onsubmit="director ofdistinctive/DTD XHTML relating totendency toprovince ofwhich woulddespite thescientific legislature.innerHTML allegationsAgriculturewas used inapproach tointelligentyears later,sans-serifdeterminingPerformanceappearances, which is foundationsabbreviatedhigher thans from the individual composed ofsupposed toclaims thatattributionfont-size:1elements ofHistorical his brotherat the timeanniversarygoverned byrelated to ultimately innovationsit is stillcan only bedefinitionstoGMTStringA number ofimg class="Eventually,was changedoccurred inneighboringdistinguishwhen he wasintroducingterrestrialMany of theargues thatan Americanconquest ofwidespread were killedscreen and In order toexpected todescendantsare locatedlegislativegenerations backgroundmost peopleyears afterthere is nothe highestfrequently they do notargued thatshowed thatpredominanttheologicalby the timeconsideringshort-livedcan be usedvery littleone of the had alreadyinterpretedcommunicatefeatures ofgovernment,entered the" height="3Independentpopulationslarge-scale. Although used in thedestructionpossibilitystarting intwo or moreexpressionssubordinatelarger thanhistory and +Continentaleliminatingwill not bepractice ofin front ofsite of theensure thatto create amississippipotentiallyoutstandingbetter thanwhat is nowsituated inmeta name="TraditionalsuggestionsTranslationthe form ofatmosphericideologicalenterprisescalculatingeast of theremnants ofpluginspage/index.php?remained intransformedHe was alsowas alreadystatisticalin favor ofMinistry ofmovement offormulationis required +question ofwas electedto become abecause of some peopleinspired bysuccessful a time whenmore commonamongst thean officialwidth:100%;technology,was adoptedto keep thesettlementslive birthsindex.html"Connecticutassigned to&times;account foralign=rightthe companyalways beenreturned toinvolvementBecause thethis period" name="q" confined toa result ofvalue="" />is actuallyEnvironment + +Conversely,> +
this is notthe presentif they areand finallya matter of +
+ +faster thanmajority ofafter whichcomparativeto maintainimprove theawarded theer" class="frameborderrestorationin the sameanalysis oftheir firstDuring the continentalsequence offunction(){font-size: work on the +adopted theproperty ofdirected byeffectivelywas broughtchildren ofProgramminglonger thanmanuscriptswar againstby means ofand most ofsimilar to proprietaryoriginatingprestigiousgrammaticalexperience.to make theIt was alsois found incompetitorsin the U.S.replace thebrought thecalculationfall of thethe generalpracticallyin honor ofreleased inresidentialand some ofking of thereaction to1st Earl ofculture andprincipally + they can beback to thesome of hisexposure toare similarform of theaddFavoritecitizenshippart in thepeople within practiceto continue&minus;approved by the first allowed theand for thefunctioningplaying thesolution toheight="0" in his bookmore than afollows thecreated thepresence in nationalistthe idea ofa characterwere forced class="btndays of thefeatured inshowing theinterest inin place ofturn of thethe head ofLord of thepoliticallyhas its ownEducationalapproval ofsome of theeach other,behavior ofand becauseand anotherappeared onrecorded inblack"may includethe world'scan lead torefers to aborder="0" government winning theresulted in while the Washington,the subjectcity in the>

+ reflect theto completebecame moreradioactiverejected bywithout anyhis father,which couldcopy of theto indicatea politicalaccounts ofconstitutesworked witherof his lifeaccompaniedclientWidthprevent theLegislativedifferentlytogether inhas severalfor anothertext of thefounded thee with the is used forchanged theusually theplace wherewhereas the> The currentthe site ofsubstantialexperience,in the Westthey shouldslovenčinacomentariosuniversidadcondicionesactividadesexperienciatecnologíaproducciónpuntuaciónaplicacióncontraseñacategoríasregistrarseprofesionaltratamientoregístratesecretaríaprincipalesprotecciónimportantesimportanciaposibilidadinteresantecrecimientonecesidadessuscribirseasociacióndisponiblesevaluaciónestudiantesresponsableresoluciónguadalajararegistradosoportunidadcomercialesfotografíaautoridadesingenieríatelevisióncompetenciaoperacionesestablecidosimplementeactualmentenavegaciónconformidadline-height:font-family:" : "http://applicationslink" href="specifically// +/index.html"window.open( !important;application/independence//www.googleorganizationautocompleterequirementsconservative
most notably/>
notification'undefined')Furthermore,believe thatinnerHTML = prior to thedramaticallyreferring tonegotiationsheadquartersSouth AfricaunsuccessfulPennsylvaniaAs a result, +
English (US)appendChild(transmissions. However, intelligence" tabindex="float:right;Commonwealthranging fromin which theat least onereproductionencyclopedia;font-size:1jurisdictionat that time">compensationchampionshipmedia="all" violation ofreference toreturn true;Strict//EN" transactionsinterventionverificationInformation difficultiesChampionshipcapabilities} + +Christianityfor example,Professionalrestrictionssuggest thatwas released(such as theremoveClass(unemploymentthe Americanstructure of/index.html published inspan class=""> + +f (document.border: 1px {font-size:1treatment of0" height="1modificationIndependencedivided intogreater thanachievementsestablishingJavaScript" neverthelesssignificanceBroadcasting> container"> +such as the influence ofa particularsrc='http://navigation" half of the substantial  advantage ofdiscovery offundamental metropolitanthe opposite" xml:lang="deliberatelyalign=centerevolution ofpreservationimprovementsbeginning inJesus ChristPublicationsdisagreementtext-align:r, function()similaritiesbody>is currentlyalphabeticalis sometimestype="image/many of the flow:hidden;available indescribe theexistence ofall over thethe Internet