Commit 30215244 by Bernhard Kerbl

Initial commit

parents
cmake_minimum_required(VERSION 3.20)
project(DiffRast LANGUAGES CUDA CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
add_library(CudaRasterizer
cuda_rasterizer/backward.h
cuda_rasterizer/backward.cu
cuda_rasterizer/forward.h
cuda_rasterizer/forward.cu
cuda_rasterizer/auxiliary.h
cuda_rasterizer/rasterizer_impl.cu
cuda_rasterizer/rasterizer_impl.h
cuda_rasterizer/rasterizer.h
)
set_target_properties(CudaRasterizer PROPERTIES CUDA_ARCHITECTURES "75;86")
target_include_directories(CudaRasterizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/cuda_rasterizer)
target_include_directories(CudaRasterizer PRIVATE third_party/glm ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
#ifndef CUDA_RASTERIZER_AUXILIARY_H_INCLUDED
#define CUDA_RASTERIZER_AUXILIARY_H_INCLUDED
#include "config.h"
#define BLOCK_SIZE (BLOCK_X * BLOCK_Y)
#define NUM_WARPS (BLOCK_SIZE/32)
// Spherical harmonics coefficients
__device__ constexpr float SH_C0 = 0.28209479177387814f;
__device__ constexpr float SH_C1 = 0.4886025119029199f;
__device__ constexpr float SH_C2[] = {
1.0925484305920792f,
-1.0925484305920792f,
0.31539156525252005f,
-1.0925484305920792f,
0.5462742152960396f
};
__device__ constexpr float SH_C3[] = {
-0.5900435899266435f,
2.890611442640554f,
-0.4570457994644658f,
0.3731763325901154f,
-0.4570457994644658f,
1.445305721320277f,
-0.5900435899266435f
};
__forceinline __host__ __device__ float ndc2Pix(float v, int S)
{
return ((v + 1.0) * S - 1.0) * 0.5;
}
__forceinline __device__ void getRect(const float2 p, int max_radius, uint2& rect_min, uint2& rect_max, dim3 grid)
{
rect_min = {
min(grid.x, max((int)0, (int)((p.x - max_radius) / BLOCK_X))),
min(grid.y, max((int)0, (int)((p.y - max_radius) / BLOCK_Y)))
};
rect_max = {
min(grid.x, max((int)0, (int)((p.x + max_radius + BLOCK_X - 1) / BLOCK_X))),
min(grid.y, max((int)0, (int)((p.y + max_radius + BLOCK_Y - 1) / BLOCK_Y)))
};
}
__forceinline __device__ float3 transformPoint4x3(const float3& p, const float* matrix)
{
float3 transformed = {
matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12],
matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13],
matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14],
};
return transformed;
}
__forceinline __device__ float4 transformPoint4x4(const float3& p, const float* matrix)
{
float4 transformed = {
matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12],
matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13],
matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14],
matrix[3] * p.x + matrix[7] * p.y + matrix[11] * p.z + matrix[15]
};
return transformed;
}
__forceinline __device__ float3 transformVec4x3(const float3& p, const float* matrix)
{
float3 transformed = {
matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z,
matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z,
matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z,
};
return transformed;
}
__forceinline __device__ float3 transformVec4x3Transpose(const float3& p, const float* matrix)
{
float3 transformed = {
matrix[0] * p.x + matrix[1] * p.y + matrix[2] * p.z,
matrix[4] * p.x + matrix[5] * p.y + matrix[6] * p.z,
matrix[8] * p.x + matrix[9] * p.y + matrix[10] * p.z,
};
return transformed;
}
__forceinline __device__ float dnormvdz(float3 v, float3 dv)
{
float sum2 = v.x * v.x + v.y * v.y + v.z * v.z;
float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
float dnormvdz = (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) * invsum32;
return dnormvdz;
}
__forceinline __device__ float3 dnormvdv(float3 v, float3 dv)
{
float sum2 = v.x * v.x + v.y * v.y + v.z * v.z;
float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
float3 dnormvdv;
dnormvdv.x = ((+sum2 - v.x * v.x) * dv.x - v.y * v.x * dv.y - v.z * v.x * dv.z) * invsum32;
dnormvdv.y = (-v.x * v.y * dv.x + (sum2 - v.y * v.y) * dv.y - v.z * v.y * dv.z) * invsum32;
dnormvdv.z = (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) * invsum32;
return dnormvdv;
}
__forceinline __device__ float4 dnormvdv(float4 v, float4 dv)
{
float sum2 = v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w;
float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
float4 vdv = { v.x * dv.x, v.y * dv.y, v.z * dv.z, v.w * dv.w };
float vdv_sum = vdv.x + vdv.y + vdv.z + vdv.w;
float4 dnormvdv;
dnormvdv.x = ((sum2 - v.x * v.x) * dv.x - v.x * (vdv_sum - vdv.x)) * invsum32;
dnormvdv.y = ((sum2 - v.y * v.y) * dv.y - v.y * (vdv_sum - vdv.y)) * invsum32;
dnormvdv.z = ((sum2 - v.z * v.z) * dv.z - v.z * (vdv_sum - vdv.z)) * invsum32;
dnormvdv.w = ((sum2 - v.w * v.w) * dv.w - v.w * (vdv_sum - vdv.w)) * invsum32;
return dnormvdv;
}
__forceinline __device__ float sigmoid(float x)
{
return 1.0f / (1.0f + expf(-x));
}
__forceinline __device__ bool in_frustum(int idx,
const float* orig_points,
const float* viewmatrix,
const float* projmatrix,
bool prefiltered,
float3& p_view)
{
float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
// Bring points to screen space
float4 p_hom = transformPoint4x4(p_orig, projmatrix);
float p_w = 1.0f / (p_hom.w + 0.0000001f);
float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
p_view = transformPoint4x3(p_orig, viewmatrix);
if (p_view.z <= 0.2f || ((p_proj.x < -1.3 || p_proj.x > 1.3 || p_proj.y < -1.3 || p_proj.y > 1.3)))
{
if (prefiltered)
{
printf("Point is filtered although prefiltered is set. This shouldn't happen!");
__trap();
}
return false;
}
return true;
}
#endif
\ No newline at end of file
#include "backward.h"
#include "auxiliary.h"
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
namespace cg = cooperative_groups;
// Backward pass for conversion of spherical harmonics to RGB for
// each Gaussian.
__device__ void computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, const bool* clamped, const glm::vec3* dL_dcolor, glm::vec3* dL_dmeans, glm::vec3* dL_dshs)
{
// Compute intermediate values, as it is done during forward
glm::vec3 pos = means[idx];
glm::vec3 dir_orig = pos - campos;
glm::vec3 dir = dir_orig / glm::length(dir_orig);
glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
// Use PyTorch rule for clamping: if clamping was applied,
// gradient becomes 0.
glm::vec3 dL_dRGB = dL_dcolor[idx];
dL_dRGB.x *= clamped[3 * idx + 0] ? 0 : 1;
dL_dRGB.y *= clamped[3 * idx + 1] ? 0 : 1;
dL_dRGB.z *= clamped[3 * idx + 2] ? 0 : 1;
glm::vec3 dRGBdx(0, 0, 0);
glm::vec3 dRGBdy(0, 0, 0);
glm::vec3 dRGBdz(0, 0, 0);
float x = dir.x;
float y = dir.y;
float z = dir.z;
// Target location for this Gaussian to write SH gradients to
glm::vec3* dL_dsh = dL_dshs + idx * max_coeffs;
// No tricks here, just high school-level calculus.
float dRGBdsh0 = SH_C0;
dL_dsh[0] = dRGBdsh0 * dL_dRGB;
if (deg > 0)
{
float dRGBdsh1 = -SH_C1 * y;
float dRGBdsh2 = SH_C1 * z;
float dRGBdsh3 = -SH_C1 * x;
dL_dsh[1] = dRGBdsh1 * dL_dRGB;
dL_dsh[2] = dRGBdsh2 * dL_dRGB;
dL_dsh[3] = dRGBdsh3 * dL_dRGB;
dRGBdx = -SH_C1 * sh[3];
dRGBdy = -SH_C1 * sh[1];
dRGBdz = SH_C1 * sh[2];
if (deg > 1)
{
float xx = x * x, yy = y * y, zz = z * z;
float xy = x * y, yz = y * z, xz = x * z;
float dRGBdsh4 = SH_C2[0] * xy;
float dRGBdsh5 = SH_C2[1] * yz;
float dRGBdsh6 = SH_C2[2] * (2.f * zz - xx - yy);
float dRGBdsh7 = SH_C2[3] * xz;
float dRGBdsh8 = SH_C2[4] * (xx - yy);
dL_dsh[4] = dRGBdsh4 * dL_dRGB;
dL_dsh[5] = dRGBdsh5 * dL_dRGB;
dL_dsh[6] = dRGBdsh6 * dL_dRGB;
dL_dsh[7] = dRGBdsh7 * dL_dRGB;
dL_dsh[8] = dRGBdsh8 * dL_dRGB;
dRGBdx += SH_C2[0] * y * sh[4] + SH_C2[2] * 2.f * -x * sh[6] + SH_C2[3] * z * sh[7] + SH_C2[4] * 2.f * x * sh[8];
dRGBdy += SH_C2[0] * x * sh[4] + SH_C2[1] * z * sh[5] + SH_C2[2] * 2.f * -y * sh[6] + SH_C2[4] * 2.f * -y * sh[8];
dRGBdz += SH_C2[1] * y * sh[5] + SH_C2[2] * 2.f * 2.f * z * sh[6] + SH_C2[3] * x * sh[7];
if (deg > 2)
{
float dRGBdsh9 = SH_C3[0] * y * (3.f * xx - yy);
float dRGBdsh10 = SH_C3[1] * xy * z;
float dRGBdsh11 = SH_C3[2] * y * (4.f * zz - xx - yy);
float dRGBdsh12 = SH_C3[3] * z * (2.f * zz - 3.f * xx - 3.f * yy);
float dRGBdsh13 = SH_C3[4] * x * (4.f * zz - xx - yy);
float dRGBdsh14 = SH_C3[5] * z * (xx - yy);
float dRGBdsh15 = SH_C3[6] * x * (xx - 3.f * yy);
dL_dsh[9] = dRGBdsh9 * dL_dRGB;
dL_dsh[10] = dRGBdsh10 * dL_dRGB;
dL_dsh[11] = dRGBdsh11 * dL_dRGB;
dL_dsh[12] = dRGBdsh12 * dL_dRGB;
dL_dsh[13] = dRGBdsh13 * dL_dRGB;
dL_dsh[14] = dRGBdsh14 * dL_dRGB;
dL_dsh[15] = dRGBdsh15 * dL_dRGB;
dRGBdx += (
SH_C3[0] * sh[9] * 3.f * 2.f * xy +
SH_C3[1] * sh[10] * yz +
SH_C3[2] * sh[11] * -2.f * xy +
SH_C3[3] * sh[12] * -3.f * 2.f * xz +
SH_C3[4] * sh[13] * (-3.f * xx + 4.f * zz - yy) +
SH_C3[5] * sh[14] * 2.f * xz +
SH_C3[6] * sh[15] * 3.f * (xx - yy));
dRGBdy += (
SH_C3[0] * sh[9] * 3.f * (xx - yy) +
SH_C3[1] * sh[10] * xz +
SH_C3[2] * sh[11] * (-3.f * yy + 4.f * zz - xx) +
SH_C3[3] * sh[12] * -3.f * 2.f * yz +
SH_C3[4] * sh[13] * -2.f * xy +
SH_C3[5] * sh[14] * -2.f * yz +
SH_C3[6] * sh[15] * -3.f * 2.f * xy);
dRGBdz += (
SH_C3[1] * sh[10] * xy +
SH_C3[2] * sh[11] * 4.f * 2.f * yz +
SH_C3[3] * sh[12] * 3.f * (2.f * zz - xx - yy) +
SH_C3[4] * sh[13] * 4.f * 2.f * xz +
SH_C3[5] * sh[14] * (xx - yy));
}
}
}
// The view direction is an input to the computation. View direction
// is influenced by the Gaussian's mean, so SHs gradients
// must propagate back into 3D position.
glm::vec3 dL_ddir(glm::dot(dRGBdx, dL_dRGB), glm::dot(dRGBdy, dL_dRGB), glm::dot(dRGBdz, dL_dRGB));
// Account for normalization of direction
float3 dL_dmean = dnormvdv(float3{ dir_orig.x, dir_orig.y, dir_orig.z }, float3{ dL_ddir.x, dL_ddir.y, dL_ddir.z });
// Gradients of loss w.r.t. Gaussian means, but only the portion
// that is caused because the mean affects the view-dependent color.
// Additional mean gradient is accumulated in below methods.
dL_dmeans[idx] += glm::vec3(dL_dmean.x, dL_dmean.y, dL_dmean.z);
}
// Backward version of INVERSE 2D covariance matrix computation
// (due to length launched as separate kernel before other
// backward steps contained in preprocess)
__global__ void computeCov2DCUDA(int P,
const float3* means,
const int* radii,
const float* cov3Ds,
float h_x,
float h_y,
const float* view_matrix,
const float* dL_dconics,
float3* dL_dmeans,
float* dL_dcov)
{
auto idx = cg::this_grid().thread_rank();
if (idx >= P || !(radii[idx] > 0))
return;
// Reading location of 3D covariance for this Gaussian
const float* cov3D = cov3Ds + 6 * idx;
// Fetch gradients, recompute 2D covariance and relevant
// intermediate forward results needed in the backward.
float3 mean = means[idx];
float3 dL_dconic = { dL_dconics[4 * idx], dL_dconics[4 * idx + 1], dL_dconics[4 * idx + 3] };
float3 t = transformPoint4x3(mean, view_matrix);
float t_inv_norm = 1.f / sqrt(t.x * t.x + t.y * t.y + t.z * t.z);
glm::mat3 J = glm::mat3(h_x / t.z, 0.0f, -(h_x * t.x) / (t.z * t.z),
0.0f, h_y / t.z, -(h_y * t.y) / (t.z * t.z),
t.x * t_inv_norm, t.y * t_inv_norm, t.z * t_inv_norm);
glm::mat3 W = glm::mat3(
view_matrix[0], view_matrix[4], view_matrix[8],
view_matrix[1], view_matrix[5], view_matrix[9],
view_matrix[2], view_matrix[6], view_matrix[10]);
glm::mat3 Vrk = glm::mat3(
cov3D[0], cov3D[1], cov3D[2],
cov3D[1], cov3D[3], cov3D[4],
cov3D[2], cov3D[4], cov3D[5]);
glm::mat3 T = W * J;
glm::mat3 cov2D = glm::transpose(T) * glm::transpose(Vrk) * T;
// Use helper variables for 2D covariance entries. More compact.
float a = cov2D[0][0] += 0.3f;
float b = cov2D[0][1];
float c = cov2D[1][1] += 0.3f;
float denom = a * c - b * b;
float dL_da = 0, dL_db = 0, dL_dc = 0;
float denom2inv = 1.0f / ((denom * denom) + 0.0000001f);
if (denom2inv != 0)
{
// Gradients of loss w.r.t. entries of 2D covariance matrix,
// given gradients of loss w.r.t. conic matrix (inverse covariance matrix).
// e.g., dL / da = dL / d_conic_a * d_conic_a / d_a
dL_da = denom2inv * (-c * c * dL_dconic.x + 2 * b * c * dL_dconic.y + (denom - a * c) * dL_dconic.z);
dL_dc = denom2inv * (-a * a * dL_dconic.z + 2 * a * b * dL_dconic.y + (denom - a * c) * dL_dconic.x);
dL_db = denom2inv * 2 * (b * c * dL_dconic.x - (denom + 2 * b * b) * dL_dconic.y + a * b * dL_dconic.z);
// Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry,
// given gradients w.r.t. 2D covariance matrix (diagonal).
// cov2D = transpose(T) * transpose(Vrk) * T;
dL_dcov[6 * idx + 0] = (T[0][0] * T[0][0] * dL_da + T[0][0] * T[1][0] * dL_db + T[1][0] * T[1][0] * dL_dc);
dL_dcov[6 * idx + 3] = (T[0][1] * T[0][1] * dL_da + T[0][1] * T[1][1] * dL_db + T[1][1] * T[1][1] * dL_dc);
dL_dcov[6 * idx + 5] = (T[0][2] * T[0][2] * dL_da + T[0][2] * T[1][2] * dL_db + T[1][2] * T[1][2] * dL_dc);
// Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry,
// given gradients w.r.t. 2D covariance matrix (off-diagonal).
// Off-diagonal elements appear twice --> double the gradient.
// cov2D = transpose(T) * transpose(Vrk) * T;
dL_dcov[6 * idx + 1] = 2 * T[0][0] * T[0][1] * dL_da + (T[0][0] * T[1][1] + T[0][1] * T[1][0]) * dL_db + 2 * T[1][0] * T[1][1] * dL_dc;
dL_dcov[6 * idx + 2] = 2 * T[0][0] * T[0][2] * dL_da + (T[0][0] * T[1][2] + T[0][2] * T[1][0]) * dL_db + 2 * T[1][0] * T[1][2] * dL_dc;
dL_dcov[6 * idx + 4] = 2 * T[0][2] * T[0][1] * dL_da + (T[0][1] * T[1][2] + T[0][2] * T[1][1]) * dL_db + 2 * T[1][1] * T[1][2] * dL_dc;
}
else
{
for (int i = 0; i < 6; i++)
dL_dcov[6 * idx + i] = 0;
}
// Gradients of loss w.r.t. upper 2x3 portion of intermediate matrix T
// cov2D = transpose(T) * transpose(Vrk) * T;
float dL_dT00 = 2 * (T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_da +
(T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_db;
float dL_dT01 = 2 * (T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_da +
(T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_db;
float dL_dT02 = 2 * (T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_da +
(T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_db;
float dL_dT10 = 2 * (T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_dc +
(T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_db;
float dL_dT11 = 2 * (T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_dc +
(T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_db;
float dL_dT12 = 2 * (T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_dc +
(T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_db;
// Gradients of loss w.r.t. upper 3x2 non-zero entries of Jacobian matrix
// T = W * J
float dL_dJ00 = W[0][0] * dL_dT00 + W[0][1] * dL_dT01 + W[0][2] * dL_dT02;
float dL_dJ02 = W[2][0] * dL_dT00 + W[2][1] * dL_dT01 + W[2][2] * dL_dT02;
float dL_dJ11 = W[1][0] * dL_dT10 + W[1][1] * dL_dT11 + W[1][2] * dL_dT12;
float dL_dJ12 = W[2][0] * dL_dT10 + W[2][1] * dL_dT11 + W[2][2] * dL_dT12;
float tz = 1.f / t.z;
float tz2 = tz * tz;
float tz3 = tz2 * tz;
// Gradients of loss w.r.t. transformed Gaussian mean t
float dL_dtx = -h_x * tz2 * dL_dJ02;
float dL_dty = -h_y * tz2 * dL_dJ12;
float dL_dtz = -h_x * tz2 * dL_dJ00 - h_y * tz2 * dL_dJ11 + (2 * h_x * t.x) * tz3 * dL_dJ02 + (2 * h_y * t.y) * tz3 * dL_dJ12;
// Account for transformation of mean to t
// t = transformPoint4x3(mean, view_matrix);
float3 dL_dmean = transformVec4x3Transpose({ dL_dtx, dL_dty, dL_dtz }, view_matrix);
// Gradients of loss w.r.t. Gaussian means, but only the portion
// that is caused because the mean affects the covariance matrix.
// Additional mean gradient is accumulated in BACKWARD::preprocess.
dL_dmeans[idx] = dL_dmean;
}
// Backward pass for the conversion of scale and rotation to a
// 3D covariance matrix for each Gaussian.
__device__ void computeCov3D(int idx, const glm::vec3 scale, float mod, const glm::vec4 rot, const float* dL_dcov3Ds, glm::vec3* dL_dscales, glm::vec4* dL_drots)
{
// Recompute (intermediate) results for the 3D covariance computation.
glm::vec4 q = rot / glm::length(rot);
float r = q.x;
float x = q.y;
float y = q.z;
float z = q.w;
glm::mat3 R = glm::mat3(
1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y),
2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x),
2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y)
);
glm::mat3 S = glm::mat3(1.0f);
glm::vec3 s = mod * exp(scale);
S[0][0] = s.x;
S[1][1] = s.y;
S[2][2] = s.z;
glm::mat3 M = S * R;
const float* dL_dcov3D = dL_dcov3Ds + 6 * idx;
glm::vec3 dunc(dL_dcov3D[0], dL_dcov3D[3], dL_dcov3D[5]);
glm::vec3 ounc = 0.5f * glm::vec3(dL_dcov3D[1], dL_dcov3D[2], dL_dcov3D[4]);
// Convert per-element covariance loss gradients to matrix form
glm::mat3 dL_dSigma = glm::mat3(
dL_dcov3D[0], 0.5f * dL_dcov3D[1], 0.5f * dL_dcov3D[2],
0.5f * dL_dcov3D[1], dL_dcov3D[3], 0.5f * dL_dcov3D[4],
0.5f * dL_dcov3D[2], 0.5f * dL_dcov3D[4], dL_dcov3D[5]
);
// Compute loss gradient w.r.t. matrix M
// dSigma_dM = 2 * M
glm::mat3 dL_dM = 2.0f * M * dL_dSigma;
glm::mat3 Rt = glm::transpose(R);
glm::mat3 dL_dMt = glm::transpose(dL_dM);
dL_dMt[0] *= s.x;
dL_dMt[1] *= s.y;
dL_dMt[2] *= s.z;
// Gradients of loss w.r.t. scale
glm::vec3* dL_dscale = dL_dscales + idx;
dL_dscale->x = glm::dot(Rt[0], dL_dMt[0]);
dL_dscale->y = glm::dot(Rt[1], dL_dMt[1]);
dL_dscale->z = glm::dot(Rt[2], dL_dMt[2]);
// Gradients of loss w.r.t. normalized quaternion
glm::vec4 dL_dq;
dL_dq.x = 2 * z * (dL_dMt[0][1] - dL_dMt[1][0]) + 2 * y * (dL_dMt[2][0] - dL_dMt[0][2]) + 2 * x * (dL_dMt[1][2] - dL_dMt[2][1]);
dL_dq.y = 2 * y * (dL_dMt[1][0] + dL_dMt[0][1]) + 2 * z * (dL_dMt[2][0] + dL_dMt[0][2]) + 2 * r * (dL_dMt[1][2] - dL_dMt[2][1]) - 4 * x * (dL_dMt[2][2] + dL_dMt[1][1]);
dL_dq.z = 2 * x * (dL_dMt[1][0] + dL_dMt[0][1]) + 2 * r * (dL_dMt[2][0] - dL_dMt[0][2]) + 2 * z * (dL_dMt[1][2] + dL_dMt[2][1]) - 4 * y * (dL_dMt[2][2] + dL_dMt[0][0]);
dL_dq.w = 2 * r * (dL_dMt[0][1] - dL_dMt[1][0]) + 2 * x * (dL_dMt[2][0] + dL_dMt[0][2]) + 2 * y * (dL_dMt[1][2] + dL_dMt[2][1]) - 4 * z * (dL_dMt[1][1] + dL_dMt[0][0]);
// Gradients of loss w.r.t. unnormalized quaternion
float4* dL_drot = (float4*)(dL_drots + idx);
*dL_drot = dnormvdv(float4{ rot.x, rot.y, rot.z, rot.w }, float4{ dL_dq.x, dL_dq.y, dL_dq.z, dL_dq.w });
}
// Backward pass of the preprocessing steps, except
// for the covariance computation and inversion
// (those are handled by a previous kernel call)
template<int C>
__global__ void preprocessCUDA(
int P, int D, int M,
const float3* means,
const int* radii,
const float* shs,
const bool* clamped,
const glm::vec3* scales,
const glm::vec4* rotations,
const float scale_modifier,
const float* proj,
const glm::vec3* campos,
const float3* dL_dmean2D,
glm::vec3* dL_dmeans,
float* dL_dcolor,
float* dL_dcov3D,
float* dL_dsh,
glm::vec3* dL_dscale,
glm::vec4* dL_drot)
{
auto idx = cg::this_grid().thread_rank();
if (idx >= P || !(radii[idx] > 0))
return;
float3 m = means[idx];
// Taking care of gradients from the screenspace points
float4 m_hom = transformPoint4x4(m, proj);
float m_w = 1.0f / (m_hom.w + 0.0000001f);
// Compute loss gradient w.r.t. 3D means due to gradients of 2D means
// from rendering procedure
glm::vec3 dL_dmean;
float mul1 = (proj[0] * m.x + proj[4] * m.y + proj[8] * m.z + proj[12]) * m_w * m_w;
float mul2 = (proj[1] * m.x + proj[5] * m.y + proj[9] * m.z + proj[13]) * m_w * m_w;
dL_dmean.x = (proj[0] * m_w - proj[3] * mul1) * dL_dmean2D[idx].x + (proj[1] * m_w - proj[3] * mul2) * dL_dmean2D[idx].y;
dL_dmean.y = (proj[4] * m_w - proj[7] * mul1) * dL_dmean2D[idx].x + (proj[5] * m_w - proj[7] * mul2) * dL_dmean2D[idx].y;
dL_dmean.z = (proj[8] * m_w - proj[11] * mul1) * dL_dmean2D[idx].x + (proj[9] * m_w - proj[11] * mul2) * dL_dmean2D[idx].y;
// That's the second part of the mean gradient. Previous computation
// of cov2D and following SH conversion also affects it.
dL_dmeans[idx] += dL_dmean;
// Compute gradient updates due to computing colors from SHs
if (shs)
computeColorFromSH(idx, D, M, (glm::vec3*)means, *campos, shs, clamped, (glm::vec3*)dL_dcolor, (glm::vec3*)dL_dmeans, (glm::vec3*)dL_dsh);
// Compute gradient updates due to computing covariance from scale/rotation
if (scales)
computeCov3D(idx, scales[idx], scale_modifier, rotations[idx], dL_dcov3D, dL_dscale, dL_drot);
}
// Backward version of the rendering procedure.
template <uint32_t C>
__global__ void renderCUDA(
const uint2* __restrict__ ranges,
const uint32_t* __restrict__ point_list,
int W, int H,
const float* __restrict__ bg_color,
const float2* __restrict__ points_xy_image,
const float4* __restrict__ conic_opacity,
const float* __restrict__ colors,
const float* __restrict__ final_Ts,
const uint32_t* __restrict__ n_contrib,
const float* __restrict__ dL_dpixels,
float3* __restrict__ dL_dmean2D,
float4* __restrict__ dL_dconic2D,
float* __restrict__ dL_dopacity,
float* __restrict__ dL_dcolors)
{
// We rasterize again. Compute necessary block info.
auto block = cg::this_thread_block();
const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
const uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
const uint32_t pix_id = W * pix.y + pix.x;
const float2 pixf = { (float)pix.x, (float)pix.y };
const bool inside = pix.x < W&& pix.y < H;
const uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
bool done = !inside;
int toDo = range.y - range.x;
__shared__ int collected_id[BLOCK_SIZE];
__shared__ float2 collected_xy[BLOCK_SIZE];
__shared__ float4 collected_conic_opacity[BLOCK_SIZE];
__shared__ float collected_colors[C * BLOCK_SIZE];
// In the forward, we stored the final value for T, the
// product of all (1 - alpha) factors.
const float T_final = inside ? final_Ts[pix_id] : 0;
float T = T_final;
// We start from the back. The ID of the last contributing
// Gaussian is known from each pixel from the forward.
uint32_t contributor = toDo;
const int last_contributor = inside ? n_contrib[pix_id] : 0;
float accum_rec[C] = { 0 };
float dL_dpixel[C];
if (inside)
for (int i = 0; i < C; i++)
dL_dpixel[i] = dL_dpixels[i * H * W + pix_id];
float last_alpha = 0;
float last_color[C] = { 0 };
// Gradient of pixel coordinate w.r.t. normalized
// screen-space viewport corrdinates (-1 to 1)
const float ddelx_dx = 0.5 * W;
const float ddely_dy = 0.5 * H;
// Traverse all Gaussians
for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
{
// Load auxiliary data into shared memory, start in the BACK
// and load them in revers order.
block.sync();
const int progress = i * BLOCK_SIZE + block.thread_rank();
if (range.x + progress < range.y)
{
const int coll_id = point_list[range.y - progress - 1];
collected_id[block.thread_rank()] = coll_id;
collected_xy[block.thread_rank()] = points_xy_image[coll_id];
collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
for (int i = 0; i < C; i++)
collected_colors[i * BLOCK_SIZE + block.thread_rank()] = colors[coll_id * C + i];
}
block.sync();
// Iterate over Gaussians
for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
{
// Keep track of current Gaussian ID. Skip, if this one
// is behind the last contributor for this pixel.
contributor--;
if (contributor >= last_contributor)
continue;
// Compute blending values, as before.
const float2 xy = collected_xy[j];
const float2 d = { xy.x - pixf.x, xy.y - pixf.y };
const float4 con_o = collected_conic_opacity[j];
const float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
if (power > 0.0f)
continue;
const float G = exp(power);
const float alpha = min(0.99f, con_o.w * G);
if (alpha < 1.0f / 255.0f)
continue;
T = T / (1.f - alpha);
const float dchannel_dcolor = alpha * T;
// Propagate gradients to per-Gaussian colors and keep
// gradients w.r.t. alpha (blending factor for a Gaussian/pixel
// pair).
float dL_dalpha = 0.0f;
const int global_id = collected_id[j];
for (int ch = 0; ch < C; ch++)
{
const float c = collected_colors[ch * BLOCK_SIZE + j];
// Update last color (to be used in the next iteration)
accum_rec[ch] = last_alpha * last_color[ch] + (1.f - last_alpha) * accum_rec[ch];
last_color[ch] = c;
const float dL_dchannel = dL_dpixel[ch];
dL_dalpha += (c - accum_rec[ch]) * dL_dchannel;
// Update the gradients w.r.t. color of the Gaussian.
// Atomic, since this pixel is just one of potentially
// many that were affected by this Gaussian.
atomicAdd(&(dL_dcolors[global_id * C + ch]), dchannel_dcolor * dL_dchannel);
}
dL_dalpha *= T;
// Update last alpha (to be used in the next iteration)
last_alpha = alpha;
// Account for fact that alpha also influences how much of
// the background color is added if nothing left to blend
float bg_dot_dpixel = 0;
for (int i = 0; i < C; i++)
bg_dot_dpixel += bg_color[i] * dL_dpixel[i];
dL_dalpha += (-T_final / (1.f - alpha)) * bg_dot_dpixel;
// Helpful reusable temporary variables
const float dL_dG = con_o.w * dL_dalpha;
const float gdx = G * d.x;
const float gdy = G * d.y;
const float dG_ddelx = -gdx * con_o.x - gdy * con_o.y;
const float dG_ddely = -gdy * con_o.z - gdx * con_o.y;
// Update gradients w.r.t. 2D mean position of the Gaussian
atomicAdd(&dL_dmean2D[global_id].x, dL_dG * dG_ddelx * ddelx_dx);
atomicAdd(&dL_dmean2D[global_id].y, dL_dG * dG_ddely * ddely_dy);
// Update gradients w.r.t. 2D covariance (2x2 matrix, symmetric)
atomicAdd(&dL_dconic2D[global_id].x, -0.5f * gdx * d.x * dL_dG);
atomicAdd(&dL_dconic2D[global_id].y, -0.5f * gdx * d.y * dL_dG);
atomicAdd(&dL_dconic2D[global_id].w, -0.5f * gdy * d.y * dL_dG);
// Update gradients w.r.t. opacity of the Gaussian
atomicAdd(&(dL_dopacity[global_id]), G * dL_dalpha);
}
}
}
void BACKWARD::preprocess(
int P, int D, int M,
const float3* means3D,
const int* radii,
const float* shs,
const bool* clamped,
const glm::vec3* scales,
const glm::vec4* rotations,
const float scale_modifier,
const float* cov3Ds,
const float* viewmatrix,
const float* projmatrix,
const float focal_x, float focal_y,
const glm::vec3* campos,
const float3* dL_dmean2D,
const float* dL_dconic,
glm::vec3* dL_dmean3D,
float* dL_dcolor,
float* dL_dcov3D,
float* dL_dsh,
glm::vec3* dL_dscale,
glm::vec4* dL_drot)
{
// Propagate gradients for the path of 2D conic matrix computation.
// Somewhat long, thus it is its own kernel rather than being part of
// "preprocess". When done, loss gradient w.r.t. 3D means has been
// modified and gradient w.r.t. 3D covariance matrix has been computed.
computeCov2DCUDA << <(P + 255) / 256, 256 >> > (
P,
means3D,
radii,
cov3Ds,
focal_x,
focal_y,
viewmatrix,
dL_dconic,
(float3*)dL_dmean3D,
dL_dcov3D);
// Propagate gradients for remaining steps: finish 3D mean gradients,
// propagate color gradients to SH (if desireD), propagate 3D covariance
// matrix gradients to scale and rotation.
preprocessCUDA<NUM_CHANNELS> << < (P + 255) / 256, 256 >> > (
P, D, M,
(float3*)means3D,
radii,
shs,
clamped,
(glm::vec3*)scales,
(glm::vec4*)rotations,
scale_modifier,
projmatrix,
campos,
(float3*)dL_dmean2D,
(glm::vec3*)dL_dmean3D,
dL_dcolor,
dL_dcov3D,
dL_dsh,
dL_dscale,
dL_drot);
}
void BACKWARD::render(
const dim3 grid, const dim3 block,
const uint2* ranges,
const uint32_t* point_list,
int W, int H,
const float* bg_color,
const float2* means2D,
const float4* conic_opacity,
const float* colors,
const float* final_Ts,
const uint32_t* n_contrib,
const float* dL_dpixels,
float3* dL_dmean2D,
float4* dL_dconic2D,
float* dL_dopacity,
float* dL_dcolors)
{
renderCUDA<NUM_CHANNELS> << <grid, block >> >(
ranges,
point_list,
W, H,
bg_color,
means2D,
conic_opacity,
colors,
final_Ts,
n_contrib,
dL_dpixels,
dL_dmean2D,
dL_dconic2D,
dL_dopacity,
dL_dcolors
);
}
\ No newline at end of file
#ifndef CUDA_RASTERIZER_BACKWARD_H_INCLUDED
#define CUDA_RASTERIZER_BACKWARD_H_INCLUDED
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define GLM_FORCE_CUDA
#include <glm/glm.hpp>
namespace BACKWARD
{
void render(
const dim3 grid, dim3 block,
const uint2* ranges,
const uint32_t* point_list,
int W, int H,
const float* bg_color,
const float2* means2D,
const float4* conic_opacity,
const float* colors,
const float* final_Ts,
const uint32_t* n_contrib,
const float* dL_dpixels,
float3* dL_dmean2D,
float4* dL_dconic2D,
float* dL_dopacity,
float* dL_dcolors);
void preprocess(
int P, int D, int M,
const float3* means,
const int* radii,
const float* shs,
const bool* clamped,
const glm::vec3* scales,
const glm::vec4* rotations,
const float scale_modifier,
const float* cov3Ds,
const float* view,
const float* proj,
const float focal_x, float focal_y,
const glm::vec3* campos,
const float3* dL_dmean2D,
const float* dL_dconics,
glm::vec3* dL_dmeans,
float* dL_dcolor,
float* dL_dcov3D,
float* dL_dsh,
glm::vec3* dL_dscale,
glm::vec4* dL_drot);
}
#endif
\ No newline at end of file
#ifndef CUDA_RASTERIZER_CONFIG_H_INCLUDED
#define CUDA_RASTERIZER_CONFIG_H_INCLUDED
#define NUM_CHANNELS 3 // Default 3, RGB
#define BLOCK_X 16
#define BLOCK_Y 16
#endif
\ No newline at end of file
#include "forward.h"
#include "auxiliary.h"
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
namespace cg = cooperative_groups;
// Forward method for converting the input spherical harmonics
// coefficients of each Gaussian to a simple RGB color.
__device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped)
{
// The implementation is loosely based on code for
// "Differentiable Point-Based Radiance Fields for
// Efficient View Synthesis" by Zhang et al. (2022)
glm::vec3 pos = means[idx];
glm::vec3 dir = pos - campos;
dir = dir / glm::length(dir);
glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
glm::vec3 result = SH_C0 * sh[0];
if (deg > 0)
{
float x = dir.x;
float y = dir.y;
float z = dir.z;
result = result - SH_C1 * y * sh[1] + SH_C1 * z * sh[2] - SH_C1 * x * sh[3];
if (deg > 1)
{
float xx = x * x, yy = y * y, zz = z * z;
float xy = x * y, yz = y * z, xz = x * z;
result = result +
SH_C2[0] * xy * sh[4] +
SH_C2[1] * yz * sh[5] +
SH_C2[2] * (2.0f * zz - xx - yy) * sh[6] +
SH_C2[3] * xz * sh[7] +
SH_C2[4] * (xx - yy) * sh[8];
if (deg > 2)
{
result = result +
SH_C3[0] * y * (3.0f * xx - yy) * sh[9] +
SH_C3[1] * xy * z * sh[10] +
SH_C3[2] * y * (4.0f * zz - xx - yy) * sh[11] +
SH_C3[3] * z * (2.0f * zz - 3.0f * xx - 3.0f * yy) * sh[12] +
SH_C3[4] * x * (4.0f * zz - xx - yy) * sh[13] +
SH_C3[5] * z * (xx - yy) * sh[14] +
SH_C3[6] * x * (xx - 3.0f * yy) * sh[15];
}
}
}
result += 0.5f;
// RGB colors are clamped to positive values. If values are
// clamped, we need to keep track of this for the backward pass.
clamped[3 * idx + 0] = (result.x < 0);
clamped[3 * idx + 1] = (result.y < 0);
clamped[3 * idx + 2] = (result.z < 0);
return glm::max(result, 0.0f);
}
// Forward version of 2D covariance matrix computation
__device__ float3 computeCov2D(const float3& mean, float focal_x, float focal_y, const float* cov3D, const float* viewmatrix)
{
// The following models the steps outlined by equations 29
// and 31 in "EWA Splatting" (Zwicker et al., 2002).
// Additionally considers aspect / scaling of viewport.
// Transposes used to account for row-/column-major conventions.
float3 t = transformPoint4x3(mean, viewmatrix);
float t_inv_norm = 1.f / sqrt(t.x * t.x + t.y * t.y + t.z * t.z);
glm::mat3 J = glm::mat3(
focal_x / t.z, 0.0f, -(focal_x * t.x) / (t.z * t.z),
0.0f, focal_y / t.z, -(focal_y * t.y) / (t.z * t.z),
t.x * t_inv_norm, t.y * t_inv_norm, t.z * t_inv_norm);
glm::mat3 W = glm::mat3(
viewmatrix[0], viewmatrix[4], viewmatrix[8],
viewmatrix[1], viewmatrix[5], viewmatrix[9],
viewmatrix[2], viewmatrix[6], viewmatrix[10]);
glm::mat3 T = W * J;
glm::mat3 Vrk = glm::mat3(
cov3D[0], cov3D[1], cov3D[2],
cov3D[1], cov3D[3], cov3D[4],
cov3D[2], cov3D[4], cov3D[5]);
glm::mat3 cov = glm::transpose(T) * glm::transpose(Vrk) * T;
// Apply low-pass filter: every Gaussian should be at least
// one pixel wide/high. Discard 3rd row and column.
cov[0][0] += 0.3f;
cov[1][1] += 0.3f;
return { float(cov[0][0]), float(cov[0][1]), float(cov[1][1]) };
}
// Forward method for converting scale and rotation properties of each
// Gaussian to a 3D covariance matrix in world space. Also takes care
// of quaternion normalization and scale activation via exp.
__device__ void computeCov3D(const glm::vec3 scale, float mod, const glm::vec4 rot, float* cov3D)
{
// Create scaling matrix
glm::mat3 S = glm::mat3(1.0f);
S[0][0] = mod * exp(scale.x);
S[1][1] = mod * exp(scale.y);
S[2][2] = mod * exp(scale.z);
// Normalize quaternion to get valid rotation
glm::vec4 q = rot / glm::length(rot);
float r = q.x;
float x = q.y;
float y = q.z;
float z = q.w;
// Compute rotation matrix from quaternion
glm::mat3 R = glm::mat3(
1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y),
2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x),
2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y)
);
glm::mat3 M = S * R;
// Compute 3D world covariance matrix Sigma
glm::mat3 Sigma = glm::transpose(M) * M;
// Covariance is symmetric, only store upper right
cov3D[0] = Sigma[0][0];
cov3D[1] = Sigma[0][1];
cov3D[2] = Sigma[0][2];
cov3D[3] = Sigma[1][1];
cov3D[4] = Sigma[1][2];
cov3D[5] = Sigma[2][2];
}
// Perform initial steps for each Gaussian prior to rasterization.
template<int C>
__global__ void preprocessCUDA(int P, int D, int M,
const float* orig_points,
const glm::vec3* scales,
const float scale_modifier,
const glm::vec4* rotations,
const float* opacities,
const float* shs,
bool* clamped,
const float* cov3D_precomp,
const float* colors_precomp,
const float* viewmatrix,
const float* projmatrix,
const glm::vec3* cam_pos,
const int W, int H,
const float tan_fovx, float tan_fovy,
const float focal_x, float focal_y,
int* radii,
float2* points_xy_image,
float* depths,
float* cov3Ds,
float* rgb,
float4* conic_opacity,
const dim3 grid,
uint32_t* tiles_touched,
bool prefiltered)
{
auto idx = cg::this_grid().thread_rank();
if (idx >= P)
return;
// Initialize radius and touched tiles to 0. If this isn't changed,
// this Gaussian will not be processed further.
radii[idx] = 0;
tiles_touched[idx] = 0;
// Perform near and frustum culling with guardband, quit if outside.
float3 p_view;
if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view))
return;
// Transform point by projecting
float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
float4 p_hom = transformPoint4x4(p_orig, projmatrix);
float p_w = 1.0f / (p_hom.w + 0.0000001f);
float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
// If 3D covariance matrix is precomputed, use it, otherwise compute
// from scaling and rotation parameters.
const float* cov3D;
if (cov3D_precomp != nullptr)
{
cov3D = cov3D_precomp + idx * 6;
}
else
{
computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6);
cov3D = cov3Ds + idx * 6;
}
// Compute max extent of Gaussian for fine-grained fustum culling
float max_dist2 = 9.f * max(cov3D[0], max(cov3D[3], cov3D[5]));
// Compute 2D screen-space covariance matrix
float3 cov = computeCov2D(p_orig, focal_x, focal_y, cov3D, viewmatrix);
// Invert covariance (EWA algorithm)
float det = (cov.x * cov.z - cov.y * cov.y);
if (det == 0.0f)
return;
float det_inv = 1.f / det;
float3 conic = { cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv };
// Fine-grained frustum culling against ellipsoid
float z_at_point = p_view.z + sqrt(max_dist2);
float x_to_border = z_at_point * tan_fovx;
float y_to_border = z_at_point * tan_fovy;
float D2_point = p_view.x * p_view.x + p_view.y * p_view.y;
if (D2_point - (x_to_border * x_to_border + y_to_border * y_to_border) > max_dist2)
return;
// Compute extent in screen space (by finding eigenvalues of
// 2D covariance matrix). Use extent to compute a bounding rectangle
// of screen-space tiles that this Gaussian overlaps with. Quit if
// rectangle covers 0 tiles.
float mid = 0.5f * (cov.x + cov.z);
float lambda1 = mid + sqrt(max(0.1f, mid * mid - det));
float lambda2 = mid - sqrt(max(0.1f, mid * mid - det));
float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2)));
float2 point_image = { ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H) };
uint2 rect_min, rect_max;
getRect(point_image, my_radius, rect_min, rect_max, grid);
if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0)
return;
// If colors have been precomputed, use them, otherwise convert
// spherical harmonics coefficients to RGB color.
if (colors_precomp == nullptr)
{
glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped);
rgb[idx * C + 0] = result.x;
rgb[idx * C + 1] = result.y;
rgb[idx * C + 2] = result.z;
}
// Store some useful helper data for the next steps.
depths[idx] = p_view.z;
radii[idx] = my_radius;
points_xy_image[idx] = point_image;
// Inverse 2D covariance and opacity neatly pack into one float4
conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] };
tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x);
}
// Main rasterization method. Collaboratively works on one tile per
// block, each thread treats one pixel. Alternates between fetching
// and rasterizing data.
template <uint32_t CHANNELS>
__global__ void renderCUDA(
const uint2* __restrict__ ranges,
const uint32_t* __restrict__ point_list,
int W, int H,
const float2* __restrict__ points_xy_image,
const float* __restrict__ features,
const float4* __restrict__ conic_opacity,
float* __restrict__ final_T,
uint32_t* __restrict__ n_contrib,
const float* __restrict__ bg_color,
float* __restrict__ out_color)
{
// Identify current tile and associated min/max pixel range.
auto block = cg::this_thread_block();
uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
uint32_t pix_id = W * pix.y + pix.x;
float2 pixf = { (float)pix.x, (float)pix.y };
// Check if this thread is associated with a valid pixel or outside.
bool inside = pix.x < W&& pix.y < H;
// Done threads can help with fetching, but don't rasterize
bool done = !inside;
// Load start/end range of IDs to process in bit sorted list.
uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
int toDo = range.y - range.x;
// Allocate storage for batches of collectively fetched data.
__shared__ int collected_id[BLOCK_SIZE];
__shared__ float2 collected_xy[BLOCK_SIZE];
__shared__ float4 collected_conic_opacity[BLOCK_SIZE];
// Initialize helper variables
float T = 1.0f;
uint32_t contributor = 0;
uint32_t last_contributor = 0;
float C[CHANNELS] = { 0 };
// Iterate over batches until all done or range is complete
for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
{
// End if entire block votes that it is done rasterizing
int num_done = __syncthreads_count(done);
if (num_done == BLOCK_SIZE)
break;
// Collectively fetch per-Gaussian data from global to shared
int progress = i * BLOCK_SIZE + block.thread_rank();
if (range.x + progress < range.y)
{
int coll_id = point_list[range.x + progress];
collected_id[block.thread_rank()] = coll_id;
collected_xy[block.thread_rank()] = points_xy_image[coll_id];
collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
}
block.sync();
// Iterate over current batch
for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
{
// Keep track of current position in range
contributor++;
// Resample using conic matrix (cf. "Surface
// Splatting" by Zwicker et al., 2001)
float2 xy = collected_xy[j];
float2 d = { xy.x - pixf.x, xy.y - pixf.y };
float4 con_o = collected_conic_opacity[j];
float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
if (power > 0.0f)
continue;
// Eq. (2) from 3D Gaussian splatting paper.
// Obtain alpha by multiplying with Gaussian opacity
// and its exponential falloff from mean.
// Avoid numerical instabilities (see paper appendix).
float alpha = min(0.99f, con_o.w * exp(power));
if (alpha < 1.0f / 255.0f)
continue;
float test_T = T * (1 - alpha);
if (test_T < 0.0001f)
{
done = true;
continue;
}
// Eq. (3) from 3D Gaussian splatting paper.
for (int ch = 0; ch < CHANNELS; ch++)
C[ch] += features[collected_id[j] * CHANNELS + ch] * alpha * T;
T = test_T;
// Keep track of last range entry to update this
// pixel.
last_contributor = contributor;
}
}
// All threads that treat valid pixel write out their final
// rendering data to the frame and auxiliary buffers.
if (inside)
{
final_T[pix_id] = T;
n_contrib[pix_id] = last_contributor;
for (int ch = 0; ch < CHANNELS; ch++)
out_color[ch * H * W + pix_id] = C[ch] + T * bg_color[ch];
}
}
void FORWARD::render(
const dim3 grid, dim3 block,
const uint2* ranges,
const uint32_t* point_list,
int W, int H,
const float2* means2D,
const float* colors,
const float4* conic_opacity,
float* final_T,
uint32_t* n_contrib,
const float* bg_color,
float* out_color)
{
renderCUDA<NUM_CHANNELS> << <grid, block >> > (
ranges,
point_list,
W, H,
means2D,
colors,
conic_opacity,
final_T,
n_contrib,
bg_color,
out_color);
}
void FORWARD::preprocess(int P, int D, int M,
const float* means3D,
const glm::vec3* scales,
const float scale_modifier,
const glm::vec4* rotations,
const float* opacities,
const float* shs,
bool* clamped,
const float* cov3D_precomp,
const float* colors_precomp,
const float* viewmatrix,
const float* projmatrix,
const glm::vec3* cam_pos,
const int W, int H,
const float tan_fovx, float tan_fovy,
const float focal_x, float focal_y,
int* radii,
float2* means2D,
float* depths,
float* cov3Ds,
float* rgb,
float4* conic_opacity,
const dim3 grid,
uint32_t* tiles_touched,
bool prefiltered)
{
preprocessCUDA<NUM_CHANNELS> << <(P + 255) / 256, 256 >> > (
P, D, M,
means3D,
scales,
scale_modifier,
rotations,
opacities,
shs,
clamped,
cov3D_precomp,
colors_precomp,
viewmatrix,
projmatrix,
cam_pos,
W, H,
tan_fovx, tan_fovy,
focal_x, focal_y,
radii,
means2D,
depths,
cov3Ds,
rgb,
conic_opacity,
grid,
tiles_touched,
prefiltered
);
}
\ No newline at end of file
#ifndef CUDA_RASTERIZER_FORWARD_H_INCLUDED
#define CUDA_RASTERIZER_FORWARD_H_INCLUDED
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define GLM_FORCE_CUDA
#include <glm/glm.hpp>
namespace FORWARD
{
// Perform initial steps for each Gaussian prior to rasterization.
void preprocess(int P, int D, int M,
const float* orig_points,
const glm::vec3* scales,
const float scale_modifier,
const glm::vec4* rotations,
const float* opacities,
const float* shs,
bool* clamped,
const float* cov3D_precomp,
const float* colors_precomp,
const float* viewmatrix,
const float* projmatrix,
const glm::vec3* cam_pos,
const int W, int H,
const float tan_fovx, float tan_fovy,
const float focal_x, float focal_y,
int* radii,
float2* points_xy_image,
float* depths,
float* cov3Ds,
float* colors,
float4* conic_opacity,
const dim3 grid,
uint32_t* tiles_touched,
bool prefiltered);
// Main rasterization method.
void render(
const dim3 grid, dim3 block,
const uint2* ranges,
const uint32_t* point_list,
int W, int H,
const float2* points_xy_image,
const float* features,
const float4* conic_opacity,
float* final_T,
uint32_t* n_contrib,
const float* bg_color,
float* out_color);
}
#endif
\ No newline at end of file
#ifndef CUDA_RASTERIZER_H_INCLUDED
#define CUDA_RASTERIZER_H_INCLUDED
#include <vector>
namespace CudaRasterizer
{
class Rasterizer
{
public:
virtual void markVisible(
int P,
float* means3D,
float* viewmatrix,
float* projmatrix,
bool* present) = 0;
virtual void forward(
const int P, int D, int M,
const float* background,
const int width, int height,
const float* means3D,
const float* shs,
const float* colors_precomp,
const float* opacities,
const float* scales,
const float scale_modifier,
const float* rotations,
const float* cov3D_precomp,
const float* viewmatrix,
const float* projmatrix,
const float* cam_pos,
const float tan_fovx, float tan_fovy,
const bool prefiltered,
float* out_color,
int* radii = nullptr) = 0;
virtual void backward(
const int P, int D, int M,
const float* background,
const int width, int height,
const float* means3D,
const float* shs,
const float* colors_precomp,
const float* scales,
const float scale_modifier,
const float* rotations,
const float* cov3D_precomp,
const float* viewmatrix,
const float* projmatrix,
const float* campos,
const float tan_fovx, float tan_fovy,
const int* radii,
const float* dL_dpix,
float* dL_dmean2D,
float* dL_dconic,
float* dL_dopacity,
float* dL_dcolor,
float* dL_dmean3D,
float* dL_dcov3D,
float* dL_dsh,
float* dL_dscale,
float* dL_drot) = 0;
virtual ~Rasterizer() {};
static Rasterizer* make(int resizeMultipliyer = 2);
};
};
#endif
\ No newline at end of file
#include "rasterizer_impl.h"
#include <iostream>
#include <fstream>
#include <algorithm>
#include <numeric>
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cub/cub.cuh>
#include <cub/device/device_radix_sort.cuh>
#include <thrust/sequence.h>
#define GLM_FORCE_CUDA
#include <glm/glm.hpp>
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
namespace cg = cooperative_groups;
#include "auxiliary.h"
#include "forward.h"
#include "backward.h"
// Helper function to find the next-highest bit of the MSB
// on the CPU.
uint32_t getHigherMsb(uint32_t n)
{
uint32_t msb = sizeof(n) * 4;
uint32_t step = msb;
while (step > 1)
{
step /= 2;
if (n >> msb)
msb += step;
else
msb -= step;
}
if (n >> msb)
msb++;
return msb;
}
// Wrapper method to call auxiliary coarse frustum containment test.
// Mark all Gaussians that pass it.
__global__ void checkFrustum(int P,
const float* orig_points,
const float* viewmatrix,
const float* projmatrix,
bool* present)
{
auto idx = cg::this_grid().thread_rank();
if (idx >= P)
return;
float3 p_view;
present[idx] = in_frustum(idx, orig_points, viewmatrix, projmatrix, false, p_view);
}
// Generates one key/value pair for all Gaussian / tile overlaps.
// Run once per Gaussian (1:N mapping).
__global__ void duplicateWithKeys(
int P,
const float2* points_xy,
const float* depths,
const uint32_t* offsets,
uint64_t* gaussian_keys_unsorted,
uint32_t* gaussian_values_unsorted,
int* radii,
dim3 grid)
{
auto idx = cg::this_grid().thread_rank();
if (idx >= P)
return;
// Generate no key/value pair for invisible Gaussians
if (radii[idx] > 0)
{
// Find this Gaussian's offset in buffer for writing keys/values.
uint32_t off = (idx == 0) ? 0 : offsets[idx - 1];
uint2 rect_min, rect_max;
getRect(points_xy[idx], radii[idx], rect_min, rect_max, grid);
// For each tile that the bounding rect overlaps, emit a
// key/value pair. The key is | tile ID | depth |,
// and the value is the ID of the Gaussian. Sorting the values
// with this key yields Gaussian IDs in a list, such that they
// are first sorted by tile and then by depth.
for (int y = rect_min.y; y < rect_max.y; y++)
{
for (int x = rect_min.x; x < rect_max.x; x++)
{
uint64_t key = y * grid.x + x;
key <<= 32;
key |= *((uint32_t*)&depths[idx]);
gaussian_keys_unsorted[off] = key;
gaussian_values_unsorted[off] = idx;
off++;
}
}
}
}
// Check keys to see if it is at the start/end of one tile's range in
// the full sorted list. If yes, write start/end of this tile.
// Run once per instanced (duplicated) Gaussian ID.
__global__ void identifyTileRanges(int L, uint64_t* point_list_keys, uint2* ranges)
{
auto idx = cg::this_grid().thread_rank();
if (idx >= L)
return;
// Read tile ID from key. Update start/end of tile range if at limit.
uint64_t key = point_list_keys[idx];
uint32_t currtile = key >> 32;
if (idx == 0)
ranges[currtile].x = 0;
else
{
uint32_t prevtile = point_list_keys[idx - 1] >> 32;
if (currtile != prevtile)
{
ranges[prevtile].y = idx;
ranges[currtile].x = idx;
}
if (idx == L - 1)
ranges[currtile].y = L;
}
}
CudaRasterizer::RasterizerImpl::RasterizerImpl(int resizeMultiplier)
: resizeMultiplier(resizeMultiplier)
{}
// Instantiate rasterizer
CudaRasterizer::Rasterizer* CudaRasterizer::Rasterizer::make(int resizeMultiplier)
{
printf("Version 17\n");
return new CudaRasterizer::RasterizerImpl(resizeMultiplier);
}
// Mark Gaussians as visible/invisible, based on view frustum testing
void CudaRasterizer::RasterizerImpl::markVisible(
int P,
float* means3D,
float* viewmatrix,
float* projmatrix,
bool* present)
{
checkFrustum << <(P + 255) / 256, 256 >> > (
P,
means3D,
viewmatrix, projmatrix,
present);
}
// Forward rendering procedure for differentiable rasterization
// of Gaussians.
void CudaRasterizer::RasterizerImpl::forward(
const int P, int D, int M,
const float* background,
const int width, int height,
const float* means3D,
const float* shs,
const float* colors_precomp,
const float* opacities,
const float* scales,
const float scale_modifier,
const float* rotations,
const float* cov3D_precomp,
const float* viewmatrix,
const float* projmatrix,
const float* cam_pos,
const float tan_fovx, float tan_fovy,
const bool prefiltered,
float* out_color,
int* radii)
{
const float focal_y = height / (2.0f * tan_fovy);
const float focal_x = width / (2.0f * tan_fovx);
// Dynamically resize auxiliary buffers during training
if (P > maxP)
{
maxP = resizeMultiplier * P;
cov3D.resize(maxP * 6);
rgb.resize(maxP * 3);
tiles_touched.resize(maxP);
point_offsets.resize(maxP);
clamped.resize(3 * maxP);
depths.resize(maxP);
means2D.resize(maxP);
conic_opacity.resize(maxP);
cub::DeviceScan::InclusiveSum(nullptr, scan_size, tiles_touched.data().get(), tiles_touched.data().get(), maxP);
scanning_space.resize(scan_size);
}
if (radii == nullptr)
{
internal_radii.resize(maxP);
radii = internal_radii.data().get();
}
dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
dim3 block(BLOCK_X, BLOCK_Y, 1);
// Dynamically resize image-based auxiliary buffers during training
if (width * height > maxPixels)
{
maxPixels = width * height;
accum_alpha.resize(maxPixels);
n_contrib.resize(maxPixels);
ranges.resize(tile_grid.x * tile_grid.y);
}
if (NUM_CHANNELS != 3 && colors_precomp == nullptr)
{
throw std::runtime_error("For non-RGB, provide precomputed Gaussian colors!");
}
// Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB)
FORWARD::preprocess(
P, D, M,
means3D,
(glm::vec3*)scales,
scale_modifier,
(glm::vec4*)rotations,
opacities,
shs,
clamped.data().get(),
cov3D_precomp,
colors_precomp,
viewmatrix, projmatrix,
(glm::vec3*)cam_pos,
width, height,
tan_fovx, tan_fovy,
focal_x, focal_y,
radii,
means2D.data().get(),
depths.data().get(),
cov3D.data().get(),
rgb.data().get(),
conic_opacity.data().get(),
tile_grid,
tiles_touched.data().get(),
prefiltered
);
// Compute prefix sum over full list of touched tile counts by Gaussians
// E.g., [2, 3, 0, 2, 1] -> [2, 5, 5, 7, 8]
cub::DeviceScan::InclusiveSum(scanning_space.data().get(), scan_size,
tiles_touched.data().get(), point_offsets.data().get(), P);
// Retrieve total number of Gaussian instances to launch and resize aux buffers
int num_needed;
cudaMemcpy(&num_needed, point_offsets.data().get() + P - 1, sizeof(int), cudaMemcpyDeviceToHost);
if (num_needed > point_list_keys_unsorted.size())
{
point_list_keys_unsorted.resize(2 * num_needed);
point_list_keys.resize(2 * num_needed);
point_list_unsorted.resize(2 * num_needed);
point_list.resize(2 * num_needed);
cub::DeviceRadixSort::SortPairs(
nullptr, sorting_size,
point_list_keys_unsorted.data().get(), point_list_keys.data().get(),
point_list_unsorted.data().get(), point_list.data().get(),
2 * num_needed);
list_sorting_space.resize(sorting_size);
}
// For each instance to be rendered, produce adequate [ tile | depth ] key
// and corresponding dublicated Gaussian indices to be sorted
duplicateWithKeys << <(P + 255) / 256, 256 >> > (
P,
means2D.data().get(),
depths.data().get(),
point_offsets.data().get(),
point_list_keys_unsorted.data().get(),
point_list_unsorted.data().get(),
radii,
tile_grid
);
int bit = getHigherMsb(tile_grid.x * tile_grid.y);
// Sort complete list of (duplicated) Gaussian indices by keys
cub::DeviceRadixSort::SortPairs(
list_sorting_space.data().get(),
sorting_size,
point_list_keys_unsorted.data().get(), point_list_keys.data().get(),
point_list_unsorted.data().get(), point_list.data().get(),
num_needed, 0, 32 + bit);
cudaMemset(ranges.data().get(), 0, tile_grid.x * tile_grid.y * sizeof(uint2));
// Identify start and end of per-tile workloads in sorted list
identifyTileRanges << <(num_needed + 255) / 256, 256 >> > (
num_needed,
point_list_keys.data().get(),
ranges.data().get()
);
// Let each tile blend its range of Gaussians independently in parallel
const float* feature_ptr = colors_precomp != nullptr ? colors_precomp : rgb.data().get();
FORWARD::render(
tile_grid, block,
ranges.data().get(),
point_list.data().get(),
width, height,
means2D.data().get(),
feature_ptr,
conic_opacity.data().get(),
accum_alpha.data().get(),
n_contrib.data().get(),
background,
out_color);
}
// Produce necessary gradients for optimization, corresponding
// to forward render pass
void CudaRasterizer::RasterizerImpl::backward(
const int P, int D, int M,
const float* background,
const int width, int height,
const float* means3D,
const float* shs,
const float* colors_precomp,
const float* scales,
const float scale_modifier,
const float* rotations,
const float* cov3D_precomp,
const float* viewmatrix,
const float* projmatrix,
const float* campos,
const float tan_fovx, float tan_fovy,
const int* radii,
const float* dL_dpix,
float* dL_dmean2D,
float* dL_dconic,
float* dL_dopacity,
float* dL_dcolor,
float* dL_dmean3D,
float* dL_dcov3D,
float* dL_dsh,
float* dL_dscale,
float* dL_drot)
{
if (radii == nullptr)
{
radii = internal_radii.data().get();
}
const float focal_y = height / (2.0f * tan_fovy);
const float focal_x = width / (2.0f * tan_fovx);
const dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
const dim3 block(BLOCK_X, BLOCK_Y, 1);
// Compute loss gradients w.r.t. 2D mean position, conic matrix,
// opacity and RGB of Gaussians from per-pixel loss gradients.
// If we were given precomputed colors and not SHs, use them.
const float* color_ptr = (colors_precomp != nullptr) ? colors_precomp : rgb.data().get();
BACKWARD::render(
tile_grid,
block,
ranges.data().get(),
point_list.data().get(),
width, height,
background,
means2D.data().get(),
conic_opacity.data().get(),
color_ptr,
accum_alpha.data().get(),
n_contrib.data().get(),
dL_dpix,
(float3*)dL_dmean2D,
(float4*)dL_dconic,
dL_dopacity,
dL_dcolor);
// Take care of the rest of preprocessing. Was the precomputed covariance
// given to us or a scales/rot pair? If precomputed, pass that. If not,
// use the one we computed ourselves.
const float* cov3D_ptr = (cov3D_precomp != nullptr) ? cov3D_precomp : cov3D.data().get();
BACKWARD::preprocess(P, D, M,
(float3*)means3D,
radii,
shs,
clamped.data().get(),
(glm::vec3*)scales,
(glm::vec4*)rotations,
scale_modifier,
cov3D_ptr,
viewmatrix,
projmatrix,
focal_x, focal_y,
(glm::vec3*)campos,
(float3*)dL_dmean2D,
dL_dconic,
(glm::vec3*)dL_dmean3D,
dL_dcolor,
dL_dcov3D,
dL_dsh,
(glm::vec3*)dL_dscale,
(glm::vec4*)dL_drot);
}
CudaRasterizer::RasterizerImpl::~RasterizerImpl()
{
}
\ No newline at end of file
#pragma once
#include <iostream>
#include <vector>
#include "rasterizer.h"
#include <cuda_runtime_api.h>
#include <thrust/device_vector.h>
namespace CudaRasterizer
{
class RasterizerImpl : public Rasterizer
{
private:
int maxP = 0;
int maxPixels = 0;
int resizeMultiplier = 2;
// Initial aux structs
size_t sorting_size;
size_t list_sorting_size;
size_t scan_size;
thrust::device_vector<float> depths;
thrust::device_vector<uint32_t> tiles_touched;
thrust::device_vector<uint32_t> point_offsets;
thrust::device_vector<uint64_t> point_list_keys_unsorted;
thrust::device_vector<uint64_t> point_list_keys;
thrust::device_vector<uint32_t> point_list_unsorted;
thrust::device_vector<uint32_t> point_list;
thrust::device_vector<char> scanning_space;
thrust::device_vector<char> list_sorting_space;
thrust::device_vector<bool> clamped;
thrust::device_vector<int> internal_radii;
// Internal state kept across forward / backward
thrust::device_vector<uint2> ranges;
thrust::device_vector<uint32_t> n_contrib;
thrust::device_vector<float> accum_alpha;
thrust::device_vector<float2> means2D;
thrust::device_vector<float> cov3D;
thrust::device_vector<float4> conic_opacity;
thrust::device_vector<float> rgb;
public:
virtual void markVisible(
int P,
float* means3D,
float* viewmatrix,
float* projmatrix,
bool* present) override;
virtual void forward(
const int P, int D, int M,
const float* background,
const int width, int height,
const float* means3D,
const float* shs,
const float* colors_precomp,
const float* opacities,
const float* scales,
const float scale_modifier,
const float* rotations,
const float* cov3D_precomp,
const float* viewmatrix,
const float* projmatrix,
const float* cam_pos,
const float tan_fovx, float tan_fovy,
const bool prefiltered,
float* out_color,
int* radii) override;
virtual void backward(
const int P, int D, int M,
const float* background,
const int width, int height,
const float* means3D,
const float* shs,
const float* colors_precomp,
const float* scales,
const float scale_modifier,
const float* rotations,
const float* cov3D_precomp,
const float* viewmatrix,
const float* projmatrix,
const float* campos,
const float tan_fovx, float tan_fovy,
const int* radii,
const float* dL_dpix,
float* dL_dmean2D,
float* dL_dconic,
float* dL_dopacity,
float* dL_dcolor,
float* dL_dmean3D,
float* dL_dcov3D,
float* dL_dsh,
float* dL_dscale,
float* dL_drot) override;
RasterizerImpl(int resizeMultiplier);
virtual ~RasterizerImpl() override;
};
};
\ No newline at end of file
#include <torch/extension.h>
#include "rasterize_points.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("rasterize_gaussians", &RasterizeGaussiansCUDA);
m.def("rasterize_gaussians_backward", &RasterizeGaussiansBackwardCUDA);
m.def("mark_visible", &markVisible);
}
\ No newline at end of file
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <math.h>
#include <torch/extension.h>
#include <cstdio>
#include <sstream>
#include <iostream>
#include <tuple>
#include <stdio.h>
#include <cuda_runtime_api.h>
#include <memory>
#include "cuda_rasterizer/config.h"
#include "cuda_rasterizer/rasterizer.h"
#include <fstream>
#include <string>
static std::unique_ptr<CudaRasterizer::Rasterizer> cudaRenderer = nullptr;
std::tuple<torch::Tensor, torch::Tensor>
RasterizeGaussiansCUDA(
const torch::Tensor& background,
const torch::Tensor& means3D,
const torch::Tensor& colors,
const torch::Tensor& opacity,
const torch::Tensor& scales,
const torch::Tensor& rotations,
const float scale_modifier,
const torch::Tensor& cov3D_precomp,
const torch::Tensor& viewmatrix,
const torch::Tensor& projmatrix,
const float tan_fovx,
const float tan_fovy,
const int image_height,
const int image_width,
const torch::Tensor& sh,
const int degree,
const torch::Tensor& campos,
const bool prefiltered)
{
if (means3D.ndimension() != 2 || means3D.size(1) != 3) {
AT_ERROR("means3D must have dimensions (num_points, 3)");
}
if (cudaRenderer == nullptr)
{
cudaRenderer = std::unique_ptr<CudaRasterizer::Rasterizer>(CudaRasterizer::Rasterizer::make());
}
const int P = means3D.size(0);
const int N = 1; // batch size hard-coded
const int H = image_height;
const int W = image_width;
auto int_opts = means3D.options().dtype(torch::kInt32);
auto float_opts = means3D.options().dtype(torch::kFloat32);
torch::Tensor out_color = torch::full({N, NUM_CHANNELS, H, W}, 0.0, float_opts);
torch::Tensor radii = torch::full({P}, 0, means3D.options().dtype(torch::kInt32));
if(P != 0)
{
int M = 0;
if(sh.size(0) != 0)
{
M = sh.size(1);
}
cudaRenderer->forward(P, degree, M,
background.contiguous().data<float>(),
W, H,
means3D.contiguous().data<float>(),
sh.contiguous().data_ptr<float>(),
colors.contiguous().data<float>(),
opacity.contiguous().data<float>(),
scales.contiguous().data_ptr<float>(),
scale_modifier,
rotations.contiguous().data_ptr<float>(),
cov3D_precomp.contiguous().data<float>(),
viewmatrix.contiguous().data<float>(),
projmatrix.contiguous().data<float>(),
campos.contiguous().data<float>(),
tan_fovx,
tan_fovy,
prefiltered,
out_color.contiguous().data<float>(),
radii.contiguous().data<int>());
}
return std::make_tuple(out_color, radii);
}
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
RasterizeGaussiansBackwardCUDA(
const torch::Tensor& background,
const torch::Tensor& means3D,
const torch::Tensor& radii,
const torch::Tensor& colors,
const torch::Tensor& scales,
const torch::Tensor& rotations,
const float scale_modifier,
const torch::Tensor& cov3D_precomp,
const torch::Tensor& viewmatrix,
const torch::Tensor& projmatrix,
const float tan_fovx,
const float tan_fovy,
const torch::Tensor& dL_dout_color,
const torch::Tensor& sh,
const int degree,
const torch::Tensor& campos)
{
const int P = means3D.size(0);
const int H = dL_dout_color.size(2);
const int W = dL_dout_color.size(3);
int M = 0;
if(sh.size(0) != 0)
{
M = sh.size(1);
}
torch::Tensor dL_dmeans3D = torch::zeros({P, 3}, means3D.options());
torch::Tensor dL_dmeans2D = torch::zeros({P, 3}, means3D.options());
torch::Tensor dL_dcolors = torch::zeros({P, NUM_CHANNELS}, means3D.options());
torch::Tensor dL_dconic = torch::zeros({P, 2, 2}, means3D.options());
torch::Tensor dL_dopacity = torch::zeros({P, 1}, means3D.options());
torch::Tensor dL_dcov3D = torch::zeros({P, 6}, means3D.options());
torch::Tensor dL_dsh = torch::zeros({P, M, 3}, means3D.options());
torch::Tensor dL_dscales = torch::zeros({P, 3}, means3D.options());
torch::Tensor dL_drotations = torch::zeros({P, 4}, means3D.options());
if(P != 0)
{
cudaRenderer->backward(P, degree, M,
background.contiguous().data<float>(),
W, H,
means3D.contiguous().data<float>(),
sh.contiguous().data<float>(),
colors.contiguous().data<float>(),
scales.data_ptr<float>(),
scale_modifier,
rotations.data_ptr<float>(),
cov3D_precomp.contiguous().data<float>(),
viewmatrix.contiguous().data<float>(),
projmatrix.contiguous().data<float>(),
campos.contiguous().data<float>(),
tan_fovx,
tan_fovy,
radii.contiguous().data<int>(),
dL_dout_color.contiguous().data<float>(),
dL_dmeans2D.contiguous().data<float>(),
dL_dconic.contiguous().data<float>(),
dL_dopacity.contiguous().data<float>(),
dL_dcolors.contiguous().data<float>(),
dL_dmeans3D.contiguous().data<float>(),
dL_dcov3D.contiguous().data<float>(),
dL_dsh.contiguous().data<float>(),
dL_dscales.contiguous().data<float>(),
dL_drotations.contiguous().data<float>());
}
return std::make_tuple(dL_dmeans2D, dL_dcolors, dL_dopacity, dL_dmeans3D, dL_dcov3D, dL_dsh, dL_dscales, dL_drotations);
}
torch::Tensor markVisible(
torch::Tensor& means3D,
torch::Tensor& viewmatrix,
torch::Tensor& projmatrix)
{
if (cudaRenderer == nullptr)
{
cudaRenderer = std::unique_ptr<CudaRasterizer::Rasterizer>(CudaRasterizer::Rasterizer::make());
}
const int P = means3D.size(0);
torch::Tensor present = torch::full({P}, false, means3D.options().dtype(at::kBool));
if(P != 0)
{
cudaRenderer->markVisible(P,
means3D.contiguous().data<float>(),
viewmatrix.contiguous().data<float>(),
projmatrix.contiguous().data<float>(),
present.contiguous().data<bool>());
}
return present;
}
\ No newline at end of file
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#pragma once
#include <torch/extension.h>
#include <cstdio>
#include <tuple>
#include <string>
std::tuple<torch::Tensor, torch::Tensor>
RasterizeGaussiansCUDA(
const torch::Tensor& background,
const torch::Tensor& means3D,
const torch::Tensor& colors,
const torch::Tensor& opacity,
const torch::Tensor& scales,
const torch::Tensor& rotations,
const float scale_modifier,
const torch::Tensor& cov3D_precomp,
const torch::Tensor& viewmatrix,
const torch::Tensor& projmatrix,
const float tan_fovx,
const float tan_fovy,
const int image_height,
const int image_width,
const torch::Tensor& sh,
const int degree,
const torch::Tensor& campos,
const bool prefiltered);
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
RasterizeGaussiansBackwardCUDA(
const torch::Tensor& background,
const torch::Tensor& means3D,
const torch::Tensor& radii,
const torch::Tensor& colors,
const torch::Tensor& scales,
const torch::Tensor& rotations,
const float scale_modifier,
const torch::Tensor& cov3D_precomp,
const torch::Tensor& viewmatrix,
const torch::Tensor& projmatrix,
const float tan_fovx,
const float tan_fovy,
const torch::Tensor& dL_dout_color,
const torch::Tensor& sh,
const int degree,
const torch::Tensor& campos);
torch::Tensor markVisible(
torch::Tensor& means3D,
torch::Tensor& viewmatrix,
torch::Tensor& projmatrix);
\ No newline at end of file
from typing import NamedTuple
import torch.nn as nn
import torch
import utils.profiling_utils as profiling_utils
from diff_rasterizationCUDA import _C
def rasterize_gaussians(
means3D,
means2D,
sh,
colors_precomp,
opacities,
scales,
rotations,
cov3Ds_precomp,
raster_settings,
):
return _RasterizeGaussians.apply(
means3D,
means2D,
sh,
colors_precomp,
opacities,
scales,
rotations,
cov3Ds_precomp,
raster_settings,
)
class _RasterizeGaussians(torch.autograd.Function):
@staticmethod
def forward(
ctx,
means3D,
means2D,
sh,
colors_precomp,
opacities,
scales,
rotations,
cov3Ds_precomp,
raster_settings,
):
# Restructure arguments the way that the C++ lib expects them
args = (
raster_settings.bg,
means3D,
colors_precomp,
opacities,
scales,
rotations,
raster_settings.scale_modifier,
cov3Ds_precomp,
raster_settings.viewmatrix,
raster_settings.projmatrix,
raster_settings.tanfovx,
raster_settings.tanfovy,
raster_settings.image_height,
raster_settings.image_width,
sh,
raster_settings.sh_degree,
raster_settings.campos,
raster_settings.prefiltered,
)
# Invoke C++/CUDA rasterizer
color, radii = _C.rasterize_gaussians(*args)
# Keep relevant tensors for backward
ctx.raster_settings = raster_settings
ctx.save_for_backward(colors_precomp, means3D, scales, rotations, cov3Ds_precomp, radii, sh)
return color, radii
@staticmethod
def backward(ctx, grad_out_color, _):
# Restore necessary values from context
raster_settings = ctx.raster_settings
colors_precomp, means3D, scales, rotations, cov3Ds_precomp, radii, sh = ctx.saved_tensors
# Restructure args as C++ method expects them
args = (raster_settings.bg,
means3D,
radii,
colors_precomp,
scales,
rotations,
raster_settings.scale_modifier,
cov3Ds_precomp,
raster_settings.viewmatrix,
raster_settings.projmatrix,
raster_settings.tanfovx,
raster_settings.tanfovy,
grad_out_color,
sh,
raster_settings.sh_degree,
raster_settings.campos)
back_rng = profiling_utils.start("rasterize", "yellow", "render" )
# Compute gradients for relevant tensors by invoking backward method
grad_means2D, grad_colors_precomp, grad_opacities, grad_means3D, grad_cov3Ds_precomp, grad_sh, grad_scales, grad_rotations = _C.rasterize_gaussians_backward(*args)
profiling_utils.stop(back_rng)
grads = (
grad_means3D,
grad_means2D,
grad_sh,
grad_colors_precomp,
grad_opacities,
grad_scales,
grad_rotations,
grad_cov3Ds_precomp,
None,
)
return grads
class GaussianRasterizationSettings(NamedTuple):
image_height: int
image_width: int
tanfovx : float
tanfovy : float
bg : torch.Tensor
scale_modifier : float
viewmatrix : torch.Tensor
projmatrix : torch.Tensor
sh_degree : int
campos : torch.Tensor
prefiltered : bool
class GaussianRasterizer(nn.Module):
def __init__(self, raster_settings):
super().__init__()
self.raster_settings = raster_settings
def markVisible(self, positions):
# Mark visible points (based on frustum culling for camera) with a boolean
with torch.no_grad():
raster_settings = self.raster_settings
visible = _C.mark_visible(
positions,
raster_settings.viewmatrix,
raster_settings.projmatrix)
return visible
def forward(self, means3D, means2D, opacities, shs = None, colors_precomp = None, scales = None, rotations = None, cov3D_precomp = None):
raster_settings = self.raster_settings
if (shs is None and colors_precomp is None) or (shs is not None and colors_precomp is not None):
raise Exception('Please provide excatly one of either SHs or precomputed colors!')
if ((scales is None or rotations is None) and cov3D_precomp is None) or ((scales is not None or rotations is not None) and cov3D_precomp is not None):
raise Exception('Please provide exactly one of either scale/rotation pair or precomputed 3D covariance!')
if shs is None:
shs = torch.Tensor([])
if colors_precomp is None:
colors_precomp = torch.Tensor([])
if scales is None:
scales = torch.Tensor([])
if rotations is None:
rotations = torch.Tensor([])
if cov3D_precomp is None:
cov3D_precomp = torch.Tensor([])
# Invoke C++/CUDA rasterization routine
return rasterize_gaussians(
means3D,
means2D,
shs,
colors_precomp,
opacities,
scales,
rotations,
cov3D_precomp,
raster_settings,
)
from setuptools import setup
from torch.utils.cpp_extension import CUDAExtension, BuildExtension
import os
os.path.dirname(os.path.abspath(__file__))
setup(
name="diff_rasterizationCUDA",
ext_modules=[
CUDAExtension(
name="diff_rasterizationCUDA._C",
sources=[
"rasterize_points.cu",
"cuda_rasterizer/rasterizer_impl.cu",
"cuda_rasterizer/forward.cu",
"cuda_rasterizer/backward.cu",
"ext.cpp"],
extra_compile_args={"nvcc": ["-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), "third_party/glm/")],
"cxx": ["/wd4624"]})
],
cmdclass={
'build_ext': BuildExtension
}
)
/* stb_image_write - v1.16 - public domain - http://nothings.org/stb
writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
no warranty implied; use at your own risk
Before #including,
#define STB_IMAGE_WRITE_IMPLEMENTATION
in the file that you want to have the implementation.
Will probably not work correctly with strict-aliasing optimizations.
ABOUT:
This header file is a library for writing images to C stdio or a callback.
The PNG output is not optimal; it is 20-50% larger than the file
written by a decent optimizing implementation; though providing a custom
zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
This library is designed for source code compactness and simplicity,
not optimal image file size or run-time performance.
BUILDING:
You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
malloc,realloc,free.
You can #define STBIW_MEMMOVE() to replace memmove()
You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
for PNG compression (instead of the builtin one), it must have the following signature:
unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
The returned data will be freed with STBIW_FREE() (free() by default),
so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
UNICODE:
If compiling for Windows and you wish to use Unicode filenames, compile
with
#define STBIW_WINDOWS_UTF8
and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert
Windows wchar_t filenames to utf8.
USAGE:
There are five functions, one for each image file format:
int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
There are also five equivalent functions that use an arbitrary write function. You are
expected to open/close your file-equivalent before and after calling these:
int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data, int stride_in_bytes);
int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data);
int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data);
int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
where the callback is:
void stbi_write_func(void *context, void *data, int size);
You can configure it with these global variables:
int stbi_write_tga_with_rle; // defaults to true; set to 0 to disable RLE
int stbi_write_png_compression_level; // defaults to 8; set to higher for more compression
int stbi_write_force_png_filter; // defaults to -1; set to 0..5 to force a filter mode
You can define STBI_WRITE_NO_STDIO to disable the file variant of these
functions, so the library will not use stdio.h at all. However, this will
also disable HDR writing, because it requires stdio for formatted output.
Each function returns 0 on failure and non-0 on success.
The functions create an image file defined by the parameters. The image
is a rectangle of pixels stored from left-to-right, top-to-bottom.
Each pixel contains 'comp' channels of data stored interleaved with 8-bits
per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is
monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall.
The *data pointer points to the first byte of the top-left-most pixel.
For PNG, "stride_in_bytes" is the distance in bytes from the first byte of
a row of pixels to the first byte of the next row of pixels.
PNG creates output files with the same number of components as the input.
The BMP format expands Y to RGB in the file format and does not
output alpha.
PNG supports writing rectangles of data even when the bytes storing rows of
data are not consecutive in memory (e.g. sub-rectangles of a larger image),
by supplying the stride between the beginning of adjacent rows. The other
formats do not. (Thus you cannot write a native-format BMP through the BMP
writer, both because it is in BGR order and because it may have padding
at the end of the line.)
PNG allows you to set the deflate compression level by setting the global
variable 'stbi_write_png_compression_level' (it defaults to 8).
HDR expects linear float data. Since the format is always 32-bit rgb(e)
data, alpha (if provided) is discarded, and for monochrome data it is
replicated across all three channels.
TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
data, set the global variable 'stbi_write_tga_with_rle' to 0.
JPEG does ignore alpha channels in input data; quality is between 1 and 100.
Higher quality looks better but results in a bigger image.
JPEG baseline (no JPEG progressive).
CREDITS:
Sean Barrett - PNG/BMP/TGA
Baldur Karlsson - HDR
Jean-Sebastien Guay - TGA monochrome
Tim Kelsey - misc enhancements
Alan Hickman - TGA RLE
Emmanuel Julien - initial file IO callback implementation
Jon Olick - original jo_jpeg.cpp code
Daniel Gibson - integrate JPEG, allow external zlib
Aarni Koskela - allow choosing PNG filter
bugfixes:
github:Chribba
Guillaume Chereau
github:jry2
github:romigrou
Sergio Gonzalez
Jonas Karlsson
Filip Wasil
Thatcher Ulrich
github:poppolopoppo
Patrick Boettcher
github:xeekworx
Cap Petschulat
Simon Rodriguez
Ivan Tikhonov
github:ignotion
Adam Schackart
Andrew Kensler
LICENSE
See end of file for license information.
*/
#ifndef INCLUDE_STB_IMAGE_WRITE_H
#define INCLUDE_STB_IMAGE_WRITE_H
#include <stdlib.h>
// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
#ifndef STBIWDEF
#ifdef STB_IMAGE_WRITE_STATIC
#define STBIWDEF static
#else
#ifdef __cplusplus
#define STBIWDEF extern "C"
#else
#define STBIWDEF extern
#endif
#endif
#endif
#ifndef STB_IMAGE_WRITE_STATIC // C++ forbids static forward declarations
STBIWDEF int stbi_write_tga_with_rle;
STBIWDEF int stbi_write_png_compression_level;
STBIWDEF int stbi_write_force_png_filter;
#endif
#ifndef STBI_WRITE_NO_STDIO
STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality);
#ifdef STBIW_WINDOWS_UTF8
STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
#endif
#endif
typedef void stbi_write_func(void *context, void *data, int size);
STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data, int stride_in_bytes);
STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data);
STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data);
STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
#endif//INCLUDE_STB_IMAGE_WRITE_H
#ifdef STB_IMAGE_WRITE_IMPLEMENTATION
#ifdef _WIN32
#ifndef _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS
#endif
#ifndef _CRT_NONSTDC_NO_DEPRECATE
#define _CRT_NONSTDC_NO_DEPRECATE
#endif
#endif
#ifndef STBI_WRITE_NO_STDIO
#include <stdio.h>
#endif // STBI_WRITE_NO_STDIO
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
// ok
#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
// ok
#else
#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
#endif
#ifndef STBIW_MALLOC
#define STBIW_MALLOC(sz) malloc(sz)
#define STBIW_REALLOC(p,newsz) realloc(p,newsz)
#define STBIW_FREE(p) free(p)
#endif
#ifndef STBIW_REALLOC_SIZED
#define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
#endif
#ifndef STBIW_MEMMOVE
#define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
#endif
#ifndef STBIW_ASSERT
#include <assert.h>
#define STBIW_ASSERT(x) assert(x)
#endif
#define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
#ifdef STB_IMAGE_WRITE_STATIC
static int stbi_write_png_compression_level = 8;
static int stbi_write_tga_with_rle = 1;
static int stbi_write_force_png_filter = -1;
#else
int stbi_write_png_compression_level = 8;
int stbi_write_tga_with_rle = 1;
int stbi_write_force_png_filter = -1;
#endif
static int stbi__flip_vertically_on_write = 0;
STBIWDEF void stbi_flip_vertically_on_write(int flag)
{
stbi__flip_vertically_on_write = flag;
}
typedef struct
{
stbi_write_func *func;
void *context;
unsigned char buffer[64];
int buf_used;
} stbi__write_context;
// initialize a callback-based context
static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
{
s->func = c;
s->context = context;
}
#ifndef STBI_WRITE_NO_STDIO
static void stbi__stdio_write(void *context, void *data, int size)
{
fwrite(data,1,size,(FILE*) context);
}
#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
#ifdef __cplusplus
#define STBIW_EXTERN extern "C"
#else
#define STBIW_EXTERN extern
#endif
STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
{
return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
}
#endif
static FILE *stbiw__fopen(char const *filename, char const *mode)
{
FILE *f;
#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
wchar_t wMode[64];
wchar_t wFilename[1024];
if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
return 0;
if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
return 0;
#if defined(_MSC_VER) && _MSC_VER >= 1400
if (0 != _wfopen_s(&f, wFilename, wMode))
f = 0;
#else
f = _wfopen(wFilename, wMode);
#endif
#elif defined(_MSC_VER) && _MSC_VER >= 1400
if (0 != fopen_s(&f, filename, mode))
f=0;
#else
f = fopen(filename, mode);
#endif
return f;
}
static int stbi__start_write_file(stbi__write_context *s, const char *filename)
{
FILE *f = stbiw__fopen(filename, "wb");
stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
return f != NULL;
}
static void stbi__end_write_file(stbi__write_context *s)
{
fclose((FILE *)s->context);
}
#endif // !STBI_WRITE_NO_STDIO
typedef unsigned int stbiw_uint32;
typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
{
while (*fmt) {
switch (*fmt++) {
case ' ': break;
case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
s->func(s->context,&x,1);
break; }
case '2': { int x = va_arg(v,int);
unsigned char b[2];
b[0] = STBIW_UCHAR(x);
b[1] = STBIW_UCHAR(x>>8);
s->func(s->context,b,2);
break; }
case '4': { stbiw_uint32 x = va_arg(v,int);
unsigned char b[4];
b[0]=STBIW_UCHAR(x);
b[1]=STBIW_UCHAR(x>>8);
b[2]=STBIW_UCHAR(x>>16);
b[3]=STBIW_UCHAR(x>>24);
s->func(s->context,b,4);
break; }
default:
STBIW_ASSERT(0);
return;
}
}
}
static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
{
va_list v;
va_start(v, fmt);
stbiw__writefv(s, fmt, v);
va_end(v);
}
static void stbiw__write_flush(stbi__write_context *s)
{
if (s->buf_used) {
s->func(s->context, &s->buffer, s->buf_used);
s->buf_used = 0;
}
}
static void stbiw__putc(stbi__write_context *s, unsigned char c)
{
s->func(s->context, &c, 1);
}
static void stbiw__write1(stbi__write_context *s, unsigned char a)
{
if ((size_t)s->buf_used + 1 > sizeof(s->buffer))
stbiw__write_flush(s);
s->buffer[s->buf_used++] = a;
}
static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
{
int n;
if ((size_t)s->buf_used + 3 > sizeof(s->buffer))
stbiw__write_flush(s);
n = s->buf_used;
s->buf_used = n+3;
s->buffer[n+0] = a;
s->buffer[n+1] = b;
s->buffer[n+2] = c;
}
static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
{
unsigned char bg[3] = { 255, 0, 255}, px[3];
int k;
if (write_alpha < 0)
stbiw__write1(s, d[comp - 1]);
switch (comp) {
case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
case 1:
if (expand_mono)
stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
else
stbiw__write1(s, d[0]); // monochrome TGA
break;
case 4:
if (!write_alpha) {
// composite against pink background
for (k = 0; k < 3; ++k)
px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
break;
}
/* FALLTHROUGH */
case 3:
stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
break;
}
if (write_alpha > 0)
stbiw__write1(s, d[comp - 1]);
}
static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
{
stbiw_uint32 zero = 0;
int i,j, j_end;
if (y <= 0)
return;
if (stbi__flip_vertically_on_write)
vdir *= -1;
if (vdir < 0) {
j_end = -1; j = y-1;
} else {
j_end = y; j = 0;
}
for (; j != j_end; j += vdir) {
for (i=0; i < x; ++i) {
unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
}
stbiw__write_flush(s);
s->func(s->context, &zero, scanline_pad);
}
}
static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
{
if (y < 0 || x < 0) {
return 0;
} else {
va_list v;
va_start(v, fmt);
stbiw__writefv(s, fmt, v);
va_end(v);
stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
return 1;
}
}
static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
{
if (comp != 4) {
// write RGB bitmap
int pad = (-x*3) & 3;
return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
"11 4 22 4" "4 44 22 444444",
'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40, // file header
40, x,y, 1,24, 0,0,0,0,0,0); // bitmap header
} else {
// RGBA bitmaps need a v4 header
// use BI_BITFIELDS mode with 32bpp and alpha mask
// (straight BI_RGB with alpha mask doesn't work in most readers)
return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0,
"11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444",
'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header
108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header
}
}
STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
{
stbi__write_context s = { 0 };
stbi__start_write_callbacks(&s, func, context);
return stbi_write_bmp_core(&s, x, y, comp, data);
}
#ifndef STBI_WRITE_NO_STDIO
STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
{
stbi__write_context s = { 0 };
if (stbi__start_write_file(&s,filename)) {
int r = stbi_write_bmp_core(&s, x, y, comp, data);
stbi__end_write_file(&s);
return r;
} else
return 0;
}
#endif //!STBI_WRITE_NO_STDIO
static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
{
int has_alpha = (comp == 2 || comp == 4);
int colorbytes = has_alpha ? comp-1 : comp;
int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
if (y < 0 || x < 0)
return 0;
if (!stbi_write_tga_with_rle) {
return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
"111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
} else {
int i,j,k;
int jend, jdir;
stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
if (stbi__flip_vertically_on_write) {
j = 0;
jend = y;
jdir = 1;
} else {
j = y-1;
jend = -1;
jdir = -1;
}
for (; j != jend; j += jdir) {
unsigned char *row = (unsigned char *) data + j * x * comp;
int len;
for (i = 0; i < x; i += len) {
unsigned char *begin = row + i * comp;
int diff = 1;
len = 1;
if (i < x - 1) {
++len;
diff = memcmp(begin, row + (i + 1) * comp, comp);
if (diff) {
const unsigned char *prev = begin;
for (k = i + 2; k < x && len < 128; ++k) {
if (memcmp(prev, row + k * comp, comp)) {
prev += comp;
++len;
} else {
--len;
break;
}
}
} else {
for (k = i + 2; k < x && len < 128; ++k) {
if (!memcmp(begin, row + k * comp, comp)) {
++len;
} else {
break;
}
}
}
}
if (diff) {
unsigned char header = STBIW_UCHAR(len - 1);
stbiw__write1(s, header);
for (k = 0; k < len; ++k) {
stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
}
} else {
unsigned char header = STBIW_UCHAR(len - 129);
stbiw__write1(s, header);
stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
}
}
}
stbiw__write_flush(s);
}
return 1;
}
STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
{
stbi__write_context s = { 0 };
stbi__start_write_callbacks(&s, func, context);
return stbi_write_tga_core(&s, x, y, comp, (void *) data);
}
#ifndef STBI_WRITE_NO_STDIO
STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
{
stbi__write_context s = { 0 };
if (stbi__start_write_file(&s,filename)) {
int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
stbi__end_write_file(&s);
return r;
} else
return 0;
}
#endif
// *************************************************************************************************
// Radiance RGBE HDR writer
// by Baldur Karlsson
#define stbiw__max(a, b) ((a) > (b) ? (a) : (b))
#ifndef STBI_WRITE_NO_STDIO
static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
{
int exponent;
float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
if (maxcomp < 1e-32f) {
rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
} else {
float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
rgbe[0] = (unsigned char)(linear[0] * normalize);
rgbe[1] = (unsigned char)(linear[1] * normalize);
rgbe[2] = (unsigned char)(linear[2] * normalize);
rgbe[3] = (unsigned char)(exponent + 128);
}
}
static void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
{
unsigned char lengthbyte = STBIW_UCHAR(length+128);
STBIW_ASSERT(length+128 <= 255);
s->func(s->context, &lengthbyte, 1);
s->func(s->context, &databyte, 1);
}
static void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
{
unsigned char lengthbyte = STBIW_UCHAR(length);
STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
s->func(s->context, &lengthbyte, 1);
s->func(s->context, data, length);
}
static void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
{
unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
unsigned char rgbe[4];
float linear[3];
int x;
scanlineheader[2] = (width&0xff00)>>8;
scanlineheader[3] = (width&0x00ff);
/* skip RLE for images too small or large */
if (width < 8 || width >= 32768) {
for (x=0; x < width; x++) {
switch (ncomp) {
case 4: /* fallthrough */
case 3: linear[2] = scanline[x*ncomp + 2];
linear[1] = scanline[x*ncomp + 1];
linear[0] = scanline[x*ncomp + 0];
break;
default:
linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
break;
}
stbiw__linear_to_rgbe(rgbe, linear);
s->func(s->context, rgbe, 4);
}
} else {
int c,r;
/* encode into scratch buffer */
for (x=0; x < width; x++) {
switch(ncomp) {
case 4: /* fallthrough */
case 3: linear[2] = scanline[x*ncomp + 2];
linear[1] = scanline[x*ncomp + 1];
linear[0] = scanline[x*ncomp + 0];
break;
default:
linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
break;
}
stbiw__linear_to_rgbe(rgbe, linear);
scratch[x + width*0] = rgbe[0];
scratch[x + width*1] = rgbe[1];
scratch[x + width*2] = rgbe[2];
scratch[x + width*3] = rgbe[3];
}
s->func(s->context, scanlineheader, 4);
/* RLE each component separately */
for (c=0; c < 4; c++) {
unsigned char *comp = &scratch[width*c];
x = 0;
while (x < width) {
// find first run
r = x;
while (r+2 < width) {
if (comp[r] == comp[r+1] && comp[r] == comp[r+2])
break;
++r;
}
if (r+2 >= width)
r = width;
// dump up to first run
while (x < r) {
int len = r-x;
if (len > 128) len = 128;
stbiw__write_dump_data(s, len, &comp[x]);
x += len;
}
// if there's a run, output it
if (r+2 < width) { // same test as what we break out of in search loop, so only true if we break'd
// find next byte after run
while (r < width && comp[r] == comp[x])
++r;
// output run up to r
while (x < r) {
int len = r-x;
if (len > 127) len = 127;
stbiw__write_run_data(s, len, comp[x]);
x += len;
}
}
}
}
}
}
static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
{
if (y <= 0 || x <= 0 || data == NULL)
return 0;
else {
// Each component is stored separately. Allocate scratch space for full output scanline.
unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
int i, len;
char buffer[128];
char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
s->func(s->context, header, sizeof(header)-1);
#ifdef __STDC_LIB_EXT1__
len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE= 1.0000000000000\n\n-Y %d +X %d\n", y, x);
#else
len = sprintf(buffer, "EXPOSURE= 1.0000000000000\n\n-Y %d +X %d\n", y, x);
#endif
s->func(s->context, buffer, len);
for(i=0; i < y; i++)
stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i));
STBIW_FREE(scratch);
return 1;
}
}
STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
{
stbi__write_context s = { 0 };
stbi__start_write_callbacks(&s, func, context);
return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
}
STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
{
stbi__write_context s = { 0 };
if (stbi__start_write_file(&s,filename)) {
int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
stbi__end_write_file(&s);
return r;
} else
return 0;
}
#endif // STBI_WRITE_NO_STDIO
//////////////////////////////////////////////////////////////////////////////
//
// PNG writer
//
#ifndef STBIW_ZLIB_COMPRESS
// stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
#define stbiw__sbraw(a) ((int *) (void *) (a) - 2)
#define stbiw__sbm(a) stbiw__sbraw(a)[0]
#define stbiw__sbn(a) stbiw__sbraw(a)[1]
#define stbiw__sbneedgrow(a,n) ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a))
#define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0)
#define stbiw__sbgrow(a,n) stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a)))
#define stbiw__sbpush(a, v) (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v))
#define stbiw__sbcount(a) ((a) ? stbiw__sbn(a) : 0)
#define stbiw__sbfree(a) ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0)
static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
{
int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
STBIW_ASSERT(p);
if (p) {
if (!*arr) ((int *) p)[1] = 0;
*arr = (void *) ((int *) p + 2);
stbiw__sbm(*arr) = m;
}
return *arr;
}
static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
{
while (*bitcount >= 8) {
stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
*bitbuffer >>= 8;
*bitcount -= 8;
}
return data;
}
static int stbiw__zlib_bitrev(int code, int codebits)
{
int res=0;
while (codebits--) {
res = (res << 1) | (code & 1);
code >>= 1;
}
return res;
}
static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit)
{
int i;
for (i=0; i < limit && i < 258; ++i)
if (a[i] != b[i]) break;
return i;
}
static unsigned int stbiw__zhash(unsigned char *data)
{
stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
hash ^= hash << 3;
hash += hash >> 5;
hash ^= hash << 4;
hash += hash >> 17;
hash ^= hash << 25;
hash += hash >> 6;
return hash;
}
#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
#define stbiw__zlib_add(code,codebits) \
(bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
#define stbiw__zlib_huffa(b,c) stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c)
// default huffman tables
#define stbiw__zlib_huff1(n) stbiw__zlib_huffa(0x30 + (n), 8)
#define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + (n)-144, 9)
#define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + (n)-256,7)
#define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + (n)-280,8)
#define stbiw__zlib_huff(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
#define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
#define stbiw__ZHASH 16384
#endif // STBIW_ZLIB_COMPRESS
STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
{
#ifdef STBIW_ZLIB_COMPRESS
// user provided a zlib compress implementation, use that
return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
#else // use builtin
static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
static unsigned char lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0 };
static unsigned short distc[] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
static unsigned char disteb[] = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
unsigned int bitbuf=0;
int i,j, bitcount=0;
unsigned char *out = NULL;
unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char**));
if (hash_table == NULL)
return NULL;
if (quality < 5) quality = 5;
stbiw__sbpush(out, 0x78); // DEFLATE 32K window
stbiw__sbpush(out, 0x5e); // FLEVEL = 1
stbiw__zlib_add(1,1); // BFINAL = 1
stbiw__zlib_add(1,2); // BTYPE = 1 -- fixed huffman
for (i=0; i < stbiw__ZHASH; ++i)
hash_table[i] = NULL;
i=0;
while (i < data_len-3) {
// hash next 3 bytes of data to be compressed
int h = stbiw__zhash(data+i)&(stbiw__ZHASH-1), best=3;
unsigned char *bestloc = 0;
unsigned char **hlist = hash_table[h];
int n = stbiw__sbcount(hlist);
for (j=0; j < n; ++j) {
if (hlist[j]-data > i-32768) { // if entry lies within window
int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i);
if (d >= best) { best=d; bestloc=hlist[j]; }
}
}
// when hash table entry is too long, delete half the entries
if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2*quality) {
STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality);
stbiw__sbn(hash_table[h]) = quality;
}
stbiw__sbpush(hash_table[h],data+i);
if (bestloc) {
// "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
h = stbiw__zhash(data+i+1)&(stbiw__ZHASH-1);
hlist = hash_table[h];
n = stbiw__sbcount(hlist);
for (j=0; j < n; ++j) {
if (hlist[j]-data > i-32767) {
int e = stbiw__zlib_countm(hlist[j], data+i+1, data_len-i-1);
if (e > best) { // if next match is better, bail on current match
bestloc = NULL;
break;
}
}
}
}
if (bestloc) {
int d = (int) (data+i - bestloc); // distance back
STBIW_ASSERT(d <= 32767 && best <= 258);
for (j=0; best > lengthc[j+1]-1; ++j);
stbiw__zlib_huff(j+257);
if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
for (j=0; d > distc[j+1]-1; ++j);
stbiw__zlib_add(stbiw__zlib_bitrev(j,5),5);
if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]);
i += best;
} else {
stbiw__zlib_huffb(data[i]);
++i;
}
}
// write out final bytes
for (;i < data_len; ++i)
stbiw__zlib_huffb(data[i]);
stbiw__zlib_huff(256); // end of block
// pad with 0 bits to byte boundary
while (bitcount)
stbiw__zlib_add(0,1);
for (i=0; i < stbiw__ZHASH; ++i)
(void) stbiw__sbfree(hash_table[i]);
STBIW_FREE(hash_table);
// store uncompressed instead if compression was worse
if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) {
stbiw__sbn(out) = 2; // truncate to DEFLATE 32K window and FLEVEL = 1
for (j = 0; j < data_len;) {
int blocklen = data_len - j;
if (blocklen > 32767) blocklen = 32767;
stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression
stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN
stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8));
stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN
stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8));
memcpy(out+stbiw__sbn(out), data+j, blocklen);
stbiw__sbn(out) += blocklen;
j += blocklen;
}
}
{
// compute adler32 on input
unsigned int s1=1, s2=0;
int blocklen = (int) (data_len % 5552);
j=0;
while (j < data_len) {
for (i=0; i < blocklen; ++i) { s1 += data[j+i]; s2 += s1; }
s1 %= 65521; s2 %= 65521;
j += blocklen;
blocklen = 5552;
}
stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
stbiw__sbpush(out, STBIW_UCHAR(s2));
stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
stbiw__sbpush(out, STBIW_UCHAR(s1));
}
*out_len = stbiw__sbn(out);
// make returned pointer freeable
STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
return (unsigned char *) stbiw__sbraw(out);
#endif // STBIW_ZLIB_COMPRESS
}
static unsigned int stbiw__crc32(unsigned char *buffer, int len)
{
#ifdef STBIW_CRC32
return STBIW_CRC32(buffer, len);
#else
static unsigned int crc_table[256] =
{
0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
};
unsigned int crc = ~0u;
int i;
for (i=0; i < len; ++i)
crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
return ~crc;
#endif
}
#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
#define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
#define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
static void stbiw__wpcrc(unsigned char **data, int len)
{
unsigned int crc = stbiw__crc32(*data - len - 4, len+4);
stbiw__wp32(*data, crc);
}
static unsigned char stbiw__paeth(int a, int b, int c)
{
int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
if (pb <= pc) return STBIW_UCHAR(b);
return STBIW_UCHAR(c);
}
// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
{
static int mapping[] = { 0,1,2,3,4 };
static int firstmap[] = { 0,1,0,5,6 };
int *mymap = (y != 0) ? mapping : firstmap;
int i;
int type = mymap[filter_type];
unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y);
int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
if (type==0) {
memcpy(line_buffer, z, width*n);
return;
}
// first loop isn't optimized since it's just one pixel
for (i = 0; i < n; ++i) {
switch (type) {
case 1: line_buffer[i] = z[i]; break;
case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break;
case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break;
case 5: line_buffer[i] = z[i]; break;
case 6: line_buffer[i] = z[i]; break;
}
}
switch (type) {
case 1: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-n]; break;
case 2: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-signed_stride]; break;
case 3: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break;
case 4: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break;
case 5: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - (z[i-n]>>1); break;
case 6: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
}
}
STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
{
int force_filter = stbi_write_force_png_filter;
int ctype[5] = { -1, 0, 4, 2, 6 };
unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
unsigned char *out,*o, *filt, *zlib;
signed char *line_buffer;
int j,zlen;
if (stride_bytes == 0)
stride_bytes = x * n;
if (force_filter >= 5) {
force_filter = -1;
}
filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
for (j=0; j < y; ++j) {
int filter_type;
if (force_filter > -1) {
filter_type = force_filter;
stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter, line_buffer);
} else { // Estimate the best filter by running through all of them:
int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
for (filter_type = 0; filter_type < 5; filter_type++) {
stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type, line_buffer);
// Estimate the entropy of the line using this filter; the less, the better.
est = 0;
for (i = 0; i < x*n; ++i) {
est += abs((signed char) line_buffer[i]);
}
if (est < best_filter_val) {
best_filter_val = est;
best_filter = filter_type;
}
}
if (filter_type != best_filter) { // If the last iteration already got us the best filter, don't redo it
stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter, line_buffer);
filter_type = best_filter;
}
}
// when we get here, filter_type contains the filter type, and line_buffer contains the data
filt[j*(x*n+1)] = (unsigned char) filter_type;
STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
}
STBIW_FREE(line_buffer);
zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
STBIW_FREE(filt);
if (!zlib) return 0;
// each tag requires 12 bytes of overhead
out = (unsigned char *) STBIW_MALLOC(8 + 12+13 + 12+zlen + 12);
if (!out) return 0;
*out_len = 8 + 12+13 + 12+zlen + 12;
o=out;
STBIW_MEMMOVE(o,sig,8); o+= 8;
stbiw__wp32(o, 13); // header length
stbiw__wptag(o, "IHDR");
stbiw__wp32(o, x);
stbiw__wp32(o, y);
*o++ = 8;
*o++ = STBIW_UCHAR(ctype[n]);
*o++ = 0;
*o++ = 0;
*o++ = 0;
stbiw__wpcrc(&o,13);
stbiw__wp32(o, zlen);
stbiw__wptag(o, "IDAT");
STBIW_MEMMOVE(o, zlib, zlen);
o += zlen;
STBIW_FREE(zlib);
stbiw__wpcrc(&o, zlen);
stbiw__wp32(o,0);
stbiw__wptag(o, "IEND");
stbiw__wpcrc(&o,0);
STBIW_ASSERT(o == out + *out_len);
return out;
}
#ifndef STBI_WRITE_NO_STDIO
STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
{
FILE *f;
int len;
unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
if (png == NULL) return 0;
f = stbiw__fopen(filename, "wb");
if (!f) { STBIW_FREE(png); return 0; }
fwrite(png, 1, len, f);
fclose(f);
STBIW_FREE(png);
return 1;
}
#endif
STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
{
int len;
unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
if (png == NULL) return 0;
func(context, png, len);
STBIW_FREE(png);
return 1;
}
/* ***************************************************************************
*
* JPEG writer
*
* This is based on Jon Olick's jo_jpeg.cpp:
* public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
*/
static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) {
int bitBuf = *bitBufP, bitCnt = *bitCntP;
bitCnt += bs[1];
bitBuf |= bs[0] << (24 - bitCnt);
while(bitCnt >= 8) {
unsigned char c = (bitBuf >> 16) & 255;
stbiw__putc(s, c);
if(c == 255) {
stbiw__putc(s, 0);
}
bitBuf <<= 8;
bitCnt -= 8;
}
*bitBufP = bitBuf;
*bitCntP = bitCnt;
}
static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) {
float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
float z1, z2, z3, z4, z5, z11, z13;
float tmp0 = d0 + d7;
float tmp7 = d0 - d7;
float tmp1 = d1 + d6;
float tmp6 = d1 - d6;
float tmp2 = d2 + d5;
float tmp5 = d2 - d5;
float tmp3 = d3 + d4;
float tmp4 = d3 - d4;
// Even part
float tmp10 = tmp0 + tmp3; // phase 2
float tmp13 = tmp0 - tmp3;
float tmp11 = tmp1 + tmp2;
float tmp12 = tmp1 - tmp2;
d0 = tmp10 + tmp11; // phase 3
d4 = tmp10 - tmp11;
z1 = (tmp12 + tmp13) * 0.707106781f; // c4
d2 = tmp13 + z1; // phase 5
d6 = tmp13 - z1;
// Odd part
tmp10 = tmp4 + tmp5; // phase 2
tmp11 = tmp5 + tmp6;
tmp12 = tmp6 + tmp7;
// The rotator is modified from fig 4-8 to avoid extra negations.
z5 = (tmp10 - tmp12) * 0.382683433f; // c6
z2 = tmp10 * 0.541196100f + z5; // c2-c6
z4 = tmp12 * 1.306562965f + z5; // c2+c6
z3 = tmp11 * 0.707106781f; // c4
z11 = tmp7 + z3; // phase 5
z13 = tmp7 - z3;
*d5p = z13 + z2; // phase 6
*d3p = z13 - z2;
*d1p = z11 + z4;
*d7p = z11 - z4;
*d0p = d0; *d2p = d2; *d4p = d4; *d6p = d6;
}
static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
int tmp1 = val < 0 ? -val : val;
val = val < 0 ? val-1 : val;
bits[1] = 1;
while(tmp1 >>= 1) {
++bits[1];
}
bits[0] = val & ((1<<bits[1])-1);
}
static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, int du_stride, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
int dataOff, i, j, n, diff, end0pos, x, y;
int DU[64];
// DCT rows
for(dataOff=0, n=du_stride*8; dataOff<n; dataOff+=du_stride) {
stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
}
// DCT columns
for(dataOff=0; dataOff<8; ++dataOff) {
stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+du_stride], &CDU[dataOff+du_stride*2], &CDU[dataOff+du_stride*3], &CDU[dataOff+du_stride*4],
&CDU[dataOff+du_stride*5], &CDU[dataOff+du_stride*6], &CDU[dataOff+du_stride*7]);
}
// Quantize/descale/zigzag the coefficients
for(y = 0, j=0; y < 8; ++y) {
for(x = 0; x < 8; ++x,++j) {
float v;
i = y*du_stride+x;
v = CDU[i]*fdtbl[j];
// DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
// ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
}
}
// Encode DC
diff = DU[0] - DC;
if (diff == 0) {
stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
} else {
unsigned short bits[2];
stbiw__jpg_calcBits(diff, bits);
stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
}
// Encode ACs
end0pos = 63;
for(; (end0pos>0)&&(DU[end0pos]==0); --end0pos) {
}
// end0pos = first element in reverse order !=0
if(end0pos == 0) {
stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
return DU[0];
}
for(i = 1; i <= end0pos; ++i) {
int startpos = i;
int nrzeroes;
unsigned short bits[2];
for (; DU[i]==0 && i<=end0pos; ++i) {
}
nrzeroes = i-startpos;
if ( nrzeroes >= 16 ) {
int lng = nrzeroes>>4;
int nrmarker;
for (nrmarker=1; nrmarker <= lng; ++nrmarker)
stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
nrzeroes &= 15;
}
stbiw__jpg_calcBits(DU[i], bits);
stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]);
stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
}
if(end0pos != 63) {
stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
}
return DU[0];
}
static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
// Constants that don't pollute global namespace
static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d};
static const unsigned char std_ac_luminance_values[] = {
0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
};
static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0};
static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77};
static const unsigned char std_ac_chrominance_values[] = {
0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
};
// Huffman tables
static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}};
static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}};
static const unsigned short YAC_HT[256][2] = {
{10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0},
{2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
};
static const unsigned short UVAC_HT[256][2] = {
{0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
{16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0},
{1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
};
static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99};
static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99};
static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f,
1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
int row, col, i, k, subsample;
float fdtbl_Y[64], fdtbl_UV[64];
unsigned char YTable[64], UVTable[64];
if(!data || !width || !height || comp > 4 || comp < 1) {
return 0;
}
quality = quality ? quality : 90;
subsample = quality <= 90 ? 1 : 0;
quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
for(i = 0; i < 64; ++i) {
int uvti, yti = (YQT[i]*quality+50)/100;
YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti);
uvti = (UVQT[i]*quality+50)/100;
UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
}
for(row = 0, k = 0; row < 8; ++row) {
for(col = 0; col < 8; ++col, ++k) {
fdtbl_Y[k] = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
}
}
// Write Headers
{
static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
3,1,(unsigned char)(subsample?0x22:0x11),0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
s->func(s->context, (void*)head0, sizeof(head0));
s->func(s->context, (void*)YTable, sizeof(YTable));
stbiw__putc(s, 1);
s->func(s->context, UVTable, sizeof(UVTable));
s->func(s->context, (void*)head1, sizeof(head1));
s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
stbiw__putc(s, 0x10); // HTYACinfo
s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1);
s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
stbiw__putc(s, 1); // HTUDCinfo
s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1);
s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
stbiw__putc(s, 0x11); // HTUACinfo
s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1);
s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
s->func(s->context, (void*)head2, sizeof(head2));
}
// Encode 8x8 macroblocks
{
static const unsigned short fillBits[] = {0x7F, 7};
int DCY=0, DCU=0, DCV=0;
int bitBuf=0, bitCnt=0;
// comp == 2 is grey+alpha (alpha is ignored)
int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
const unsigned char *dataR = (const unsigned char *)data;
const unsigned char *dataG = dataR + ofsG;
const unsigned char *dataB = dataR + ofsB;
int x, y, pos;
if(subsample) {
for(y = 0; y < height; y += 16) {
for(x = 0; x < width; x += 16) {
float Y[256], U[256], V[256];
for(row = y, pos = 0; row < y+16; ++row) {
// row >= height => use last input row
int clamped_row = (row < height) ? row : height - 1;
int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
for(col = x; col < x+16; ++col, ++pos) {
// if col >= width => use pixel from last input column
int p = base_p + ((col < width) ? col : (width-1))*comp;
float r = dataR[p], g = dataG[p], b = dataB[p];
Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
}
}
DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+0, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+8, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+128, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+136, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
// subsample U,V
{
float subU[64], subV[64];
int yy, xx;
for(yy = 0, pos = 0; yy < 8; ++yy) {
for(xx = 0; xx < 8; ++xx, ++pos) {
int j = yy*32+xx*2;
subU[pos] = (U[j+0] + U[j+1] + U[j+16] + U[j+17]) * 0.25f;
subV[pos] = (V[j+0] + V[j+1] + V[j+16] + V[j+17]) * 0.25f;
}
}
DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subU, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subV, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
}
}
}
} else {
for(y = 0; y < height; y += 8) {
for(x = 0; x < width; x += 8) {
float Y[64], U[64], V[64];
for(row = y, pos = 0; row < y+8; ++row) {
// row >= height => use last input row
int clamped_row = (row < height) ? row : height - 1;
int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
for(col = x; col < x+8; ++col, ++pos) {
// if col >= width => use pixel from last input column
int p = base_p + ((col < width) ? col : (width-1))*comp;
float r = dataR[p], g = dataG[p], b = dataB[p];
Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
}
}
DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y, 8, fdtbl_Y, DCY, YDC_HT, YAC_HT);
DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, U, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, V, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
}
}
}
// Do the bit alignment of the EOI marker
stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
}
// EOI
stbiw__putc(s, 0xFF);
stbiw__putc(s, 0xD9);
return 1;
}
STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
{
stbi__write_context s = { 0 };
stbi__start_write_callbacks(&s, func, context);
return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
}
#ifndef STBI_WRITE_NO_STDIO
STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
{
stbi__write_context s = { 0 };
if (stbi__start_write_file(&s,filename)) {
int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
stbi__end_write_file(&s);
return r;
} else
return 0;
}
#endif
#endif // STB_IMAGE_WRITE_IMPLEMENTATION
/* Revision history
1.16 (2021-07-11)
make Deflate code emit uncompressed blocks when it would otherwise expand
support writing BMPs with alpha channel
1.15 (2020-07-13) unknown
1.14 (2020-02-02) updated JPEG writer to downsample chroma channels
1.13
1.12
1.11 (2019-08-11)
1.10 (2019-02-07)
support utf8 filenames in Windows; fix warnings and platform ifdefs
1.09 (2018-02-11)
fix typo in zlib quality API, improve STB_I_W_STATIC in C++
1.08 (2018-01-29)
add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
1.07 (2017-07-24)
doc fix
1.06 (2017-07-23)
writing JPEG (using Jon Olick's code)
1.05 ???
1.04 (2017-03-03)
monochrome BMP expansion
1.03 ???
1.02 (2016-04-02)
avoid allocating large structures on the stack
1.01 (2016-01-16)
STBIW_REALLOC_SIZED: support allocators with no realloc support
avoid race-condition in crc initialization
minor compile issues
1.00 (2015-09-14)
installable file IO function
0.99 (2015-09-13)
warning fixes; TGA rle support
0.98 (2015-04-08)
added STBIW_MALLOC, STBIW_ASSERT etc
0.97 (2015-01-18)
fixed HDR asserts, rewrote HDR rle logic
0.96 (2015-01-17)
add HDR output
fix monochrome BMP
0.95 (2014-08-17)
add monochrome TGA output
0.94 (2014-05-31)
rename private functions to avoid conflicts with stb_image.h
0.93 (2014-05-27)
warning fixes
0.92 (2010-08-01)
casts to unsigned char to fix warnings
0.91 (2010-07-17)
first public release
0.90 first internal release
*/
/*
------------------------------------------------------------------------------
This software is available under 2 licenses -- choose whichever you prefer.
------------------------------------------------------------------------------
ALTERNATIVE A - MIT License
Copyright (c) 2017 Sean Barrett
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
------------------------------------------------------------------------------
ALTERNATIVE B - Public Domain (www.unlicense.org)
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
software, either in source code form or as a compiled binary, for any purpose,
commercial or non-commercial, and by any means.
In jurisdictions that recognize copyright laws, the author or authors of this
software dedicate any and all copyright interest in the software to the public
domain. We make this dedication for the benefit of the public at large and to
the detriment of our heirs and successors. We intend this dedication to be an
overt act of relinquishment in perpetuity of all present and future rights to
this software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
------------------------------------------------------------------------------
*/
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment