From 9982ae46758858bd51c4eba603adbc9a125f7b24 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Tue, 14 Sep 2021 11:58:21 +0200 Subject: [PATCH 01/27] Convert images on the GPU on Linux for NVidia cards --- .gitmodules | 3 + CMakeLists.txt | 3 + sunshine/main.cpp | 2 + sunshine/platform/linux/cuda.cpp | 289 +++++++++++++++++++++++++++ sunshine/platform/linux/cuda.h | 12 ++ sunshine/platform/linux/graphics.cpp | 27 ++- sunshine/platform/linux/graphics.h | 8 +- sunshine/platform/linux/vaapi.cpp | 6 +- sunshine/platform/linux/x11grab.cpp | 26 ++- sunshine/platform/linux/x11grab.h | 12 ++ sunshine/utility.h | 7 +- sunshine/video.cpp | 4 +- third-party/nv-codec-headers | 1 + 13 files changed, 372 insertions(+), 28 deletions(-) create mode 100644 sunshine/platform/linux/cuda.cpp create mode 100644 sunshine/platform/linux/cuda.h create mode 160000 third-party/nv-codec-headers diff --git a/.gitmodules b/.gitmodules index 153d2de8..39650e86 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,6 @@ [submodule "third-party/miniupnp"] path = third-party/miniupnp url = https://github.com/miniupnp/miniupnp +[submodule "third-party/nv-codec-headers"] + path = third-party/nv-codec-headers + url = https://github.com/FFmpeg/nv-codec-headers diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ff42e7a..1b04f96f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -188,6 +188,8 @@ else() sunshine/platform/linux/publish.cpp sunshine/platform/linux/vaapi.h sunshine/platform/linux/vaapi.cpp + sunshine/platform/linux/cuda.cpp + sunshine/platform/linux/cuda.h sunshine/platform/linux/graphics.h sunshine/platform/linux/graphics.cpp sunshine/platform/linux/misc.h @@ -212,6 +214,7 @@ else() include_directories( /usr/include/libevdev-1.0 + third-party/nv-codec-headers/include third-party/glad/include) if(NOT DEFINED SUNSHINE_EXECUTABLE_PATH) diff --git a/sunshine/main.cpp b/sunshine/main.cpp index b2bed72f..e1f07bf5 100644 --- a/sunshine/main.cpp +++ b/sunshine/main.cpp @@ -26,6 +26,8 @@ #include "upnp.h" #include "video.h" +#include "platform/linux/cuda.h" + #include "platform/common.h" extern "C" { #include diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp new file mode 100644 index 00000000..5c79acde --- /dev/null +++ b/sunshine/platform/linux/cuda.cpp @@ -0,0 +1,289 @@ +#include "cuda.h" +#include "graphics.h" +#include "sunshine/main.h" +#include "sunshine/utility.h" +#include "wayland.h" +#include "x11grab.h" +#include + +extern "C" { +#include +#include +#include +} + +#define SUNSHINE_STRINGVIEW_HELPER(x) x##sv +#define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x) + +#define CU_CHECK(x, y) \ + if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return -1 + +#define CU_CHECK_IGNORE(x, y) \ + check((x), SUNSHINE_STRINGVIEW(y ": ")) + +using namespace std::literals; +namespace cuda { +void cff(CudaFunctions *cf) { + cuda_free_functions(&cf); +} + +using cdf_t = util::safe_ptr; + +static cdf_t cdf; + +inline static int check(CUresult result, const std::string_view &sv) { + if(result != CUDA_SUCCESS) { + const char *name; + const char *description; + + cdf->cuGetErrorName(result, &name); + cdf->cuGetErrorString(result, &description); + + BOOST_LOG(error) << sv << name << ':' << description; + return -1; + } + + return 0; +} + +class ctx_t { +public: + ctx_t(CUcontext ctx) { + CU_CHECK_IGNORE(cdf->cuCtxPushCurrent(ctx), "Couldn't push cuda context"); + } + + ~ctx_t() { + CUcontext dummy; + + CU_CHECK_IGNORE(cdf->cuCtxPopCurrent(&dummy), "Couldn't pop cuda context"); + } +}; + +void free_res(CUgraphicsResource res) { + cdf->cuGraphicsUnregisterResource(res); +} + +using res_internal_t = util::safe_ptr; + +template +class res_t { +public: + res_t() : resources {}, mapped { false } {} + res_t(res_t &&other) noexcept : resources { other.resources }, array_p { other.array_p }, ctx { other.ctx }, stream { other.stream } { + other.resources = std::array {}; + } + + res_t &operator=(res_t &&other) { + for(auto x = 0; x < N; ++x) { + std::swap(resources[x], other.resources[x]); + std::swap(array_p[x], other.array_p[x]); + } + + std::swap(ctx, other.ctx); + std::swap(stream, other.stream); + std::swap(mapped, other.mapped); + + return *this; + } + + res_t(CUcontext ctx, CUstream stream) : resources {}, ctx { ctx }, stream { stream }, mapped { false } {} + + int bind(gl::tex_t &tex) { + ctx_t ctx { this->ctx }; + + CU_CHECK(cdf->cuGraphicsGLRegisterImage(&resources[0], tex[0], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register Y image"); + CU_CHECK(cdf->cuGraphicsGLRegisterImage(&resources[1], tex[1], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register uv image"); + + return 0; + } + + int map() { + ctx_t ctx { this->ctx }; + + CU_CHECK(cdf->cuGraphicsMapResources(resources.size(), resources.data(), stream), "Coudn't map cuda resources"); + + mapped = true; + + CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&array_p[0], resources[0], 0, 0), "Couldn't get mapped subresource [0]"); + CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&array_p[1], resources[1], 0, 0), "Couldn't get mapped subresource [1]"); + + return 0; + } + + void unmap() { + // Either all or none are mapped + if(mapped) { + ctx_t ctx { this->ctx }; + + CU_CHECK_IGNORE(cdf->cuGraphicsUnmapResources(resources.size(), resources.data(), stream), "Couldn't unmap cuda resources"); + + mapped = false; + } + } + + inline CUarray &operator[](std::size_t index) { + return array_p[index]; + } + + ~res_t() { + unmap(); + } + + std::array resources; + std::array array_p; + + CUcontext ctx; + CUstream stream; + + bool mapped; +}; + +int init() { + auto status = cuda_load_functions(&cdf, nullptr); + if(status) { + BOOST_LOG(error) << "Couldn't load cuda: "sv << status; + + return -1; + } + + CU_CHECK(cdf->cuInit(0), "Couldn't initialize cuda"); + + return 0; +} + +class cuda_t : public platf::hwdevice_t { +public: + int init(int in_width, int in_height, platf::x11::xdisplay_t::pointer xdisplay) { + if(!cdf) { + BOOST_LOG(warning) << "cuda not initialized"sv; + return -1; + } + + this->data = (void *)0x1; + + display = egl::make_display(xdisplay); + if(!display) { + return -1; + } + + auto ctx_opt = egl::make_ctx(display.get()); + if(!ctx_opt) { + return -1; + } + + ctx = std::move(*ctx_opt); + + width = in_width; + height = in_height; + + return 0; + } + + int set_frame(AVFrame *frame) override { + auto cuda_ctx = (AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx; + + tex = gl::tex_t::make(2); + fb = gl::frame_buf_t::make(2); + + gl::ctx.BindTexture(GL_TEXTURE_2D, tex[0]); + gl::ctx.TexImage2D(GL_TEXTURE_2D, 0, GL_RED, frame->width, frame->height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr); + gl::ctx.BindTexture(GL_TEXTURE_2D, tex[1]); + gl::ctx.TexImage2D(GL_TEXTURE_2D, 0, GL_RG, frame->width / 2, frame->height / 2, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr); + gl::ctx.BindTexture(GL_TEXTURE_2D, 0); + + fb.bind(std::begin(tex), std::end(tex)); + + res = res_t<2> { cuda_ctx->cuda_ctx, cuda_ctx->stream }; + + if(res.bind(tex)) { + return -1; + } + + this->hwframe.reset(frame); + this->frame = frame; + + if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) { + BOOST_LOG(error) << "Couldn't get hwframe for NVENC"sv; + + return -1; + } + + auto sws_opt = egl::sws_t::make(width, height, frame->width, frame->height); + if(!sws_opt) { + return -1; + } + + this->sws = std::move(*sws_opt); + + return 0; + } + + int convert(platf::img_t &img) override { + sws.load_ram(img); + + if(sws.convert(fb)) { + return -1; + } + + if(res.map()) { + return -1; + } + + // Push and pop cuda context + ctx_t ctx { res.ctx }; + for(auto x = 0; x < 2; ++x) { + CUDA_MEMCPY2D desc {}; + + auto shift = x; + + desc.srcPitch = frame->width; + desc.dstPitch = frame->linesize[x]; + desc.Height = frame->height >> shift; + desc.WidthInBytes = std::min(desc.srcPitch, desc.dstPitch); + + desc.srcMemoryType = CU_MEMORYTYPE_ARRAY; + desc.dstMemoryType = CU_MEMORYTYPE_DEVICE; + + desc.srcArray = res[x]; + desc.dstDevice = (CUdeviceptr)frame->data[x]; + + CU_CHECK(cdf->cuMemcpy2DAsync(&desc, res.stream), "Couldn't copy from OpenGL to cuda"); + } + + res.unmap(); + + return 0; + } + + void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override { + sws.set_colorspace(colorspace, color_range); + } + + frame_t hwframe; + + egl::display_t display; + egl::ctx_t ctx; + + egl::sws_t sws; + + gl::tex_t tex; + gl::frame_buf_t fb; + + res_t<2> res; + + int width, height; +}; + +std::shared_ptr make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay) { + if(init()) { + return nullptr; + } + + auto cuda = std::make_shared(); + if(cuda->init(width, height, xdisplay)) { + return nullptr; + } + + return cuda; +} +} // namespace cuda diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h new file mode 100644 index 00000000..a56c961b --- /dev/null +++ b/sunshine/platform/linux/cuda.h @@ -0,0 +1,12 @@ +#ifndef SUNSHINE_PLATFORM_CUDA_H +#define SUNSHINE_PLATFORM_CUDA_H + +#include "sunshine/platform/common.h" +#include "x11grab.h" + +namespace cuda { +std::shared_ptr make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay); +int init(); +} // namespace cuda + +#endif \ No newline at end of file diff --git a/sunshine/platform/linux/graphics.cpp b/sunshine/platform/linux/graphics.cpp index bbf75324..3e0594b8 100644 --- a/sunshine/platform/linux/graphics.cpp +++ b/sunshine/platform/linux/graphics.cpp @@ -313,19 +313,30 @@ bool fail() { return eglGetError() != EGL_SUCCESS; } -display_t make_display(util::Either native_display) { +display_t make_display(std::variant native_display) { constexpr auto EGL_PLATFORM_GBM_MESA = 0x31D7; constexpr auto EGL_PLATFORM_WAYLAND_KHR = 0x31D8; + constexpr auto EGL_PLATFORM_X11_KHR = 0x31D5; int egl_platform; void *native_display_p; - if(native_display.has_left()) { + + switch(native_display.index()) { + case 0: egl_platform = EGL_PLATFORM_GBM_MESA; - native_display_p = native_display.left(); - } - else { + native_display_p = std::get<0>(native_display); + break; + case 1: egl_platform = EGL_PLATFORM_WAYLAND_KHR; - native_display_p = native_display.right(); + native_display_p = std::get<1>(native_display); + break; + case 2: + egl_platform = EGL_PLATFORM_X11_KHR; + native_display_p = std::get<2>(native_display); + break; + default: + BOOST_LOG(error) << "egl::make_display(): Index ["sv << native_display.index() << "] not implemented"sv; + return nullptr; } // native_display.left() equals native_display.right() @@ -803,7 +814,7 @@ void sws_t::load_vram(img_descriptor_t &img, int offset_x, int offset_y, int tex } } -int sws_t::convert(nv12_t &nv12) { +int sws_t::convert(gl::frame_buf_t &fb) { gl::ctx.BindTexture(GL_TEXTURE_2D, loaded_texture); GLenum attachments[] { @@ -812,7 +823,7 @@ int sws_t::convert(nv12_t &nv12) { }; for(int x = 0; x < sizeof(attachments) / sizeof(decltype(attachments[0])); ++x) { - gl::ctx.BindFramebuffer(GL_FRAMEBUFFER, nv12->buf[x]); + gl::ctx.BindFramebuffer(GL_FRAMEBUFFER, fb[x]); gl::ctx.DrawBuffers(1, &attachments[x]); #ifndef NDEBUG diff --git a/sunshine/platform/linux/graphics.h b/sunshine/platform/linux/graphics.h index 02de4d93..5599d9a4 100644 --- a/sunshine/platform/linux/graphics.h +++ b/sunshine/platform/linux/graphics.h @@ -19,6 +19,9 @@ extern "C" int close(int __fd); +// X11 Display +extern "C" struct _XDisplay; + struct AVFrame; void free_frame(AVFrame *frame); @@ -227,7 +230,7 @@ struct surface_descriptor_t { std::uint32_t offsets[4]; }; -display_t make_display(util::Either native_display); +display_t make_display(std::variant native_display); std::optional make_ctx(display_t::pointer display); std::optional import_source( @@ -276,7 +279,8 @@ public: static std::optional make(int in_width, int in_height, int out_width, int out_heigth, gl::tex_t &&tex); static std::optional make(int in_width, int in_height, int out_width, int out_heigth); - int convert(nv12_t &nv12); + // Convert the loaded image into the first two framebuffers + int convert(gl::frame_buf_t &fb); void load_ram(platf::img_t &img); void load_vram(img_descriptor_t &img, int offset_x, int offset_y, int texture); diff --git a/sunshine/platform/linux/vaapi.cpp b/sunshine/platform/linux/vaapi.cpp index 33646333..37a64cb8 100644 --- a/sunshine/platform/linux/vaapi.cpp +++ b/sunshine/platform/linux/vaapi.cpp @@ -3,8 +3,6 @@ #include -#include - extern "C" { #include } @@ -404,7 +402,7 @@ public: int convert(platf::img_t &img) override { sws.load_ram(img); - sws.convert(nv12); + sws.convert(nv12->buf); return 0; } }; @@ -430,7 +428,7 @@ public: sws.load_vram(descriptor, offset_x, offset_y, rgb->tex[0]); - sws.convert(nv12); + sws.convert(nv12->buf); return 0; } diff --git a/sunshine/platform/linux/x11grab.cpp b/sunshine/platform/linux/x11grab.cpp index 33797f24..24a87900 100644 --- a/sunshine/platform/linux/x11grab.cpp +++ b/sunshine/platform/linux/x11grab.cpp @@ -24,6 +24,7 @@ #include "misc.h" #include "vaapi.h" #include "x11grab.h" +#include "cuda.h" using namespace std::literals; @@ -259,9 +260,8 @@ void freeX(XFixesCursorImage *); using xcb_connect_t = util::dyn_safe_ptr; using xcb_img_t = util::c_ptr; -using xdisplay_t = util::dyn_safe_ptr_v2; -using ximg_t = util::safe_ptr; -using xcursor_t = util::safe_ptr; +using ximg_t = util::safe_ptr; +using xcursor_t = util::safe_ptr; using crtc_info_t = util::dyn_safe_ptr<_XRRCrtcInfo, &x11::rr::FreeCrtcInfo>; using output_info_t = util::dyn_safe_ptr<_XRROutputInfo, &x11::rr::FreeOutputInfo>; @@ -366,7 +366,7 @@ static void blend_cursor(Display *display, img_t &img, int offsetX, int offsetY) struct x11_attr_t : public display_t { std::chrono::nanoseconds delay; - xdisplay_t xdisplay; + x11::xdisplay_t xdisplay; Window xwindow; XWindowAttributes xattr; @@ -516,6 +516,10 @@ struct x11_attr_t : public display_t { return va::make_hwdevice(width, height, false); } + if(mem_type == mem_type_e::cuda) { + return cuda::make_hwdevice(width, height, xdisplay.get()); + } + return std::make_shared(); } @@ -526,7 +530,7 @@ struct x11_attr_t : public display_t { }; struct shm_attr_t : public x11_attr_t { - xdisplay_t shm_xdisplay; // Prevent race condition with x11_attr_t::xdisplay + x11::xdisplay_t shm_xdisplay; // Prevent race condition with x11_attr_t::xdisplay xcb_connect_t xcb; xcb_screen_t *display; std::uint32_t seg; @@ -713,7 +717,7 @@ std::vector x11_display_names() { BOOST_LOG(info) << "Detecting connected monitors"sv; - xdisplay_t xdisplay { x11::OpenDisplay(nullptr) }; + x11::xdisplay_t xdisplay { x11::OpenDisplay(nullptr) }; if(!xdisplay) { return {}; } @@ -807,8 +811,16 @@ void cursor_t::blend(img_t &img, int offsetX, int offsetY) { blend_cursor((xdisplay_t::pointer)ctx.get(), img, offsetX, offsetY); } +xdisplay_t make_display() { + return OpenDisplay(nullptr); +} + +void freeDisplay(_XDisplay *xdisplay) { + CloseDisplay(xdisplay); +} + void freeCursorCtx(cursor_ctx_t::pointer ctx) { - x11::CloseDisplay((xdisplay_t::pointer)ctx); + CloseDisplay((xdisplay_t::pointer)ctx); } } // namespace x11 } // namespace platf diff --git a/sunshine/platform/linux/x11grab.h b/sunshine/platform/linux/x11grab.h index 1440ae76..3d2868c8 100644 --- a/sunshine/platform/linux/x11grab.h +++ b/sunshine/platform/linux/x11grab.h @@ -6,6 +6,9 @@ #include "sunshine/platform/common.h" #include "sunshine/utility.h" +// X11 Display +extern "C" struct _XDisplay; + namespace egl { class cursor_t; } @@ -15,8 +18,10 @@ namespace platf::x11 { #ifdef SUNSHINE_BUILD_X11 struct cursor_ctx_raw_t; void freeCursorCtx(cursor_ctx_raw_t *ctx); +void freeDisplay(_XDisplay *xdisplay); using cursor_ctx_t = util::safe_ptr; +using xdisplay_t = util::safe_ptr<_XDisplay, freeDisplay>; class cursor_t { public: @@ -34,7 +39,12 @@ public: cursor_ctx_t ctx; }; + +xdisplay_t make_display(); #else +// It's never something different from nullptr +util::safe_ptr<_XDisplay, std::default_delete<_XDisplay>>; + class cursor_t { public: static std::optional make() { return std::nullopt; } @@ -42,6 +52,8 @@ public: void capture(egl::cursor_t &) {} void blend(img_t &, int, int) {} }; + +xdisplay_t make_display() { return nullptr; } #endif } // namespace platf::x11 diff --git a/sunshine/utility.h b/sunshine/utility.h index 90435696..0e585e21 100644 --- a/sunshine/utility.h +++ b/sunshine/utility.h @@ -64,8 +64,7 @@ struct argument_type { typedef U type; }; #define KITTY_DEFAULT_CONSTR_MOVE(x) \ x(x &&) noexcept = default; \ - x &operator=(x &&) noexcept = default; \ - x() = default; + x &operator=(x &&) noexcept = default; #define KITTY_DEFAULT_CONSTR_MOVE_THROW(x) \ x(x &&) = default; \ @@ -415,9 +414,9 @@ inline std::int64_t from_view(const std::string_view &number) { } template -class Either : public std::variant { +class Either : public std::variant { public: - using std::variant::variant; + using std::variant::variant; constexpr bool has_left() const { return std::holds_alternative(*this); diff --git a/sunshine/video.cpp b/sunshine/video.cpp index 261d86bc..7205aa07 100644 --- a/sunshine/video.cpp +++ b/sunshine/video.cpp @@ -409,13 +409,11 @@ static encoder_t nvenc { #ifdef _WIN32 AV_HWDEVICE_TYPE_D3D11VA, AV_PIX_FMT_D3D11, - AV_PIX_FMT_NV12, AV_PIX_FMT_P010, #else AV_HWDEVICE_TYPE_CUDA, AV_PIX_FMT_CUDA, - // Fully planar YUV formats are more efficient for sws_scale() - AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV420P10, #endif + AV_PIX_FMT_NV12, AV_PIX_FMT_P010, { { { "forced-idr"s, 1 }, diff --git a/third-party/nv-codec-headers b/third-party/nv-codec-headers new file mode 160000 index 00000000..b641a195 --- /dev/null +++ b/third-party/nv-codec-headers @@ -0,0 +1 @@ +Subproject commit b641a195edbe3ac9788e681e22c2e2fad8aacddb From f5db0e438b7a20a1b0fbf81964720ded088c43c5 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Tue, 14 Sep 2021 15:07:34 +0200 Subject: [PATCH 02/27] The background is black instead of green --- sunshine/platform/linux/cuda.cpp | 5 ++--- sunshine/platform/linux/graphics.cpp | 14 ++++++++++++++ sunshine/platform/linux/graphics.h | 3 +++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp index 5c79acde..cdcaba57 100644 --- a/sunshine/platform/linux/cuda.cpp +++ b/sunshine/platform/linux/cuda.cpp @@ -213,9 +213,8 @@ public: return -1; } - this->sws = std::move(*sws_opt); - - return 0; + sws = std::move(*sws_opt); + return sws.blank(fb, 0, 0, frame->width, frame->height); } int convert(platf::img_t &img) override { diff --git a/sunshine/platform/linux/graphics.cpp b/sunshine/platform/linux/graphics.cpp index 3e0594b8..8d31e37b 100644 --- a/sunshine/platform/linux/graphics.cpp +++ b/sunshine/platform/linux/graphics.cpp @@ -739,6 +739,20 @@ std::optional sws_t::make(int in_width, int in_height, int out_width, int return std::move(sws); } +int sws_t::blank(gl::frame_buf_t &fb, int offsetX, int offsetY, int width, int height) { + auto f = [&]() { + std::swap(offsetX, this->offsetX); + std::swap(offsetY, this->offsetY); + std::swap(width, this->out_width); + std::swap(height, this->out_height); + }; + + f(); + auto fg = util::fail_guard(f); + + return convert(fb); +} + std::optional sws_t::make(int in_width, int in_height, int out_width, int out_heigth) { auto tex = gl::tex_t::make(2); gl::ctx.BindTexture(GL_TEXTURE_2D, tex[0]); diff --git a/sunshine/platform/linux/graphics.h b/sunshine/platform/linux/graphics.h index 5599d9a4..2cc24b01 100644 --- a/sunshine/platform/linux/graphics.h +++ b/sunshine/platform/linux/graphics.h @@ -282,6 +282,9 @@ public: // Convert the loaded image into the first two framebuffers int convert(gl::frame_buf_t &fb); + // Make an area of the image black + int blank(gl::frame_buf_t &fb, int offsetX, int offsetY, int width, int height); + void load_ram(platf::img_t &img); void load_vram(img_descriptor_t &img, int offset_x, int offset_y, int texture); From c94d922282e2e6c4044b5c456ffcb9224b80db81 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Tue, 14 Sep 2021 19:16:29 +0200 Subject: [PATCH 03/27] Fix windows build --- sunshine/main.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/sunshine/main.cpp b/sunshine/main.cpp index e1f07bf5..b2bed72f 100644 --- a/sunshine/main.cpp +++ b/sunshine/main.cpp @@ -26,8 +26,6 @@ #include "upnp.h" #include "video.h" -#include "platform/linux/cuda.h" - #include "platform/common.h" extern "C" { #include From e3f642ac25ac4ad6f47135b2334380ff1caedfc8 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Wed, 15 Sep 2021 12:10:12 +0200 Subject: [PATCH 04/27] Reduce cpu usage with x11grab --- sunshine/platform/linux/x11grab.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sunshine/platform/linux/x11grab.cpp b/sunshine/platform/linux/x11grab.cpp index 24a87900..c8bc70f0 100644 --- a/sunshine/platform/linux/x11grab.cpp +++ b/sunshine/platform/linux/x11grab.cpp @@ -458,6 +458,7 @@ struct x11_attr_t : public display_t { std::this_thread::sleep_for((next_frame - now) / 3 * 2); } while(next_frame > now) { + std::this_thread::sleep_for(1ns); now = std::chrono::steady_clock::now(); } next_frame = now + delay; @@ -566,6 +567,7 @@ struct shm_attr_t : public x11_attr_t { std::this_thread::sleep_for((next_frame - now) / 3 * 2); } while(next_frame > now) { + std::this_thread::sleep_for(1ns); now = std::chrono::steady_clock::now(); } next_frame = now + delay; From fed329568ca44993d9cc0a8fe145e73a332fb111 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Sun, 19 Sep 2021 20:40:34 +0200 Subject: [PATCH 05/27] Use an actual cuda kernel to convert RGB to NV12 --- CMakeLists.txt | 39 +- sunshine/platform/linux/cuda.cpp | 212 +++- sunshine/platform/linux/cuda.cu | 248 ++++ sunshine/platform/linux/cuda.h | 46 + sunshine/video.cpp | 26 +- third-party/nvfbc/NvFBC.h | 2006 ++++++++++++++++++++++++++++++ third-party/nvfbc/helper_math.h | 1469 ++++++++++++++++++++++ 7 files changed, 4007 insertions(+), 39 deletions(-) create mode 100644 sunshine/platform/linux/cuda.cu create mode 100644 third-party/nvfbc/NvFBC.h create mode 100644 third-party/nvfbc/helper_math.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b04f96f..df0424d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,15 +4,6 @@ project(Sunshine) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) -add_subdirectory(third-party/Simple-Web-Server) - -set(UPNPC_BUILD_SHARED OFF CACHE BOOL "no shared libraries") -set(UPNPC_BUILD_TESTS OFF CACHE BOOL "Don't build tests for miniupnpc") -set(UPNPC_BUILD_SAMPLE OFF CACHE BOOL "Don't build samples for miniupnpc") -set(UPNPC_NO_INSTALL ON CACHE BOOL "Don't install any libraries build for miniupnpc") -add_subdirectory(third-party/miniupnp/miniupnpc) -include_directories(third-party/miniupnp) - if(WIN32) # Ugly hack to compile with #include add_compile_definitions( @@ -21,9 +12,20 @@ if(WIN32) QOS_NON_ADAPTIVE_FLOW=2) endif() add_subdirectory(third-party/moonlight-common-c/enet) +add_subdirectory(third-party/Simple-Web-Server) +add_subdirectory(third-party/cbs) + +set(UPNPC_BUILD_SHARED OFF CACHE BOOL "no shared libraries") +set(UPNPC_BUILD_TESTS OFF CACHE BOOL "Don't build tests for miniupnpc") +set(UPNPC_BUILD_SAMPLE OFF CACHE BOOL "Don't build samples for miniupnpc") +set(UPNPC_NO_INSTALL ON CACHE BOOL "Don't install any libraries build for miniupnpc") +add_subdirectory(third-party/miniupnp/miniupnpc) +include_directories(third-party/miniupnp) find_package(Threads REQUIRED) find_package(OpenSSL REQUIRED) +set(Boost_USE_STATIC_LIBS ON) +find_package(Boost COMPONENTS log filesystem REQUIRED) list(APPEND SUNSHINE_COMPILE_OPTIONS -fPIC -Wall -Wno-missing-braces -Wno-maybe-uninitialized -Wno-sign-compare) @@ -106,6 +108,11 @@ else() option(SUNSHINE_ENABLE_X11 "Enable X11 grab if available" ON) option(SUNSHINE_ENABLE_WAYLAND "Enable building wayland specific code" ON) + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 75) + endif() + enable_language(CUDA) + if(${SUNSHINE_ENABLE_X11}) find_package(X11) else() @@ -188,6 +195,7 @@ else() sunshine/platform/linux/publish.cpp sunshine/platform/linux/vaapi.h sunshine/platform/linux/vaapi.cpp + sunshine/platform/linux/cuda.cu sunshine/platform/linux/cuda.cpp sunshine/platform/linux/cuda.h sunshine/platform/linux/graphics.h @@ -203,7 +211,8 @@ else() third-party/glad/include/EGL/eglplatform.h third-party/glad/include/KHR/khrplatform.h third-party/glad/include/glad/gl.h - third-party/glad/include/glad/egl.h) + third-party/glad/include/glad/egl.h + third-party/nvfbc/NvFBC.h) list(APPEND PLATFORM_LIBRARIES dl @@ -215,7 +224,8 @@ else() include_directories( /usr/include/libevdev-1.0 third-party/nv-codec-headers/include - third-party/glad/include) + third-party/glad/include + third-party/nvfbc) if(NOT DEFINED SUNSHINE_EXECUTABLE_PATH) set(SUNSHINE_EXECUTABLE_PATH "sunshine") @@ -224,11 +234,6 @@ else() configure_file(sunshine.service.in sunshine.service @ONLY) endif() -add_subdirectory(third-party/cbs) - -set(Boost_USE_STATIC_LIBS ON) -find_package(Boost COMPONENTS log filesystem REQUIRED) - set(SUNSHINE_TARGET_FILES third-party/moonlight-common-c/reedsolomon/rs.c third-party/moonlight-common-c/reedsolomon/rs.h @@ -290,7 +295,7 @@ include_directories( string(TOUPPER "x${CMAKE_BUILD_TYPE}" BUILD_TYPE) if("${BUILD_TYPE}" STREQUAL "XDEBUG") - list(APPEND SUNSHINE_COMPILE_OPTIONS -O0 -pedantic -ggdb3) + list(APPEND SUNSHINE_COMPILE_OPTIONS -O0 -ggdb3) if(WIN32) set_source_files_properties(sunshine/nvhttp.cpp PROPERTIES COMPILE_FLAGS -O2) endif() diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp index cdcaba57..811293d6 100644 --- a/sunshine/platform/linux/cuda.cpp +++ b/sunshine/platform/linux/cuda.cpp @@ -1,9 +1,4 @@ -#include "cuda.h" -#include "graphics.h" -#include "sunshine/main.h" -#include "sunshine/utility.h" -#include "wayland.h" -#include "x11grab.h" +#include #include extern "C" { @@ -12,6 +7,13 @@ extern "C" { #include } +#include "cuda.h" +#include "graphics.h" +#include "sunshine/main.h" +#include "sunshine/utility.h" +#include "wayland.h" +#include "x11grab.h" + #define SUNSHINE_STRINGVIEW_HELPER(x) x##sv #define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x) @@ -23,6 +25,13 @@ extern "C" { using namespace std::literals; namespace cuda { +constexpr auto cudaDevAttrMaxThreadsPerBlock = (CUdevice_attribute)1; +constexpr auto cudaDevAttrMaxThreadsPerMultiProcessor = (CUdevice_attribute)39; + +void pass_error(const std::string_view &sv, const char *name, const char *description) { + BOOST_LOG(error) << sv << name << ':' << description; +} + void cff(CudaFunctions *cf) { cuda_free_functions(&cf); } @@ -151,7 +160,7 @@ int init() { return 0; } -class cuda_t : public platf::hwdevice_t { +class opengl_t : public platf::hwdevice_t { public: int init(int in_width, int in_height, platf::x11::xdisplay_t::pointer xdisplay) { if(!cdf) { @@ -273,16 +282,203 @@ public: int width, height; }; +class cuda_t : public platf::hwdevice_t { +public: + ~cuda_t() override { + // sws_t needs to be destroyed while the context is active + if(sws) { + ctx_t ctx { cuda_ctx }; + + sws.reset(); + } + } + + int init(int in_width, int in_height) { + if(!cdf) { + BOOST_LOG(warning) << "cuda not initialized"sv; + return -1; + } + + data = (void *)0x1; + + width = in_width; + height = in_height; + + return 0; + } + + int set_frame(AVFrame *frame) override { + this->hwframe.reset(frame); + this->frame = frame; + + if(((AVHWFramesContext *)frame->hw_frames_ctx->data)->sw_format != AV_PIX_FMT_NV12) { + BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12"sv; + return -1; + } + + if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) { + BOOST_LOG(error) << "Couldn't get hwframe for NVENC"sv; + + return -1; + } + + cuda_ctx = ((AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx)->cuda_ctx; + + ctx_t ctx { cuda_ctx }; + sws = sws_t::make(width * 4, height, frame->width, frame->height); + + if(!sws) { + return -1; + } + + return 0; + } + + int convert(platf::img_t &img) override { + ctx_t ctx { cuda_ctx }; + + return sws->load_ram(img) || sws->convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1]); + } + + void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override { + ctx_t ctx { cuda_ctx }; + sws->set_colorspace(colorspace, color_range); + } + + frame_t hwframe; + + std::unique_ptr sws; + + int width, height; + + CUcontext cuda_ctx; +}; + std::shared_ptr make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay) { if(init()) { return nullptr; } auto cuda = std::make_shared(); - if(cuda->init(width, height, xdisplay)) { + if(cuda->init(width, height)) { return nullptr; } return cuda; } } // namespace cuda + +namespace platf::nvfbc { +static PNVFBCCREATEINSTANCE createInstance {}; +static NVFBC_API_FUNCTION_LIST func { NVFBC_VERSION }; + +static void *handle { nullptr }; +int init() { + static bool funcs_loaded = false; + + if(funcs_loaded) return 0; + + if(!handle) { + handle = dyn::handle({ "libnvidia-fbc.so.1", "libnvidia-fbc.so" }); + if(!handle) { + return -1; + } + } + + std::vector> funcs { + { (dyn::apiproc *)&createInstance, "NvFBCCreateInstance" }, + }; + + if(dyn::load(handle, funcs)) { + dlclose(handle); + handle = nullptr; + + return -1; + } + + funcs_loaded = true; + return 0; +} + +class handle_t { + KITTY_USING_MOVE_T(session_t, NVFBC_SESSION_HANDLE, std::numeric_limits::max(), { + if(el == std::numeric_limits::max()) { + return; + } + NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER }; + + auto status = func.nvFBCDestroyHandle(el, ¶ms); + if(status) { + BOOST_LOG(error) << "Failed to destroy nvfbc handle: "sv << func.nvFBCGetLastErrorStr(el); + } + }); + +public: + static std::optional make() { + NVFBC_CREATE_HANDLE_PARAMS params { NVFBC_CREATE_HANDLE_PARAMS_VER }; + session_t session; + + auto status = func.nvFBCCreateHandle(&session.el, ¶ms); + if(status) { + BOOST_LOG(error) << "Failed to create session: "sv << func.nvFBCGetLastErrorStr(session.el); + session.release(); + + return std::nullopt; + } + + return handle_t { std::move(session) }; + } + + const char *last_error() { + return func.nvFBCGetLastErrorStr(session.el); + } + + std::optional status() { + NVFBC_GET_STATUS_PARAMS params { NVFBC_GET_STATUS_PARAMS_VER }; + + auto status = func.nvFBCGetStatus(session.el, ¶ms); + if(status) { + BOOST_LOG(error) << "Failed to create session: "sv << last_error(); + + return std::nullopt; + } + + return params; + } + + session_t session; +}; + +std::vector nvfbc_display_names() { + if(init()) { + return {}; + } + + std::vector display_names; + + auto status = createInstance(&func); + if(status) { + BOOST_LOG(error) << "Unable to create NvFBC instance"sv; + return {}; + } + + auto handle = handle_t::make(); + if(!handle) { + return {}; + } + + auto status_params = handle->status(); + if(!status_params) { + return {}; + } + + if(!status_params->bIsCapturePossible) { + BOOST_LOG(error) << "NVidia driver doesn't support NvFBC screencasting"sv; + } + + BOOST_LOG(info) << "Found ["sv << status_params->dwOutputNum << "] outputs"sv; + BOOST_LOG(info) << "Virtual Desktop: "sv << status_params->screenSize.w << 'x' << status_params->screenSize.h; + + return display_names; +} +} // namespace platf::nvfbc \ No newline at end of file diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu new file mode 100644 index 00000000..a2ca6508 --- /dev/null +++ b/sunshine/platform/linux/cuda.cu @@ -0,0 +1,248 @@ +// #include +#include +#include +#include +#include +#include + +#include "cuda.h" + +using namespace std::literals; + +#define SUNSHINE_STRINGVIEW_HELPER(x) x##sv +#define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x) + +#define CU_CHECK(x, y) \ + if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return -1 + +#define CU_CHECK_VOID(x, y) \ + if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return; + +#define CU_CHECK_PTR(x, y) \ + if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return nullptr; + +#define CU_CHECK_IGNORE(x, y) \ + check((x), SUNSHINE_STRINGVIEW(y ": ")) + +using namespace std::literals; + +//////////////////// Special desclarations +/** + * NVCC segfaults when including + * Therefore, some declarations need to be added explicitely + */ +namespace platf { +struct img_t { +public: + std::uint8_t *data {}; + std::int32_t width {}; + std::int32_t height {}; + std::int32_t pixel_pitch {}; + std::int32_t row_pitch {}; + + virtual ~img_t() = default; +}; +} // namespace platf + +namespace video { +using __float4 = float[4]; +using __float3 = float[3]; +using __float2 = float[2]; + +struct __attribute__((__aligned__(16))) color_t { + float4 color_vec_y; + float4 color_vec_u; + float4 color_vec_v; + float2 range_y; + float2 range_uv; +}; + +struct __attribute__((__aligned__(16))) color_extern_t { + __float4 color_vec_y; + __float4 color_vec_u; + __float4 color_vec_v; + __float2 range_y; + __float2 range_uv; +}; + +extern color_extern_t colors[4]; +} // namespace video + +//////////////////// End special declarations + +namespace cuda { +auto constexpr INVALID_TEXTURE = std::numeric_limits::max(); + +template +inline T div_align(T l, T r) { + return (l + r - 1) / r; +} + +void pass_error(const std::string_view &sv, const char *name, const char *description); +inline static int check(cudaError_t result, const std::string_view &sv) { + if(result) { + auto name = cudaGetErrorName(result); + auto description = cudaGetErrorString(result); + + pass_error(sv, name, description); + return -1; + } + + return 0; +} + +__device__ __constant__ video::color_t color; + + +inline __device__ float3 bgra_to_rgb(uchar4 vec) { + return make_float3((float)vec.z, (float)vec.y, (float)vec.x); +} + +inline __device__ float2 calcUV(float3 pixel) { + float4 vec_u = color.color_vec_u; + float4 vec_v = color.color_vec_v; + + float u = dot(pixel, make_float3(vec_u)) + vec_u.w; + float v = dot(pixel, make_float3(vec_v)) + vec_v.w; + + u = u * color.range_uv.x + color.range_uv.y; + v = (v * color.range_uv.x + color.range_uv.y) * 224.0f / 256.0f + 0.0625f * 256.0f; + + return make_float2(u, v); +} + +inline __device__ float calcY(float3 pixel) { + float4 vec_y = color.color_vec_y; + + return (dot(pixel, make_float3(vec_y)) + vec_y.w) * color.range_y.x + color.range_y.y; +} + +__global__ void RGBA_to_NV12( + cudaTextureObject_t srcImage, std::uint8_t *dstY, std::uint8_t *dstUV, + std::uint32_t dstPitchY, std::uint32_t dstPitchUV, + std::uint32_t width, std::uint32_t height) { + + int idX = (threadIdx.x + blockDim.x * blockIdx.x) * 2; + int idY = (threadIdx.y + blockDim.y * blockIdx.y); + + if(idX >= width) return; + if(idY >= height) return; + + dstY = dstY + idX + idY * dstPitchY; + dstUV = dstUV + idX + (idY / 2 * dstPitchUV); + + float x = (float)idX / (float)width / 4; + float y = (float)idY / (float)height; + + float3 rgb_l = bgra_to_rgb(tex2D(srcImage, x, y)); + float3 rgb_r = bgra_to_rgb(tex2D(srcImage, x + 0.25f / width, y + 1.0f / height)); + + float2 uv = calcUV((rgb_l + rgb_r) * 0.5f); + + dstUV[0] = uv.x; + dstUV[1] = uv.y; + dstY[0] = calcY(rgb_l); + dstY[1] = calcY(rgb_r); +} + +sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock) + : array {}, texture { INVALID_TEXTURE }, width { out_width }, height { out_height }, threadsPerBlock { threadsPerBlock } { + auto format = cudaCreateChannelDesc(); + + CU_CHECK_VOID(cudaMallocArray(&array, &format, in_width, in_height, cudaArrayDefault), "Couldn't allocate cuda array"); + + cudaResourceDesc res {}; + res.resType = cudaResourceTypeArray; + res.res.array.array = array; + + cudaTextureDesc desc {}; + + desc.readMode = cudaReadModeElementType; + desc.filterMode = cudaFilterModePoint; + desc.normalizedCoords = true; + + std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp); + + CU_CHECK_VOID(cudaCreateTextureObject(&texture, &res, &desc, nullptr), "Couldn't create cuda texture"); +} + +sws_t::~sws_t() { + if(texture != INVALID_TEXTURE) { + CU_CHECK_IGNORE(cudaDestroyTextureObject(texture), "Couldn't deallocate cuda texture"); + + texture = INVALID_TEXTURE; + } + + if(array) { + CU_CHECK_IGNORE(cudaFreeArray(array), "Couldn't deallocate cuda array"); + + array = cudaArray_t {}; + } +} + +std::unique_ptr sws_t::make(int in_width, int in_height, int out_width, int out_height) { + cudaDeviceProp props; + int device; + CU_CHECK_PTR(cudaGetDevice(&device), "Couldn't get cuda device"); + CU_CHECK_PTR(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties"); + + auto sws = std::make_unique(in_width, in_height, out_width, out_height, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2); + + if(sws->texture == INVALID_TEXTURE) { + return nullptr; + } + + return sws; +} + +int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV) { + int threadsX = width / 2; + int threadsY = height; + + dim3 block(threadsPerBlock, threadsPerBlock); + dim3 grid(div_align(threadsX, threadsPerBlock), div_align(threadsY, threadsPerBlock)); + + RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, width, height); + + return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed"); +} + +void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) { + color_range = 1; + colorspace = 5; + video::color_extern_t *color_p; + switch(colorspace) { + case 5: // SWS_CS_SMPTE170M + color_p = &video::colors[0]; + break; + case 1: // SWS_CS_ITU709 + color_p = &video::colors[2]; + break; + case 9: // SWS_CS_BT2020 + default: + color_p = &video::colors[0]; + }; + + if(color_range > 1) { + // Full range + ++color_p; + } + + auto color_matrix = *(video::color_t*)color_p; + color_matrix.color_vec_y.w *= 256.0f; + color_matrix.color_vec_u.w *= 256.0f; + color_matrix.color_vec_v.w *= 256.0f; + + color_matrix.range_y.y *= 256.0f; + color_matrix.range_uv.y *= 256.0f; + + static_assert(sizeof(video::color_t) == sizeof(video::color_extern_t), "color matrix struct mismatch"); + + CU_CHECK_IGNORE(cudaMemcpyToSymbol(color, &color_matrix, sizeof(video::color_t)), "Couldn't copy color matrix to cuda"); +} + +int sws_t::load_ram(platf::img_t &img) { + return CU_CHECK_IGNORE(cudaMemcpy2DToArray(array, 0, 0, img.data, img.row_pitch, img.width * img.pixel_pitch, img.height, cudaMemcpyHostToDevice), "Couldn't copy to cuda array"); +} + +} // namespace cuda \ No newline at end of file diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h index a56c961b..41087506 100644 --- a/sunshine/platform/linux/cuda.h +++ b/sunshine/platform/linux/cuda.h @@ -1,6 +1,8 @@ #ifndef SUNSHINE_PLATFORM_CUDA_H #define SUNSHINE_PLATFORM_CUDA_H +#ifndef __NVCC__ + #include "sunshine/platform/common.h" #include "x11grab.h" @@ -9,4 +11,48 @@ std::shared_ptr make_hwdevice(int width, int height, platf::x int init(); } // namespace cuda +#else +namespace platf { +class img_t; +} +#endif + +typedef struct cudaArray *cudaArray_t; + +#if !defined(__CUDACC__) +typedef unsigned long long cudaTextureObject_t; +#else /* defined(__CUDACC__) */ +typedef __location__(device_builtin) unsigned long long cudaTextureObject_t; +#endif /* !defined(__CUDACC__) */ + +namespace cuda { +class sws_t { +public: + ~sws_t(); + sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock); + + /** + * in_width, out_width -- The width and height of the captured image in bytes + * out_width, out_height -- the width and height of the NV12 image in pixels + * + * cuda_device -- pointer to the cuda device + */ + static std::unique_ptr make(int in_width, int in_height, int out_width, int out_height); + + // Converts loaded image into a CUDevicePtr + int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV); + + void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range); + + int load_ram(platf::img_t &img); + + cudaArray_t array; + cudaTextureObject_t texture; + + int width, height; + + int threadsPerBlock; +}; +} // namespace cuda + #endif \ No newline at end of file diff --git a/sunshine/video.cpp b/sunshine/video.cpp index 7205aa07..3b143cbe 100644 --- a/sunshine/video.cpp +++ b/sunshine/video.cpp @@ -324,7 +324,7 @@ struct encoder_t { class session_t { public: session_t() = default; - session_t(ctx_t &&ctx, util::wrap_ptr &&device, int inject) : ctx { std::move(ctx) }, device { std::move(device) }, inject { inject } {} + session_t(ctx_t &&ctx, std::shared_ptr &&device, int inject) : ctx { std::move(ctx) }, device { std::move(device) }, inject { inject } {} session_t(session_t &&other) noexcept = default; @@ -342,7 +342,7 @@ public: } ctx_t ctx; - util::wrap_ptr device; + std::shared_ptr device; std::vector replacements; @@ -369,7 +369,6 @@ struct sync_session_t { sync_session_ctx_t *ctx; platf::img_t *img_tmp; - std::shared_ptr hwdevice; session_t session; }; @@ -779,7 +778,7 @@ int encode(int64_t frame_nr, session_t &session, frame_t::pointer frame, safe::m return 0; } -std::optional make_session(const encoder_t &encoder, const config_t &config, int width, int height, platf::hwdevice_t *hwdevice) { +std::optional make_session(const encoder_t &encoder, const config_t &config, int width, int height, std::shared_ptr &&hwdevice) { bool hardware = encoder.dev_type != AV_HWDEVICE_TYPE_NONE; auto &video_format = config.videoFormat == 0 ? encoder.h264 : encoder.hevc; @@ -886,7 +885,7 @@ std::optional make_session(const encoder_t &encoder, const config_t & if(hardware) { ctx->pix_fmt = encoder.dev_pix_fmt; - auto buf_or_error = encoder.make_hwdevice_ctx(hwdevice); + auto buf_or_error = encoder.make_hwdevice_ctx(hwdevice.get()); if(buf_or_error.has_right()) { return std::nullopt; } @@ -965,7 +964,7 @@ std::optional make_session(const encoder_t &encoder, const config_t & frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx); } - util::wrap_ptr device; + std::shared_ptr device; if(!hwdevice->data) { auto device_tmp = std::make_unique(); @@ -977,7 +976,7 @@ std::optional make_session(const encoder_t &encoder, const config_t & device = std::move(device_tmp); } else { - device = hwdevice; + device = std::move(hwdevice); } if(device->set_frame(frame.release())) { @@ -1009,12 +1008,12 @@ void encode_run( img_event_t images, config_t config, int width, int height, - platf::hwdevice_t *hwdevice, + std::shared_ptr &&hwdevice, safe::signal_t &reinit_event, const encoder_t &encoder, void *channel_data) { - auto session = make_session(encoder, config, width, height, hwdevice); + auto session = make_session(encoder, config, width, height, std::move(hwdevice)); if(!session) { return; } @@ -1101,12 +1100,11 @@ std::optional make_synced_session(platf::display_t *disp, const // absolute mouse coordinates require that the dimensions of the screen are known ctx.touch_port_events->raise(make_port(disp, ctx.config)); - auto session = make_session(encoder, ctx.config, img.width, img.height, hwdevice.get()); + auto session = make_session(encoder, ctx.config, img.width, img.height, std::move(hwdevice)); if(!session) { return std::nullopt; } - encode_session.hwdevice = std::move(hwdevice); encode_session.session = std::move(*session); return std::move(encode_session); @@ -1208,7 +1206,7 @@ encode_e encode_run_sync( ctx->idr_events->pop(); } - if(pos->hwdevice->convert(*img)) { + if(pos->session.device->convert(*img)) { BOOST_LOG(error) << "Could not convert image"sv; ctx->shutdown_event->raise(true); @@ -1356,7 +1354,7 @@ void capture_async( frame_nr, mail, images, config, display->width, display->height, - hwdevice.get(), + std::move(hwdevice), ref->reinit_event, *ref->encoder_p, channel_data); } @@ -1409,7 +1407,7 @@ int validate_config(std::shared_ptr &disp, const encoder_t &en return -1; } - auto session = make_session(encoder, config, disp->width, disp->height, hwdevice.get()); + auto session = make_session(encoder, config, disp->width, disp->height, std::move(hwdevice)); if(!session) { return -1; } diff --git a/third-party/nvfbc/NvFBC.h b/third-party/nvfbc/NvFBC.h new file mode 100644 index 00000000..8990eeab --- /dev/null +++ b/third-party/nvfbc/NvFBC.h @@ -0,0 +1,2006 @@ +/*! + * \file + * + * This file contains the interface constants, structure definitions and + * function prototypes defining the NvFBC API for Linux. + * + * Copyright (c) 2013-2020, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _NVFBC_H_ +#define _NVFBC_H_ + +#include + +/*! + * \mainpage NVIDIA Framebuffer Capture (NvFBC) for Linux. + * + * NvFBC is a high performance, low latency API to capture the framebuffer of + * an X server screen. + * + * The output from NvFBC captures everything that would be visible if we were + * directly looking at the monitor. This includes window manager decoration, + * mouse cursor, overlay, etc. + * + * It is ideally suited to desktop or fullscreen application capture and + * remoting. + */ + +/*! + * \defgroup FBC_REQ Requirements + * + * The following requirements are provided by the regular NVIDIA Display Driver + * package: + * + * - OpenGL core >= 4.2: + * Required. NvFBC relies on OpenGL to perform frame capture and + * post-processing. + * + * - Vulkan 1.1: + * Required. + * + * - libcuda.so.1 >= 5.5: + * Optional. Used for capture to video memory with CUDA interop. + * + * The following requirements must be installed separately depending on the + * Linux distribution being used: + * + * - XRandR extension >= 1.2: + * Optional. Used for RandR output tracking. + * + * - libX11-xcb.so.1 >= 1.2: + * Required. NvFBC uses a mix of Xlib and XCB. Xlib is needed to use GLX, + * XCB is needed to make NvFBC more resilient against X server terminations + * while a capture session is active. + * + * - libxcb.so.1 >= 1.3: + * Required. See above. + * + * - xorg-server >= 1.3: + * Optional. Required for push model to work properly. + * + * Note that all optional dependencies are dlopen()'d at runtime. Failure to + * load an optional library is not fatal. + */ + +/*! + * \defgroup FBC_CHANGES ChangeLog + * + * NvFBC Linux API version 0.1 + * - Initial BETA release. + * + * NvFBC Linux API version 0.2 + * - Added 'bEnableMSE' field to NVFBC_H264_HW_ENC_CONFIG. + * - Added 'dwMSE' field to NVFBC_TOH264_GRAB_FRAME_PARAMS. + * - Added 'bEnableAQ' field to NVFBC_H264_HW_ENC_CONFIG. + * - Added 'NVFBC_H264_PRESET_LOSSLESS_HP' enum to NVFBC_H264_PRESET. + * - Added 'NVFBC_BUFFER_FORMAT_YUV444P' enum to NVFBC_BUFFER_FORMAT. + * - Added 'eInputBufferFormat' field to NVFBC_H264_HW_ENC_CONFIG. + * - Added '0' and '244' values for NVFBC_H264_HW_ENC_CONFIG::dwProfile. + * + * NvFBC Linux API version 0.3 + * - Improved multi-threaded support by implementing an API locking mechanism. + * - Added 'nvFBCBindContext' API entry point. + * - Added 'nvFBCReleaseContext' API entry point. + * + * NvFBC Linux API version 1.0 + * - Added codec agnostic interface for HW encoding. + * - Deprecated H.264 interface. + * - Added support for H.265/HEVC HW encoding. + * + * NvFBC Linux API version 1.1 + * - Added 'nvFBCToHwGetCaps' API entry point. + * - Added 'dwDiffMapScalingFactor' field to NVFBC_TOSYS_SETUP_PARAMS. + * + * NvFBC Linux API version 1.2 + * - Deprecated ToHwEnc interface. + * - Added ToGL interface that captures frames to an OpenGL texture in video + * memory. + * - Added 'bDisableAutoModesetRecovery' field to + * NVFBC_CREATE_CAPTURE_SESSION_PARAMS. + * - Added 'bExternallyManagedContext' field to NVFBC_CREATE_HANDLE_PARAMS. + * + * NvFBC Linux API version 1.3 + * - Added NVFBC_BUFFER_FORMAT_RGBA + * - Added 'dwTimeoutMs' field to NVFBC_TOSYS_GRAB_FRAME_PARAMS, + * NVFBC_TOCUDA_GRAB_FRAME_PARAMS, and NVFBC_TOGL_GRAB_FRAME_PARAMS. + * + * NvFBC Linux API version 1.4 + * - Clarified that NVFBC_BUFFER_FORMAT_{ARGB,RGB,RGBA} are byte-order formats. + * - Renamed NVFBC_BUFFER_FORMAT_YUV420P to NVFBC_BUFFER_FORMAT_NV12. + * - Added new requirements. + * - Made NvFBC more resilient against the X server terminating during an active + * capture session. See new comments for ::NVFBC_ERR_X. + * - Relaxed requirement that 'frameSize' must have a width being a multiple of + * 4 and a height being a multiple of 2. + * - Added 'bRoundFrameSize' field to NVFBC_CREATE_CAPTURE_SESSION_PARAMS. + * - Relaxed requirement that the scaling factor for differential maps must be + * a multiple of the size of the frame. + * - Added 'diffMapSize' field to NVFBC_TOSYS_SETUP_PARAMS and + * NVFBC_TOGL_SETUP_PARAMS. + * + * NvFBC Linux API version 1.5 + * - Added NVFBC_BUFFER_FORMAT_BGRA + * + * NvFBC Linux API version 1.6 + * - Added the 'NVFBC_TOSYS_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY', + * 'NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY', and + * 'NVFBC_TOGL_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY' capture flags. + * - Exposed debug and performance logs through the NVFBC_LOG_LEVEL environment + * variable. Setting it to "1" enables performance logs, setting it to "2" + * enables debugging logs, setting it to "3" enables both. + * - Logs are printed to stdout or to the file pointed by the NVFBC_LOG_FILE + * environment variable. + * - Added 'ulTimestampUs' to NVFBC_FRAME_GRAB_INFO. + * - Added 'dwSamplingRateMs' to NVFBC_CREATE_CAPTURE_SESSION_PARAMS. + * - Added 'bPushModel' to NVFBC_CREATE_CAPTURE_SESSION_PARAMS. + * + * NvFBC Linux API version 1.7 + * - Retired the NVFBC_CAPTURE_TO_HW_ENCODER interface. + * This interface has been deprecated since NvFBC 1.2 and has received no + * updates or new features since. We recommend using the NVIDIA Video Codec + * SDK to encode NvFBC frames. + * See: https://developer.nvidia.com/nvidia-video-codec-sdk + * - Added a 'Capture Modes' section to those headers. + * - Added a 'Post Processing' section to those headers. + * - Added an 'Environment Variables' section to those headers. + * - Added 'bInModeset' to NVFBC_GET_STATUS_PARAMS. + * - Added 'bAllowDirectCapture' to NVFBC_CREATE_CAPTURE_SESSION_PARAMS. + * - Added 'bDirectCaptured' to NVFBC_FRAME_GRAB_INFO. + * - Added 'bRequiredPostProcessing' to NVFBC_FRAME_GRAB_INFO. + */ + +/*! + * \defgroup FBC_MODES Capture Modes + * + * When creating a capture session, NvFBC instantiates a capture subsystem + * living in the NVIDIA X driver. + * + * This subsystem listens for damage events coming from applications then + * generates (composites) frames for NvFBC when new content is available. + * + * This capture server can operate on a timer where it periodically checks if + * there are any pending damage events, or it can generate frames as soon as it + * receives a new damage event. + * See NVFBC_CREATE_CAPTURE_SESSION_PARAMS::dwSamplingRateMs, + * and NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bPushModel. + * + * NvFBC can also attach itself to a fullscreen unoccluded application and have + * it copy its frames directly into a buffer owned by NvFBC upon present. This + * mode bypasses the X server. + * See NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bAllowDirectCapture. + * + * NvFBC is designed to capture frames with as few copies as possible. The + * NVIDIA X driver composites frames directly into the NvFBC buffers, and + * direct capture copies frames directly into these buffers as well. + * + * Depending on the configuration of a capture session, an extra copy (rendering + * pass) may be needed. See the 'Post Processing' section. + */ + +/*! + * \defgroup FBC_PP Post Processing + * + * Depending on the configuration of a capture session, NvFBC might require to + * do post processing on frames. + * + * Post processing is required for the following reasons: + * - NvFBC needs to do a pixel format conversion. + * - Diffmaps are requested. + * - Capture to system memory is requested. + * + * NvFBC needs to do a conversion if the requested pixel format does not match + * the native format. The native format is NVFBC_BUFFER_FORMAT_BGRA. + * + * Note: post processing is *not* required for frame scaling and frame cropping. + * + * Skipping post processing can reduce capture latency. An application can know + * whether post processing was required by checking + * NVFBC_FRAME_GRAB_INFO::bRequiredPostProcessing. + */ + +/*! + * \defgroup FBC_ENVVAR Environment Variables + * + * Below are the environment variables supported by NvFBC: + * + * - NVFBC_LOG_LEVEL + * Bitfield where the first bit enables debug logs and the second bit enables + * performance logs. Both can be enabled by setting this envvar to 3. + * + * - NVFBC_LOG_FILE + * Write all NvFBC logs to the given file. + * + * - NVFBC_FORCE_ALLOW_DIRECT_CAPTURE + * Used to override NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bAllowDirectCapture. + * + * - NVFBC_FORCE_POST_PROCESSING + * Used to force the post processing step, even if it could be skipped. + * See the 'Post Processing' section. + */ + +/*! + * \defgroup FBC_STRUCT Structure Definition + * + * @{ + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * Calling convention. + */ +#define NVFBCAPI + +/*! + * NvFBC API major version. + */ +#define NVFBC_VERSION_MAJOR 1 + +/*! + * NvFBC API minor version. + */ +#define NVFBC_VERSION_MINOR 7 + +/*! + * NvFBC API version. + */ +#define NVFBC_VERSION (uint32_t) (NVFBC_VERSION_MINOR | (NVFBC_VERSION_MAJOR << 8)) + +/*! + * Creates a version number for structure parameters. + */ +#define NVFBC_STRUCT_VERSION(typeName, ver) \ + (uint32_t) (sizeof(typeName) | ((ver) << 16) | (NVFBC_VERSION << 24)) + +/*! + * Defines error codes. + * + * \see NvFBCGetLastErrorStr + */ +typedef enum _NVFBCSTATUS +{ + /*! + * This indicates that the API call returned with no errors. + */ + NVFBC_SUCCESS = 0, + /*! + * This indicates that the API version between the client and the library + * is not compatible. + */ + NVFBC_ERR_API_VERSION = 1, + /*! + * An internal error occurred. + */ + NVFBC_ERR_INTERNAL = 2, + /*! + * This indicates that one or more of the parameter passed to the API call + * is invalid. + */ + NVFBC_ERR_INVALID_PARAM = 3, + /*! + * This indicates that one or more of the pointers passed to the API call + * is invalid. + */ + NVFBC_ERR_INVALID_PTR = 4, + /*! + * This indicates that the handle passed to the API call to identify the + * client is invalid. + */ + NVFBC_ERR_INVALID_HANDLE = 5, + /*! + * This indicates that the maximum number of threaded clients of the same + * process has been reached. The limit is 10 threads per process. + * There is no limit on the number of process. + */ + NVFBC_ERR_MAX_CLIENTS = 6, + /*! + * This indicates that the requested feature is not currently supported + * by the library. + */ + NVFBC_ERR_UNSUPPORTED = 7, + /*! + * This indicates that the API call failed because it was unable to allocate + * enough memory to perform the requested operation. + */ + NVFBC_ERR_OUT_OF_MEMORY = 8, + /*! + * This indicates that the API call was not expected. This happens when + * API calls are performed in a wrong order, such as trying to capture + * a frame prior to creating a new capture session; or trying to set up + * a capture to video memory although a capture session to system memory + * was created. + */ + NVFBC_ERR_BAD_REQUEST = 9, + /*! + * This indicates an X error, most likely meaning that the X server has + * been terminated. When this error is returned, the only resort is to + * create another FBC handle using NvFBCCreateHandle(). + * + * The previous handle should still be freed with NvFBCDestroyHandle(), but + * it might leak resources, in particular X, GLX, and GL resources since + * it is no longer possible to communicate with an X server to free them + * through the driver. + * + * The best course of action to eliminate this potential leak is to close + * the OpenGL driver, close the forked process running the capture, or + * restart the application. + */ + NVFBC_ERR_X = 10, + /*! + * This indicates a GLX error. + */ + NVFBC_ERR_GLX = 11, + /*! + * This indicates an OpenGL error. + */ + NVFBC_ERR_GL = 12, + /*! + * This indicates a CUDA error. + */ + NVFBC_ERR_CUDA = 13, + /*! + * This indicates a HW encoder error. + */ + NVFBC_ERR_ENCODER = 14, + /*! + * This indicates an NvFBC context error. + */ + NVFBC_ERR_CONTEXT = 15, + /*! + * This indicates that the application must recreate the capture session. + * + * This error can be returned if a modeset event occurred while capturing + * frames, and NVFBC_CREATE_HANDLE_PARAMS::bDisableAutoModesetRecovery + * was set to NVFBC_TRUE. + */ + NVFBC_ERR_MUST_RECREATE = 16, + /*! + * This indicates a Vulkan error. + */ + NVFBC_ERR_VULKAN = 17, +} NVFBCSTATUS; + +/*! + * Defines boolean values. + */ +typedef enum _NVFBC_BOOL +{ + /*! + * False value. + */ + NVFBC_FALSE = 0, + /*! + * True value. + */ + NVFBC_TRUE, +} NVFBC_BOOL; + +/*! + * Maximum size in bytes of an error string. + */ +#define NVFBC_ERR_STR_LEN 512 + +/*! + * Capture type. + */ +typedef enum _NVFBC_CAPTURE_TYPE +{ + /*! + * Capture frames to a buffer in system memory. + */ + NVFBC_CAPTURE_TO_SYS = 0, + /*! + * Capture frames to a CUDA device in video memory. + * + * Specifying this will dlopen() libcuda.so.1 and fail if not available. + */ + NVFBC_CAPTURE_SHARED_CUDA, + /*! + * Retired. Do not use. + */ + /* NVFBC_CAPTURE_TO_HW_ENCODER, */ + /*! + * Capture frames to an OpenGL buffer in video memory. + */ + NVFBC_CAPTURE_TO_GL = 3, +} NVFBC_CAPTURE_TYPE; + +/*! + * Tracking type. + * + * NvFBC can track a specific region of the framebuffer to capture. + * + * An X screen corresponds to the entire framebuffer. + * + * An RandR CRTC is a component of the GPU that reads pixels from a region of + * the X screen and sends them through a pipeline to an RandR output. + * A physical monitor can be connected to an RandR output. Tracking an RandR + * output captures the region of the X screen that the RandR CRTC is sending to + * the RandR output. + */ +typedef enum +{ + /*! + * By default, NvFBC tries to track a connected primary output. If none is + * found, then it tries to track the first connected output. If none is + * found then it tracks the entire X screen. + * + * If the XRandR extension is not available, this option has the same effect + * as ::NVFBC_TRACKING_SCREEN. + * + * This default behavior might be subject to changes in the future. + */ + NVFBC_TRACKING_DEFAULT = 0, + /*! + * Track an RandR output specified by its ID in the appropriate field. + * + * The list of connected outputs can be queried via NvFBCGetStatus(). + * This list can also be obtained using e.g., xrandr(1). + * + * If the XRandR extension is not available, setting this option returns an + * error. + */ + NVFBC_TRACKING_OUTPUT, + /*! + * Track the entire X screen. + */ + NVFBC_TRACKING_SCREEN, +} NVFBC_TRACKING_TYPE; + +/*! + * Buffer format. + */ +typedef enum _NVFBC_BUFFER_FORMAT +{ + /*! + * Data will be converted to ARGB8888 byte-order format. 32 bpp. + */ + NVFBC_BUFFER_FORMAT_ARGB = 0, + /*! + * Data will be converted to RGB888 byte-order format. 24 bpp. + */ + NVFBC_BUFFER_FORMAT_RGB, + /*! + * Data will be converted to NV12 format using HDTV weights + * according to ITU-R BT.709. 12 bpp. + */ + NVFBC_BUFFER_FORMAT_NV12, + /*! + * Data will be converted to YUV 444 planar format using HDTV weights + * according to ITU-R BT.709. 24 bpp + */ + NVFBC_BUFFER_FORMAT_YUV444P, + /*! + * Data will be converted to RGBA8888 byte-order format. 32 bpp. + */ + NVFBC_BUFFER_FORMAT_RGBA, + /*! + * Native format. No pixel conversion needed. + * BGRA8888 byte-order format. 32 bpp. + */ + NVFBC_BUFFER_FORMAT_BGRA, +} NVFBC_BUFFER_FORMAT; + +#define NVFBC_BUFFER_FORMAT_YUV420P NVFBC_BUFFER_FORMAT_NV12 + +/*! + * Handle used to identify an NvFBC session. + */ +typedef uint64_t NVFBC_SESSION_HANDLE; + +/*! + * Box used to describe an area of the tracked region to capture. + * + * The coordinates are relative to the tracked region. + * + * E.g., if the size of the X screen is 3520x1200 and the tracked RandR output + * scans a region of 1600x1200+1920+0, then setting a capture box of + * 800x600+100+50 effectively captures a region of 800x600+2020+50 relative to + * the X screen. + */ +typedef struct _NVFBC_BOX +{ + /*! + * [in] X offset of the box. + */ + uint32_t x; + /*! + * [in] Y offset of the box. + */ + uint32_t y; + /*! + * [in] Width of the box. + */ + uint32_t w; + /*! + * [in] Height of the box. + */ + uint32_t h; +} NVFBC_BOX; + +/*! + * Size used to describe the size of a frame. + */ +typedef struct _NVFBC_SIZE +{ + /*! + * [in] Width. + */ + uint32_t w; + /*! + * [in] Height. + */ + uint32_t h; +} NVFBC_SIZE; + +/*! + * Describes information about a captured frame. + */ +typedef struct _NVFBC_FRAME_GRAB_INFO +{ + /*! + * [out] Width of the captured frame. + */ + uint32_t dwWidth; + /*! + * [out] Height of the captured frame. + */ + uint32_t dwHeight; + /*! + * [out] Size of the frame in bytes. + */ + uint32_t dwByteSize; + /*! + * [out] Incremental ID of the current frame. + * + * This can be used to identify a frame. + */ + uint32_t dwCurrentFrame; + /*! + * [out] Whether the captured frame is a new frame. + * + * When using non blocking calls it is possible to capture a frame + * that was already captured before if the display server did not + * render a new frame in the meantime. In that case, this flag + * will be set to NVFBC_FALSE. + * + * When using blocking calls each captured frame will have + * this flag set to NVFBC_TRUE since the blocking mechanism waits for + * the display server to render a new frame. + * + * Note that this flag does not guarantee that the content of + * the frame will be different compared to the previous captured frame. + * + * In particular, some compositing managers report the entire + * framebuffer as damaged when an application refreshes its content. + * + * Consider a single X screen spanned across physical displays A and B + * and an NvFBC application tracking display A. Depending on the + * compositing manager, it is possible that an application refreshing + * itself on display B will trigger a frame capture on display A. + * + * Workarounds include: + * - Using separate X screens + * - Disabling the composite extension + * - Using a compositing manager that properly reports what regions + * are damaged + * - Using NvFBC's diffmaps to find out if the frame changed + */ + NVFBC_BOOL bIsNewFrame; + /*! + * [out] Frame timestamp + * + * Time in micro seconds when the display server started rendering the + * frame. + * + * This does not account for when the frame was captured. If capturing an + * old frame (e.g., bIsNewFrame is NVFBC_FALSE) the reported timestamp + * will reflect the time when the old frame was rendered by the display + * server. + */ + uint64_t ulTimestampUs; + /* + * [out] Number of frames generated since the last capture. + * + * This can help applications tell whether they missed frames or there + * were no frames generated by the server since the last capture. + */ + uint32_t dwMissedFrames; + /* + * [out] Whether the captured frame required post processing. + * + * See the 'Post Processing' section. + */ + NVFBC_BOOL bRequiredPostProcessing; + /* + * [out] Whether this frame was obtained via direct capture. + * + * See NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bAllowDirectCapture. + */ + NVFBC_BOOL bDirectCapture; +} NVFBC_FRAME_GRAB_INFO; + +/*! + * Defines parameters for the CreateHandle() API call. + */ +typedef struct _NVFBC_CREATE_HANDLE_PARAMS +{ + /*! + * [in] Must be set to NVFBC_CREATE_HANDLE_PARAMS_VER + */ + uint32_t dwVersion; + /*! + * [in] Application specific private information passed to the NvFBC + * session. + */ + const void *privateData; + /*! + * [in] Size of the application specific private information passed to the + * NvFBC session. + */ + uint32_t privateDataSize; + /*! + * [in] Whether NvFBC should not create and manage its own graphics context + * + * NvFBC internally uses OpenGL to perfom graphics operations on the + * captured frames. By default, NvFBC will create and manage (e.g., make + * current, detect new threads, etc.) its own OpenGL context. + * + * If set to NVFBC_TRUE, NvFBC will use the application's context. It will + * be the application's responsibility to make sure that a context is + * current on the thread calling into the NvFBC API. + */ + NVFBC_BOOL bExternallyManagedContext; + /*! + * [in] GLX context + * + * GLX context that NvFBC should use internally to create pixmaps and + * make them current when creating a new capture session. + * + * Note: NvFBC expects a context created against a GLX_RGBA_TYPE render + * type. + */ + void *glxCtx; + /*! + * [in] GLX framebuffer configuration + * + * Framebuffer configuration that was used to create the GLX context, and + * that will be used to create pixmaps internally. + * + * Note: NvFBC expects a configuration having at least the following + * attributes: + * GLX_DRAWABLE_TYPE, GLX_PIXMAP_BIT + * GLX_BIND_TO_TEXTURE_RGBA_EXT, 1 + * GLX_BIND_TO_TEXTURE_TARGETS_EXT, GLX_TEXTURE_2D_BIT_EXT + */ + void *glxFBConfig; +} NVFBC_CREATE_HANDLE_PARAMS; + +/*! + * NVFBC_CREATE_HANDLE_PARAMS structure version. + */ +#define NVFBC_CREATE_HANDLE_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_CREATE_HANDLE_PARAMS, 2) + +/*! + * Defines parameters for the ::NvFBCDestroyHandle() API call. + */ +typedef struct _NVFBC_DESTROY_HANDLE_PARAMS +{ + /*! + * [in] Must be set to NVFBC_DESTROY_HANDLE_PARAMS_VER + */ + uint32_t dwVersion; +} NVFBC_DESTROY_HANDLE_PARAMS; + +/*! + * NVFBC_DESTROY_HANDLE_PARAMS structure version. + */ +#define NVFBC_DESTROY_HANDLE_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_DESTROY_HANDLE_PARAMS, 1) + +/*! + * Maximum number of connected RandR outputs to an X screen. + */ +#define NVFBC_OUTPUT_MAX 5 + +/*! + * Maximum size in bytes of an RandR output name. + */ +#define NVFBC_OUTPUT_NAME_LEN 128 + +/*! + * Describes an RandR output. + * + * Filling this structure relies on the XRandR extension. This feature cannot + * be used if the extension is missing or its version is below the requirements. + * + * \see Requirements + */ +typedef struct _NVFBC_OUTPUT +{ + /*! + * Identifier of the RandR output. + */ + uint32_t dwId; + /*! + * Name of the RandR output, as reported by tools such as xrandr(1). + * + * Example: "DVI-I-0" + */ + char name[NVFBC_OUTPUT_NAME_LEN]; + /*! + * Region of the X screen tracked by the RandR CRTC driving this RandR + * output. + */ + NVFBC_BOX trackedBox; +} NVFBC_RANDR_OUTPUT_INFO; + +/*! + * Defines parameters for the ::NvFBCGetStatus() API call. + */ +typedef struct _NVFBC_GET_STATUS_PARAMS +{ + /*! + * [in] Must be set to NVFBC_GET_STATUS_PARAMS_VER + */ + uint32_t dwVersion; + /*! + * [out] Whether or not framebuffer capture is supported by the graphics + * driver. + */ + NVFBC_BOOL bIsCapturePossible; + /*! + * [out] Whether or not there is already a capture session on this system. + */ + NVFBC_BOOL bCurrentlyCapturing; + /*! + * [out] Whether or not it is possible to create a capture session on this + * system. + */ + NVFBC_BOOL bCanCreateNow; + /*! + * [out] Size of the X screen (framebuffer). + */ + NVFBC_SIZE screenSize; + /*! + * [out] Whether the XRandR extension is available. + * + * If this extension is not available then it is not possible to have + * information about RandR outputs. + */ + NVFBC_BOOL bXRandRAvailable; + /*! + * [out] Array of outputs connected to the X screen. + * + * An application can track a specific output by specifying its ID when + * creating a capture session. + * + * Only if XRandR is available. + */ + NVFBC_RANDR_OUTPUT_INFO outputs[NVFBC_OUTPUT_MAX]; + /*! + * [out] Number of outputs connected to the X screen. + * + * This must be used to parse the array of connected outputs. + * + * Only if XRandR is available. + */ + uint32_t dwOutputNum; + /*! + * [out] Version of the NvFBC library running on this system. + */ + uint32_t dwNvFBCVersion; + /*! + * [out] Whether the X server is currently in modeset. + * + * When the X server is in modeset, it must give up all its video + * memory allocations. It is not possible to create a capture + * session until the modeset is over. + * + * Note that VT-switches are considered modesets. + */ + NVFBC_BOOL bInModeset; +} NVFBC_GET_STATUS_PARAMS; + +/*! + * NVFBC_GET_STATUS_PARAMS structure version. + */ +#define NVFBC_GET_STATUS_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_GET_STATUS_PARAMS, 2) + +/*! + * Defines parameters for the ::NvFBCCreateCaptureSession() API call. + */ +typedef struct _NVFBC_CREATE_CAPTURE_SESSION_PARAMS +{ + /*! + * [in] Must be set to NVFBC_CREATE_CAPTURE_SESSION_PARAMS_VER + */ + uint32_t dwVersion; + /*! + * [in] Desired capture type. + * + * Note that when specyfing ::NVFBC_CAPTURE_SHARED_CUDA NvFBC will try to + * dlopen() the corresponding libraries. This means that NvFBC can run on + * a system without the CUDA library since it does not link against them. + */ + NVFBC_CAPTURE_TYPE eCaptureType; + /*! + * [in] What region of the framebuffer should be tracked. + */ + NVFBC_TRACKING_TYPE eTrackingType; + /*! + * [in] ID of the output to track if eTrackingType is set to + * ::NVFBC_TRACKING_OUTPUT. + */ + uint32_t dwOutputId; + /*! + * [in] Crop the tracked region. + * + * The coordinates are relative to the tracked region. + * + * It can be set to 0 to capture the entire tracked region. + */ + NVFBC_BOX captureBox; + /*! + * [in] Desired size of the captured frame. + * + * This parameter allow to scale the captured frame. + * + * It can be set to 0 to disable frame resizing. + */ + NVFBC_SIZE frameSize; + /*! + * [in] Whether the mouse cursor should be composited to the frame. + * + * Disabling the cursor will not generate new frames when only the cursor + * is moved. + */ + NVFBC_BOOL bWithCursor; + /*! + * [in] Whether NvFBC should not attempt to recover from modesets. + * + * NvFBC is able to detect when a modeset event occured and can automatically + * re-create a capture session with the same settings as before, then resume + * its frame capture session transparently. + * + * This option allows to disable this behavior. NVFBC_ERR_MUST_RECREATE + * will be returned in that case. + * + * It can be useful in the cases when an application needs to do some work + * between setting up a capture and grabbing the first frame. + * + * For example: an application using the ToGL interface needs to register + * resources with EncodeAPI prior to encoding frames. + * + * Note that during modeset recovery, NvFBC will try to re-create the + * capture session every second until it succeeds. + */ + NVFBC_BOOL bDisableAutoModesetRecovery; + /*! + * [in] Whether NvFBC should round the requested frameSize. + * + * When disabled, NvFBC will not attempt to round the requested resolution. + * + * However, some pixel formats have resolution requirements. E.g., YUV/NV + * formats must have a width being a multiple of 4, and a height being a + * multiple of 2. RGB formats don't have such requirements. + * + * If the resolution doesn't meet the requirements of the format, then NvFBC + * will fail at setup time. + * + * When enabled, NvFBC will round the requested width to the next multiple + * of 4 and the requested height to the next multiple of 2. + * + * In this case, requesting any resolution will always work with every + * format. However, an NvFBC client must be prepared to handle the case + * where the requested resolution is different than the captured resolution. + * + * NVFBC_FRAME_GRAB_INFO::dwWidth and NVFBC_FRAME_GRAB_INFO::dwHeight should + * always be used for getting information about captured frames. + */ + NVFBC_BOOL bRoundFrameSize; + /*! + * [in] Rate in ms at which the display server generates new frames + * + * This controls the frequency at which the display server will generate + * new frames if new content is available. This effectively controls the + * capture rate when using blocking calls. + * + * Note that lower values will increase the CPU and GPU loads. + * + * The default value is 16ms (~ 60 Hz). + */ + uint32_t dwSamplingRateMs; + /*! + * [in] Enable push model for frame capture + * + * When set to NVFBC_TRUE, the display server will generate frames whenever + * it receives a damage event from applications. + * + * Setting this to NVFBC_TRUE will ignore ::dwSamplingRateMs. + * + * Using push model with the NVFBC_*_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY + * capture flag should guarantee the shortest amount of time between an + * application rendering a frame and an NvFBC client capturing it, provided + * that the NvFBC client is able to process the frames quickly enough. + * + * Note that applications running at high frame rates will increase CPU and + * GPU loads. + */ + NVFBC_BOOL bPushModel; + /*! + * [in] Allow direct capture + * + * Direct capture allows NvFBC to attach itself to a fullscreen graphics + * application. Whenever that application presents a frame, it makes a copy + * of it directly into a buffer owned by NvFBC thus bypassing the X server. + * + * When direct capture is *not* enabled, the NVIDIA X driver generates a + * frame for NvFBC when it receives a damage event from an application if push + * model is enabled, or periodically checks if there are any pending damage + * events otherwise (see NVFBC_CREATE_CAPTURE_SESSION_PARAMS::dwSamplingRateMs). + * + * Direct capture is possible under the following conditions: + * - Direct capture is allowed + * - Push model is enabled (see NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bPushModel) + * - The mouse cursor is not composited (see NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bWithCursor) + * - No viewport transformation is required. This happens when the remote + * desktop is e.g. rotated. + * + * When direct capture is possible, NvFBC will automatically attach itself + * to a fullscreen unoccluded application, if such exists. + * + * Notes: + * - This includes compositing desktops such as GNOME (e.g., gnome-shell + * is the fullscreen unoccluded application). + * - There can be only one fullscreen unoccluded application at a time. + * - The NVIDIA X driver monitors which application qualifies or no + * longer qualifies. + * + * For example, if a fullscreen application is launched in GNOME, NvFBC will + * detach from gnome-shell and attach to that application. + * + * Attaching and detaching happens automatically from the perspective of an + * NvFBC client. When detaching from an application, the X driver will + * transparently resume generating frames for NvFBC. + * + * An application can know whether a given frame was obtained through + * direct capture by checking NVFBC_FRAME_GRAB_INFO::bDirectCapture. + */ + NVFBC_BOOL bAllowDirectCapture; +} NVFBC_CREATE_CAPTURE_SESSION_PARAMS; + +/*! + * NVFBC_CREATE_CAPTURE_SESSION_PARAMS structure version. + */ +#define NVFBC_CREATE_CAPTURE_SESSION_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_CREATE_CAPTURE_SESSION_PARAMS, 6) + +/*! + * Defines parameters for the ::NvFBCDestroyCaptureSession() API call. + */ +typedef struct _NVFBC_DESTROY_CAPTURE_SESSION_PARAMS +{ + /*! + * [in] Must be set to NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER + */ + uint32_t dwVersion; +} NVFBC_DESTROY_CAPTURE_SESSION_PARAMS; + +/*! + * NVFBC_DESTROY_CAPTURE_SESSION_PARAMS structure version. + */ +#define NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_DESTROY_CAPTURE_SESSION_PARAMS, 1) + +/*! + * Defines parameters for the ::NvFBCBindContext() API call. + */ +typedef struct _NVFBC_BIND_CONTEXT_PARAMS +{ + /*! + * [in] Must be set to NVFBC_BIND_CONTEXT_PARAMS_VER + */ + uint32_t dwVersion; +} NVFBC_BIND_CONTEXT_PARAMS; + +/*! + * NVFBC_BIND_CONTEXT_PARAMS structure version. + */ +#define NVFBC_BIND_CONTEXT_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_BIND_CONTEXT_PARAMS, 1) + +/*! + * Defines parameters for the ::NvFBCReleaseContext() API call. + */ +typedef struct _NVFBC_RELEASE_CONTEXT_PARAMS +{ + /*! + * [in] Must be set to NVFBC_RELEASE_CONTEXT_PARAMS_VER + */ + uint32_t dwVersion; +} NVFBC_RELEASE_CONTEXT_PARAMS; + +/*! + * NVFBC_RELEASE_CONTEXT_PARAMS structure version. + */ +#define NVFBC_RELEASE_CONTEXT_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_RELEASE_CONTEXT_PARAMS, 1) + +/*! + * Defines flags that can be used when capturing to system memory. + */ +typedef enum +{ + /*! + * Default, capturing waits for a new frame or mouse move. + * + * The default behavior of blocking grabs is to wait for a new frame until + * after the call was made. But it's possible that there is a frame already + * ready that the client hasn't seen. + * \see NVFBC_TOSYS_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY + */ + NVFBC_TOSYS_GRAB_FLAGS_NOFLAGS = 0, + /*! + * Capturing does not wait for a new frame nor a mouse move. + * + * It is therefore possible to capture the same frame multiple times. + * When this occurs, the dwCurrentFrame parameter of the + * NVFBC_FRAME_GRAB_INFO structure is not incremented. + */ + NVFBC_TOSYS_GRAB_FLAGS_NOWAIT = (1 << 0), + /*! + * Forces the destination buffer to be refreshed even if the frame has not + * changed since previous capture. + * + * By default, if the captured frame is identical to the previous one, NvFBC + * will omit one copy and not update the destination buffer. + * + * Setting that flag will prevent this behavior. This can be useful e.g., + * if the application has modified the buffer in the meantime. + */ + NVFBC_TOSYS_GRAB_FLAGS_FORCE_REFRESH = (1 << 1), + /*! + * Similar to NVFBC_TOSYS_GRAB_FLAGS_NOFLAGS, except that the capture will + * not wait if there is already a frame available that the client has + * never seen yet. + */ + NVFBC_TOSYS_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY = (1 << 2), +} NVFBC_TOSYS_GRAB_FLAGS; + +/*! + * Defines parameters for the ::NvFBCToSysSetUp() API call. + */ +typedef struct _NVFBC_TOSYS_SETUP_PARAMS +{ + /*! + * [in] Must be set to NVFBC_TOSYS_SETUP_PARAMS_VER + */ + uint32_t dwVersion; + /*! + * [in] Desired buffer format. + */ + NVFBC_BUFFER_FORMAT eBufferFormat; + /*! + * [out] Pointer to a pointer to a buffer in system memory. + * + * This buffer contains the pixel value of the requested format. Refer to + * the description of the buffer formats to understand the memory layout. + * + * The application does not need to allocate memory for this buffer. It + * should not free this buffer either. This buffer is automatically + * re-allocated when needed (e.g., when the resolution changes). + * + * This buffer is allocated by the NvFBC library to the proper size. This + * size is returned in the dwByteSize field of the + * ::NVFBC_FRAME_GRAB_INFO structure. + */ + void **ppBuffer; + /*! + * [in] Whether differential maps should be generated. + */ + NVFBC_BOOL bWithDiffMap; + /*! + * [out] Pointer to a pointer to a buffer in system memory. + * + * This buffer contains the differential map of two frames. It must be read + * as an array of unsigned char. Each unsigned char is either 0 or + * non-zero. 0 means that the pixel value at the given location has not + * changed since the previous captured frame. Non-zero means that the pixel + * value has changed. + * + * The application does not need to allocate memory for this buffer. It + * should not free this buffer either. This buffer is automatically + * re-allocated when needed (e.g., when the resolution changes). + * + * This buffer is allocated by the NvFBC library to the proper size. The + * size of the differential map is returned in ::diffMapSize. + * + * This option is not compatible with the ::NVFBC_BUFFER_FORMAT_YUV420P and + * ::NVFBC_BUFFER_FORMAT_YUV444P buffer formats. + */ + void **ppDiffMap; + /*! + * [in] Scaling factor of the differential maps. + * + * For example, a scaling factor of 16 means that one pixel of the diffmap + * will represent 16x16 pixels of the original frames. + * + * If any of these 16x16 pixels is different between the current and the + * previous frame, then the corresponding pixel in the diffmap will be set + * to non-zero. + * + * The default scaling factor is 1. A dwDiffMapScalingFactor of 0 will be + * set to 1. + */ + uint32_t dwDiffMapScalingFactor; + /*! + * [out] Size of the differential map. + * + * Only set if bWithDiffMap is set to NVFBC_TRUE. + */ + NVFBC_SIZE diffMapSize; +} NVFBC_TOSYS_SETUP_PARAMS; + +/*! + * NVFBC_TOSYS_SETUP_PARAMS structure version. + */ +#define NVFBC_TOSYS_SETUP_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOSYS_SETUP_PARAMS, 3) + +/*! + * Defines parameters for the ::NvFBCToSysGrabFrame() API call. + */ +typedef struct _NVFBC_TOSYS_GRAB_FRAME_PARAMS +{ + /*! + * [in] Must be set to NVFBC_TOSYS_GRAB_FRAME_PARAMS_VER + */ + uint32_t dwVersion; + /*! + * [in] Flags defining the behavior of this frame capture. + */ + uint32_t dwFlags; + /*! + * [out] Information about the captured frame. + * + * Can be NULL. + */ + NVFBC_FRAME_GRAB_INFO *pFrameGrabInfo; + /*! + * [in] Wait timeout in milliseconds. + * + * When capturing frames with the NVFBC_TOSYS_GRAB_FLAGS_NOFLAGS or + * NVFBC_TOSYS_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY flags, + * NvFBC will wait for a new frame or mouse move until the below timer + * expires. + * + * When timing out, the last captured frame will be returned. Note that as + * long as the NVFBC_TOSYS_GRAB_FLAGS_FORCE_REFRESH flag is not set, + * returning an old frame will incur no performance penalty. + * + * NvFBC clients can use the return value of the grab frame operation to + * find out whether a new frame was captured, or the timer expired. + * + * Note that the behavior of blocking calls is to wait for a new frame + * *after* the call has been made. When using timeouts, it is possible + * that NvFBC will return a new frame (e.g., it has never been captured + * before) even though no new frame was generated after the grab call. + * + * For the precise definition of what constitutes a new frame, see + * ::bIsNewFrame. + * + * Set to 0 to disable timeouts. + */ + uint32_t dwTimeoutMs; +} NVFBC_TOSYS_GRAB_FRAME_PARAMS; + +/*! + * NVFBC_TOSYS_GRAB_FRAME_PARAMS structure version. + */ +#define NVFBC_TOSYS_GRAB_FRAME_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOSYS_GRAB_FRAME_PARAMS, 2) + +/*! + * Defines flags that can be used when capturing to a CUDA buffer in video memory. + */ +typedef enum +{ + /*! + * Default, capturing waits for a new frame or mouse move. + * + * The default behavior of blocking grabs is to wait for a new frame until + * after the call was made. But it's possible that there is a frame already + * ready that the client hasn't seen. + * \see NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY + */ + NVFBC_TOCUDA_GRAB_FLAGS_NOFLAGS = 0, + /*! + * Capturing does not wait for a new frame nor a mouse move. + * + * It is therefore possible to capture the same frame multiple times. + * When this occurs, the dwCurrentFrame parameter of the + * NVFBC_FRAME_GRAB_INFO structure is not incremented. + */ + NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT = (1 << 0), + /*! + * [in] Forces the destination buffer to be refreshed even if the frame + * has not changed since previous capture. + * + * By default, if the captured frame is identical to the previous one, NvFBC + * will omit one copy and not update the destination buffer. + * + * Setting that flag will prevent this behavior. This can be useful e.g., + * if the application has modified the buffer in the meantime. + */ + NVFBC_TOCUDA_GRAB_FLAGS_FORCE_REFRESH = (1 << 1), + /*! + * Similar to NVFBC_TOCUDA_GRAB_FLAGS_NOFLAGS, except that the capture will + * not wait if there is already a frame available that the client has + * never seen yet. + */ + NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY = (1 << 2), +} NVFBC_TOCUDA_FLAGS; + +/*! + * Defines parameters for the ::NvFBCToCudaSetUp() API call. + */ +typedef struct _NVFBC_TOCUDA_SETUP_PARAMS +{ + /*! + * [in] Must be set to NVFBC_TOCUDA_SETUP_PARAMS_VER + */ + uint32_t dwVersion; + /*! + * [in] Desired buffer format. + */ + NVFBC_BUFFER_FORMAT eBufferFormat; +} NVFBC_TOCUDA_SETUP_PARAMS; + +/*! + * NVFBC_TOCUDA_SETUP_PARAMS structure version. + */ +#define NVFBC_TOCUDA_SETUP_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOCUDA_SETUP_PARAMS, 1) + +/*! + * Defines parameters for the ::NvFBCToCudaGrabFrame() API call. + */ +typedef struct _NVFBC_TOCUDA_GRAB_FRAME_PARAMS +{ + /*! + * [in] Must be set to NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER. + */ + uint32_t dwVersion; + /*! + * [in] Flags defining the behavior of this frame capture. + */ + uint32_t dwFlags; + /*! + * [out] Pointer to a ::CUdeviceptr + * + * The application does not need to allocate memory for this CUDA device. + * + * The application does need to create its own CUDA context to use this + * CUDA device. + * + * This ::CUdeviceptr will be mapped to a segment in video memory containing + * the frame. It is not possible to process a CUDA device while capturing + * a new frame. If the application wants to do so, it must copy the CUDA + * device using ::cuMemcpyDtoD or ::cuMemcpyDtoH beforehand. + */ + void *pCUDADeviceBuffer; + /*! + * [out] Information about the captured frame. + * + * Can be NULL. + */ + NVFBC_FRAME_GRAB_INFO *pFrameGrabInfo; + /*! + * [in] Wait timeout in milliseconds. + * + * When capturing frames with the NVFBC_TOCUDA_GRAB_FLAGS_NOFLAGS or + * NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY flags, + * NvFBC will wait for a new frame or mouse move until the below timer + * expires. + * + * When timing out, the last captured frame will be returned. Note that as + * long as the NVFBC_TOCUDA_GRAB_FLAGS_FORCE_REFRESH flag is not set, + * returning an old frame will incur no performance penalty. + * + * NvFBC clients can use the return value of the grab frame operation to + * find out whether a new frame was captured, or the timer expired. + * + * Note that the behavior of blocking calls is to wait for a new frame + * *after* the call has been made. When using timeouts, it is possible + * that NvFBC will return a new frame (e.g., it has never been captured + * before) even though no new frame was generated after the grab call. + * + * For the precise definition of what constitutes a new frame, see + * ::bIsNewFrame. + * + * Set to 0 to disable timeouts. + */ + uint32_t dwTimeoutMs; +} NVFBC_TOCUDA_GRAB_FRAME_PARAMS; + +/*! + * NVFBC_TOCUDA_GRAB_FRAME_PARAMS structure version. + */ +#define NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOCUDA_GRAB_FRAME_PARAMS, 2) + +/*! + * Defines flags that can be used when capturing to an OpenGL buffer in video memory. + */ +typedef enum +{ + /*! + * Default, capturing waits for a new frame or mouse move. + * + * The default behavior of blocking grabs is to wait for a new frame until + * after the call was made. But it's possible that there is a frame already + * ready that the client hasn't seen. + * \see NVFBC_TOGL_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY + */ + NVFBC_TOGL_GRAB_FLAGS_NOFLAGS = 0, + /*! + * Capturing does not wait for a new frame nor a mouse move. + * + * It is therefore possible to capture the same frame multiple times. + * When this occurs, the dwCurrentFrame parameter of the + * NVFBC_FRAME_GRAB_INFO structure is not incremented. + */ + NVFBC_TOGL_GRAB_FLAGS_NOWAIT = (1 << 0), + /*! + * [in] Forces the destination buffer to be refreshed even if the frame + * has not changed since previous capture. + * + * By default, if the captured frame is identical to the previous one, NvFBC + * will omit one copy and not update the destination buffer. + * + * Setting that flag will prevent this behavior. This can be useful e.g., + * if the application has modified the buffer in the meantime. + */ + NVFBC_TOGL_GRAB_FLAGS_FORCE_REFRESH = (1 << 1), + /*! + * Similar to NVFBC_TOGL_GRAB_FLAGS_NOFLAGS, except that the capture will + * not wait if there is already a frame available that the client has + * never seen yet. + */ + NVFBC_TOGL_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY = (1 << 2), +} NVFBC_TOGL_FLAGS; + +/*! + * Maximum number of GL textures that can be used to store frames. + */ +#define NVFBC_TOGL_TEXTURES_MAX 2 + +/*! + * Defines parameters for the ::NvFBCToGLSetUp() API call. + */ +typedef struct _NVFBC_TOGL_SETUP_PARAMS +{ + /*! + * [in] Must be set to NVFBC_TOGL_SETUP_PARAMS_VER + */ + uint32_t dwVersion; + /*! + * [in] Desired buffer format. + */ + NVFBC_BUFFER_FORMAT eBufferFormat; + /*! + * [in] Whether differential maps should be generated. + */ + NVFBC_BOOL bWithDiffMap; + /*! + * [out] Pointer to a pointer to a buffer in system memory. + * + * \see NVFBC_TOSYS_SETUP_PARAMS::ppDiffMap + */ + void **ppDiffMap; + /*! + * [in] Scaling factor of the differential maps. + * + * \see NVFBC_TOSYS_SETUP_PARAMS::dwDiffMapScalingFactor + */ + uint32_t dwDiffMapScalingFactor; + /*! + * [out] List of GL textures that will store the captured frames. + * + * This array is 0 terminated. The number of textures varies depending on + * the capture settings (such as whether diffmaps are enabled). + * + * An application wishing to interop with, for example, EncodeAPI will need + * to register these textures prior to start encoding frames. + * + * After each frame capture, the texture holding the current frame will be + * returned in NVFBC_TOGL_GRAB_FRAME_PARAMS::dwTexture. + */ + uint32_t dwTextures[NVFBC_TOGL_TEXTURES_MAX]; + /*! + * [out] GL target to which the texture should be bound. + */ + uint32_t dwTexTarget; + /*! + * [out] GL format of the textures. + */ + uint32_t dwTexFormat; + /*! + * [out] GL type of the textures. + */ + uint32_t dwTexType; + /*! + * [out] Size of the differential map. + * + * Only set if bWithDiffMap is set to NVFBC_TRUE. + */ + NVFBC_SIZE diffMapSize; +} NVFBC_TOGL_SETUP_PARAMS; + +/*! + * NVFBC_TOGL_SETUP_PARAMS structure version. + */ +#define NVFBC_TOGL_SETUP_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOGL_SETUP_PARAMS, 2) + +/*! + * Defines parameters for the ::NvFBCToGLGrabFrame() API call. + */ +typedef struct _NVFBC_TOGL_GRAB_FRAME_PARAMS +{ + /*! + * [in] Must be set to NVFBC_TOGL_GRAB_FRAME_PARAMS_VER. + */ + uint32_t dwVersion; + /*! + * [in] Flags defining the behavior of this frame capture. + */ + uint32_t dwFlags; + /*! + * [out] Index of the texture storing the current frame. + * + * This is an index in the NVFBC_TOGL_SETUP_PARAMS::dwTextures array. + */ + uint32_t dwTextureIndex; + /*! + * [out] Information about the captured frame. + * + * Can be NULL. + */ + NVFBC_FRAME_GRAB_INFO *pFrameGrabInfo; + /*! + * [in] Wait timeout in milliseconds. + * + * When capturing frames with the NVFBC_TOGL_GRAB_FLAGS_NOFLAGS or + * NVFBC_TOGL_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY flags, + * NvFBC will wait for a new frame or mouse move until the below timer + * expires. + * + * When timing out, the last captured frame will be returned. Note that as + * long as the NVFBC_TOGL_GRAB_FLAGS_FORCE_REFRESH flag is not set, + * returning an old frame will incur no performance penalty. + * + * NvFBC clients can use the return value of the grab frame operation to + * find out whether a new frame was captured, or the timer expired. + * + * Note that the behavior of blocking calls is to wait for a new frame + * *after* the call has been made. When using timeouts, it is possible + * that NvFBC will return a new frame (e.g., it has never been captured + * before) even though no new frame was generated after the grab call. + * + * For the precise definition of what constitutes a new frame, see + * ::bIsNewFrame. + * + * Set to 0 to disable timeouts. + */ + uint32_t dwTimeoutMs; +} NVFBC_TOGL_GRAB_FRAME_PARAMS; + +/*! + * NVFBC_TOGL_GRAB_FRAME_PARAMS structure version. + */ +#define NVFBC_TOGL_GRAB_FRAME_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOGL_GRAB_FRAME_PARAMS, 2) + +/*! @} FBC_STRUCT */ + +/*! + * \defgroup FBC_FUNC API Entry Points + * + * Entry points are thread-safe and can be called concurrently. + * + * The locking model includes a global lock that protects session handle + * management (\see NvFBCCreateHandle, \see NvFBCDestroyHandle). + * + * Each NvFBC session uses a local lock to protect other entry points. Note + * that in certain cases, a thread can hold the local lock for an undefined + * amount of time, such as grabbing a frame using a blocking call. + * + * Note that a context is associated with each session. NvFBC clients wishing + * to share a session between different threads are expected to release and + * bind the context appropriately (\see NvFBCBindContext, + * \see NvFBCReleaseContext). This is not required when each thread uses its + * own NvFBC session. + * + * @{ + */ + +/*! + * Gets the last error message that got recorded for a client. + * + * When NvFBC returns an error, it will save an error message that can be + * queried through this API call. Only the last message is saved. + * The message and the return code should give enough information about + * what went wrong. + * + * \param [in] sessionHandle + * Handle to the NvFBC client. + * \return + * A NULL terminated error message, or an empty string. Its maximum length + * is NVFBC_ERROR_STR_LEN. + */ +const char* NVFBCAPI NvFBCGetLastErrorStr(const NVFBC_SESSION_HANDLE sessionHandle); + +/*! + * \brief Allocates a new handle for an NvFBC client. + * + * This function allocates a session handle used to identify an FBC client. + * + * This function implicitly calls NvFBCBindContext(). + * + * \param [out] pSessionHandle + * Pointer that will hold the allocated session handle. + * \param [in] pParams + * ::NVFBC_CREATE_HANDLE_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_PTR \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_OUT_OF_MEMORY \n + * ::NVFBC_ERR_MAX_CLIENTS \n + * ::NVFBC_ERR_X \n + * ::NVFBC_ERR_GLX \n + * ::NVFBC_ERR_GL + * + */ +NVFBCSTATUS NVFBCAPI NvFBCCreateHandle(NVFBC_SESSION_HANDLE *pSessionHandle, NVFBC_CREATE_HANDLE_PARAMS *pParams); + +/*! + * \brief Destroys the handle of an NvFBC client. + * + * This function uninitializes an FBC client. + * + * This function implicitly calls NvFBCReleaseContext(). + * + * After this fucntion returns, it is not possible to use this session handle + * for any further API call. + * + * \param [in] sessionHandle + * FBC session handle. + * \param [in] pParams + * ::NVFBC_DESTROY_HANDLE_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_X + */ +NVFBCSTATUS NVFBCAPI NvFBCDestroyHandle(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_DESTROY_HANDLE_PARAMS *pParams); + +/*! + * \brief Gets the current status of the display driver. + * + * This function queries the display driver for various information. + * + * \param [in] sessionHandle + * FBC session handle. + * \param [in] pParams + * ::NVFBC_GET_STATUS_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_X + */ +NVFBCSTATUS NVFBCAPI NvFBCGetStatus(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_GET_STATUS_PARAMS *pParams); + +/*! + * \brief Binds the FBC context to the calling thread. + * + * The NvFBC library internally relies on objects that must be bound to a + * thread. Such objects are OpenGL contexts and CUDA contexts. + * + * This function binds these objects to the calling thread. + * + * The FBC context must be bound to the calling thread for most NvFBC entry + * points, otherwise ::NVFBC_ERR_CONTEXT is returned. + * + * If the FBC context is already bound to a different thread, + * ::NVFBC_ERR_CONTEXT is returned. The other thread must release the context + * first by calling the ReleaseContext() entry point. + * + * If the FBC context is already bound to the current thread, this function has + * no effects. + * + * \param [in] sessionHandle + * FBC session handle. + * \param [in] pParams + * ::NVFBC_DESTROY_CAPTURE_SESSION_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_X + */ +NVFBCSTATUS NVFBCAPI NvFBCBindContext(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_BIND_CONTEXT_PARAMS *pParams); + +/*! + * \brief Releases the FBC context from the calling thread. + * + * If the FBC context is bound to a different thread, ::NVFBC_ERR_CONTEXT is + * returned. + * + * If the FBC context is already released, this functino has no effects. + * + * \param [in] sessionHandle + * FBC session handle. + * \param [in] pParams + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_X + */ +NVFBCSTATUS NVFBCAPI NvFBCReleaseContext(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_RELEASE_CONTEXT_PARAMS *pParams); + +/*! + * \brief Creates a capture session for an FBC client. + * + * This function starts a capture session of the desired type (system memory, + * video memory with CUDA interop, or H.264 compressed frames in system memory). + * + * Not all types are supported on all systems. Also, it is possible to use + * NvFBC without having the CUDA library. In this case, requesting a capture + * session of the concerned type will return an error. + * + * After this function returns, the display driver will start generating frames + * that can be captured using the corresponding API call. + * + * \param [in] sessionHandle + * FBC session handle. + * \param [in] pParams + * ::NVFBC_CREATE_CAPTURE_SESSION_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_INVALID_PARAM \n + * ::NVFBC_ERR_OUT_OF_MEMORY \n + * ::NVFBC_ERR_X \n + * ::NVFBC_ERR_GLX \n + * ::NVFBC_ERR_GL \n + * ::NVFBC_ERR_CUDA \n + * ::NVFBC_ERR_MUST_RECREATE \n + * ::NVFBC_ERR_INTERNAL + */ +NVFBCSTATUS NVFBCAPI NvFBCCreateCaptureSession(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_CREATE_CAPTURE_SESSION_PARAMS *pParams); + +/*! + * \brief Destroys a capture session for an FBC client. + * + * This function stops a capture session and frees allocated objects. + * + * After this function returns, it is possible to create another capture + * session using the corresponding API call. + * + * \param [in] sessionHandle + * FBC session handle. + * \param [in] pParams + * ::NVFBC_DESTROY_CAPTURE_SESSION_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_X + */ +NVFBCSTATUS NVFBCAPI NvFBCDestroyCaptureSession(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_DESTROY_CAPTURE_SESSION_PARAMS *pParams); + +/*! + * \brief Sets up a capture to system memory session. + * + * This function configures how the capture to system memory should behave. It + * can be called anytime and several times after the capture session has been + * created. However, it must be called at least once prior to start capturing + * frames. + * + * This function allocates the buffer that will contain the captured frame. + * The application does not need to free this buffer. The size of this buffer + * is returned in the ::NVFBC_FRAME_GRAB_INFO structure. + * + * \param [in] sessionHandle + * FBC session handle. + * \param [in] pParams + * ::NVFBC_TOSYS_SETUP_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_UNSUPPORTED \n + * ::NVFBC_ERR_INVALID_PTR \n + * ::NVFBC_ERR_INVALID_PARAM \n + * ::NVFBC_ERR_OUT_OF_MEMORY \n + * ::NVFBC_ERR_X + */ +NVFBCSTATUS NVFBCAPI NvFBCToSysSetUp(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOSYS_SETUP_PARAMS *pParams); + +/*! + * \brief Captures a frame to a buffer in system memory. + * + * This function triggers a frame capture to a buffer in system memory that was + * registered with the ToSysSetUp() API call. + * + * Note that it is possible that the resolution of the desktop changes while + * capturing frames. This should be transparent for the application. + * + * When the resolution changes, the capture session is recreated using the same + * parameters, and necessary buffers are re-allocated. The frame counter is not + * reset. + * + * An application can detect that the resolution changed by comparing the + * dwByteSize member of the ::NVFBC_FRAME_GRAB_INFO against a previous + * frame and/or dwWidth and dwHeight. + * + * During a change of resolution the capture is paused even in asynchronous + * mode. + * + * \param [in] sessionHandle + * FBC session handle. + * \param [in] pParams + * ::NVFBC_TOSYS_GRAB_FRAME_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_INVALID_PTR \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_X \n + * ::NVFBC_ERR_MUST_RECREATE \n + * \see NvFBCCreateCaptureSession \n + * \see NvFBCToSysSetUp + */ +NVFBCSTATUS NVFBCAPI NvFBCToSysGrabFrame(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOSYS_GRAB_FRAME_PARAMS *pParams); + +/*! + * \brief Sets up a capture to video memory session. + * + * This function configures how the capture to video memory with CUDA interop + * should behave. It can be called anytime and several times after the capture + * session has been created. However, it must be called at least once prior + * to start capturing frames. + * + * \param [in] sessionHandle + * FBC session handle. + * + * \param [in] pParams + * ::NVFBC_TOCUDA_SETUP_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_UNSUPPORTED \n + * ::NVFBC_ERR_GL \n + * ::NVFBC_ERR_X + */ +NVFBCSTATUS NVFBCAPI NvFBCToCudaSetUp(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOCUDA_SETUP_PARAMS *pParams); + +/*! + * \brief Captures a frame to a CUDA device in video memory. + * + * This function triggers a frame capture to a CUDA device in video memory. + * + * Note about changes of resolution: \see NvFBCToSysGrabFrame + * + * \param [in] sessionHandle + * FBC session handle. + * + * \param [in] pParams + * ::NVFBC_TOCUDA_GRAB_FRAME_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_INVALID_PTR \n + * ::NVFBC_ERR_CUDA \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_X \n + * ::NVFBC_ERR_MUST_RECREATE \n + * \see NvFBCCreateCaptureSession \n + * \see NvFBCToCudaSetUp + */ +NVFBCSTATUS NVFBCAPI NvFBCToCudaGrabFrame(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOCUDA_GRAB_FRAME_PARAMS *pParams); + +/*! + * \brief Sets up a capture to OpenGL buffer in video memory session. + * + * This function configures how the capture to video memory should behave. + * It can be called anytime and several times after the capture session has been + * created. However, it must be called at least once prior to start capturing + * frames. + * + * \param [in] sessionHandle + * FBC session handle. + * + * \param [in] pParams + * ::NVFBC_TOGL_SETUP_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_UNSUPPORTED \n + * ::NVFBC_ERR_GL \n + * ::NVFBC_ERR_X + */ +NVFBCSTATUS NVFBCAPI NvFBCToGLSetUp(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOGL_SETUP_PARAMS *pParams); + +/*! + * \brief Captures a frame to an OpenGL buffer in video memory. + * + * This function triggers a frame capture to a selected resource in video memory. + * + * Note about changes of resolution: \see NvFBCToSysGrabFrame + * + * \param [in] sessionHandle + * FBC session handle. + * + * \param [in] pParams + * ::NVFBC_TOGL_GRAB_FRAME_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_INVALID_PTR \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_X \n + * ::NVFBC_ERR_MUST_RECREATE \n + * \see NvFBCCreateCaptureSession \n + * \see NvFBCToCudaSetUp + */ +NVFBCSTATUS NVFBCAPI NvFBCToGLGrabFrame(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOGL_GRAB_FRAME_PARAMS *pParams); + +/*! + * \cond FBC_PFN + * + * Defines API function pointers + */ +typedef const char* (NVFBCAPI* PNVFBCGETLASTERRORSTR)(const NVFBC_SESSION_HANDLE sessionHandle); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCCREATEHANDLE)(NVFBC_SESSION_HANDLE *pSessionHandle, NVFBC_CREATE_HANDLE_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCDESTROYHANDLE)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_DESTROY_HANDLE_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCBINDCONTEXT)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_BIND_CONTEXT_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCRELEASECONTEXT)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_RELEASE_CONTEXT_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCGETSTATUS)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_GET_STATUS_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCCREATECAPTURESESSION)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_CREATE_CAPTURE_SESSION_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCDESTROYCAPTURESESSION)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_DESTROY_CAPTURE_SESSION_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOSYSSETUP)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOSYS_SETUP_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOSYSGRABFRAME)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOSYS_GRAB_FRAME_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOCUDASETUP)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOCUDA_SETUP_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOCUDAGRABFRAME)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOCUDA_GRAB_FRAME_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOGLSETUP)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOGL_SETUP_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOGLGRABFRAME)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOGL_GRAB_FRAME_PARAMS *pParams); + +/// \endcond + +/*! @} FBC_FUNC */ + +/*! + * \ingroup FBC_STRUCT + * + * Structure populated with API function pointers. + */ +typedef struct +{ + uint32_t dwVersion; //!< [in] Must be set to NVFBC_VERSION. + PNVFBCGETLASTERRORSTR nvFBCGetLastErrorStr; //!< [out] Pointer to ::NvFBCGetLastErrorStr(). + PNVFBCCREATEHANDLE nvFBCCreateHandle; //!< [out] Pointer to ::NvFBCCreateHandle(). + PNVFBCDESTROYHANDLE nvFBCDestroyHandle; //!< [out] Pointer to ::NvFBCDestroyHandle(). + PNVFBCGETSTATUS nvFBCGetStatus; //!< [out] Pointer to ::NvFBCGetStatus(). + PNVFBCCREATECAPTURESESSION nvFBCCreateCaptureSession; //!< [out] Pointer to ::NvFBCCreateCaptureSession(). + PNVFBCDESTROYCAPTURESESSION nvFBCDestroyCaptureSession; //!< [out] Pointer to ::NvFBCDestroyCaptureSession(). + PNVFBCTOSYSSETUP nvFBCToSysSetUp; //!< [out] Pointer to ::NvFBCToSysSetUp(). + PNVFBCTOSYSGRABFRAME nvFBCToSysGrabFrame; //!< [out] Pointer to ::NvFBCToSysGrabFrame(). + PNVFBCTOCUDASETUP nvFBCToCudaSetUp; //!< [out] Pointer to ::NvFBCToCudaSetUp(). + PNVFBCTOCUDAGRABFRAME nvFBCToCudaGrabFrame; //!< [out] Pointer to ::NvFBCToCudaGrabFrame(). + void* pad1; //!< [out] Retired. Do not use. + void* pad2; //!< [out] Retired. Do not use. + void* pad3; //!< [out] Retired. Do not use. + PNVFBCBINDCONTEXT nvFBCBindContext; //!< [out] Pointer to ::NvFBCBindContext(). + PNVFBCRELEASECONTEXT nvFBCReleaseContext; //!< [out] Pointer to ::NvFBCReleaseContext(). + void* pad4; //!< [out] Retired. Do not use. + void* pad5; //!< [out] Retired. Do not use. + void* pad6; //!< [out] Retired. Do not use. + void* pad7; //!< [out] Retired. Do not use. + PNVFBCTOGLSETUP nvFBCToGLSetUp; //!< [out] Pointer to ::nvFBCToGLSetup(). + PNVFBCTOGLGRABFRAME nvFBCToGLGrabFrame; //!< [out] Pointer to ::nvFBCToGLGrabFrame(). +} NVFBC_API_FUNCTION_LIST; + +/*! + * \ingroup FBC_FUNC + * + * \brief Entry Points to the NvFBC interface. + * + * Creates an instance of the NvFBC interface, and populates the + * pFunctionList with function pointers to the API routines implemented by + * the NvFBC interface. + * + * \param [out] pFunctionList + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_PTR \n + * ::NVFBC_ERR_API_VERSION + */ +NVFBCSTATUS NVFBCAPI NvFBCCreateInstance(NVFBC_API_FUNCTION_LIST *pFunctionList); +/*! + * \ingroup FBC_FUNC + * + * Defines function pointer for the ::NvFBCCreateInstance() API call. + */ +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCCREATEINSTANCE)(NVFBC_API_FUNCTION_LIST *pFunctionList); + +#ifdef __cplusplus +} +#endif + +#endif // _NVFBC_H_ diff --git a/third-party/nvfbc/helper_math.h b/third-party/nvfbc/helper_math.h new file mode 100644 index 00000000..d17b024e --- /dev/null +++ b/third-party/nvfbc/helper_math.h @@ -0,0 +1,1469 @@ +/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This file implements common mathematical operations on vector types + * (float3, float4 etc.) since these are not provided as standard by CUDA. + * + * The syntax is modeled on the Cg standard library. + * + * This is part of the Helper library includes + * + * Thanks to Linh Hah for additions and fixes. + */ + +#ifndef HELPER_MATH_H +#define HELPER_MATH_H + +#include "cuda_runtime.h" + +typedef unsigned int uint; +typedef unsigned short ushort; + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#ifndef __CUDACC__ +#include + +//////////////////////////////////////////////////////////////////////////////// +// host implementations of CUDA functions +//////////////////////////////////////////////////////////////////////////////// + +inline float fminf(float a, float b) +{ + return a < b ? a : b; +} + +inline float fmaxf(float a, float b) +{ + return a > b ? a : b; +} + +inline int max(int a, int b) +{ + return a > b ? a : b; +} + +inline int min(int a, int b) +{ + return a < b ? a : b; +} + +inline float rsqrtf(float x) +{ + return 1.0f / sqrtf(x); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// constructors +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 make_float2(float s) +{ + return make_float2(s, s); +} +inline __host__ __device__ float2 make_float2(float3 a) +{ + return make_float2(a.x, a.y); +} +inline __host__ __device__ float2 make_float2(int2 a) +{ + return make_float2(float(a.x), float(a.y)); +} +inline __host__ __device__ float2 make_float2(uint2 a) +{ + return make_float2(float(a.x), float(a.y)); +} + +inline __host__ __device__ int2 make_int2(int s) +{ + return make_int2(s, s); +} +inline __host__ __device__ int2 make_int2(int3 a) +{ + return make_int2(a.x, a.y); +} +inline __host__ __device__ int2 make_int2(uint2 a) +{ + return make_int2(int(a.x), int(a.y)); +} +inline __host__ __device__ int2 make_int2(float2 a) +{ + return make_int2(int(a.x), int(a.y)); +} + +inline __host__ __device__ uint2 make_uint2(uint s) +{ + return make_uint2(s, s); +} +inline __host__ __device__ uint2 make_uint2(uint3 a) +{ + return make_uint2(a.x, a.y); +} +inline __host__ __device__ uint2 make_uint2(int2 a) +{ + return make_uint2(uint(a.x), uint(a.y)); +} + +inline __host__ __device__ float3 make_float3(float s) +{ + return make_float3(s, s, s); +} +inline __host__ __device__ float3 make_float3(float2 a) +{ + return make_float3(a.x, a.y, 0.0f); +} +inline __host__ __device__ float3 make_float3(float2 a, float s) +{ + return make_float3(a.x, a.y, s); +} +inline __host__ __device__ float3 make_float3(float4 a) +{ + return make_float3(a.x, a.y, a.z); +} +inline __host__ __device__ float3 make_float3(int3 a) +{ + return make_float3(float(a.x), float(a.y), float(a.z)); +} +inline __host__ __device__ float3 make_float3(uint3 a) +{ + return make_float3(float(a.x), float(a.y), float(a.z)); +} + +inline __host__ __device__ int3 make_int3(int s) +{ + return make_int3(s, s, s); +} +inline __host__ __device__ int3 make_int3(int2 a) +{ + return make_int3(a.x, a.y, 0); +} +inline __host__ __device__ int3 make_int3(int2 a, int s) +{ + return make_int3(a.x, a.y, s); +} +inline __host__ __device__ int3 make_int3(uint3 a) +{ + return make_int3(int(a.x), int(a.y), int(a.z)); +} +inline __host__ __device__ int3 make_int3(float3 a) +{ + return make_int3(int(a.x), int(a.y), int(a.z)); +} + +inline __host__ __device__ uint3 make_uint3(uint s) +{ + return make_uint3(s, s, s); +} +inline __host__ __device__ uint3 make_uint3(uint2 a) +{ + return make_uint3(a.x, a.y, 0); +} +inline __host__ __device__ uint3 make_uint3(uint2 a, uint s) +{ + return make_uint3(a.x, a.y, s); +} +inline __host__ __device__ uint3 make_uint3(uint4 a) +{ + return make_uint3(a.x, a.y, a.z); +} +inline __host__ __device__ uint3 make_uint3(int3 a) +{ + return make_uint3(uint(a.x), uint(a.y), uint(a.z)); +} + +inline __host__ __device__ float4 make_float4(float s) +{ + return make_float4(s, s, s, s); +} +inline __host__ __device__ float4 make_float4(float3 a) +{ + return make_float4(a.x, a.y, a.z, 0.0f); +} +inline __host__ __device__ float4 make_float4(float3 a, float w) +{ + return make_float4(a.x, a.y, a.z, w); +} +inline __host__ __device__ float4 make_float4(int4 a) +{ + return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); +} +inline __host__ __device__ float4 make_float4(uint4 a) +{ + return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); +} + +inline __host__ __device__ int4 make_int4(int s) +{ + return make_int4(s, s, s, s); +} +inline __host__ __device__ int4 make_int4(int3 a) +{ + return make_int4(a.x, a.y, a.z, 0); +} +inline __host__ __device__ int4 make_int4(int3 a, int w) +{ + return make_int4(a.x, a.y, a.z, w); +} +inline __host__ __device__ int4 make_int4(uint4 a) +{ + return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); +} +inline __host__ __device__ int4 make_int4(float4 a) +{ + return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); +} + + +inline __host__ __device__ uint4 make_uint4(uint s) +{ + return make_uint4(s, s, s, s); +} +inline __host__ __device__ uint4 make_uint4(uint3 a) +{ + return make_uint4(a.x, a.y, a.z, 0); +} +inline __host__ __device__ uint4 make_uint4(uint3 a, uint w) +{ + return make_uint4(a.x, a.y, a.z, w); +} +inline __host__ __device__ uint4 make_uint4(int4 a) +{ + return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// negate +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator-(float2 &a) +{ + return make_float2(-a.x, -a.y); +} +inline __host__ __device__ int2 operator-(int2 &a) +{ + return make_int2(-a.x, -a.y); +} +inline __host__ __device__ float3 operator-(float3 &a) +{ + return make_float3(-a.x, -a.y, -a.z); +} +inline __host__ __device__ int3 operator-(int3 &a) +{ + return make_int3(-a.x, -a.y, -a.z); +} +inline __host__ __device__ float4 operator-(float4 &a) +{ + return make_float4(-a.x, -a.y, -a.z, -a.w); +} +inline __host__ __device__ int4 operator-(int4 &a) +{ + return make_int4(-a.x, -a.y, -a.z, -a.w); +} + +//////////////////////////////////////////////////////////////////////////////// +// addition +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator+(float2 a, float2 b) +{ + return make_float2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(float2 &a, float2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ float2 operator+(float2 a, float b) +{ + return make_float2(a.x + b, a.y + b); +} +inline __host__ __device__ float2 operator+(float b, float2 a) +{ + return make_float2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(float2 &a, float b) +{ + a.x += b; + a.y += b; +} + +inline __host__ __device__ int2 operator+(int2 a, int2 b) +{ + return make_int2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(int2 &a, int2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ int2 operator+(int2 a, int b) +{ + return make_int2(a.x + b, a.y + b); +} +inline __host__ __device__ int2 operator+(int b, int2 a) +{ + return make_int2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(int2 &a, int b) +{ + a.x += b; + a.y += b; +} + +inline __host__ __device__ uint2 operator+(uint2 a, uint2 b) +{ + return make_uint2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(uint2 &a, uint2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ uint2 operator+(uint2 a, uint b) +{ + return make_uint2(a.x + b, a.y + b); +} +inline __host__ __device__ uint2 operator+(uint b, uint2 a) +{ + return make_uint2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(uint2 &a, uint b) +{ + a.x += b; + a.y += b; +} + + +inline __host__ __device__ float3 operator+(float3 a, float3 b) +{ + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(float3 &a, float3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ float3 operator+(float3 a, float b) +{ + return make_float3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(float3 &a, float b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ int3 operator+(int3 a, int3 b) +{ + return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(int3 &a, int3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ int3 operator+(int3 a, int b) +{ + return make_int3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(int3 &a, int b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ uint3 operator+(uint3 a, uint3 b) +{ + return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(uint3 &a, uint3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ uint3 operator+(uint3 a, uint b) +{ + return make_uint3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(uint3 &a, uint b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ int3 operator+(int b, int3 a) +{ + return make_int3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ uint3 operator+(uint b, uint3 a) +{ + return make_uint3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ float3 operator+(float b, float3 a) +{ + return make_float3(a.x + b, a.y + b, a.z + b); +} + +inline __host__ __device__ float4 operator+(float4 a, float4 b) +{ + return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(float4 &a, float4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ float4 operator+(float4 a, float b) +{ + return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ float4 operator+(float b, float4 a) +{ + return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(float4 &a, float b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +inline __host__ __device__ int4 operator+(int4 a, int4 b) +{ + return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(int4 &a, int4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ int4 operator+(int4 a, int b) +{ + return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ int4 operator+(int b, int4 a) +{ + return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(int4 &a, int b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +inline __host__ __device__ uint4 operator+(uint4 a, uint4 b) +{ + return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(uint4 &a, uint4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ uint4 operator+(uint4 a, uint b) +{ + return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ uint4 operator+(uint b, uint4 a) +{ + return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(uint4 &a, uint b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +//////////////////////////////////////////////////////////////////////////////// +// subtract +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator-(float2 a, float2 b) +{ + return make_float2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(float2 &a, float2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ float2 operator-(float2 a, float b) +{ + return make_float2(a.x - b, a.y - b); +} +inline __host__ __device__ float2 operator-(float b, float2 a) +{ + return make_float2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(float2 &a, float b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ int2 operator-(int2 a, int2 b) +{ + return make_int2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(int2 &a, int2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ int2 operator-(int2 a, int b) +{ + return make_int2(a.x - b, a.y - b); +} +inline __host__ __device__ int2 operator-(int b, int2 a) +{ + return make_int2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(int2 &a, int b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ uint2 operator-(uint2 a, uint2 b) +{ + return make_uint2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(uint2 &a, uint2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ uint2 operator-(uint2 a, uint b) +{ + return make_uint2(a.x - b, a.y - b); +} +inline __host__ __device__ uint2 operator-(uint b, uint2 a) +{ + return make_uint2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(uint2 &a, uint b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ float3 operator-(float3 a, float3 b) +{ + return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(float3 &a, float3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ float3 operator-(float3 a, float b) +{ + return make_float3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ float3 operator-(float b, float3 a) +{ + return make_float3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(float3 &a, float b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ int3 operator-(int3 a, int3 b) +{ + return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(int3 &a, int3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ int3 operator-(int3 a, int b) +{ + return make_int3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ int3 operator-(int b, int3 a) +{ + return make_int3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(int3 &a, int b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ uint3 operator-(uint3 a, uint3 b) +{ + return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(uint3 &a, uint3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ uint3 operator-(uint3 a, uint b) +{ + return make_uint3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ uint3 operator-(uint b, uint3 a) +{ + return make_uint3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(uint3 &a, uint b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ float4 operator-(float4 a, float4 b) +{ + return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(float4 &a, float4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ float4 operator-(float4 a, float b) +{ + return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ void operator-=(float4 &a, float b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +inline __host__ __device__ int4 operator-(int4 a, int4 b) +{ + return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(int4 &a, int4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ int4 operator-(int4 a, int b) +{ + return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ int4 operator-(int b, int4 a) +{ + return make_int4(b - a.x, b - a.y, b - a.z, b - a.w); +} +inline __host__ __device__ void operator-=(int4 &a, int b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +inline __host__ __device__ uint4 operator-(uint4 a, uint4 b) +{ + return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(uint4 &a, uint4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ uint4 operator-(uint4 a, uint b) +{ + return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ uint4 operator-(uint b, uint4 a) +{ + return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w); +} +inline __host__ __device__ void operator-=(uint4 &a, uint b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +//////////////////////////////////////////////////////////////////////////////// +// multiply +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator*(float2 a, float2 b) +{ + return make_float2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(float2 &a, float2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ float2 operator*(float2 a, float b) +{ + return make_float2(a.x * b, a.y * b); +} +inline __host__ __device__ float2 operator*(float b, float2 a) +{ + return make_float2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(float2 &a, float b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ int2 operator*(int2 a, int2 b) +{ + return make_int2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(int2 &a, int2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ int2 operator*(int2 a, int b) +{ + return make_int2(a.x * b, a.y * b); +} +inline __host__ __device__ int2 operator*(int b, int2 a) +{ + return make_int2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(int2 &a, int b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ uint2 operator*(uint2 a, uint2 b) +{ + return make_uint2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(uint2 &a, uint2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ uint2 operator*(uint2 a, uint b) +{ + return make_uint2(a.x * b, a.y * b); +} +inline __host__ __device__ uint2 operator*(uint b, uint2 a) +{ + return make_uint2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(uint2 &a, uint b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ float3 operator*(float3 a, float3 b) +{ + return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(float3 &a, float3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ float3 operator*(float3 a, float b) +{ + return make_float3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ float3 operator*(float b, float3 a) +{ + return make_float3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(float3 &a, float b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ int3 operator*(int3 a, int3 b) +{ + return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(int3 &a, int3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ int3 operator*(int3 a, int b) +{ + return make_int3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ int3 operator*(int b, int3 a) +{ + return make_int3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(int3 &a, int b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ uint3 operator*(uint3 a, uint3 b) +{ + return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(uint3 &a, uint3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ uint3 operator*(uint3 a, uint b) +{ + return make_uint3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ uint3 operator*(uint b, uint3 a) +{ + return make_uint3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(uint3 &a, uint b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ float4 operator*(float4 a, float4 b) +{ + return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(float4 &a, float4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ float4 operator*(float4 a, float b) +{ + return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ float4 operator*(float b, float4 a) +{ + return make_float4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(float4 &a, float b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +inline __host__ __device__ int4 operator*(int4 a, int4 b) +{ + return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(int4 &a, int4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ int4 operator*(int4 a, int b) +{ + return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ int4 operator*(int b, int4 a) +{ + return make_int4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(int4 &a, int b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +inline __host__ __device__ uint4 operator*(uint4 a, uint4 b) +{ + return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(uint4 &a, uint4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ uint4 operator*(uint4 a, uint b) +{ + return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ uint4 operator*(uint b, uint4 a) +{ + return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(uint4 &a, uint b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +//////////////////////////////////////////////////////////////////////////////// +// divide +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator/(float2 a, float2 b) +{ + return make_float2(a.x / b.x, a.y / b.y); +} +inline __host__ __device__ void operator/=(float2 &a, float2 b) +{ + a.x /= b.x; + a.y /= b.y; +} +inline __host__ __device__ float2 operator/(float2 a, float b) +{ + return make_float2(a.x / b, a.y / b); +} +inline __host__ __device__ void operator/=(float2 &a, float b) +{ + a.x /= b; + a.y /= b; +} +inline __host__ __device__ float2 operator/(float b, float2 a) +{ + return make_float2(b / a.x, b / a.y); +} + +inline __host__ __device__ float3 operator/(float3 a, float3 b) +{ + return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); +} +inline __host__ __device__ void operator/=(float3 &a, float3 b) +{ + a.x /= b.x; + a.y /= b.y; + a.z /= b.z; +} +inline __host__ __device__ float3 operator/(float3 a, float b) +{ + return make_float3(a.x / b, a.y / b, a.z / b); +} +inline __host__ __device__ void operator/=(float3 &a, float b) +{ + a.x /= b; + a.y /= b; + a.z /= b; +} +inline __host__ __device__ float3 operator/(float b, float3 a) +{ + return make_float3(b / a.x, b / a.y, b / a.z); +} + +inline __host__ __device__ float4 operator/(float4 a, float4 b) +{ + return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); +} +inline __host__ __device__ void operator/=(float4 &a, float4 b) +{ + a.x /= b.x; + a.y /= b.y; + a.z /= b.z; + a.w /= b.w; +} +inline __host__ __device__ float4 operator/(float4 a, float b) +{ + return make_float4(a.x / b, a.y / b, a.z / b, a.w / b); +} +inline __host__ __device__ void operator/=(float4 &a, float b) +{ + a.x /= b; + a.y /= b; + a.z /= b; + a.w /= b; +} +inline __host__ __device__ float4 operator/(float b, float4 a) +{ + return make_float4(b / a.x, b / a.y, b / a.z, b / a.w); +} + +//////////////////////////////////////////////////////////////////////////////// +// min +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fminf(float2 a, float2 b) +{ + return make_float2(fminf(a.x,b.x), fminf(a.y,b.y)); +} +inline __host__ __device__ float3 fminf(float3 a, float3 b) +{ + return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z)); +} +inline __host__ __device__ float4 fminf(float4 a, float4 b) +{ + return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w)); +} + +inline __host__ __device__ int2 min(int2 a, int2 b) +{ + return make_int2(min(a.x,b.x), min(a.y,b.y)); +} +inline __host__ __device__ int3 min(int3 a, int3 b) +{ + return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} +inline __host__ __device__ int4 min(int4 a, int4 b) +{ + return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w)); +} + +inline __host__ __device__ uint2 min(uint2 a, uint2 b) +{ + return make_uint2(min(a.x,b.x), min(a.y,b.y)); +} +inline __host__ __device__ uint3 min(uint3 a, uint3 b) +{ + return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} +inline __host__ __device__ uint4 min(uint4 a, uint4 b) +{ + return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// max +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fmaxf(float2 a, float2 b) +{ + return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y)); +} +inline __host__ __device__ float3 fmaxf(float3 a, float3 b) +{ + return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z)); +} +inline __host__ __device__ float4 fmaxf(float4 a, float4 b) +{ + return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w)); +} + +inline __host__ __device__ int2 max(int2 a, int2 b) +{ + return make_int2(max(a.x,b.x), max(a.y,b.y)); +} +inline __host__ __device__ int3 max(int3 a, int3 b) +{ + return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} +inline __host__ __device__ int4 max(int4 a, int4 b) +{ + return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w)); +} + +inline __host__ __device__ uint2 max(uint2 a, uint2 b) +{ + return make_uint2(max(a.x,b.x), max(a.y,b.y)); +} +inline __host__ __device__ uint3 max(uint3 a, uint3 b) +{ + return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} +inline __host__ __device__ uint4 max(uint4 a, uint4 b) +{ + return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// lerp +// - linear interpolation between a and b, based on value t in [0, 1] range +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float lerp(float a, float b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float2 lerp(float2 a, float2 b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float3 lerp(float3 a, float3 b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float4 lerp(float4 a, float4 b, float t) +{ + return a + t*(b-a); +} + +//////////////////////////////////////////////////////////////////////////////// +// clamp +// - clamp the value v to be in the range [a, b] +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float clamp(float f, float a, float b) +{ + return fmaxf(a, fminf(f, b)); +} +inline __device__ __host__ int clamp(int f, int a, int b) +{ + return max(a, min(f, b)); +} +inline __device__ __host__ uint clamp(uint f, uint a, uint b) +{ + return max(a, min(f, b)); +} + +inline __device__ __host__ float2 clamp(float2 v, float a, float b) +{ + return make_float2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b) +{ + return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ float3 clamp(float3 v, float a, float b) +{ + return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b) +{ + return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ float4 clamp(float4 v, float a, float b) +{ + return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b) +{ + return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +inline __device__ __host__ int2 clamp(int2 v, int a, int b) +{ + return make_int2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b) +{ + return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ int3 clamp(int3 v, int a, int b) +{ + return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b) +{ + return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ int4 clamp(int4 v, int a, int b) +{ + return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b) +{ + return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b) +{ + return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b) +{ + return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b) +{ + return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b) +{ + return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b) +{ + return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b) +{ + return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// dot product +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float dot(float2 a, float2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ float dot(float3 a, float3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ float dot(float4 a, float4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +inline __host__ __device__ int dot(int2 a, int2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ int dot(int3 a, int3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ int dot(int4 a, int4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +inline __host__ __device__ uint dot(uint2 a, uint2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ uint dot(uint3 a, uint3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ uint dot(uint4 a, uint4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +//////////////////////////////////////////////////////////////////////////////// +// length +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float length(float2 v) +{ + return sqrtf(dot(v, v)); +} +inline __host__ __device__ float length(float3 v) +{ + return sqrtf(dot(v, v)); +} +inline __host__ __device__ float length(float4 v) +{ + return sqrtf(dot(v, v)); +} + +//////////////////////////////////////////////////////////////////////////////// +// normalize +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 normalize(float2 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} +inline __host__ __device__ float3 normalize(float3 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} +inline __host__ __device__ float4 normalize(float4 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} + +//////////////////////////////////////////////////////////////////////////////// +// floor +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 floorf(float2 v) +{ + return make_float2(floorf(v.x), floorf(v.y)); +} +inline __host__ __device__ float3 floorf(float3 v) +{ + return make_float3(floorf(v.x), floorf(v.y), floorf(v.z)); +} +inline __host__ __device__ float4 floorf(float4 v) +{ + return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// frac - returns the fractional portion of a scalar or each vector component +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float fracf(float v) +{ + return v - floorf(v); +} +inline __host__ __device__ float2 fracf(float2 v) +{ + return make_float2(fracf(v.x), fracf(v.y)); +} +inline __host__ __device__ float3 fracf(float3 v) +{ + return make_float3(fracf(v.x), fracf(v.y), fracf(v.z)); +} +inline __host__ __device__ float4 fracf(float4 v) +{ + return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// fmod +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fmodf(float2 a, float2 b) +{ + return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y)); +} +inline __host__ __device__ float3 fmodf(float3 a, float3 b) +{ + return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z)); +} +inline __host__ __device__ float4 fmodf(float4 a, float4 b) +{ + return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// absolute value +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fabs(float2 v) +{ + return make_float2(fabs(v.x), fabs(v.y)); +} +inline __host__ __device__ float3 fabs(float3 v) +{ + return make_float3(fabs(v.x), fabs(v.y), fabs(v.z)); +} +inline __host__ __device__ float4 fabs(float4 v) +{ + return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w)); +} + +inline __host__ __device__ int2 abs(int2 v) +{ + return make_int2(abs(v.x), abs(v.y)); +} +inline __host__ __device__ int3 abs(int3 v) +{ + return make_int3(abs(v.x), abs(v.y), abs(v.z)); +} +inline __host__ __device__ int4 abs(int4 v) +{ + return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// reflect +// - returns reflection of incident ray I around surface normal N +// - N should be normalized, reflected vector's length is equal to length of I +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float3 reflect(float3 i, float3 n) +{ + return i - 2.0f * n * dot(n,i); +} + +//////////////////////////////////////////////////////////////////////////////// +// cross product +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float3 cross(float3 a, float3 b) +{ + return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); +} + +//////////////////////////////////////////////////////////////////////////////// +// smoothstep +// - returns 0 if x < a +// - returns 1 if x > b +// - otherwise returns smooth interpolation between 0 and 1 based on x +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float smoothstep(float a, float b, float x) +{ + float y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(3.0f - (2.0f*y))); +} +inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x) +{ + float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y))); +} +inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x) +{ + float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y))); +} +inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x) +{ + float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y))); +} + +#endif From 4d1689d6e94584f616f9f960c51c1ea97ef825ec Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Sun, 19 Sep 2021 21:27:31 +0200 Subject: [PATCH 06/27] Each cuda::sws_t has it's own color matrix --- sunshine/platform/linux/cuda.cu | 64 ++++++++++++++++++++------------- sunshine/platform/linux/cuda.h | 13 ++++++- 2 files changed, 51 insertions(+), 26 deletions(-) diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu index a2ca6508..8b7d76b7 100644 --- a/sunshine/platform/linux/cuda.cu +++ b/sunshine/platform/linux/cuda.cu @@ -65,7 +65,9 @@ struct __attribute__((__aligned__(16))) color_extern_t { __float2 range_uv; }; -extern color_extern_t colors[4]; +static_assert(sizeof(video::color_t) == sizeof(video::color_extern_t), "color matrix struct mismatch"); + +extern color_t colors[4]; } // namespace video //////////////////// End special declarations @@ -91,36 +93,47 @@ inline static int check(cudaError_t result, const std::string_view &sv) { return 0; } -__device__ __constant__ video::color_t color; +template +ptr_t make_ptr() { + void *p; + CU_CHECK_PTR(cudaMalloc(&p, sizeof(T)), "Couldn't allocate color matrix"); + ptr_t ptr { p }; + + return ptr; +} + +void freeCudaPtr_t::operator()(void *ptr) { + CU_CHECK_IGNORE(cudaFree(ptr), "Couldn't free cuda device pointer"); +} inline __device__ float3 bgra_to_rgb(uchar4 vec) { return make_float3((float)vec.z, (float)vec.y, (float)vec.x); } -inline __device__ float2 calcUV(float3 pixel) { - float4 vec_u = color.color_vec_u; - float4 vec_v = color.color_vec_v; +inline __device__ float2 calcUV(float3 pixel, const video::color_t *const color_matrix) { + float4 vec_u = color_matrix->color_vec_u; + float4 vec_v = color_matrix->color_vec_v; float u = dot(pixel, make_float3(vec_u)) + vec_u.w; float v = dot(pixel, make_float3(vec_v)) + vec_v.w; - u = u * color.range_uv.x + color.range_uv.y; - v = (v * color.range_uv.x + color.range_uv.y) * 224.0f / 256.0f + 0.0625f * 256.0f; + u = u * color_matrix->range_uv.x + color_matrix->range_uv.y; + v = (v * color_matrix->range_uv.x + color_matrix->range_uv.y) * 224.0f / 256.0f + 0.0625f * 256.0f; return make_float2(u, v); } -inline __device__ float calcY(float3 pixel) { - float4 vec_y = color.color_vec_y; +inline __device__ float calcY(float3 pixel, const video::color_t *const color_matrix) { + float4 vec_y = color_matrix->color_vec_y; - return (dot(pixel, make_float3(vec_y)) + vec_y.w) * color.range_y.x + color.range_y.y; + return (dot(pixel, make_float3(vec_y)) + vec_y.w) * color_matrix->range_y.x + color_matrix->range_y.y; } __global__ void RGBA_to_NV12( cudaTextureObject_t srcImage, std::uint8_t *dstY, std::uint8_t *dstUV, std::uint32_t dstPitchY, std::uint32_t dstPitchUV, - std::uint32_t width, std::uint32_t height) { + std::uint32_t width, std::uint32_t height, const video::color_t *const color_matrix) { int idX = (threadIdx.x + blockDim.x * blockIdx.x) * 2; int idY = (threadIdx.y + blockDim.y * blockIdx.y); @@ -137,16 +150,16 @@ __global__ void RGBA_to_NV12( float3 rgb_l = bgra_to_rgb(tex2D(srcImage, x, y)); float3 rgb_r = bgra_to_rgb(tex2D(srcImage, x + 0.25f / width, y + 1.0f / height)); - float2 uv = calcUV((rgb_l + rgb_r) * 0.5f); + float2 uv = calcUV((rgb_l + rgb_r) * 0.5f, color_matrix); dstUV[0] = uv.x; dstUV[1] = uv.y; - dstY[0] = calcY(rgb_l); - dstY[1] = calcY(rgb_r); + dstY[0] = calcY(rgb_l, color_matrix); + dstY[1] = calcY(rgb_r, color_matrix); } -sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock) - : array {}, texture { INVALID_TEXTURE }, width { out_width }, height { out_height }, threadsPerBlock { threadsPerBlock } { +sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock, ptr_t &&color_matrix) + : array {}, texture { INVALID_TEXTURE }, width { out_width }, height { out_height }, threadsPerBlock { threadsPerBlock }, color_matrix { std::move(color_matrix) } { auto format = cudaCreateChannelDesc(); CU_CHECK_VOID(cudaMallocArray(&array, &format, in_width, in_height, cudaArrayDefault), "Couldn't allocate cuda array"); @@ -186,7 +199,12 @@ std::unique_ptr sws_t::make(int in_width, int in_height, int out_width, i CU_CHECK_PTR(cudaGetDevice(&device), "Couldn't get cuda device"); CU_CHECK_PTR(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties"); - auto sws = std::make_unique(in_width, in_height, out_width, out_height, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2); + auto ptr = make_ptr(); + if(!ptr) { + return nullptr; + } + + auto sws = std::make_unique(in_width, in_height, out_width, out_height, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2, std::move(ptr)); if(sws->texture == INVALID_TEXTURE) { return nullptr; @@ -202,15 +220,13 @@ int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std: dim3 block(threadsPerBlock, threadsPerBlock); dim3 grid(div_align(threadsX, threadsPerBlock), div_align(threadsY, threadsPerBlock)); - RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, width, height); + RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, width, height, (video::color_t*)color_matrix.get()); return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed"); } void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) { - color_range = 1; - colorspace = 5; - video::color_extern_t *color_p; + video::color_t *color_p; switch(colorspace) { case 5: // SWS_CS_SMPTE170M color_p = &video::colors[0]; @@ -228,7 +244,7 @@ void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) ++color_p; } - auto color_matrix = *(video::color_t*)color_p; + auto color_matrix = *color_p; color_matrix.color_vec_y.w *= 256.0f; color_matrix.color_vec_u.w *= 256.0f; color_matrix.color_vec_v.w *= 256.0f; @@ -236,9 +252,7 @@ void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) color_matrix.range_y.y *= 256.0f; color_matrix.range_uv.y *= 256.0f; - static_assert(sizeof(video::color_t) == sizeof(video::color_extern_t), "color matrix struct mismatch"); - - CU_CHECK_IGNORE(cudaMemcpyToSymbol(color, &color_matrix, sizeof(video::color_t)), "Couldn't copy color matrix to cuda"); + CU_CHECK_IGNORE(cudaMemcpy(this->color_matrix.get(), &color_matrix, sizeof(video::color_t), cudaMemcpyHostToDevice), "Couldn't copy color matrix to cuda"); } int sws_t::load_ram(platf::img_t &img) { diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h index 41087506..0260dad4 100644 --- a/sunshine/platform/linux/cuda.h +++ b/sunshine/platform/linux/cuda.h @@ -1,6 +1,8 @@ #ifndef SUNSHINE_PLATFORM_CUDA_H #define SUNSHINE_PLATFORM_CUDA_H +#include + #ifndef __NVCC__ #include "sunshine/platform/common.h" @@ -26,10 +28,18 @@ typedef __location__(device_builtin) unsigned long long cudaTextureObject_t; #endif /* !defined(__CUDACC__) */ namespace cuda { + +class freeCudaPtr_t { +public: + void operator()(void *ptr); +}; + +using ptr_t = std::unique_ptr; + class sws_t { public: ~sws_t(); - sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock); + sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock, ptr_t &&color_matrix); /** * in_width, out_width -- The width and height of the captured image in bytes @@ -46,6 +56,7 @@ public: int load_ram(platf::img_t &img); + ptr_t color_matrix; cudaArray_t array; cudaTextureObject_t texture; From a963b31c1dfbf329a01889d2ea94b5d985ba13ba Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Sun, 19 Sep 2021 23:00:42 +0200 Subject: [PATCH 07/27] Ensure the background color is black --- sunshine/platform/linux/cuda.cpp | 24 ++++++++++++++- sunshine/platform/linux/cuda.cu | 53 +++++++++++++++++++++++--------- sunshine/platform/linux/cuda.h | 18 +++++++---- 3 files changed, 73 insertions(+), 22 deletions(-) diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp index 811293d6..e57f914d 100644 --- a/sunshine/platform/linux/cuda.cpp +++ b/sunshine/platform/linux/cuda.cpp @@ -325,7 +325,7 @@ public: cuda_ctx = ((AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx)->cuda_ctx; ctx_t ctx { cuda_ctx }; - sws = sws_t::make(width * 4, height, frame->width, frame->height); + sws = sws_t::make(width, height, frame->width, frame->height, width * 4); if(!sws) { return -1; @@ -343,6 +343,28 @@ public: void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override { ctx_t ctx { cuda_ctx }; sws->set_colorspace(colorspace, color_range); + + + // The default green color is ugly. + // Update the background color + platf::img_t img; + img.width = frame->width; + img.height = frame->height; + img.pixel_pitch = 4; + img.row_pitch = img.width * img.pixel_pitch; + + std::vector image_data; + image_data.resize(img.row_pitch * img.height); + + img.data = image_data.data(); + + if(sws->load_ram(img)) { + return; + } + + sws->convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], { + frame->width, frame->height, 0, 0 + }); } frame_t hwframe; diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu index 8b7d76b7..1dac5dd5 100644 --- a/sunshine/platform/linux/cuda.cu +++ b/sunshine/platform/linux/cuda.cu @@ -133,22 +133,25 @@ inline __device__ float calcY(float3 pixel, const video::color_t *const color_ma __global__ void RGBA_to_NV12( cudaTextureObject_t srcImage, std::uint8_t *dstY, std::uint8_t *dstUV, std::uint32_t dstPitchY, std::uint32_t dstPitchUV, - std::uint32_t width, std::uint32_t height, const video::color_t *const color_matrix) { + const viewport_t viewport, const video::color_t *const color_matrix) { int idX = (threadIdx.x + blockDim.x * blockIdx.x) * 2; int idY = (threadIdx.y + blockDim.y * blockIdx.y); - if(idX >= width) return; - if(idY >= height) return; + if(idX >= viewport.width) return; + if(idY >= viewport.height) return; + + float x = (float)idX / (float)viewport.width / 4; + float y = (float)idY / (float)viewport.height; + + idX += viewport.offsetX; + idY += viewport.offsetY; dstY = dstY + idX + idY * dstPitchY; dstUV = dstUV + idX + (idY / 2 * dstPitchUV); - float x = (float)idX / (float)width / 4; - float y = (float)idY / (float)height; - float3 rgb_l = bgra_to_rgb(tex2D(srcImage, x, y)); - float3 rgb_r = bgra_to_rgb(tex2D(srcImage, x + 0.25f / width, y + 1.0f / height)); + float3 rgb_r = bgra_to_rgb(tex2D(srcImage, x + 0.25f / viewport.width, y + 1.0f / viewport.height)); float2 uv = calcUV((rgb_l + rgb_r) * 0.5f, color_matrix); @@ -158,11 +161,11 @@ __global__ void RGBA_to_NV12( dstY[1] = calcY(rgb_r, color_matrix); } -sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock, ptr_t &&color_matrix) - : array {}, texture { INVALID_TEXTURE }, width { out_width }, height { out_height }, threadsPerBlock { threadsPerBlock }, color_matrix { std::move(color_matrix) } { +sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pitch, int threadsPerBlock, ptr_t &&color_matrix) + : array {}, texture { INVALID_TEXTURE }, threadsPerBlock { threadsPerBlock }, color_matrix { std::move(color_matrix) } { auto format = cudaCreateChannelDesc(); - CU_CHECK_VOID(cudaMallocArray(&array, &format, in_width, in_height, cudaArrayDefault), "Couldn't allocate cuda array"); + CU_CHECK_VOID(cudaMallocArray(&array, &format, pitch, in_height, cudaArrayDefault), "Couldn't allocate cuda array"); cudaResourceDesc res {}; res.resType = cudaResourceTypeArray; @@ -177,6 +180,22 @@ sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int thr std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp); CU_CHECK_VOID(cudaCreateTextureObject(&texture, &res, &desc, nullptr), "Couldn't create cuda texture"); + + + // Ensure aspect ratio is maintained + auto scalar = std::fminf(out_width / (float)in_width, out_height / (float)in_height); + auto out_width_f = in_width * scalar; + auto out_height_f = in_height * scalar; + + // result is always positive + auto offsetX_f = (out_width - out_width_f) / 2; + auto offsetY_f = (out_height - out_height_f) / 2; + + viewport.width = out_width_f; + viewport.height = out_height_f; + + viewport.offsetX = offsetX_f; + viewport.offsetY = offsetY_f; } sws_t::~sws_t() { @@ -193,7 +212,7 @@ sws_t::~sws_t() { } } -std::unique_ptr sws_t::make(int in_width, int in_height, int out_width, int out_height) { +std::unique_ptr sws_t::make(int in_width, int in_height, int out_width, int out_height, int pitch) { cudaDeviceProp props; int device; CU_CHECK_PTR(cudaGetDevice(&device), "Couldn't get cuda device"); @@ -204,7 +223,7 @@ std::unique_ptr sws_t::make(int in_width, int in_height, int out_width, i return nullptr; } - auto sws = std::make_unique(in_width, in_height, out_width, out_height, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2, std::move(ptr)); + auto sws = std::make_unique(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2, std::move(ptr)); if(sws->texture == INVALID_TEXTURE) { return nullptr; @@ -214,13 +233,17 @@ std::unique_ptr sws_t::make(int in_width, int in_height, int out_width, i } int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV) { - int threadsX = width / 2; - int threadsY = height; + return convert(Y, UV, pitchY, pitchUV, viewport); +} + +int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, const viewport_t &viewport) { + int threadsX = viewport.width / 2; + int threadsY = viewport.height; dim3 block(threadsPerBlock, threadsPerBlock); dim3 grid(div_align(threadsX, threadsPerBlock), div_align(threadsY, threadsPerBlock)); - RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, width, height, (video::color_t*)color_matrix.get()); + RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, viewport, (video::color_t*)color_matrix.get()); return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed"); } diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h index 0260dad4..27dede07 100644 --- a/sunshine/platform/linux/cuda.h +++ b/sunshine/platform/linux/cuda.h @@ -36,21 +36,27 @@ public: using ptr_t = std::unique_ptr; +struct viewport_t { + int width, height; + int offsetX, offsetY; +}; + class sws_t { public: ~sws_t(); - sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock, ptr_t &&color_matrix); + sws_t(int in_width, int in_height, int out_width, int out_height, int pitch, int threadsPerBlock, ptr_t &&color_matrix); /** - * in_width, out_width -- The width and height of the captured image in bytes + * in_width, in_height -- The width and height of the captured image in pixels * out_width, out_height -- the width and height of the NV12 image in pixels * - * cuda_device -- pointer to the cuda device + * pitch -- The size of a single row of pixels in bytes */ - static std::unique_ptr make(int in_width, int in_height, int out_width, int out_height); + static std::unique_ptr make(int in_width, int in_height, int out_width, int out_height, int pitch); // Converts loaded image into a CUDevicePtr int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV); + int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, const viewport_t &viewport); void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range); @@ -60,9 +66,9 @@ public: cudaArray_t array; cudaTextureObject_t texture; - int width, height; - int threadsPerBlock; + + viewport_t viewport; }; } // namespace cuda From e3cc25f23febc0487fc6daed6e53804460d15864 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Mon, 20 Sep 2021 00:03:33 +0200 Subject: [PATCH 08/27] Use linear interpolation with the cuda kernel --- sunshine/platform/linux/cuda.cu | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu index 1dac5dd5..63e40341 100644 --- a/sunshine/platform/linux/cuda.cu +++ b/sunshine/platform/linux/cuda.cu @@ -111,6 +111,10 @@ inline __device__ float3 bgra_to_rgb(uchar4 vec) { return make_float3((float)vec.z, (float)vec.y, (float)vec.x); } +inline __device__ float3 bgra_to_rgb(float4 vec) { + return make_float3(vec.z, vec.y, vec.x) * 255.0f;; +} + inline __device__ float2 calcUV(float3 pixel, const video::color_t *const color_matrix) { float4 vec_u = color_matrix->color_vec_u; float4 vec_v = color_matrix->color_vec_v; @@ -150,8 +154,8 @@ __global__ void RGBA_to_NV12( dstY = dstY + idX + idY * dstPitchY; dstUV = dstUV + idX + (idY / 2 * dstPitchUV); - float3 rgb_l = bgra_to_rgb(tex2D(srcImage, x, y)); - float3 rgb_r = bgra_to_rgb(tex2D(srcImage, x + 0.25f / viewport.width, y + 1.0f / viewport.height)); + float3 rgb_l = bgra_to_rgb(tex2D(srcImage, x, y)); + float3 rgb_r = bgra_to_rgb(tex2D(srcImage, x + 0.25f / viewport.width, y + 1.0f / viewport.height)); float2 uv = calcUV((rgb_l + rgb_r) * 0.5f, color_matrix); @@ -173,8 +177,8 @@ sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pit cudaTextureDesc desc {}; - desc.readMode = cudaReadModeElementType; - desc.filterMode = cudaFilterModePoint; + desc.readMode = cudaReadModeNormalizedFloat; + desc.filterMode = cudaFilterModeLinear; desc.normalizedCoords = true; std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp); From 196f1f74715d0cf3cf91a5a6d110df4a7e9b7d0b Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Mon, 20 Sep 2021 00:21:54 +0200 Subject: [PATCH 09/27] Make changes in brightness of the color more visible --- sunshine/platform/linux/cuda.cu | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu index 63e40341..e310964c 100644 --- a/sunshine/platform/linux/cuda.cu +++ b/sunshine/platform/linux/cuda.cu @@ -112,7 +112,7 @@ inline __device__ float3 bgra_to_rgb(uchar4 vec) { } inline __device__ float3 bgra_to_rgb(float4 vec) { - return make_float3(vec.z, vec.y, vec.x) * 255.0f;; + return make_float3(vec.z, vec.y, vec.x); } inline __device__ float2 calcUV(float3 pixel, const video::color_t *const color_matrix) { @@ -123,7 +123,7 @@ inline __device__ float2 calcUV(float3 pixel, const video::color_t *const color_ float v = dot(pixel, make_float3(vec_v)) + vec_v.w; u = u * color_matrix->range_uv.x + color_matrix->range_uv.y; - v = (v * color_matrix->range_uv.x + color_matrix->range_uv.y) * 224.0f / 256.0f + 0.0625f * 256.0f; + v = (v * color_matrix->range_uv.x + color_matrix->range_uv.y) * 224.0f / 256.0f + 0.0625f; return make_float2(u, v); } @@ -157,12 +157,12 @@ __global__ void RGBA_to_NV12( float3 rgb_l = bgra_to_rgb(tex2D(srcImage, x, y)); float3 rgb_r = bgra_to_rgb(tex2D(srcImage, x + 0.25f / viewport.width, y + 1.0f / viewport.height)); - float2 uv = calcUV((rgb_l + rgb_r) * 0.5f, color_matrix); + float2 uv = calcUV((rgb_l + rgb_r) * 0.5f, color_matrix) * 255.0f; dstUV[0] = uv.x; dstUV[1] = uv.y; - dstY[0] = calcY(rgb_l, color_matrix); - dstY[1] = calcY(rgb_r, color_matrix); + dstY[0] = calcY(rgb_l, color_matrix) * 245.0f; // 245.0f is a magic number to ensure slight changes in luminosity are more visisble + dstY[1] = calcY(rgb_r, color_matrix) * 245.0f; // 245.0f is a magic number to ensure slight changes in luminosity are more visisble } sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pitch, int threadsPerBlock, ptr_t &&color_matrix) @@ -271,15 +271,7 @@ void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) ++color_p; } - auto color_matrix = *color_p; - color_matrix.color_vec_y.w *= 256.0f; - color_matrix.color_vec_u.w *= 256.0f; - color_matrix.color_vec_v.w *= 256.0f; - - color_matrix.range_y.y *= 256.0f; - color_matrix.range_uv.y *= 256.0f; - - CU_CHECK_IGNORE(cudaMemcpy(this->color_matrix.get(), &color_matrix, sizeof(video::color_t), cudaMemcpyHostToDevice), "Couldn't copy color matrix to cuda"); + CU_CHECK_IGNORE(cudaMemcpy(color_matrix.get(), color_p, sizeof(video::color_t), cudaMemcpyHostToDevice), "Couldn't copy color matrix to cuda"); } int sws_t::load_ram(platf::img_t &img) { From bb912786bdbc65b08aeb5ae508f2d481ce4b2bf6 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Wed, 22 Sep 2021 11:36:59 +0200 Subject: [PATCH 10/27] Added NvFBC on Linux --- sunshine/platform/common.h | 7 + sunshine/platform/linux/cuda.cpp | 703 +++++++++++++++++----------- sunshine/platform/linux/cuda.cu | 92 ++-- sunshine/platform/linux/cuda.h | 36 +- sunshine/platform/linux/misc.cpp | 29 +- sunshine/platform/linux/x11grab.cpp | 4 +- sunshine/video.cpp | 2 +- 7 files changed, 546 insertions(+), 327 deletions(-) diff --git a/sunshine/platform/common.h b/sunshine/platform/common.h index 7482f0ea..4b702a9b 100644 --- a/sunshine/platform/common.h +++ b/sunshine/platform/common.h @@ -141,6 +141,13 @@ public: struct img_t { public: + img_t() = default; + + img_t(img_t &&) = delete; + img_t(const img_t &) = delete; + img_t &operator=(img_t &&) = delete; + img_t &operator=(const img_t &) = delete; + std::uint8_t *data {}; std::int32_t width {}; std::int32_t height {}; diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp index e57f914d..25d88518 100644 --- a/sunshine/platform/linux/cuda.cpp +++ b/sunshine/platform/linux/cuda.cpp @@ -1,3 +1,5 @@ +#include + #include #include @@ -55,96 +57,9 @@ inline static int check(CUresult result, const std::string_view &sv) { return 0; } -class ctx_t { +class img_t : public platf::img_t { public: - ctx_t(CUcontext ctx) { - CU_CHECK_IGNORE(cdf->cuCtxPushCurrent(ctx), "Couldn't push cuda context"); - } - - ~ctx_t() { - CUcontext dummy; - - CU_CHECK_IGNORE(cdf->cuCtxPopCurrent(&dummy), "Couldn't pop cuda context"); - } -}; - -void free_res(CUgraphicsResource res) { - cdf->cuGraphicsUnregisterResource(res); -} - -using res_internal_t = util::safe_ptr; - -template -class res_t { -public: - res_t() : resources {}, mapped { false } {} - res_t(res_t &&other) noexcept : resources { other.resources }, array_p { other.array_p }, ctx { other.ctx }, stream { other.stream } { - other.resources = std::array {}; - } - - res_t &operator=(res_t &&other) { - for(auto x = 0; x < N; ++x) { - std::swap(resources[x], other.resources[x]); - std::swap(array_p[x], other.array_p[x]); - } - - std::swap(ctx, other.ctx); - std::swap(stream, other.stream); - std::swap(mapped, other.mapped); - - return *this; - } - - res_t(CUcontext ctx, CUstream stream) : resources {}, ctx { ctx }, stream { stream }, mapped { false } {} - - int bind(gl::tex_t &tex) { - ctx_t ctx { this->ctx }; - - CU_CHECK(cdf->cuGraphicsGLRegisterImage(&resources[0], tex[0], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register Y image"); - CU_CHECK(cdf->cuGraphicsGLRegisterImage(&resources[1], tex[1], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register uv image"); - - return 0; - } - - int map() { - ctx_t ctx { this->ctx }; - - CU_CHECK(cdf->cuGraphicsMapResources(resources.size(), resources.data(), stream), "Coudn't map cuda resources"); - - mapped = true; - - CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&array_p[0], resources[0], 0, 0), "Couldn't get mapped subresource [0]"); - CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&array_p[1], resources[1], 0, 0), "Couldn't get mapped subresource [1]"); - - return 0; - } - - void unmap() { - // Either all or none are mapped - if(mapped) { - ctx_t ctx { this->ctx }; - - CU_CHECK_IGNORE(cdf->cuGraphicsUnmapResources(resources.size(), resources.data(), stream), "Couldn't unmap cuda resources"); - - mapped = false; - } - } - - inline CUarray &operator[](std::size_t index) { - return array_p[index]; - } - - ~res_t() { - unmap(); - } - - std::array resources; - std::array array_p; - - CUcontext ctx; - CUstream stream; - - bool mapped; + tex_t tex; }; int init() { @@ -160,139 +75,8 @@ int init() { return 0; } -class opengl_t : public platf::hwdevice_t { -public: - int init(int in_width, int in_height, platf::x11::xdisplay_t::pointer xdisplay) { - if(!cdf) { - BOOST_LOG(warning) << "cuda not initialized"sv; - return -1; - } - - this->data = (void *)0x1; - - display = egl::make_display(xdisplay); - if(!display) { - return -1; - } - - auto ctx_opt = egl::make_ctx(display.get()); - if(!ctx_opt) { - return -1; - } - - ctx = std::move(*ctx_opt); - - width = in_width; - height = in_height; - - return 0; - } - - int set_frame(AVFrame *frame) override { - auto cuda_ctx = (AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx; - - tex = gl::tex_t::make(2); - fb = gl::frame_buf_t::make(2); - - gl::ctx.BindTexture(GL_TEXTURE_2D, tex[0]); - gl::ctx.TexImage2D(GL_TEXTURE_2D, 0, GL_RED, frame->width, frame->height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr); - gl::ctx.BindTexture(GL_TEXTURE_2D, tex[1]); - gl::ctx.TexImage2D(GL_TEXTURE_2D, 0, GL_RG, frame->width / 2, frame->height / 2, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr); - gl::ctx.BindTexture(GL_TEXTURE_2D, 0); - - fb.bind(std::begin(tex), std::end(tex)); - - res = res_t<2> { cuda_ctx->cuda_ctx, cuda_ctx->stream }; - - if(res.bind(tex)) { - return -1; - } - - this->hwframe.reset(frame); - this->frame = frame; - - if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) { - BOOST_LOG(error) << "Couldn't get hwframe for NVENC"sv; - - return -1; - } - - auto sws_opt = egl::sws_t::make(width, height, frame->width, frame->height); - if(!sws_opt) { - return -1; - } - - sws = std::move(*sws_opt); - return sws.blank(fb, 0, 0, frame->width, frame->height); - } - - int convert(platf::img_t &img) override { - sws.load_ram(img); - - if(sws.convert(fb)) { - return -1; - } - - if(res.map()) { - return -1; - } - - // Push and pop cuda context - ctx_t ctx { res.ctx }; - for(auto x = 0; x < 2; ++x) { - CUDA_MEMCPY2D desc {}; - - auto shift = x; - - desc.srcPitch = frame->width; - desc.dstPitch = frame->linesize[x]; - desc.Height = frame->height >> shift; - desc.WidthInBytes = std::min(desc.srcPitch, desc.dstPitch); - - desc.srcMemoryType = CU_MEMORYTYPE_ARRAY; - desc.dstMemoryType = CU_MEMORYTYPE_DEVICE; - - desc.srcArray = res[x]; - desc.dstDevice = (CUdeviceptr)frame->data[x]; - - CU_CHECK(cdf->cuMemcpy2DAsync(&desc, res.stream), "Couldn't copy from OpenGL to cuda"); - } - - res.unmap(); - - return 0; - } - - void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override { - sws.set_colorspace(colorspace, color_range); - } - - frame_t hwframe; - - egl::display_t display; - egl::ctx_t ctx; - - egl::sws_t sws; - - gl::tex_t tex; - gl::frame_buf_t fb; - - res_t<2> res; - - int width, height; -}; - class cuda_t : public platf::hwdevice_t { public: - ~cuda_t() override { - // sws_t needs to be destroyed while the context is active - if(sws) { - ctx_t ctx { cuda_ctx }; - - sws.reset(); - } - } - int init(int in_width, int in_height) { if(!cdf) { BOOST_LOG(warning) << "cuda not initialized"sv; @@ -322,78 +106,111 @@ public: return -1; } - cuda_ctx = ((AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx)->cuda_ctx; - - ctx_t ctx { cuda_ctx }; - sws = sws_t::make(width, height, frame->width, frame->height, width * 4); - - if(!sws) { + auto sws_opt = sws_t::make(width, height, frame->width, frame->height, width * 4); + if(!sws_opt) { return -1; } + sws = std::move(*sws_opt); + return 0; } - int convert(platf::img_t &img) override { - ctx_t ctx { cuda_ctx }; - - return sws->load_ram(img) || sws->convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1]); - } - void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override { - ctx_t ctx { cuda_ctx }; - sws->set_colorspace(colorspace, color_range); + sws.set_colorspace(colorspace, color_range); + auto tex = tex_t::make(height, width * 4); + if(!tex) { + return; + } // The default green color is ugly. // Update the background color platf::img_t img; - img.width = frame->width; - img.height = frame->height; + img.width = width; + img.height = height; img.pixel_pitch = 4; - img.row_pitch = img.width * img.pixel_pitch; + img.row_pitch = img.width * img.pixel_pitch; std::vector image_data; image_data.resize(img.row_pitch * img.height); img.data = image_data.data(); - if(sws->load_ram(img)) { + if(sws.load_ram(img, tex->array)) { return; } - sws->convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], { - frame->width, frame->height, 0, 0 - }); + sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture, { frame->width, frame->height, 0, 0 }); } frame_t hwframe; - std::unique_ptr sws; - int width, height; - CUcontext cuda_ctx; + sws_t sws; }; -std::shared_ptr make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay) { +class cuda_ram_t : public cuda_t { +public: + int convert(platf::img_t &img) override { + return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex.texture); + } + + int set_frame(AVFrame *frame) { + if(cuda_t::set_frame(frame)) { + return -1; + } + + auto tex_opt = tex_t::make(height, width * 4); + if(!tex_opt) { + return -1; + } + + tex = std::move(*tex_opt); + + return 0; + } + + tex_t tex; +}; + +class cuda_vram_t : public cuda_t { +public: + int convert(platf::img_t &img) override { + return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], (cudaTextureObject_t)img.data); + } +}; + +std::shared_ptr make_hwdevice(int width, int height, bool vram) { if(init()) { return nullptr; } - auto cuda = std::make_shared(); + std::shared_ptr cuda; + + if(vram) { + cuda = std::make_shared(); + } + else { + cuda = std::make_shared(); + } + if(cuda->init(width, height)) { return nullptr; } return cuda; } -} // namespace cuda -namespace platf::nvfbc { +namespace nvfbc { static PNVFBCCREATEINSTANCE createInstance {}; static NVFBC_API_FUNCTION_LIST func { NVFBC_VERSION }; +static constexpr inline NVFBC_BOOL nv_bool(bool b) { + return b ? NVFBC_TRUE : NVFBC_FALSE; +} + static void *handle { nullptr }; int init() { static bool funcs_loaded = false; @@ -418,49 +235,65 @@ int init() { return -1; } + auto status = cuda::nvfbc::createInstance(&cuda::nvfbc::func); + if(status) { + BOOST_LOG(error) << "Unable to create NvFBC instance"sv; + + dlclose(handle); + handle = nullptr; + return -1; + } + funcs_loaded = true; return 0; } class handle_t { - KITTY_USING_MOVE_T(session_t, NVFBC_SESSION_HANDLE, std::numeric_limits::max(), { - if(el == std::numeric_limits::max()) { - return; - } - NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER }; - - auto status = func.nvFBCDestroyHandle(el, ¶ms); - if(status) { - BOOST_LOG(error) << "Failed to destroy nvfbc handle: "sv << func.nvFBCGetLastErrorStr(el); - } - }); + enum flag_e { + SESSION_HANDLE, + SESSION_CAPTURE, + MAX_FLAGS, + }; public: + handle_t() = default; + handle_t(handle_t &&other) : handle_flags { other.handle_flags }, handle { other.handle } { + other.handle_flags.reset(); + } + + handle_t &operator=(handle_t &&other) { + std::swap(handle_flags, other.handle_flags); + std::swap(handle, other.handle); + + return *this; + } + static std::optional make() { NVFBC_CREATE_HANDLE_PARAMS params { NVFBC_CREATE_HANDLE_PARAMS_VER }; - session_t session; - auto status = func.nvFBCCreateHandle(&session.el, ¶ms); + handle_t handle; + auto status = func.nvFBCCreateHandle(&handle.handle, ¶ms); if(status) { - BOOST_LOG(error) << "Failed to create session: "sv << func.nvFBCGetLastErrorStr(session.el); - session.release(); + BOOST_LOG(error) << "Failed to create session: "sv << handle.last_error(); return std::nullopt; } - return handle_t { std::move(session) }; + handle.handle_flags[SESSION_HANDLE] = true; + + return std::move(handle); } const char *last_error() { - return func.nvFBCGetLastErrorStr(session.el); + return func.nvFBCGetLastErrorStr(handle); } std::optional status() { NVFBC_GET_STATUS_PARAMS params { NVFBC_GET_STATUS_PARAMS_VER }; - auto status = func.nvFBCGetStatus(session.el, ¶ms); + auto status = func.nvFBCGetStatus(handle, ¶ms); if(status) { - BOOST_LOG(error) << "Failed to create session: "sv << last_error(); + BOOST_LOG(error) << "Failed to get NvFBC status: "sv << last_error(); return std::nullopt; } @@ -468,23 +301,328 @@ public: return params; } - session_t session; + int capture(NVFBC_CREATE_CAPTURE_SESSION_PARAMS &capture_params) { + if(func.nvFBCCreateCaptureSession(handle, &capture_params)) { + BOOST_LOG(error) << "Failed to start capture session: "sv << last_error(); + return -1; + } + + handle_flags[SESSION_CAPTURE] = true; + + NVFBC_TOCUDA_SETUP_PARAMS setup_params { + NVFBC_TOCUDA_SETUP_PARAMS_VER, + NVFBC_BUFFER_FORMAT_BGRA, + }; + + if(func.nvFBCToCudaSetUp(handle, &setup_params)) { + BOOST_LOG(error) << "Failed to setup cuda interop with nvFBC: "sv << last_error(); + return -1; + } + return 0; + } + + int stop() { + if(!handle_flags[SESSION_CAPTURE]) { + return 0; + } + + NVFBC_DESTROY_CAPTURE_SESSION_PARAMS params { NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER }; + + if(func.nvFBCDestroyCaptureSession(handle, ¶ms)) { + BOOST_LOG(error) << "Couldn't destroy capture session: "sv << last_error(); + + return -1; + } + + handle_flags[SESSION_CAPTURE] = false; + + return 0; + } + + ~handle_t() { + if(!handle_flags[SESSION_HANDLE]) { + return; + } + + if(handle_flags[SESSION_CAPTURE]) { + NVFBC_DESTROY_CAPTURE_SESSION_PARAMS params { NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER }; + + if(func.nvFBCDestroyCaptureSession(handle, ¶ms)) { + BOOST_LOG(error) << "Couldn't destroy capture session: "sv << func.nvFBCGetLastErrorStr(handle); + } + } + + NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER }; + + if(func.nvFBCDestroyHandle(handle, ¶ms)) { + BOOST_LOG(error) << "Couldn't destroy session handle: "sv << func.nvFBCGetLastErrorStr(handle); + } + } + + std::bitset handle_flags; + + NVFBC_SESSION_HANDLE handle; }; +class display_t : public platf::display_t { +public: + int init(const std::string_view &display_name, int framerate) { + auto handle = handle_t::make(); + if(!handle) { + return -1; + } + + auto status_params = handle->status(); + if(!status_params) { + return -1; + } + + int streamedMonitor = -1; + if(!display_name.empty()) { + if(status_params->bXRandRAvailable) { + auto monitor_nr = util::from_view(display_name); + + if(monitor_nr < 0 || monitor_nr >= status_params->dwOutputNum) { + BOOST_LOG(warning) << "Can't stream monitor ["sv << monitor_nr << "], it needs to be between [0] and ["sv << status_params->dwOutputNum - 1 << "], defaulting to virtual desktop"sv; + } + else { + streamedMonitor = monitor_nr; + } + } + else { + BOOST_LOG(warning) << "XrandR not available, streaming entire virtual desktop"sv; + } + } + + capture_params = NVFBC_CREATE_CAPTURE_SESSION_PARAMS { NVFBC_CREATE_CAPTURE_SESSION_PARAMS_VER }; + + capture_params.eCaptureType = NVFBC_CAPTURE_SHARED_CUDA; + capture_params.bDisableAutoModesetRecovery = nv_bool(true); + + capture_params.dwSamplingRateMs = 1000 /* ms */ / framerate; + + if(streamedMonitor != -1) { + auto &output = status_params->outputs[streamedMonitor]; + + width = output.trackedBox.w; + height = output.trackedBox.h; + offset_x = output.trackedBox.x; + offset_y = output.trackedBox.y; + + capture_params.eTrackingType = NVFBC_TRACKING_OUTPUT; + capture_params.dwOutputId = output.dwId; + } + else { + capture_params.eTrackingType = NVFBC_TRACKING_SCREEN; + + width = status_params->screenSize.w; + height = status_params->screenSize.h; + } + + env_width = status_params->screenSize.w; + env_height = status_params->screenSize.h; + + this->handle = std::move(*handle); + return 0; + } + + platf::capture_e capture(snapshot_cb_t &&snapshot_cb, std::shared_ptr img, bool *cursor) override { + // Force display_t::capture to initialize handle_t::capture + cursor_visible = !*cursor; + + auto fg = util::fail_guard([&]() { + handle.stop(); + }); + + while(img) { + auto status = snapshot(img.get(), 500ms, *cursor); + switch(status) { + case platf::capture_e::reinit: + case platf::capture_e::error: + return status; + case platf::capture_e::timeout: + std::this_thread::sleep_for(1ms); + continue; + case platf::capture_e::ok: + img = snapshot_cb(img); + break; + default: + BOOST_LOG(error) << "Unrecognized capture status ["sv << (int)status << ']'; + return status; + } + } + + return platf::capture_e::ok; + } + + // Reinitialize the capture session. + platf::capture_e reinit(bool cursor) { + if(handle.stop()) { + return platf::capture_e::error; + } + + cursor_visible = cursor; + if(false && cursor) { + capture_params.bPushModel = nv_bool(false); + capture_params.bWithCursor = nv_bool(true); + capture_params.bAllowDirectCapture = nv_bool(false); + } + else { + capture_params.bPushModel = nv_bool(true); + capture_params.bWithCursor = nv_bool(false); + capture_params.bAllowDirectCapture = nv_bool(true); + } + + if(handle.capture(capture_params)) { + return platf::capture_e::error; + } + + // If trying to capture directly, test if it actually does. + if(capture_params.bAllowDirectCapture) { + CUdeviceptr device_ptr; + NVFBC_FRAME_GRAB_INFO info; + + NVFBC_TOCUDA_GRAB_FRAME_PARAMS grab { + NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER, + NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY, + &device_ptr, + &info, + 0, + }; + + // Direct Capture may fail the first few times, even if it's possible + for(int x = 0; x < 3; ++x) { + if(auto status = func.nvFBCToCudaGrabFrame(handle.handle, &grab)) { + if(status == NVFBC_ERR_MUST_RECREATE) { + return platf::capture_e::reinit; + } + + BOOST_LOG(error) << "Couldn't capture nvFramebuffer: "sv << handle.last_error(); + + return platf::capture_e::error; + } + + if(info.bDirectCapture) { + break; + } + + BOOST_LOG(debug) << "Direct capture failed attempt ["sv << x << ']'; + } + + if(!info.bDirectCapture) { + BOOST_LOG(debug) << "Direct capture failed, trying the extra copy method"sv; + // Direct capture failed + capture_params.bPushModel = nv_bool(false); + capture_params.bWithCursor = nv_bool(false); + capture_params.bAllowDirectCapture = nv_bool(false); + + if(handle.stop() || handle.capture(capture_params)) { + return platf::capture_e::error; + } + } + } + + return platf::capture_e::ok; + } + + platf::capture_e snapshot(platf::img_t *img, std::chrono::milliseconds timeout, bool cursor) { + if(cursor != cursor_visible) { + auto status = reinit(cursor); + if(status != platf::capture_e::ok) { + return status; + } + } + + CUdeviceptr device_ptr; + NVFBC_FRAME_GRAB_INFO info; + + NVFBC_TOCUDA_GRAB_FRAME_PARAMS grab { + NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER, + NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY, + &device_ptr, + &info, + (std::uint32_t)timeout.count(), + }; + + if(auto status = func.nvFBCToCudaGrabFrame(handle.handle, &grab)) { + if(status == NVFBC_ERR_MUST_RECREATE) { + return platf::capture_e::reinit; + } + + BOOST_LOG(error) << "Couldn't capture nvFramebuffer: "sv << handle.last_error(); + return platf::capture_e::error; + } + + if(!info.bIsNewFrame) { + return platf::capture_e::timeout; + } + + if(((img_t *)img)->tex.copy((std::uint8_t *)device_ptr, img->height, img->row_pitch)) { + return platf::capture_e::error; + } + + return platf::capture_e::ok; + } + + std::shared_ptr make_hwdevice(platf::pix_fmt_e pix_fmt) override { + return ::cuda::make_hwdevice(width, height, true); + } + + std::shared_ptr alloc_img() override { + auto img = std::make_shared(); + + img->width = width; + img->height = height; + img->pixel_pitch = 4; + img->row_pitch = img->width * img->pixel_pitch; + + auto tex_opt = tex_t::make(height, width * img->pixel_pitch); + if(!tex_opt) { + return nullptr; + } + + img->tex = std::move(*tex_opt); + img->data = (std::uint8_t *)img->tex.texture; + + return img; + }; + + int dummy_img(platf::img_t *) override { + return 0; + } + + bool cursor_visible; + handle_t handle; + + NVFBC_CREATE_CAPTURE_SESSION_PARAMS capture_params; +}; +} // namespace nvfbc +} // namespace cuda + +namespace platf { +std::shared_ptr nvfbc_display(mem_type_e hwdevice_type, const std::string &display_name, int framerate) { + if(hwdevice_type != mem_type_e::cuda) { + BOOST_LOG(error) << "Could not initialize nvfbc display with the given hw device type"sv; + return nullptr; + } + + auto display = std::make_shared(); + + if(display->init(display_name, framerate)) { + return nullptr; + } + + return display; +} + std::vector nvfbc_display_names() { - if(init()) { + if(cuda::init() || cuda::nvfbc::init()) { return {}; } std::vector display_names; - auto status = createInstance(&func); - if(status) { - BOOST_LOG(error) << "Unable to create NvFBC instance"sv; - return {}; - } - - auto handle = handle_t::make(); + auto handle = cuda::nvfbc::handle_t::make(); if(!handle) { return {}; } @@ -500,7 +638,18 @@ std::vector nvfbc_display_names() { BOOST_LOG(info) << "Found ["sv << status_params->dwOutputNum << "] outputs"sv; BOOST_LOG(info) << "Virtual Desktop: "sv << status_params->screenSize.w << 'x' << status_params->screenSize.h; + BOOST_LOG(info) << "XrandR: "sv << (status_params->bXRandRAvailable ? "available"sv : "unavailable"sv); + + for(auto x = 0; x < status_params->dwOutputNum; ++x) { + auto &output = status_params->outputs[x]; + BOOST_LOG(info) << "-- Output --"sv; + BOOST_LOG(debug) << " ID: "sv << output.dwId; + BOOST_LOG(debug) << " Name: "sv << output.name; + BOOST_LOG(info) << " Resolution: "sv << output.trackedBox.w << 'x' << output.trackedBox.h; + BOOST_LOG(info) << " Offset: "sv << output.trackedBox.x << 'x' << output.trackedBox.y; + display_names.emplace_back(std::to_string(x)); + } return display_names; } -} // namespace platf::nvfbc \ No newline at end of file +} // namespace platf \ No newline at end of file diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu index e310964c..68368095 100644 --- a/sunshine/platform/linux/cuda.cu +++ b/sunshine/platform/linux/cuda.cu @@ -21,6 +21,9 @@ using namespace std::literals; #define CU_CHECK_PTR(x, y) \ if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return nullptr; +#define CU_CHECK_OPT(x, y) \ + if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return std::nullopt; + #define CU_CHECK_IGNORE(x, y) \ check((x), SUNSHINE_STRINGVIEW(y ": ")) @@ -165,15 +168,21 @@ __global__ void RGBA_to_NV12( dstY[1] = calcY(rgb_r, color_matrix) * 245.0f; // 245.0f is a magic number to ensure slight changes in luminosity are more visisble } -sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pitch, int threadsPerBlock, ptr_t &&color_matrix) - : array {}, texture { INVALID_TEXTURE }, threadsPerBlock { threadsPerBlock }, color_matrix { std::move(color_matrix) } { - auto format = cudaCreateChannelDesc(); +int tex_t::copy(std::uint8_t *src, int height, int pitch) { + CU_CHECK(cudaMemcpy2DToArray(array, 0, 0, src, pitch, pitch, height, cudaMemcpyDeviceToDevice), "Couldn't copy to cuda array from deviceptr"); - CU_CHECK_VOID(cudaMallocArray(&array, &format, pitch, in_height, cudaArrayDefault), "Couldn't allocate cuda array"); + return 0; +} + +std::optional tex_t::make(int height, int pitch) { + tex_t tex; + + auto format = cudaCreateChannelDesc(); + CU_CHECK_OPT(cudaMallocArray(&tex.array, &format, pitch, height, cudaArrayDefault), "Couldn't allocate cuda array"); cudaResourceDesc res {}; res.resType = cudaResourceTypeArray; - res.res.array.array = array; + res.res.array.array = tex.array; cudaTextureDesc desc {}; @@ -183,9 +192,40 @@ sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pit std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp); - CU_CHECK_VOID(cudaCreateTextureObject(&texture, &res, &desc, nullptr), "Couldn't create cuda texture"); + CU_CHECK_OPT(cudaCreateTextureObject(&tex.texture, &res, &desc, nullptr), "Couldn't create cuda texture"); + return std::move(tex); +} +tex_t::tex_t() : array { }, texture { INVALID_TEXTURE } {} +tex_t::tex_t(tex_t &&other) : array { other.array }, texture { other.texture } { + other.array = 0; + other.texture = INVALID_TEXTURE; +} + +tex_t &tex_t::operator=(tex_t &&other) { + std::swap(array, other.array); + std::swap(texture, other.texture); + + return *this; +} + +tex_t::~tex_t() { + if(texture != INVALID_TEXTURE) { + CU_CHECK_IGNORE(cudaDestroyTextureObject(texture), "Couldn't deallocate cuda texture"); + + texture = INVALID_TEXTURE; + } + + if(array) { + CU_CHECK_IGNORE(cudaFreeArray(array), "Couldn't deallocate cuda array"); + + array = cudaArray_t {}; + } +} + +sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pitch, int threadsPerBlock, ptr_t &&color_matrix) + : threadsPerBlock { threadsPerBlock }, color_matrix { std::move(color_matrix) } { // Ensure aspect ratio is maintained auto scalar = std::fminf(out_width / (float)in_width, out_height / (float)in_height); auto out_width_f = in_width * scalar; @@ -202,52 +242,32 @@ sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pit viewport.offsetY = offsetY_f; } -sws_t::~sws_t() { - if(texture != INVALID_TEXTURE) { - CU_CHECK_IGNORE(cudaDestroyTextureObject(texture), "Couldn't deallocate cuda texture"); - - texture = INVALID_TEXTURE; - } - - if(array) { - CU_CHECK_IGNORE(cudaFreeArray(array), "Couldn't deallocate cuda array"); - - array = cudaArray_t {}; - } -} - -std::unique_ptr sws_t::make(int in_width, int in_height, int out_width, int out_height, int pitch) { +std::optional sws_t::make(int in_width, int in_height, int out_width, int out_height, int pitch) { cudaDeviceProp props; int device; - CU_CHECK_PTR(cudaGetDevice(&device), "Couldn't get cuda device"); - CU_CHECK_PTR(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties"); + CU_CHECK_OPT(cudaGetDevice(&device), "Couldn't get cuda device"); + CU_CHECK_OPT(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties"); auto ptr = make_ptr(); if(!ptr) { - return nullptr; + return std::nullopt; } - auto sws = std::make_unique(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2, std::move(ptr)); - - if(sws->texture == INVALID_TEXTURE) { - return nullptr; - } - - return sws; + return std::make_optional(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2, std::move(ptr)); } -int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV) { - return convert(Y, UV, pitchY, pitchUV, viewport); +int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture) { + return convert(Y, UV, pitchY, pitchUV, texture, viewport); } -int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, const viewport_t &viewport) { +int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport) { int threadsX = viewport.width / 2; int threadsY = viewport.height; dim3 block(threadsPerBlock, threadsPerBlock); dim3 grid(div_align(threadsX, threadsPerBlock), div_align(threadsY, threadsPerBlock)); - RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, viewport, (video::color_t*)color_matrix.get()); + RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, viewport, (video::color_t *)color_matrix.get()); return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed"); } @@ -274,7 +294,7 @@ void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) CU_CHECK_IGNORE(cudaMemcpy(color_matrix.get(), color_p, sizeof(video::color_t), cudaMemcpyHostToDevice), "Couldn't copy color matrix to cuda"); } -int sws_t::load_ram(platf::img_t &img) { +int sws_t::load_ram(platf::img_t &img, cudaArray_t array) { return CU_CHECK_IGNORE(cudaMemcpy2DToArray(array, 0, 0, img.data, img.row_pitch, img.width * img.pixel_pitch, img.height, cudaMemcpyHostToDevice), "Couldn't copy to cuda array"); } diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h index 27dede07..08cb0156 100644 --- a/sunshine/platform/linux/cuda.h +++ b/sunshine/platform/linux/cuda.h @@ -1,15 +1,16 @@ #ifndef SUNSHINE_PLATFORM_CUDA_H #define SUNSHINE_PLATFORM_CUDA_H -#include - #ifndef __NVCC__ #include "sunshine/platform/common.h" #include "x11grab.h" namespace cuda { -std::shared_ptr make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay); +namespace nvfbc { +std::vector display_names(); +} +std::shared_ptr make_hwdevice(int width, int height, bool vram); int init(); } // namespace cuda @@ -41,9 +42,26 @@ struct viewport_t { int offsetX, offsetY; }; +class tex_t { +public: + static std::optional make(int height, int pitch); + + tex_t(); + tex_t(tex_t &&); + + tex_t &operator=(tex_t &&other); + + ~tex_t(); + + int copy(std::uint8_t *src, int height, int pitch); + + cudaArray_t array; + cudaTextureObject_t texture; +}; + class sws_t { public: - ~sws_t(); + sws_t() = default; sws_t(int in_width, int in_height, int out_width, int out_height, int pitch, int threadsPerBlock, ptr_t &&color_matrix); /** @@ -52,19 +70,17 @@ public: * * pitch -- The size of a single row of pixels in bytes */ - static std::unique_ptr make(int in_width, int in_height, int out_width, int out_height, int pitch); + static std::optional make(int in_width, int in_height, int out_width, int out_height, int pitch); // Converts loaded image into a CUDevicePtr - int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV); - int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, const viewport_t &viewport); + int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture); + int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport); void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range); - int load_ram(platf::img_t &img); + int load_ram(platf::img_t &img, cudaArray_t array); ptr_t color_matrix; - cudaArray_t array; - cudaTextureObject_t texture; int threadsPerBlock; diff --git a/sunshine/platform/linux/misc.cpp b/sunshine/platform/linux/misc.cpp index ad5afc5a..e10242ff 100644 --- a/sunshine/platform/linux/misc.cpp +++ b/sunshine/platform/linux/misc.cpp @@ -141,6 +141,9 @@ std::string get_mac_address(const std::string_view &address) { } enum class source_e { + // #ifdef SUNSHINE_BUILD_CUDA + NVFBC, +// #endif #ifdef SUNSHINE_BUILD_WAYLAND WAYLAND, #endif @@ -153,6 +156,15 @@ enum class source_e { }; static source_e source; +// #ifdef SUNSHINE_BUILD_CUDA +std::vector nvfbc_display_names(); +std::shared_ptr nvfbc_display(mem_type_e hwdevice_type, const std::string &display_name, int framerate); + +bool verify_nvfbc() { + return !nvfbc_display_names().empty(); +} +// #endif + #ifdef SUNSHINE_BUILD_WAYLAND std::vector wl_display_names(); std::shared_ptr wl_display(mem_type_e hwdevice_type, const std::string &display_name, int framerate); @@ -182,6 +194,10 @@ bool verify_x11() { std::vector display_names() { switch(source) { + // #ifdef SUNSHINE_BUILD_CUDA + case source_e::NVFBC: + return nvfbc_display_names(); + // #endif #ifdef SUNSHINE_BUILD_WAYLAND case source_e::WAYLAND: return wl_display_names(); @@ -201,6 +217,10 @@ std::vector display_names() { std::shared_ptr display(mem_type_e hwdevice_type, const std::string &display_name, int framerate) { switch(source) { + // #ifdef SUNSHINE_BUILD_CUDA + case source_e::NVFBC: + return nvfbc_display(hwdevice_type, display_name, framerate); + // #endif #ifdef SUNSHINE_BUILD_WAYLAND case source_e::WAYLAND: return wl_display(hwdevice_type, display_name, framerate); @@ -229,7 +249,7 @@ std::unique_ptr init() { window_system = window_system_e::WAYLAND; } #endif -#ifdef SUNSHINE_BUILD_X11 +#if defined(SUNSHINE_BUILD_X11) // || defined(SUNSHINE_BUILD_CUDA) if(std::getenv("DISPLAY") && window_system != window_system_e::WAYLAND) { if(std::getenv("WAYLAND_DISPLAY")) { BOOST_LOG(warning) << "Wayland detected, yet sunshine will use X11 for screencasting, screencasting will only work on XWayland applications"sv; @@ -238,6 +258,13 @@ std::unique_ptr init() { window_system = window_system_e::X11; } #endif +// #ifdef SUNSHINE_BUILD_CUDA + if(verify_nvfbc()) { + BOOST_LOG(info) << "Using nvFBC for screencasting"sv; + source = source_e::NVFBC; + goto found_source; + } +// #endif #ifdef SUNSHINE_BUILD_WAYLAND if(verify_wl()) { BOOST_LOG(info) << "Using Wayland for screencasting"sv; diff --git a/sunshine/platform/linux/x11grab.cpp b/sunshine/platform/linux/x11grab.cpp index c8bc70f0..decf24aa 100644 --- a/sunshine/platform/linux/x11grab.cpp +++ b/sunshine/platform/linux/x11grab.cpp @@ -518,7 +518,7 @@ struct x11_attr_t : public display_t { } if(mem_type == mem_type_e::cuda) { - return cuda::make_hwdevice(width, height, xdisplay.get()); + return cuda::make_hwdevice(width, height, false); } return std::make_shared(); @@ -678,7 +678,7 @@ struct shm_attr_t : public x11_attr_t { std::shared_ptr x11_display(platf::mem_type_e hwdevice_type, const std::string &display_name, int framerate) { if(hwdevice_type != platf::mem_type_e::system && hwdevice_type != platf::mem_type_e::vaapi && hwdevice_type != platf::mem_type_e::cuda) { - BOOST_LOG(error) << "Could not initialize display with the given hw device type."sv; + BOOST_LOG(error) << "Could not initialize x11 display with the given hw device type"sv; return nullptr; } diff --git a/sunshine/video.cpp b/sunshine/video.cpp index 3b143cbe..6faf82ee 100644 --- a/sunshine/video.cpp +++ b/sunshine/video.cpp @@ -1699,7 +1699,7 @@ util::Either vaapi_make_hwdevice_ctx(platf::hwdevice_t *base) { util::Either cuda_make_hwdevice_ctx(platf::hwdevice_t *base) { buffer_t hw_device_buf; - auto status = av_hwdevice_ctx_create(&hw_device_buf, AV_HWDEVICE_TYPE_CUDA, nullptr, nullptr, 0); + auto status = av_hwdevice_ctx_create(&hw_device_buf, AV_HWDEVICE_TYPE_CUDA, nullptr, nullptr, 1 /* AV_CUDA_USE_PRIMARY_CONTEXT */); if(status < 0) { char string[AV_ERROR_MAX_STRING_SIZE]; BOOST_LOG(error) << "Failed to create a CUDA device: "sv << av_make_error_string(string, AV_ERROR_MAX_STRING_SIZE, status); From d0529fb2347b878221196fb2cc8fd7c2690ee6ef Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Wed, 22 Sep 2021 14:17:08 +0200 Subject: [PATCH 11/27] Make dependency on cuda development files optional --- CMakeLists.txt | 44 ++++++++++++++++++++--------- sunshine/platform/linux/cuda.cpp | 1 - sunshine/platform/linux/cuda.h | 18 ++++++------ sunshine/platform/linux/misc.cpp | 22 +++++++-------- sunshine/platform/linux/x11grab.cpp | 4 ++- 5 files changed, 53 insertions(+), 36 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index df0424d5..d35512f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -107,17 +107,28 @@ else() option(SUNSHINE_ENABLE_DRM "Enable KMS grab if available" ON) option(SUNSHINE_ENABLE_X11 "Enable X11 grab if available" ON) option(SUNSHINE_ENABLE_WAYLAND "Enable building wayland specific code" ON) - - if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES 75) - endif() - enable_language(CUDA) + option(SUNSHINE_ENABLE_CUDA "Enable cuda specific code" ON) if(${SUNSHINE_ENABLE_X11}) find_package(X11) else() set(X11_FOUND OFF) endif() + + set(CUDA_FOUND OFF) + if(${SUNSHINE_ENABLE_CUDA}) + include(CheckLanguage) + check_language(CUDA) + + if(CMAKE_CUDA_COMPILER) + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 75) + endif() + + set(CUDA_FOUND ON) + enable_language(CUDA) + endif() + endif() if(${SUNSHINE_ENABLE_DRM}) find_package(LIBDRM) find_package(LIBCAP) @@ -138,6 +149,17 @@ else() include_directories(${X11_INCLUDE_DIR}) list(APPEND PLATFORM_TARGET_FILES sunshine/platform/linux/x11grab.cpp) endif() + + if(CUDA_FOUND) + include_directories(third-party/nvfbc) + list(APPEND PLATFORM_TARGET_FILES + sunshine/platform/linux/cuda.cu + sunshine/platform/linux/cuda.cpp + third-party/nvfbc/NvFBC.h) + + add_compile_definitions(SUNSHINE_BUILD_CUDA) + endif() + if(LIBDRM_FOUND AND LIBCAP_FOUND) add_compile_definitions(SUNSHINE_BUILD_DRM) include_directories(${LIBDRM_INCLUDE_DIRS} ${LIBCAP_INCLUDE_DIRS}) @@ -187,16 +209,14 @@ else() sunshine/platform/linux/wlgrab.cpp sunshine/platform/linux/wayland.cpp) endif() - if(NOT ${X11_FOUND} AND NOT (${LIBDRM_FOUND} AND ${LIBCAP_FOUND}) AND NOT ${WAYLAND_FOUND}) - message(FATAL_ERROR "Couldn't find either x11, wayland or (libdrm and libcap)") + if(NOT ${X11_FOUND} AND NOT (${LIBDRM_FOUND} AND ${LIBCAP_FOUND}) AND NOT ${WAYLAND_FOUND} AND NOT ${}) + message(FATAL_ERROR "Couldn't find either x11, wayland, cuda or (libdrm and libcap)") endif() list(APPEND PLATFORM_TARGET_FILES sunshine/platform/linux/publish.cpp sunshine/platform/linux/vaapi.h sunshine/platform/linux/vaapi.cpp - sunshine/platform/linux/cuda.cu - sunshine/platform/linux/cuda.cpp sunshine/platform/linux/cuda.h sunshine/platform/linux/graphics.h sunshine/platform/linux/graphics.cpp @@ -211,8 +231,7 @@ else() third-party/glad/include/EGL/eglplatform.h third-party/glad/include/KHR/khrplatform.h third-party/glad/include/glad/gl.h - third-party/glad/include/glad/egl.h - third-party/nvfbc/NvFBC.h) + third-party/glad/include/glad/egl.h) list(APPEND PLATFORM_LIBRARIES dl @@ -224,8 +243,7 @@ else() include_directories( /usr/include/libevdev-1.0 third-party/nv-codec-headers/include - third-party/glad/include - third-party/nvfbc) + third-party/glad/include) if(NOT DEFINED SUNSHINE_EXECUTABLE_PATH) set(SUNSHINE_EXECUTABLE_PATH "sunshine") diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp index 25d88518..65bd07ee 100644 --- a/sunshine/platform/linux/cuda.cpp +++ b/sunshine/platform/linux/cuda.cpp @@ -14,7 +14,6 @@ extern "C" { #include "sunshine/main.h" #include "sunshine/utility.h" #include "wayland.h" -#include "x11grab.h" #define SUNSHINE_STRINGVIEW_HELPER(x) x##sv #define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x) diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h index 08cb0156..7e377f20 100644 --- a/sunshine/platform/linux/cuda.h +++ b/sunshine/platform/linux/cuda.h @@ -1,10 +1,14 @@ -#ifndef SUNSHINE_PLATFORM_CUDA_H +#if !defined(SUNSHINE_PLATFORM_CUDA_H) && defined(SUNSHINE_BUILD_CUDA) #define SUNSHINE_PLATFORM_CUDA_H -#ifndef __NVCC__ +#include +#include +#include -#include "sunshine/platform/common.h" -#include "x11grab.h" +namespace platf { + class hwdevice_t; + class img_t; +} namespace cuda { namespace nvfbc { @@ -14,12 +18,6 @@ std::shared_ptr make_hwdevice(int width, int height, bool vra int init(); } // namespace cuda -#else -namespace platf { -class img_t; -} -#endif - typedef struct cudaArray *cudaArray_t; #if !defined(__CUDACC__) diff --git a/sunshine/platform/linux/misc.cpp b/sunshine/platform/linux/misc.cpp index e10242ff..bf6201b3 100644 --- a/sunshine/platform/linux/misc.cpp +++ b/sunshine/platform/linux/misc.cpp @@ -141,9 +141,9 @@ std::string get_mac_address(const std::string_view &address) { } enum class source_e { - // #ifdef SUNSHINE_BUILD_CUDA +#ifdef SUNSHINE_BUILD_CUDA NVFBC, -// #endif +#endif #ifdef SUNSHINE_BUILD_WAYLAND WAYLAND, #endif @@ -156,14 +156,14 @@ enum class source_e { }; static source_e source; -// #ifdef SUNSHINE_BUILD_CUDA +#ifdef SUNSHINE_BUILD_CUDA std::vector nvfbc_display_names(); std::shared_ptr nvfbc_display(mem_type_e hwdevice_type, const std::string &display_name, int framerate); bool verify_nvfbc() { return !nvfbc_display_names().empty(); } -// #endif +#endif #ifdef SUNSHINE_BUILD_WAYLAND std::vector wl_display_names(); @@ -194,10 +194,10 @@ bool verify_x11() { std::vector display_names() { switch(source) { - // #ifdef SUNSHINE_BUILD_CUDA +#ifdef SUNSHINE_BUILD_CUDA case source_e::NVFBC: return nvfbc_display_names(); - // #endif +#endif #ifdef SUNSHINE_BUILD_WAYLAND case source_e::WAYLAND: return wl_display_names(); @@ -217,10 +217,10 @@ std::vector display_names() { std::shared_ptr display(mem_type_e hwdevice_type, const std::string &display_name, int framerate) { switch(source) { - // #ifdef SUNSHINE_BUILD_CUDA +#ifdef SUNSHINE_BUILD_CUDA case source_e::NVFBC: return nvfbc_display(hwdevice_type, display_name, framerate); - // #endif +#endif #ifdef SUNSHINE_BUILD_WAYLAND case source_e::WAYLAND: return wl_display(hwdevice_type, display_name, framerate); @@ -249,7 +249,7 @@ std::unique_ptr init() { window_system = window_system_e::WAYLAND; } #endif -#if defined(SUNSHINE_BUILD_X11) // || defined(SUNSHINE_BUILD_CUDA) +#if defined(SUNSHINE_BUILD_X11) || defined(SUNSHINE_BUILD_CUDA) if(std::getenv("DISPLAY") && window_system != window_system_e::WAYLAND) { if(std::getenv("WAYLAND_DISPLAY")) { BOOST_LOG(warning) << "Wayland detected, yet sunshine will use X11 for screencasting, screencasting will only work on XWayland applications"sv; @@ -258,13 +258,13 @@ std::unique_ptr init() { window_system = window_system_e::X11; } #endif -// #ifdef SUNSHINE_BUILD_CUDA +#ifdef SUNSHINE_BUILD_CUDA if(verify_nvfbc()) { BOOST_LOG(info) << "Using nvFBC for screencasting"sv; source = source_e::NVFBC; goto found_source; } -// #endif +#endif #ifdef SUNSHINE_BUILD_WAYLAND if(verify_wl()) { BOOST_LOG(info) << "Using Wayland for screencasting"sv; diff --git a/sunshine/platform/linux/x11grab.cpp b/sunshine/platform/linux/x11grab.cpp index decf24aa..eb303d46 100644 --- a/sunshine/platform/linux/x11grab.cpp +++ b/sunshine/platform/linux/x11grab.cpp @@ -20,11 +20,11 @@ #include "sunshine/main.h" #include "sunshine/task_pool.h" +#include "cuda.h" #include "graphics.h" #include "misc.h" #include "vaapi.h" #include "x11grab.h" -#include "cuda.h" using namespace std::literals; @@ -517,9 +517,11 @@ struct x11_attr_t : public display_t { return va::make_hwdevice(width, height, false); } +#ifdef SUNSHINE_BUILD_CUDA if(mem_type == mem_type_e::cuda) { return cuda::make_hwdevice(width, height, false); } +#endif return std::make_shared(); } From b3304a059d9c8827482eb9b9c55185e372945208 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Wed, 22 Sep 2021 14:49:49 +0200 Subject: [PATCH 12/27] Target older cuda architecture for compatibility --- CMakeLists.txt | 2 +- sunshine/platform/linux/cuda.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d35512f4..2eefcdac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -122,7 +122,7 @@ else() if(CMAKE_CUDA_COMPILER) if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES 75) + set(CMAKE_CUDA_ARCHITECTURES 35) endif() set(CUDA_FOUND ON) diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp index 65bd07ee..b08b832a 100644 --- a/sunshine/platform/linux/cuda.cpp +++ b/sunshine/platform/linux/cuda.cpp @@ -461,7 +461,7 @@ public: } cursor_visible = cursor; - if(false && cursor) { + if(cursor) { capture_params.bPushModel = nv_bool(false); capture_params.bWithCursor = nv_bool(true); capture_params.bAllowDirectCapture = nv_bool(false); From bd7294e6728dfbac57b6155a73ba196af5e64cbd Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Wed, 22 Sep 2021 19:12:20 +0200 Subject: [PATCH 13/27] Fix cuda kernel launch when encoding in 4K --- sunshine/platform/linux/cuda.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu index 68368095..74f99c09 100644 --- a/sunshine/platform/linux/cuda.cu +++ b/sunshine/platform/linux/cuda.cu @@ -264,10 +264,10 @@ int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std: int threadsX = viewport.width / 2; int threadsY = viewport.height; - dim3 block(threadsPerBlock, threadsPerBlock); - dim3 grid(div_align(threadsX, threadsPerBlock), div_align(threadsY, threadsPerBlock)); + dim3 block(threadsPerBlock); + dim3 grid(div_align(threadsX, threadsPerBlock), threadsY); - RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, viewport, (video::color_t *)color_matrix.get()); + RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, viewport, (video::color_t *)color_matrix.get()); return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed"); } From f78a9e2ccf48c722bae9117728d88a06345a8053 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Sat, 25 Sep 2021 12:44:45 +0200 Subject: [PATCH 14/27] Fix downscaling image when using cuda --- sunshine/platform/linux/cuda.cu | 20 +++++++++++--------- sunshine/platform/linux/cuda.h | 2 ++ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu index 74f99c09..7bfb5eab 100644 --- a/sunshine/platform/linux/cuda.cu +++ b/sunshine/platform/linux/cuda.cu @@ -140,7 +140,7 @@ inline __device__ float calcY(float3 pixel, const video::color_t *const color_ma __global__ void RGBA_to_NV12( cudaTextureObject_t srcImage, std::uint8_t *dstY, std::uint8_t *dstUV, std::uint32_t dstPitchY, std::uint32_t dstPitchUV, - const viewport_t viewport, const video::color_t *const color_matrix) { + float scale, const viewport_t viewport, const video::color_t *const color_matrix) { int idX = (threadIdx.x + blockDim.x * blockIdx.x) * 2; int idY = (threadIdx.y + blockDim.y * blockIdx.y); @@ -148,8 +148,8 @@ __global__ void RGBA_to_NV12( if(idX >= viewport.width) return; if(idY >= viewport.height) return; - float x = (float)idX / (float)viewport.width / 4; - float y = (float)idY / (float)viewport.height; + float x = idX * scale; + float y = idY * scale; idX += viewport.offsetX; idY += viewport.offsetY; @@ -158,7 +158,7 @@ __global__ void RGBA_to_NV12( dstUV = dstUV + idX + (idY / 2 * dstPitchUV); float3 rgb_l = bgra_to_rgb(tex2D(srcImage, x, y)); - float3 rgb_r = bgra_to_rgb(tex2D(srcImage, x + 0.25f / viewport.width, y + 1.0f / viewport.height)); + float3 rgb_r = bgra_to_rgb(tex2D(srcImage, x + scale, y)); float2 uv = calcUV((rgb_l + rgb_r) * 0.5f, color_matrix) * 255.0f; @@ -188,7 +188,7 @@ std::optional tex_t::make(int height, int pitch) { desc.readMode = cudaReadModeNormalizedFloat; desc.filterMode = cudaFilterModeLinear; - desc.normalizedCoords = true; + desc.normalizedCoords = false; std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp); @@ -197,9 +197,9 @@ std::optional tex_t::make(int height, int pitch) { return std::move(tex); } -tex_t::tex_t() : array { }, texture { INVALID_TEXTURE } {} +tex_t::tex_t() : array {}, texture { INVALID_TEXTURE } {} tex_t::tex_t(tex_t &&other) : array { other.array }, texture { other.texture } { - other.array = 0; + other.array = 0; other.texture = INVALID_TEXTURE; } @@ -240,6 +240,8 @@ sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pit viewport.offsetX = offsetX_f; viewport.offsetY = offsetY_f; + + scale = 1.0f / scalar; } std::optional sws_t::make(int in_width, int in_height, int out_width, int out_height, int pitch) { @@ -253,7 +255,7 @@ std::optional sws_t::make(int in_width, int in_height, int out_width, int return std::nullopt; } - return std::make_optional(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2, std::move(ptr)); + return std::make_optional(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor, std::move(ptr)); } int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture) { @@ -267,7 +269,7 @@ int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std: dim3 block(threadsPerBlock); dim3 grid(div_align(threadsX, threadsPerBlock), threadsY); - RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, viewport, (video::color_t *)color_matrix.get()); + RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get()); return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed"); } diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h index 7e377f20..d55ab8d0 100644 --- a/sunshine/platform/linux/cuda.h +++ b/sunshine/platform/linux/cuda.h @@ -83,6 +83,8 @@ public: int threadsPerBlock; viewport_t viewport; + + float scale; }; } // namespace cuda From e287404992822cb22bd81cca6d3d12dcb9b1bdb5 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Sat, 25 Sep 2021 14:44:38 +0200 Subject: [PATCH 15/27] Handle acquiring display names based on encoder --- sunshine/platform/common.h | 4 +- sunshine/platform/linux/misc.cpp | 67 +++++++++++----------- sunshine/platform/windows/display_base.cpp | 2 +- sunshine/video.cpp | 38 ++++++------ 4 files changed, 54 insertions(+), 57 deletions(-) diff --git a/sunshine/platform/common.h b/sunshine/platform/common.h index 4b702a9b..9d465749 100644 --- a/sunshine/platform/common.h +++ b/sunshine/platform/common.h @@ -286,8 +286,8 @@ std::unique_ptr audio_control(); */ std::shared_ptr display(mem_type_e hwdevice_type, const std::string &display_name, int framerate); -// A list of names of displays accepted as display_name -std::vector display_names(); +// A list of names of displays accepted as display_name with the mem_type_e +std::vector display_names(mem_type_e hwdevice_type); input_t input(); void move_mouse(input_t &input, int deltaX, int deltaY); diff --git a/sunshine/platform/linux/misc.cpp b/sunshine/platform/linux/misc.cpp index bf6201b3..dd114ec4 100644 --- a/sunshine/platform/linux/misc.cpp +++ b/sunshine/platform/linux/misc.cpp @@ -140,7 +140,8 @@ std::string get_mac_address(const std::string_view &address) { return "00:00:00:00:00:00"s; } -enum class source_e { +namespace source { +enum source_e : std::size_t { #ifdef SUNSHINE_BUILD_CUDA NVFBC, #endif @@ -153,8 +154,11 @@ enum class source_e { #ifdef SUNSHINE_BUILD_X11 X11, #endif + MAX_FLAGS }; -static source_e source; +} // namespace source + +static std::bitset sources; #ifdef SUNSHINE_BUILD_CUDA std::vector nvfbc_display_names(); @@ -192,48 +196,48 @@ bool verify_x11() { } #endif -std::vector display_names() { - switch(source) { +std::vector display_names(mem_type_e hwdevice_type) { #ifdef SUNSHINE_BUILD_CUDA - case source_e::NVFBC: - return nvfbc_display_names(); + // display using NvFBC only supports mem_type_e::cuda + if(sources[source::NVFBC] && hwdevice_type == mem_type_e::cuda) return nvfbc_display_names(); #endif #ifdef SUNSHINE_BUILD_WAYLAND - case source_e::WAYLAND: - return wl_display_names(); + if(sources[source::WAYLAND]) return wl_display_names(); #endif #ifdef SUNSHINE_BUILD_DRM - case source_e::KMS: - return kms_display_names(); + if(sources[source::KMS]) return kms_display_names(); #endif #ifdef SUNSHINE_BUILD_X11 - case source_e::X11: - return x11_display_names(); + if(sources[source::X11]) return x11_display_names(); #endif - } - return {}; } std::shared_ptr display(mem_type_e hwdevice_type, const std::string &display_name, int framerate) { - switch(source) { #ifdef SUNSHINE_BUILD_CUDA - case source_e::NVFBC: + if(sources[source::NVFBC] && hwdevice_type == mem_type_e::cuda) { + BOOST_LOG(info) << "Screencasting with NvFBC"sv; return nvfbc_display(hwdevice_type, display_name, framerate); + } #endif #ifdef SUNSHINE_BUILD_WAYLAND - case source_e::WAYLAND: + if(sources[source::WAYLAND]) { + BOOST_LOG(info) << "Screencasting with Wayland's protocol"sv; return wl_display(hwdevice_type, display_name, framerate); + } #endif #ifdef SUNSHINE_BUILD_DRM - case source_e::KMS: + if(sources[source::KMS]) { + BOOST_LOG(info) << "Screencasting with KMS"sv; return kms_display(hwdevice_type, display_name, framerate); + } #endif #ifdef SUNSHINE_BUILD_X11 - case source_e::X11: + if(sources[source::X11]) { + BOOST_LOG(info) << "Screencasting with X11"sv; return x11_display(hwdevice_type, display_name, framerate); -#endif } +#endif return nullptr; } @@ -260,16 +264,14 @@ std::unique_ptr init() { #endif #ifdef SUNSHINE_BUILD_CUDA if(verify_nvfbc()) { - BOOST_LOG(info) << "Using nvFBC for screencasting"sv; - source = source_e::NVFBC; - goto found_source; + BOOST_LOG(info) << "Using NvFBC for screencasting"sv; + sources[source::NVFBC] = true; } #endif #ifdef SUNSHINE_BUILD_WAYLAND if(verify_wl()) { BOOST_LOG(info) << "Using Wayland for screencasting"sv; - source = source_e::WAYLAND; - goto found_source; + sources[source::WAYLAND] = true; } #endif #ifdef SUNSHINE_BUILD_DRM @@ -281,23 +283,20 @@ std::unique_ptr init() { } BOOST_LOG(info) << "Using KMS for screencasting"sv; - source = source_e::KMS; - goto found_source; + sources[source::KMS] = true; } #endif #ifdef SUNSHINE_BUILD_X11 if(verify_x11()) { BOOST_LOG(info) << "Using X11 for screencasting"sv; - source = source_e::X11; - goto found_source; + sources[source::X11] = true; } #endif - // Did not find a source - return nullptr; -// Normally, I would simply use if-else statements to achieve this result, -// but due to the macro's, (*spits on ground*), it would be too messy -found_source: + if(sources.none()) { + return nullptr; + } + if(!gladLoaderLoadEGL(EGL_NO_DISPLAY) || !eglGetPlatformDisplay) { BOOST_LOG(warning) << "Couldn't load EGL library"sv; } diff --git a/sunshine/platform/windows/display_base.cpp b/sunshine/platform/windows/display_base.cpp index f816ad88..02b08c37 100644 --- a/sunshine/platform/windows/display_base.cpp +++ b/sunshine/platform/windows/display_base.cpp @@ -452,7 +452,7 @@ std::shared_ptr display(mem_type_e hwdevice_type, const std::string & return nullptr; } -std::vector display_names() { +std::vector display_names(mem_type_e) { std::vector display_names; HRESULT status; diff --git a/sunshine/video.cpp b/sunshine/video.cpp index 6faf82ee..f4a4c523 100644 --- a/sunshine/video.cpp +++ b/sunshine/video.cpp @@ -585,7 +585,7 @@ void captureThread( // Get all the monitor names now, rather than at boot, to // get the most up-to-date list available monitors - auto display_names = platf::display_names(); + auto display_names = platf::display_names(map_dev_type(encoder.dev_type)); int display_p = 0; if(display_names.empty()) { @@ -1105,17 +1105,30 @@ std::optional make_synced_session(platf::display_t *disp, const return std::nullopt; } - encode_session.session = std::move(*session); + encode_session.session = std::move(*session); return std::move(encode_session); } encode_e encode_run_sync( std::vector> &synced_session_ctxs, - encode_session_ctx_queue_t &encode_session_ctx_queue, - int &display_p, const std::vector &display_names) { + encode_session_ctx_queue_t &encode_session_ctx_queue) { const auto &encoder = encoders.front(); + auto display_names = platf::display_names(map_dev_type(encoder.dev_type)); + int display_p = 0; + + if(display_names.empty()) { + display_names.emplace_back(config::video.output_name); + } + + for(int x = 0; x < display_names.size(); ++x) { + if(display_names[x] == config::video.output_name) { + display_p = x; + + break; + } + } std::shared_ptr disp; @@ -1269,22 +1282,7 @@ void captureThreadSync() { } }); - auto display_names = platf::display_names(); - int display_p = 0; - - if(display_names.empty()) { - display_names.emplace_back(config::video.output_name); - } - - for(int x = 0; x < display_names.size(); ++x) { - if(display_names[x] == config::video.output_name) { - display_p = x; - - break; - } - } - - while(encode_run_sync(synced_session_ctxs, ctx, display_p, display_names) == encode_e::reinit) {} + while(encode_run_sync(synced_session_ctxs, ctx) == encode_e::reinit) {} } void capture_async( From d7cb71f877f88dae6fdc2a87a12f5d6970ff99d2 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Sat, 25 Sep 2021 15:35:44 +0200 Subject: [PATCH 16/27] Update README --- README.md | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 69bf2c47..8be5438c 100644 --- a/README.md +++ b/README.md @@ -17,27 +17,35 @@ Sunshine is a Gamestream host for Moonlight Ubuntu 20.04: Install the following: -#### X11 Only + +#### Common ``` -sudo apt install cmake gcc-10 g++-10 libssl-dev libavdevice-dev libboost-thread-dev libboost-filesystem-dev libboost-log-dev libpulse-dev libopus-dev libxtst-dev libx11-dev libxrandr-dev libxfixes-dev libevdev-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev +sudo apt install cmake gcc-10 g++-10 libssl-dev libavdevice-dev libboost-thread-dev libboost-filesystem-dev libboost-log-dev libpulse-dev libopus-dev +``` +#### X11 +``` +sudo apt install libxtst-dev libx11-dev libxrandr-dev libxfixes-dev libevdev-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev ``` -#### X11 + KMS (Requires additional setup) -KMS allows Sunshine to grab the monitor with lower latency then through X11 - +#### KMS +This requires additional [setup](README.md#Setup). ``` -sudo apt install cmake gcc-10 g++-10 libssl-dev libavdevice-dev libboost-thread-dev libboost-filesystem-dev libboost-log-dev libpulse-dev libopus-dev libxtst-dev libx11-dev libxrandr-dev libxfixes-dev libevdev-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev libdrm-dev libcap-dev +sudo apt install libdrm-dev libcap-dev +``` + +#### Wayland +This is for wlroots based compositores, such as Sway +``` +sudo apt install libwayland-dev +``` + +#### Cuda + NvFBC +This requires proprietary software +``` +sudo apt install nvidia-cuda-dev nvidia-cuda-toolkit ``` ### Compilation: - -#### X11 Only -- `git clone https://github.com/loki-47-6F-64/sunshine.git --recurse-submodules` -- `cd sunshine && mkdir build && cd build` -- `cmake -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_CXX_COMPILER=g++-10 -DSUNSHINE_ENABLE_DRM=OFF ..` -- `make -j ${nproc}` - -#### X11 + KMS - `git clone https://github.com/loki-47-6F-64/sunshine.git --recurse-submodules` - `cd sunshine && mkdir build && cd build` - `cmake -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_CXX_COMPILER=g++-10 ..` From 50bd3094b4fe3ebb34c5102cd99d348ccb895a19 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Sat, 25 Sep 2021 17:46:26 +0200 Subject: [PATCH 17/27] More accurate capture rate for NvFBC --- sunshine/platform/linux/cuda.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp index b08b832a..ac6680b8 100644 --- a/sunshine/platform/linux/cuda.cpp +++ b/sunshine/platform/linux/cuda.cpp @@ -393,6 +393,8 @@ public: } } + delay = std::chrono::nanoseconds { 1s } / framerate; + capture_params = NVFBC_CREATE_CAPTURE_SESSION_PARAMS { NVFBC_CREATE_CAPTURE_SESSION_PARAMS_VER }; capture_params.eCaptureType = NVFBC_CAPTURE_SHARED_CUDA; @@ -426,6 +428,8 @@ public: } platf::capture_e capture(snapshot_cb_t &&snapshot_cb, std::shared_ptr img, bool *cursor) override { + auto next_frame = std::chrono::steady_clock::now(); + // Force display_t::capture to initialize handle_t::capture cursor_visible = !*cursor; @@ -434,7 +438,17 @@ public: }); while(img) { - auto status = snapshot(img.get(), 500ms, *cursor); + auto now = std::chrono::steady_clock::now(); + if(next_frame > now) { + std::this_thread::sleep_for((next_frame - now) / 3 * 2); + } + while(next_frame > now) { + std::this_thread::sleep_for(1ns); + now = std::chrono::steady_clock::now(); + } + next_frame = now + delay; + + auto status = snapshot(img.get(), 150ms, *cursor); switch(status) { case platf::capture_e::reinit: case platf::capture_e::error: @@ -552,10 +566,6 @@ public: return platf::capture_e::error; } - if(!info.bIsNewFrame) { - return platf::capture_e::timeout; - } - if(((img_t *)img)->tex.copy((std::uint8_t *)device_ptr, img->height, img->row_pitch)) { return platf::capture_e::error; } @@ -590,6 +600,8 @@ public: return 0; } + std::chrono::nanoseconds delay; + bool cursor_visible; handle_t handle; From fcb84132f43b074e3efbe3c29870d9e62d959319 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Sat, 25 Sep 2021 19:18:43 +0200 Subject: [PATCH 18/27] Sharper images when not scaling the image --- sunshine/platform/linux/cuda.cpp | 23 ++++++++++++++++------- sunshine/platform/linux/cuda.cu | 25 ++++++++++++++++++------- sunshine/platform/linux/cuda.h | 6 +++++- 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp index ac6680b8..b15104a5 100644 --- a/sunshine/platform/linux/cuda.cpp +++ b/sunshine/platform/linux/cuda.cpp @@ -112,6 +112,8 @@ public: sws = std::move(*sws_opt); + linear_interpolation = width != frame->width || height != frame->height; + return 0; } @@ -140,20 +142,27 @@ public: return; } - sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture, { frame->width, frame->height, 0, 0 }); + sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, { frame->width, frame->height, 0, 0 }); + } + + cudaTextureObject_t tex_obj(const tex_t &tex) const { + return linear_interpolation ? tex.texture.linear : tex.texture.point; } frame_t hwframe; int width, height; + // When heigth and width don't change, it's not necessary to use linear interpolation + bool linear_interpolation; + sws_t sws; }; class cuda_ram_t : public cuda_t { public: int convert(platf::img_t &img) override { - return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex.texture); + return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex)); } int set_frame(AVFrame *frame) { @@ -177,7 +186,7 @@ public: class cuda_vram_t : public cuda_t { public: int convert(platf::img_t &img) override { - return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], (cudaTextureObject_t)img.data); + return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *)&img)->tex)); } }; @@ -497,7 +506,7 @@ public: NVFBC_TOCUDA_GRAB_FRAME_PARAMS grab { NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER, - NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY, + NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT, &device_ptr, &info, 0, @@ -551,7 +560,7 @@ public: NVFBC_TOCUDA_GRAB_FRAME_PARAMS grab { NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER, - NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY, + NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT, &device_ptr, &info, (std::uint32_t)timeout.count(), @@ -580,6 +589,7 @@ public: std::shared_ptr alloc_img() override { auto img = std::make_shared(); + img->data = nullptr; img->width = width; img->height = height; img->pixel_pitch = 4; @@ -590,8 +600,7 @@ public: return nullptr; } - img->tex = std::move(*tex_opt); - img->data = (std::uint8_t *)img->tex.texture; + img->tex = std::move(*tex_opt); return img; }; diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu index 7bfb5eab..49f088f8 100644 --- a/sunshine/platform/linux/cuda.cu +++ b/sunshine/platform/linux/cuda.cu @@ -160,7 +160,7 @@ __global__ void RGBA_to_NV12( float3 rgb_l = bgra_to_rgb(tex2D(srcImage, x, y)); float3 rgb_r = bgra_to_rgb(tex2D(srcImage, x + scale, y)); - float2 uv = calcUV((rgb_l + rgb_r) * 0.5f, color_matrix) * 255.0f; + float2 uv = calcUV((rgb_l + rgb_r) * 0.5f, color_matrix) * 256.0f; dstUV[0] = uv.x; dstUV[1] = uv.y; @@ -187,12 +187,16 @@ std::optional tex_t::make(int height, int pitch) { cudaTextureDesc desc {}; desc.readMode = cudaReadModeNormalizedFloat; - desc.filterMode = cudaFilterModeLinear; + desc.filterMode = cudaFilterModePoint; desc.normalizedCoords = false; std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp); - CU_CHECK_OPT(cudaCreateTextureObject(&tex.texture, &res, &desc, nullptr), "Couldn't create cuda texture"); + CU_CHECK_OPT(cudaCreateTextureObject(&tex.texture.point, &res, &desc, nullptr), "Couldn't create cuda texture that uses point interpolation"); + + desc.filterMode = cudaFilterModeLinear; + + CU_CHECK_OPT(cudaCreateTextureObject(&tex.texture.linear, &res, &desc, nullptr), "Couldn't create cuda texture that uses linear interpolation"); return std::move(tex); } @@ -200,7 +204,8 @@ std::optional tex_t::make(int height, int pitch) { tex_t::tex_t() : array {}, texture { INVALID_TEXTURE } {} tex_t::tex_t(tex_t &&other) : array { other.array }, texture { other.texture } { other.array = 0; - other.texture = INVALID_TEXTURE; + other.texture.point = INVALID_TEXTURE; + other.texture.linear = INVALID_TEXTURE; } tex_t &tex_t::operator=(tex_t &&other) { @@ -211,10 +216,16 @@ tex_t &tex_t::operator=(tex_t &&other) { } tex_t::~tex_t() { - if(texture != INVALID_TEXTURE) { - CU_CHECK_IGNORE(cudaDestroyTextureObject(texture), "Couldn't deallocate cuda texture"); + if(texture.point != INVALID_TEXTURE) { + CU_CHECK_IGNORE(cudaDestroyTextureObject(texture.point), "Couldn't deallocate cuda texture that uses point interpolation"); - texture = INVALID_TEXTURE; + texture.point = INVALID_TEXTURE; + } + + if(texture.linear != INVALID_TEXTURE) { + CU_CHECK_IGNORE(cudaDestroyTextureObject(texture.linear), "Couldn't deallocate cuda texture that uses linear interpolation"); + + texture.linear = INVALID_TEXTURE; } if(array) { diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h index d55ab8d0..5811379f 100644 --- a/sunshine/platform/linux/cuda.h +++ b/sunshine/platform/linux/cuda.h @@ -54,7 +54,11 @@ public: int copy(std::uint8_t *src, int height, int pitch); cudaArray_t array; - cudaTextureObject_t texture; + + struct texture { + cudaTextureObject_t point; + cudaTextureObject_t linear; + } texture; }; class sws_t { From deecd19af256edc51046c39ea4a36b398f23ca07 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Sat, 25 Sep 2021 21:50:47 +0200 Subject: [PATCH 19/27] Update appveyor.yml --- CMakeLists.txt | 3 ++- appveyor.yml | 2 +- gen-deb.in | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2eefcdac..ba90985b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,7 +13,6 @@ if(WIN32) endif() add_subdirectory(third-party/moonlight-common-c/enet) add_subdirectory(third-party/Simple-Web-Server) -add_subdirectory(third-party/cbs) set(UPNPC_BUILD_SHARED OFF CACHE BOOL "no shared libraries") set(UPNPC_BUILD_TESTS OFF CACHE BOOL "Don't build tests for miniupnpc") @@ -311,6 +310,8 @@ include_directories( ${PLATFORM_INCLUDE_DIRS} ) +add_subdirectory(third-party/cbs) + string(TOUPPER "x${CMAKE_BUILD_TYPE}" BUILD_TYPE) if("${BUILD_TYPE}" STREQUAL "XDEBUG") list(APPEND SUNSHINE_COMPILE_OPTIONS -O0 -ggdb3) diff --git a/appveyor.yml b/appveyor.yml index ce60cc59..75ad5c77 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -9,7 +9,7 @@ environment: install: - sh: sudo apt update --ignore-missing - - sh: sudo apt install -y build-essential fakeroot gcc-10 g++-10 cmake libssl-dev libavdevice-dev libboost-thread-dev libboost-filesystem-dev libboost-log-dev libpulse-dev libopus-dev libxtst-dev libx11-dev libxrandr-dev libxfixes-dev libevdev-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev libdrm-dev libcap-dev + - sh: sudo apt install -y build-essential fakeroot gcc-10 g++-10 cmake libssl-dev libavdevice-dev libboost-thread-dev libboost-filesystem-dev libboost-log-dev libpulse-dev libopus-dev libxtst-dev libx11-dev libxrandr-dev libxfixes-dev libevdev-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev libdrm-dev libcap-dev nvidia-cuda-dev nvidia-cuda-toolkit - cmd: C:\msys64\usr\bin\bash -lc "pacman --needed --noconfirm -S mingw-w64-x86_64-openssl mingw-w64-x86_64-cmake mingw-w64-x86_64-toolchain mingw-w64-x86_64-opus mingw-w64-x86_64-x265 mingw-w64-x86_64-boost git yasm nasm diffutils make" before_build: diff --git a/gen-deb.in b/gen-deb.in index 9ddbf2a6..70da08d2 100755 --- a/gen-deb.in +++ b/gen-deb.in @@ -37,7 +37,7 @@ Package: sunshine Architecture: amd64 Maintainer: @loki Priority: optional -Version: 0.10.2 +Version: 0.11.0 Depends: libssl1.1, libavdevice58, libboost-thread1.67.0 | libboost-thread1.71.0, libboost-filesystem1.67.0 | libboost-filesystem1.71.0, libboost-log1.67.0 | libboost-log1.71.0, libpulse0, libopus0, libxcb-shm0, libxcb-xfixes0, libxtst6, libevdev2, libdrm2, libcap2 Description: Gamestream host for Moonlight EOF From c5a356f3e7c91ee70d98678ec11012571c6f3409 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Sat, 25 Sep 2021 22:31:39 +0200 Subject: [PATCH 20/27] Fix compilation on ubuntu 20.04 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2eefcdac..53476445 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,7 +27,7 @@ find_package(OpenSSL REQUIRED) set(Boost_USE_STATIC_LIBS ON) find_package(Boost COMPONENTS log filesystem REQUIRED) -list(APPEND SUNSHINE_COMPILE_OPTIONS -fPIC -Wall -Wno-missing-braces -Wno-maybe-uninitialized -Wno-sign-compare) +list(APPEND SUNSHINE_COMPILE_OPTIONS -Wall -Wno-missing-braces -Wno-maybe-uninitialized -Wno-sign-compare) if(WIN32) file( From 57c79458474c353d37c76fa2ae60bbb9e5aa4f4d Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Sun, 26 Sep 2021 00:18:49 +0200 Subject: [PATCH 21/27] Fix typo in README --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 48d0ebaf..e9532d00 100644 --- a/README.md +++ b/README.md @@ -38,10 +38,6 @@ This is for wlroots based compositores, such as Sway ``` sudo apt install libwayland-dev ``` -#### Warning: -You might require ffmpeg version >= 4.3. Check the troubleshooting section for more information. - -### Compilation: #### Cuda + NvFBC This requires proprietary software @@ -49,6 +45,9 @@ This requires proprietary software sudo apt install nvidia-cuda-dev nvidia-cuda-toolkit ``` +#### Warning: +You might require ffmpeg version >= 4.3. Check the troubleshooting section for more information. + ### Compilation: - `git clone https://github.com/loki-47-6F-64/sunshine.git --recurse-submodules` - `cd sunshine && mkdir build && cd build` From e2fb02323c73c0de469ff6704e371c045a584362 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Sun, 26 Sep 2021 10:57:43 +0200 Subject: [PATCH 22/27] Attempt to fix ubuntu 20.04 build --- CMakeLists.txt | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c87c7844..62d8b435 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -350,12 +350,17 @@ list(APPEND SUNSHINE_EXTERNAL_LIBRARIES ${OPENSSL_LIBRARIES} ${PLATFORM_LIBRARIES}) +add_compile_options("$<$:${SUNSHINE_COMPILE_OPTIONS}>") +add_compile_options("$<$:${SUNSHINE_COMPILE_OPTIONS}>") + +foreach(flag IN LISTS SUNSHINE_COMPILE_OPTIONS) + add_compile_options($<$:-Xcompiler=${flag}>) +endforeach() + list(APPEND SUNSHINE_DEFINITIONS SUNSHINE_ASSETS_DIR="${SUNSHINE_ASSETS_DIR}") list(APPEND SUNSHINE_DEFINITIONS SUNSHINE_CONFIG_DIR="${SUNSHINE_CONFIG_DIR}") list(APPEND SUNSHINE_DEFINITIONS SUNSHINE_DEFAULT_DIR="${SUNSHINE_DEFAULT_DIR}") add_executable(sunshine ${SUNSHINE_TARGET_FILES}) target_link_libraries(sunshine ${SUNSHINE_EXTERNAL_LIBRARIES}) target_compile_definitions(sunshine PUBLIC ${SUNSHINE_DEFINITIONS}) -set_target_properties(sunshine PROPERTIES CXX_STANDARD 17) - -target_compile_options(sunshine PRIVATE ${SUNSHINE_COMPILE_OPTIONS}) +set_target_properties(sunshine PROPERTIES CXX_STANDARD 17) \ No newline at end of file From 60e3538adcbb2a15501f61f078190935e727952c Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Sun, 26 Sep 2021 11:39:36 +0200 Subject: [PATCH 23/27] Attempt to fix ubuntu 20.04 build, again --- CMakeLists.txt | 5 ++++- sunshine/platform/linux/cuda.cpp | 9 ++++---- sunshine/platform/linux/cuda.cu | 37 ++++++++++++-------------------- sunshine/platform/linux/cuda.h | 6 +++--- 4 files changed, 26 insertions(+), 31 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 62d8b435..c96b1bbd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -351,11 +351,14 @@ list(APPEND SUNSHINE_EXTERNAL_LIBRARIES ${PLATFORM_LIBRARIES}) add_compile_options("$<$:${SUNSHINE_COMPILE_OPTIONS}>") +add_compile_options("$<$:-std=c++17>") add_compile_options("$<$:${SUNSHINE_COMPILE_OPTIONS}>") foreach(flag IN LISTS SUNSHINE_COMPILE_OPTIONS) add_compile_options($<$:-Xcompiler=${flag}>) endforeach() +add_compile_options($<$:-Xcompiler=-std=c++14>) +add_compile_options($<$:-std=c++14>) list(APPEND SUNSHINE_DEFINITIONS SUNSHINE_ASSETS_DIR="${SUNSHINE_ASSETS_DIR}") list(APPEND SUNSHINE_DEFINITIONS SUNSHINE_CONFIG_DIR="${SUNSHINE_CONFIG_DIR}") @@ -363,4 +366,4 @@ list(APPEND SUNSHINE_DEFINITIONS SUNSHINE_DEFAULT_DIR="${SUNSHINE_DEFAULT_DIR}") add_executable(sunshine ${SUNSHINE_TARGET_FILES}) target_link_libraries(sunshine ${SUNSHINE_EXTERNAL_LIBRARIES}) target_compile_definitions(sunshine PUBLIC ${SUNSHINE_DEFINITIONS}) -set_target_properties(sunshine PROPERTIES CXX_STANDARD 17) \ No newline at end of file +# set_target_properties(sunshine PROPERTIES CXX_STANDARD 17) \ No newline at end of file diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp index b15104a5..8a25eb27 100644 --- a/sunshine/platform/linux/cuda.cpp +++ b/sunshine/platform/linux/cuda.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -29,7 +30,7 @@ namespace cuda { constexpr auto cudaDevAttrMaxThreadsPerBlock = (CUdevice_attribute)1; constexpr auto cudaDevAttrMaxThreadsPerMultiProcessor = (CUdevice_attribute)39; -void pass_error(const std::string_view &sv, const char *name, const char *description) { +void pass_error(const std::string &sv, const char *name, const char *description) { BOOST_LOG(error) << sv << name << ':' << description; } @@ -276,7 +277,7 @@ public: return *this; } - static std::optional make() { + static std::unique_ptr make() { NVFBC_CREATE_HANDLE_PARAMS params { NVFBC_CREATE_HANDLE_PARAMS_VER }; handle_t handle; @@ -284,12 +285,12 @@ public: if(status) { BOOST_LOG(error) << "Failed to create session: "sv << handle.last_error(); - return std::nullopt; + return nullptr; } handle.handle_flags[SESSION_HANDLE] = true; - return std::move(handle); + return std::make_unique(std::move(handle)); } const char *last_error() { diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu index 49f088f8..e93f7d9f 100644 --- a/sunshine/platform/linux/cuda.cu +++ b/sunshine/platform/linux/cuda.cu @@ -1,15 +1,11 @@ -// #include +#include #include #include #include -#include -#include #include "cuda.h" -using namespace std::literals; - -#define SUNSHINE_STRINGVIEW_HELPER(x) x##sv +#define SUNSHINE_STRINGVIEW_HELPER(x) x #define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x) #define CU_CHECK(x, y) \ @@ -21,14 +17,9 @@ using namespace std::literals; #define CU_CHECK_PTR(x, y) \ if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return nullptr; -#define CU_CHECK_OPT(x, y) \ - if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return std::nullopt; - #define CU_CHECK_IGNORE(x, y) \ check((x), SUNSHINE_STRINGVIEW(y ": ")) -using namespace std::literals; - //////////////////// Special desclarations /** * NVCC segfaults when including @@ -83,8 +74,8 @@ inline T div_align(T l, T r) { return (l + r - 1) / r; } -void pass_error(const std::string_view &sv, const char *name, const char *description); -inline static int check(cudaError_t result, const std::string_view &sv) { +void pass_error(const std::string &sv, const char *name, const char *description); +inline static int check(cudaError_t result, const std::string &sv) { if(result) { auto name = cudaGetErrorName(result); auto description = cudaGetErrorString(result); @@ -174,11 +165,11 @@ int tex_t::copy(std::uint8_t *src, int height, int pitch) { return 0; } -std::optional tex_t::make(int height, int pitch) { +std::unique_ptr tex_t::make(int height, int pitch) { tex_t tex; auto format = cudaCreateChannelDesc(); - CU_CHECK_OPT(cudaMallocArray(&tex.array, &format, pitch, height, cudaArrayDefault), "Couldn't allocate cuda array"); + CU_CHECK_PTR(cudaMallocArray(&tex.array, &format, pitch, height, cudaArrayDefault), "Couldn't allocate cuda array"); cudaResourceDesc res {}; res.resType = cudaResourceTypeArray; @@ -192,13 +183,13 @@ std::optional tex_t::make(int height, int pitch) { std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp); - CU_CHECK_OPT(cudaCreateTextureObject(&tex.texture.point, &res, &desc, nullptr), "Couldn't create cuda texture that uses point interpolation"); + CU_CHECK_PTR(cudaCreateTextureObject(&tex.texture.point, &res, &desc, nullptr), "Couldn't create cuda texture that uses point interpolation"); desc.filterMode = cudaFilterModeLinear; - CU_CHECK_OPT(cudaCreateTextureObject(&tex.texture.linear, &res, &desc, nullptr), "Couldn't create cuda texture that uses linear interpolation"); + CU_CHECK_PTR(cudaCreateTextureObject(&tex.texture.linear, &res, &desc, nullptr), "Couldn't create cuda texture that uses linear interpolation"); - return std::move(tex); + return std::make_unique(std::move(tex)); } tex_t::tex_t() : array {}, texture { INVALID_TEXTURE } {} @@ -255,18 +246,18 @@ sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pit scale = 1.0f / scalar; } -std::optional sws_t::make(int in_width, int in_height, int out_width, int out_height, int pitch) { +std::unique_ptr sws_t::make(int in_width, int in_height, int out_width, int out_height, int pitch) { cudaDeviceProp props; int device; - CU_CHECK_OPT(cudaGetDevice(&device), "Couldn't get cuda device"); - CU_CHECK_OPT(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties"); + CU_CHECK_PTR(cudaGetDevice(&device), "Couldn't get cuda device"); + CU_CHECK_PTR(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties"); auto ptr = make_ptr(); if(!ptr) { - return std::nullopt; + return nullptr; } - return std::make_optional(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor, std::move(ptr)); + return std::make_unique(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor, std::move(ptr)); } int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture) { diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h index 5811379f..e46b4759 100644 --- a/sunshine/platform/linux/cuda.h +++ b/sunshine/platform/linux/cuda.h @@ -3,7 +3,7 @@ #include #include -#include +#include namespace platf { class hwdevice_t; @@ -42,7 +42,7 @@ struct viewport_t { class tex_t { public: - static std::optional make(int height, int pitch); + static std::unique_ptr make(int height, int pitch); tex_t(); tex_t(tex_t &&); @@ -72,7 +72,7 @@ public: * * pitch -- The size of a single row of pixels in bytes */ - static std::optional make(int in_width, int in_height, int out_width, int out_height, int pitch); + static std::unique_ptr make(int in_width, int in_height, int out_width, int out_height, int pitch); // Converts loaded image into a CUDevicePtr int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture); From 847d7b6980d19ef1197bdd4106b4ae9268e1abdc Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Sun, 26 Sep 2021 23:45:44 +0200 Subject: [PATCH 24/27] Fix minor error in README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e9532d00..b3160832 100644 --- a/README.md +++ b/README.md @@ -20,11 +20,11 @@ Install the following: #### Common ``` -sudo apt install cmake gcc-10 g++-10 libssl-dev libavdevice-dev libboost-thread-dev libboost-filesystem-dev libboost-log-dev libpulse-dev libopus-dev +sudo apt install cmake gcc-10 g++-10 libssl-dev libavdevice-dev libboost-thread-dev libboost-filesystem-dev libboost-log-dev libpulse-dev libopus-dev libevdev-dev ``` #### X11 ``` -sudo apt install libxtst-dev libx11-dev libxrandr-dev libxfixes-dev libevdev-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev +sudo apt install libxtst-dev libx11-dev libxrandr-dev libxfixes-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev ``` #### KMS From 4177b020647131be59d85d8b19c8e3143e820b36 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Mon, 27 Sep 2021 17:58:35 +0200 Subject: [PATCH 25/27] Allow cuda kernel to run in parallell --- sunshine/platform/linux/cuda.cpp | 70 +++++++++++++++++++++++++------- sunshine/platform/linux/cuda.cu | 25 ++++++++++-- sunshine/platform/linux/cuda.h | 25 ++++++++---- sunshine/platform/linux/misc.cpp | 4 -- 4 files changed, 95 insertions(+), 29 deletions(-) diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp index 8a25eb27..b96dcf17 100644 --- a/sunshine/platform/linux/cuda.cpp +++ b/sunshine/platform/linux/cuda.cpp @@ -57,6 +57,10 @@ inline static int check(CUresult result, const std::string_view &sv) { return 0; } +void freeStream(CUstream stream) { + CU_CHECK_IGNORE(cdf->cuStreamDestroy(stream), "Couldn't destroy cuda stream"); +} + class img_t : public platf::img_t { public: tex_t tex; @@ -95,7 +99,8 @@ public: this->hwframe.reset(frame); this->frame = frame; - if(((AVHWFramesContext *)frame->hw_frames_ctx->data)->sw_format != AV_PIX_FMT_NV12) { + auto hwframe_ctx = (AVHWFramesContext *)frame->hw_frames_ctx->data; + if(hwframe_ctx->sw_format != AV_PIX_FMT_NV12) { BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12"sv; return -1; } @@ -106,6 +111,15 @@ public: return -1; } + auto cuda_ctx = (AVCUDADeviceContext *)hwframe_ctx->device_ctx->hwctx; + + stream = make_stream(); + if(!stream) { + return -1; + } + + cuda_ctx->stream = stream.get(); + auto sws_opt = sws_t::make(width, height, frame->width, frame->height, width * 4); if(!sws_opt) { return -1; @@ -143,13 +157,14 @@ public: return; } - sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, { frame->width, frame->height, 0, 0 }); + sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, stream.get(), { frame->width, frame->height, 0, 0 }); } cudaTextureObject_t tex_obj(const tex_t &tex) const { return linear_interpolation ? tex.texture.linear : tex.texture.point; } + stream_t stream; frame_t hwframe; int width, height; @@ -163,7 +178,7 @@ public: class cuda_ram_t : public cuda_t { public: int convert(platf::img_t &img) override { - return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex)); + return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex), stream.get()); } int set_frame(AVFrame *frame) { @@ -187,7 +202,7 @@ public: class cuda_vram_t : public cuda_t { public: int convert(platf::img_t &img) override { - return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *)&img)->tex)); + return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *)&img)->tex), stream.get()); } }; @@ -257,6 +272,28 @@ int init() { return 0; } +class ctx_t { +public: + ctx_t(NVFBC_SESSION_HANDLE handle) { + NVFBC_BIND_CONTEXT_PARAMS params { NVFBC_BIND_CONTEXT_PARAMS_VER }; + + if(func.nvFBCBindContext(handle, ¶ms)) { + BOOST_LOG(error) << "Couldn't bind NvFBC context to current thread: " << func.nvFBCGetLastErrorStr(handle); + } + + this->handle = handle; + } + + ~ctx_t() { + NVFBC_RELEASE_CONTEXT_PARAMS params { NVFBC_RELEASE_CONTEXT_PARAMS_VER }; + if(func.nvFBCReleaseContext(handle, ¶ms)) { + BOOST_LOG(error) << "Couldn't release NvFBC context from current thread: " << func.nvFBCGetLastErrorStr(handle); + } + } + + NVFBC_SESSION_HANDLE handle; +}; + class handle_t { enum flag_e { SESSION_HANDLE, @@ -348,24 +385,26 @@ public: return 0; } - ~handle_t() { + int reset() { if(!handle_flags[SESSION_HANDLE]) { - return; + return 0; } - if(handle_flags[SESSION_CAPTURE]) { - NVFBC_DESTROY_CAPTURE_SESSION_PARAMS params { NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER }; - - if(func.nvFBCDestroyCaptureSession(handle, ¶ms)) { - BOOST_LOG(error) << "Couldn't destroy capture session: "sv << func.nvFBCGetLastErrorStr(handle); - } - } + stop(); NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER }; if(func.nvFBCDestroyHandle(handle, ¶ms)) { BOOST_LOG(error) << "Couldn't destroy session handle: "sv << func.nvFBCGetLastErrorStr(handle); } + + handle_flags[SESSION_HANDLE] = false; + + return 0; + } + + ~handle_t() { + reset(); } std::bitset handle_flags; @@ -381,6 +420,8 @@ public: return -1; } + ctx_t ctx { handle->handle }; + auto status_params = handle->status(); if(!status_params) { return -1; @@ -443,8 +484,9 @@ public: // Force display_t::capture to initialize handle_t::capture cursor_visible = !*cursor; + ctx_t ctx { handle.handle }; auto fg = util::fail_guard([&]() { - handle.stop(); + handle.reset(); }); while(img) { diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu index e93f7d9f..acf2d76d 100644 --- a/sunshine/platform/linux/cuda.cu +++ b/sunshine/platform/linux/cuda.cu @@ -101,6 +101,23 @@ void freeCudaPtr_t::operator()(void *ptr) { CU_CHECK_IGNORE(cudaFree(ptr), "Couldn't free cuda device pointer"); } +void freeCudaStream_t::operator()(cudaStream_t ptr) { + CU_CHECK_IGNORE(cudaStreamDestroy(ptr), "Couldn't free cuda stream"); +} + +stream_t make_stream(int flags) { + cudaStream_t stream; + + if(!flags) { + CU_CHECK_PTR(cudaStreamCreate(&stream), "Couldn't create cuda stream"); + } + else { + CU_CHECK_PTR(cudaStreamCreateWithFlags(&stream, flags), "Couldn't create cuda stream with flags"); + } + + return stream_t { stream }; +} + inline __device__ float3 bgra_to_rgb(uchar4 vec) { return make_float3((float)vec.z, (float)vec.y, (float)vec.x); } @@ -260,18 +277,18 @@ std::unique_ptr sws_t::make(int in_width, int in_height, int out_width, i return std::make_unique(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor, std::move(ptr)); } -int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture) { - return convert(Y, UV, pitchY, pitchUV, texture, viewport); +int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream) { + return convert(Y, UV, pitchY, pitchUV, texture, stream, viewport); } -int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport) { +int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream, const viewport_t &viewport) { int threadsX = viewport.width / 2; int threadsY = viewport.height; dim3 block(threadsPerBlock); dim3 grid(div_align(threadsX, threadsPerBlock), threadsY); - RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get()); + RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get()); return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed"); } diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h index e46b4759..7e81ae99 100644 --- a/sunshine/platform/linux/cuda.h +++ b/sunshine/platform/linux/cuda.h @@ -1,16 +1,17 @@ #if !defined(SUNSHINE_PLATFORM_CUDA_H) && defined(SUNSHINE_BUILD_CUDA) #define SUNSHINE_PLATFORM_CUDA_H -#include #include #include +#include namespace platf { - class hwdevice_t; - class img_t; -} +class hwdevice_t; +class img_t; +} // namespace platf namespace cuda { + namespace nvfbc { std::vector display_names(); } @@ -21,8 +22,10 @@ int init(); typedef struct cudaArray *cudaArray_t; #if !defined(__CUDACC__) +typedef struct CUstream_st *cudaStream_t; typedef unsigned long long cudaTextureObject_t; #else /* defined(__CUDACC__) */ +typedef __location__(device_builtin) struct CUstream_st *cudaStream_t; typedef __location__(device_builtin) unsigned long long cudaTextureObject_t; #endif /* !defined(__CUDACC__) */ @@ -33,7 +36,15 @@ public: void operator()(void *ptr); }; -using ptr_t = std::unique_ptr; +class freeCudaStream_t { +public: + void operator()(cudaStream_t ptr); +}; + +using ptr_t = std::unique_ptr; +using stream_t = std::unique_ptr; + +stream_t make_stream(int flags = 0); struct viewport_t { int width, height; @@ -75,8 +86,8 @@ public: static std::unique_ptr make(int in_width, int in_height, int out_width, int out_height, int pitch); // Converts loaded image into a CUDevicePtr - int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture); - int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport); + int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream); + int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream, const viewport_t &viewport); void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range); diff --git a/sunshine/platform/linux/misc.cpp b/sunshine/platform/linux/misc.cpp index dd114ec4..6d7643c0 100644 --- a/sunshine/platform/linux/misc.cpp +++ b/sunshine/platform/linux/misc.cpp @@ -264,13 +264,11 @@ std::unique_ptr init() { #endif #ifdef SUNSHINE_BUILD_CUDA if(verify_nvfbc()) { - BOOST_LOG(info) << "Using NvFBC for screencasting"sv; sources[source::NVFBC] = true; } #endif #ifdef SUNSHINE_BUILD_WAYLAND if(verify_wl()) { - BOOST_LOG(info) << "Using Wayland for screencasting"sv; sources[source::WAYLAND] = true; } #endif @@ -282,13 +280,11 @@ std::unique_ptr init() { display_cursor = false; } - BOOST_LOG(info) << "Using KMS for screencasting"sv; sources[source::KMS] = true; } #endif #ifdef SUNSHINE_BUILD_X11 if(verify_x11()) { - BOOST_LOG(info) << "Using X11 for screencasting"sv; sources[source::X11] = true; } #endif From 1f7bdb1b2a19444b5f9394f237a191cb6cd4dc6f Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Mon, 27 Sep 2021 19:35:06 +0200 Subject: [PATCH 26/27] Fix segfault when multiple controllers connected --- sunshine/platform/linux/input.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sunshine/platform/linux/input.cpp b/sunshine/platform/linux/input.cpp index a0d18578..fbdc8133 100644 --- a/sunshine/platform/linux/input.cpp +++ b/sunshine/platform/linux/input.cpp @@ -696,7 +696,13 @@ public: }; inline void rumbleIterate(std::vector &effects, std::vector &polls, std::chrono::milliseconds to) { - auto res = poll(&polls.data()->el, polls.size(), to.count()); + std::vector polls_tmp; + polls_tmp.reserve(polls.size()); + for(auto &poll : polls) { + polls_tmp.emplace_back(poll.el); + } + + auto res = poll(polls_tmp.data(), polls.size(), to.count()); // If timed out if(!res) { @@ -871,7 +877,7 @@ void broadcastRumble(safe::queue_t &rumble_queue_queue) { } if(polls.empty()) { - std::this_thread::sleep_for(50ms); + std::this_thread::sleep_for(250ms); } else { rumbleIterate(effects, polls, 100ms); From e7cbfb3ee92ae65064936f4938702e2c02b5beb0 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Mon, 27 Sep 2021 19:54:32 +0200 Subject: [PATCH 27/27] Fix dependencies for debian bullseye --- gen-deb.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gen-deb.in b/gen-deb.in index 70da08d2..3cb64ea0 100755 --- a/gen-deb.in +++ b/gen-deb.in @@ -38,7 +38,7 @@ Architecture: amd64 Maintainer: @loki Priority: optional Version: 0.11.0 -Depends: libssl1.1, libavdevice58, libboost-thread1.67.0 | libboost-thread1.71.0, libboost-filesystem1.67.0 | libboost-filesystem1.71.0, libboost-log1.67.0 | libboost-log1.71.0, libpulse0, libopus0, libxcb-shm0, libxcb-xfixes0, libxtst6, libevdev2, libdrm2, libcap2 +Depends: libssl1.1, libavdevice58, libboost-thread1.67.0 | libboost-thread1.71.0 | libboost-thread1.74.0, libboost-filesystem1.67.0 | libboost-filesystem1.71.0 | libboost-filesystem1.74.0, libboost-log1.67.0 | libboost-log1.71.0 | libboost-log1.74.0, libpulse0, libopus0, libxcb-shm0, libxcb-xfixes0, libxtst6, libevdev2, libdrm2, libcap2 Description: Gamestream host for Moonlight EOF