diff --git a/.gitmodules b/.gitmodules index 153d2de8..39650e86 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,6 @@ [submodule "third-party/miniupnp"] path = third-party/miniupnp url = https://github.com/miniupnp/miniupnp +[submodule "third-party/nv-codec-headers"] + path = third-party/nv-codec-headers + url = https://github.com/FFmpeg/nv-codec-headers diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ff42e7a..1b04f96f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -188,6 +188,8 @@ else() sunshine/platform/linux/publish.cpp sunshine/platform/linux/vaapi.h sunshine/platform/linux/vaapi.cpp + sunshine/platform/linux/cuda.cpp + sunshine/platform/linux/cuda.h sunshine/platform/linux/graphics.h sunshine/platform/linux/graphics.cpp sunshine/platform/linux/misc.h @@ -212,6 +214,7 @@ else() include_directories( /usr/include/libevdev-1.0 + third-party/nv-codec-headers/include third-party/glad/include) if(NOT DEFINED SUNSHINE_EXECUTABLE_PATH) diff --git a/sunshine/main.cpp b/sunshine/main.cpp index b2bed72f..e1f07bf5 100644 --- a/sunshine/main.cpp +++ b/sunshine/main.cpp @@ -26,6 +26,8 @@ #include "upnp.h" #include "video.h" +#include "platform/linux/cuda.h" + #include "platform/common.h" extern "C" { #include diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp new file mode 100644 index 00000000..5c79acde --- /dev/null +++ b/sunshine/platform/linux/cuda.cpp @@ -0,0 +1,289 @@ +#include "cuda.h" +#include "graphics.h" +#include "sunshine/main.h" +#include "sunshine/utility.h" +#include "wayland.h" +#include "x11grab.h" +#include + +extern "C" { +#include +#include +#include +} + +#define SUNSHINE_STRINGVIEW_HELPER(x) x##sv +#define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x) + +#define CU_CHECK(x, y) \ + if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return -1 + +#define CU_CHECK_IGNORE(x, y) \ + check((x), SUNSHINE_STRINGVIEW(y ": ")) + +using namespace std::literals; +namespace cuda { +void cff(CudaFunctions *cf) { + cuda_free_functions(&cf); +} + +using cdf_t = util::safe_ptr; + +static cdf_t cdf; + +inline static int check(CUresult result, const std::string_view &sv) { + if(result != CUDA_SUCCESS) { + const char *name; + const char *description; + + cdf->cuGetErrorName(result, &name); + cdf->cuGetErrorString(result, &description); + + BOOST_LOG(error) << sv << name << ':' << description; + return -1; + } + + return 0; +} + +class ctx_t { +public: + ctx_t(CUcontext ctx) { + CU_CHECK_IGNORE(cdf->cuCtxPushCurrent(ctx), "Couldn't push cuda context"); + } + + ~ctx_t() { + CUcontext dummy; + + CU_CHECK_IGNORE(cdf->cuCtxPopCurrent(&dummy), "Couldn't pop cuda context"); + } +}; + +void free_res(CUgraphicsResource res) { + cdf->cuGraphicsUnregisterResource(res); +} + +using res_internal_t = util::safe_ptr; + +template +class res_t { +public: + res_t() : resources {}, mapped { false } {} + res_t(res_t &&other) noexcept : resources { other.resources }, array_p { other.array_p }, ctx { other.ctx }, stream { other.stream } { + other.resources = std::array {}; + } + + res_t &operator=(res_t &&other) { + for(auto x = 0; x < N; ++x) { + std::swap(resources[x], other.resources[x]); + std::swap(array_p[x], other.array_p[x]); + } + + std::swap(ctx, other.ctx); + std::swap(stream, other.stream); + std::swap(mapped, other.mapped); + + return *this; + } + + res_t(CUcontext ctx, CUstream stream) : resources {}, ctx { ctx }, stream { stream }, mapped { false } {} + + int bind(gl::tex_t &tex) { + ctx_t ctx { this->ctx }; + + CU_CHECK(cdf->cuGraphicsGLRegisterImage(&resources[0], tex[0], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register Y image"); + CU_CHECK(cdf->cuGraphicsGLRegisterImage(&resources[1], tex[1], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register uv image"); + + return 0; + } + + int map() { + ctx_t ctx { this->ctx }; + + CU_CHECK(cdf->cuGraphicsMapResources(resources.size(), resources.data(), stream), "Coudn't map cuda resources"); + + mapped = true; + + CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&array_p[0], resources[0], 0, 0), "Couldn't get mapped subresource [0]"); + CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&array_p[1], resources[1], 0, 0), "Couldn't get mapped subresource [1]"); + + return 0; + } + + void unmap() { + // Either all or none are mapped + if(mapped) { + ctx_t ctx { this->ctx }; + + CU_CHECK_IGNORE(cdf->cuGraphicsUnmapResources(resources.size(), resources.data(), stream), "Couldn't unmap cuda resources"); + + mapped = false; + } + } + + inline CUarray &operator[](std::size_t index) { + return array_p[index]; + } + + ~res_t() { + unmap(); + } + + std::array resources; + std::array array_p; + + CUcontext ctx; + CUstream stream; + + bool mapped; +}; + +int init() { + auto status = cuda_load_functions(&cdf, nullptr); + if(status) { + BOOST_LOG(error) << "Couldn't load cuda: "sv << status; + + return -1; + } + + CU_CHECK(cdf->cuInit(0), "Couldn't initialize cuda"); + + return 0; +} + +class cuda_t : public platf::hwdevice_t { +public: + int init(int in_width, int in_height, platf::x11::xdisplay_t::pointer xdisplay) { + if(!cdf) { + BOOST_LOG(warning) << "cuda not initialized"sv; + return -1; + } + + this->data = (void *)0x1; + + display = egl::make_display(xdisplay); + if(!display) { + return -1; + } + + auto ctx_opt = egl::make_ctx(display.get()); + if(!ctx_opt) { + return -1; + } + + ctx = std::move(*ctx_opt); + + width = in_width; + height = in_height; + + return 0; + } + + int set_frame(AVFrame *frame) override { + auto cuda_ctx = (AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx; + + tex = gl::tex_t::make(2); + fb = gl::frame_buf_t::make(2); + + gl::ctx.BindTexture(GL_TEXTURE_2D, tex[0]); + gl::ctx.TexImage2D(GL_TEXTURE_2D, 0, GL_RED, frame->width, frame->height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr); + gl::ctx.BindTexture(GL_TEXTURE_2D, tex[1]); + gl::ctx.TexImage2D(GL_TEXTURE_2D, 0, GL_RG, frame->width / 2, frame->height / 2, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr); + gl::ctx.BindTexture(GL_TEXTURE_2D, 0); + + fb.bind(std::begin(tex), std::end(tex)); + + res = res_t<2> { cuda_ctx->cuda_ctx, cuda_ctx->stream }; + + if(res.bind(tex)) { + return -1; + } + + this->hwframe.reset(frame); + this->frame = frame; + + if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) { + BOOST_LOG(error) << "Couldn't get hwframe for NVENC"sv; + + return -1; + } + + auto sws_opt = egl::sws_t::make(width, height, frame->width, frame->height); + if(!sws_opt) { + return -1; + } + + this->sws = std::move(*sws_opt); + + return 0; + } + + int convert(platf::img_t &img) override { + sws.load_ram(img); + + if(sws.convert(fb)) { + return -1; + } + + if(res.map()) { + return -1; + } + + // Push and pop cuda context + ctx_t ctx { res.ctx }; + for(auto x = 0; x < 2; ++x) { + CUDA_MEMCPY2D desc {}; + + auto shift = x; + + desc.srcPitch = frame->width; + desc.dstPitch = frame->linesize[x]; + desc.Height = frame->height >> shift; + desc.WidthInBytes = std::min(desc.srcPitch, desc.dstPitch); + + desc.srcMemoryType = CU_MEMORYTYPE_ARRAY; + desc.dstMemoryType = CU_MEMORYTYPE_DEVICE; + + desc.srcArray = res[x]; + desc.dstDevice = (CUdeviceptr)frame->data[x]; + + CU_CHECK(cdf->cuMemcpy2DAsync(&desc, res.stream), "Couldn't copy from OpenGL to cuda"); + } + + res.unmap(); + + return 0; + } + + void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override { + sws.set_colorspace(colorspace, color_range); + } + + frame_t hwframe; + + egl::display_t display; + egl::ctx_t ctx; + + egl::sws_t sws; + + gl::tex_t tex; + gl::frame_buf_t fb; + + res_t<2> res; + + int width, height; +}; + +std::shared_ptr make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay) { + if(init()) { + return nullptr; + } + + auto cuda = std::make_shared(); + if(cuda->init(width, height, xdisplay)) { + return nullptr; + } + + return cuda; +} +} // namespace cuda diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h new file mode 100644 index 00000000..a56c961b --- /dev/null +++ b/sunshine/platform/linux/cuda.h @@ -0,0 +1,12 @@ +#ifndef SUNSHINE_PLATFORM_CUDA_H +#define SUNSHINE_PLATFORM_CUDA_H + +#include "sunshine/platform/common.h" +#include "x11grab.h" + +namespace cuda { +std::shared_ptr make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay); +int init(); +} // namespace cuda + +#endif \ No newline at end of file diff --git a/sunshine/platform/linux/graphics.cpp b/sunshine/platform/linux/graphics.cpp index bbf75324..3e0594b8 100644 --- a/sunshine/platform/linux/graphics.cpp +++ b/sunshine/platform/linux/graphics.cpp @@ -313,19 +313,30 @@ bool fail() { return eglGetError() != EGL_SUCCESS; } -display_t make_display(util::Either native_display) { +display_t make_display(std::variant native_display) { constexpr auto EGL_PLATFORM_GBM_MESA = 0x31D7; constexpr auto EGL_PLATFORM_WAYLAND_KHR = 0x31D8; + constexpr auto EGL_PLATFORM_X11_KHR = 0x31D5; int egl_platform; void *native_display_p; - if(native_display.has_left()) { + + switch(native_display.index()) { + case 0: egl_platform = EGL_PLATFORM_GBM_MESA; - native_display_p = native_display.left(); - } - else { + native_display_p = std::get<0>(native_display); + break; + case 1: egl_platform = EGL_PLATFORM_WAYLAND_KHR; - native_display_p = native_display.right(); + native_display_p = std::get<1>(native_display); + break; + case 2: + egl_platform = EGL_PLATFORM_X11_KHR; + native_display_p = std::get<2>(native_display); + break; + default: + BOOST_LOG(error) << "egl::make_display(): Index ["sv << native_display.index() << "] not implemented"sv; + return nullptr; } // native_display.left() equals native_display.right() @@ -803,7 +814,7 @@ void sws_t::load_vram(img_descriptor_t &img, int offset_x, int offset_y, int tex } } -int sws_t::convert(nv12_t &nv12) { +int sws_t::convert(gl::frame_buf_t &fb) { gl::ctx.BindTexture(GL_TEXTURE_2D, loaded_texture); GLenum attachments[] { @@ -812,7 +823,7 @@ int sws_t::convert(nv12_t &nv12) { }; for(int x = 0; x < sizeof(attachments) / sizeof(decltype(attachments[0])); ++x) { - gl::ctx.BindFramebuffer(GL_FRAMEBUFFER, nv12->buf[x]); + gl::ctx.BindFramebuffer(GL_FRAMEBUFFER, fb[x]); gl::ctx.DrawBuffers(1, &attachments[x]); #ifndef NDEBUG diff --git a/sunshine/platform/linux/graphics.h b/sunshine/platform/linux/graphics.h index 02de4d93..5599d9a4 100644 --- a/sunshine/platform/linux/graphics.h +++ b/sunshine/platform/linux/graphics.h @@ -19,6 +19,9 @@ extern "C" int close(int __fd); +// X11 Display +extern "C" struct _XDisplay; + struct AVFrame; void free_frame(AVFrame *frame); @@ -227,7 +230,7 @@ struct surface_descriptor_t { std::uint32_t offsets[4]; }; -display_t make_display(util::Either native_display); +display_t make_display(std::variant native_display); std::optional make_ctx(display_t::pointer display); std::optional import_source( @@ -276,7 +279,8 @@ public: static std::optional make(int in_width, int in_height, int out_width, int out_heigth, gl::tex_t &&tex); static std::optional make(int in_width, int in_height, int out_width, int out_heigth); - int convert(nv12_t &nv12); + // Convert the loaded image into the first two framebuffers + int convert(gl::frame_buf_t &fb); void load_ram(platf::img_t &img); void load_vram(img_descriptor_t &img, int offset_x, int offset_y, int texture); diff --git a/sunshine/platform/linux/vaapi.cpp b/sunshine/platform/linux/vaapi.cpp index 33646333..37a64cb8 100644 --- a/sunshine/platform/linux/vaapi.cpp +++ b/sunshine/platform/linux/vaapi.cpp @@ -3,8 +3,6 @@ #include -#include - extern "C" { #include } @@ -404,7 +402,7 @@ public: int convert(platf::img_t &img) override { sws.load_ram(img); - sws.convert(nv12); + sws.convert(nv12->buf); return 0; } }; @@ -430,7 +428,7 @@ public: sws.load_vram(descriptor, offset_x, offset_y, rgb->tex[0]); - sws.convert(nv12); + sws.convert(nv12->buf); return 0; } diff --git a/sunshine/platform/linux/x11grab.cpp b/sunshine/platform/linux/x11grab.cpp index 33797f24..24a87900 100644 --- a/sunshine/platform/linux/x11grab.cpp +++ b/sunshine/platform/linux/x11grab.cpp @@ -24,6 +24,7 @@ #include "misc.h" #include "vaapi.h" #include "x11grab.h" +#include "cuda.h" using namespace std::literals; @@ -259,9 +260,8 @@ void freeX(XFixesCursorImage *); using xcb_connect_t = util::dyn_safe_ptr; using xcb_img_t = util::c_ptr; -using xdisplay_t = util::dyn_safe_ptr_v2; -using ximg_t = util::safe_ptr; -using xcursor_t = util::safe_ptr; +using ximg_t = util::safe_ptr; +using xcursor_t = util::safe_ptr; using crtc_info_t = util::dyn_safe_ptr<_XRRCrtcInfo, &x11::rr::FreeCrtcInfo>; using output_info_t = util::dyn_safe_ptr<_XRROutputInfo, &x11::rr::FreeOutputInfo>; @@ -366,7 +366,7 @@ static void blend_cursor(Display *display, img_t &img, int offsetX, int offsetY) struct x11_attr_t : public display_t { std::chrono::nanoseconds delay; - xdisplay_t xdisplay; + x11::xdisplay_t xdisplay; Window xwindow; XWindowAttributes xattr; @@ -516,6 +516,10 @@ struct x11_attr_t : public display_t { return va::make_hwdevice(width, height, false); } + if(mem_type == mem_type_e::cuda) { + return cuda::make_hwdevice(width, height, xdisplay.get()); + } + return std::make_shared(); } @@ -526,7 +530,7 @@ struct x11_attr_t : public display_t { }; struct shm_attr_t : public x11_attr_t { - xdisplay_t shm_xdisplay; // Prevent race condition with x11_attr_t::xdisplay + x11::xdisplay_t shm_xdisplay; // Prevent race condition with x11_attr_t::xdisplay xcb_connect_t xcb; xcb_screen_t *display; std::uint32_t seg; @@ -713,7 +717,7 @@ std::vector x11_display_names() { BOOST_LOG(info) << "Detecting connected monitors"sv; - xdisplay_t xdisplay { x11::OpenDisplay(nullptr) }; + x11::xdisplay_t xdisplay { x11::OpenDisplay(nullptr) }; if(!xdisplay) { return {}; } @@ -807,8 +811,16 @@ void cursor_t::blend(img_t &img, int offsetX, int offsetY) { blend_cursor((xdisplay_t::pointer)ctx.get(), img, offsetX, offsetY); } +xdisplay_t make_display() { + return OpenDisplay(nullptr); +} + +void freeDisplay(_XDisplay *xdisplay) { + CloseDisplay(xdisplay); +} + void freeCursorCtx(cursor_ctx_t::pointer ctx) { - x11::CloseDisplay((xdisplay_t::pointer)ctx); + CloseDisplay((xdisplay_t::pointer)ctx); } } // namespace x11 } // namespace platf diff --git a/sunshine/platform/linux/x11grab.h b/sunshine/platform/linux/x11grab.h index 1440ae76..3d2868c8 100644 --- a/sunshine/platform/linux/x11grab.h +++ b/sunshine/platform/linux/x11grab.h @@ -6,6 +6,9 @@ #include "sunshine/platform/common.h" #include "sunshine/utility.h" +// X11 Display +extern "C" struct _XDisplay; + namespace egl { class cursor_t; } @@ -15,8 +18,10 @@ namespace platf::x11 { #ifdef SUNSHINE_BUILD_X11 struct cursor_ctx_raw_t; void freeCursorCtx(cursor_ctx_raw_t *ctx); +void freeDisplay(_XDisplay *xdisplay); using cursor_ctx_t = util::safe_ptr; +using xdisplay_t = util::safe_ptr<_XDisplay, freeDisplay>; class cursor_t { public: @@ -34,7 +39,12 @@ public: cursor_ctx_t ctx; }; + +xdisplay_t make_display(); #else +// It's never something different from nullptr +util::safe_ptr<_XDisplay, std::default_delete<_XDisplay>>; + class cursor_t { public: static std::optional make() { return std::nullopt; } @@ -42,6 +52,8 @@ public: void capture(egl::cursor_t &) {} void blend(img_t &, int, int) {} }; + +xdisplay_t make_display() { return nullptr; } #endif } // namespace platf::x11 diff --git a/sunshine/utility.h b/sunshine/utility.h index 90435696..0e585e21 100644 --- a/sunshine/utility.h +++ b/sunshine/utility.h @@ -64,8 +64,7 @@ struct argument_type { typedef U type; }; #define KITTY_DEFAULT_CONSTR_MOVE(x) \ x(x &&) noexcept = default; \ - x &operator=(x &&) noexcept = default; \ - x() = default; + x &operator=(x &&) noexcept = default; #define KITTY_DEFAULT_CONSTR_MOVE_THROW(x) \ x(x &&) = default; \ @@ -415,9 +414,9 @@ inline std::int64_t from_view(const std::string_view &number) { } template -class Either : public std::variant { +class Either : public std::variant { public: - using std::variant::variant; + using std::variant::variant; constexpr bool has_left() const { return std::holds_alternative(*this); diff --git a/sunshine/video.cpp b/sunshine/video.cpp index 261d86bc..7205aa07 100644 --- a/sunshine/video.cpp +++ b/sunshine/video.cpp @@ -409,13 +409,11 @@ static encoder_t nvenc { #ifdef _WIN32 AV_HWDEVICE_TYPE_D3D11VA, AV_PIX_FMT_D3D11, - AV_PIX_FMT_NV12, AV_PIX_FMT_P010, #else AV_HWDEVICE_TYPE_CUDA, AV_PIX_FMT_CUDA, - // Fully planar YUV formats are more efficient for sws_scale() - AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV420P10, #endif + AV_PIX_FMT_NV12, AV_PIX_FMT_P010, { { { "forced-idr"s, 1 }, diff --git a/third-party/nv-codec-headers b/third-party/nv-codec-headers new file mode 160000 index 00000000..b641a195 --- /dev/null +++ b/third-party/nv-codec-headers @@ -0,0 +1 @@ +Subproject commit b641a195edbe3ac9788e681e22c2e2fad8aacddb