Convert images on the GPU on Linux for NVidia cards

2025-01-27 12:35:25 +00:00 · 2021-09-14 11:58:21 +02:00 · 2021-09-14 11:58:21 +02:00 · 9982ae4675
commit 9982ae4675
parent 1a7ed53559
13 changed files with 372 additions and 28 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -10,3 +10,6 @@
 [submodule "third-party/miniupnp"]
 	path = third-party/miniupnp
 	url = https://github.com/miniupnp/miniupnp
+[submodule "third-party/nv-codec-headers"]
+	path = third-party/nv-codec-headers
+	url = https://github.com/FFmpeg/nv-codec-headers
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -188,6 +188,8 @@ else()
 		sunshine/platform/linux/publish.cpp
 		sunshine/platform/linux/vaapi.h
 		sunshine/platform/linux/vaapi.cpp
+		sunshine/platform/linux/cuda.cpp
+		sunshine/platform/linux/cuda.h
 		sunshine/platform/linux/graphics.h
 		sunshine/platform/linux/graphics.cpp
 		sunshine/platform/linux/misc.h
@ -212,6 +214,7 @@ else()
 	
 	include_directories(
 		/usr/include/libevdev-1.0
+		third-party/nv-codec-headers/include
 		third-party/glad/include)

 	if(NOT DEFINED SUNSHINE_EXECUTABLE_PATH)
--- a/sunshine/main.cpp
+++ b/sunshine/main.cpp
@ -26,6 +26,8 @@
 #include "upnp.h"
 #include "video.h"

+#include "platform/linux/cuda.h"
+
 #include "platform/common.h"
 extern "C" {
 #include <libavutil/log.h>
--- a/sunshine/platform/linux/cuda.cpp
+++ b/sunshine/platform/linux/cuda.cpp
@ -0,0 +1,289 @@
+#include "cuda.h"
+#include "graphics.h"
+#include "sunshine/main.h"
+#include "sunshine/utility.h"
+#include "wayland.h"
+#include "x11grab.h"
+#include <ffnvcodec/dynlink_loader.h>
+
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavutil/hwcontext_cuda.h>
+#include <libavutil/imgutils.h>
+}
+
+#define SUNSHINE_STRINGVIEW_HELPER(x) x##sv
+#define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x)
+
+#define CU_CHECK(x, y) \
+  if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return -1
+
+#define CU_CHECK_IGNORE(x, y) \
+  check((x), SUNSHINE_STRINGVIEW(y ": "))
+
+using namespace std::literals;
+namespace cuda {
+void cff(CudaFunctions *cf) {
+  cuda_free_functions(&cf);
+}
+
+using cdf_t = util::safe_ptr<CudaFunctions, cff>;
+
+static cdf_t cdf;
+
+inline static int check(CUresult result, const std::string_view &sv) {
+  if(result != CUDA_SUCCESS) {
+    const char *name;
+    const char *description;
+
+    cdf->cuGetErrorName(result, &name);
+    cdf->cuGetErrorString(result, &description);
+
+    BOOST_LOG(error) << sv << name << ':' << description;
+    return -1;
+  }
+
+  return 0;
+}
+
+class ctx_t {
+public:
+  ctx_t(CUcontext ctx) {
+    CU_CHECK_IGNORE(cdf->cuCtxPushCurrent(ctx), "Couldn't push cuda context");
+  }
+
+  ~ctx_t() {
+    CUcontext dummy;
+
+    CU_CHECK_IGNORE(cdf->cuCtxPopCurrent(&dummy), "Couldn't pop cuda context");
+  }
+};
+
+void free_res(CUgraphicsResource res) {
+  cdf->cuGraphicsUnregisterResource(res);
+}
+
+using res_internal_t = util::safe_ptr<CUgraphicsResource_st, free_res>;
+
+template<std::size_t N>
+class res_t {
+public:
+  res_t() : resources {}, mapped { false } {}
+  res_t(res_t &&other) noexcept : resources { other.resources }, array_p { other.array_p }, ctx { other.ctx }, stream { other.stream } {
+    other.resources = std::array<res_internal_t::pointer, N> {};
+  }
+
+  res_t &operator=(res_t &&other) {
+    for(auto x = 0; x < N; ++x) {
+      std::swap(resources[x], other.resources[x]);
+      std::swap(array_p[x], other.array_p[x]);
+    }
+
+    std::swap(ctx, other.ctx);
+    std::swap(stream, other.stream);
+    std::swap(mapped, other.mapped);
+
+    return *this;
+  }
+
+  res_t(CUcontext ctx, CUstream stream) : resources {}, ctx { ctx }, stream { stream }, mapped { false } {}
+
+  int bind(gl::tex_t &tex) {
+    ctx_t ctx { this->ctx };
+
+    CU_CHECK(cdf->cuGraphicsGLRegisterImage(&resources[0], tex[0], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register Y image");
+    CU_CHECK(cdf->cuGraphicsGLRegisterImage(&resources[1], tex[1], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register uv image");
+
+    return 0;
+  }
+
+  int map() {
+    ctx_t ctx { this->ctx };
+
+    CU_CHECK(cdf->cuGraphicsMapResources(resources.size(), resources.data(), stream), "Coudn't map cuda resources");
+
+    mapped = true;
+
+    CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&array_p[0], resources[0], 0, 0), "Couldn't get mapped subresource [0]");
+    CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&array_p[1], resources[1], 0, 0), "Couldn't get mapped subresource [1]");
+
+    return 0;
+  }
+
+  void unmap() {
+    // Either all or none are mapped
+    if(mapped) {
+      ctx_t ctx { this->ctx };
+
+      CU_CHECK_IGNORE(cdf->cuGraphicsUnmapResources(resources.size(), resources.data(), stream), "Couldn't unmap cuda resources");
+
+      mapped = false;
+    }
+  }
+
+  inline CUarray &operator[](std::size_t index) {
+    return array_p[index];
+  }
+
+  ~res_t() {
+    unmap();
+  }
+
+  std::array<res_internal_t::pointer, N> resources;
+  std::array<CUarray, N> array_p;
+
+  CUcontext ctx;
+  CUstream stream;
+
+  bool mapped;
+};
+
+int init() {
+  auto status = cuda_load_functions(&cdf, nullptr);
+  if(status) {
+    BOOST_LOG(error) << "Couldn't load cuda: "sv << status;
+
+    return -1;
+  }
+
+  CU_CHECK(cdf->cuInit(0), "Couldn't initialize cuda");
+
+  return 0;
+}
+
+class cuda_t : public platf::hwdevice_t {
+public:
+  int init(int in_width, int in_height, platf::x11::xdisplay_t::pointer xdisplay) {
+    if(!cdf) {
+      BOOST_LOG(warning) << "cuda not initialized"sv;
+      return -1;
+    }
+
+    this->data = (void *)0x1;
+
+    display = egl::make_display(xdisplay);
+    if(!display) {
+      return -1;
+    }
+
+    auto ctx_opt = egl::make_ctx(display.get());
+    if(!ctx_opt) {
+      return -1;
+    }
+
+    ctx = std::move(*ctx_opt);
+
+    width  = in_width;
+    height = in_height;
+
+    return 0;
+  }
+
+  int set_frame(AVFrame *frame) override {
+    auto cuda_ctx = (AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx;
+
+    tex = gl::tex_t::make(2);
+    fb  = gl::frame_buf_t::make(2);
+
+    gl::ctx.BindTexture(GL_TEXTURE_2D, tex[0]);
+    gl::ctx.TexImage2D(GL_TEXTURE_2D, 0, GL_RED, frame->width, frame->height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
+    gl::ctx.BindTexture(GL_TEXTURE_2D, tex[1]);
+    gl::ctx.TexImage2D(GL_TEXTURE_2D, 0, GL_RG, frame->width / 2, frame->height / 2, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr);
+    gl::ctx.BindTexture(GL_TEXTURE_2D, 0);
+
+    fb.bind(std::begin(tex), std::end(tex));
+
+    res = res_t<2> { cuda_ctx->cuda_ctx, cuda_ctx->stream };
+
+    if(res.bind(tex)) {
+      return -1;
+    }
+
+    this->hwframe.reset(frame);
+    this->frame = frame;
+
+    if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) {
+      BOOST_LOG(error) << "Couldn't get hwframe for NVENC"sv;
+
+      return -1;
+    }
+
+    auto sws_opt = egl::sws_t::make(width, height, frame->width, frame->height);
+    if(!sws_opt) {
+      return -1;
+    }
+
+    this->sws = std::move(*sws_opt);
+
+    return 0;
+  }
+
+  int convert(platf::img_t &img) override {
+    sws.load_ram(img);
+
+    if(sws.convert(fb)) {
+      return -1;
+    }
+
+    if(res.map()) {
+      return -1;
+    }
+
+    // Push and pop cuda context
+    ctx_t ctx { res.ctx };
+    for(auto x = 0; x < 2; ++x) {
+      CUDA_MEMCPY2D desc {};
+
+      auto shift = x;
+
+      desc.srcPitch     = frame->width;
+      desc.dstPitch     = frame->linesize[x];
+      desc.Height       = frame->height >> shift;
+      desc.WidthInBytes = std::min(desc.srcPitch, desc.dstPitch);
+
+      desc.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+      desc.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+
+      desc.srcArray  = res[x];
+      desc.dstDevice = (CUdeviceptr)frame->data[x];
+
+      CU_CHECK(cdf->cuMemcpy2DAsync(&desc, res.stream), "Couldn't copy from OpenGL to cuda");
+    }
+
+    res.unmap();
+
+    return 0;
+  }
+
+  void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override {
+    sws.set_colorspace(colorspace, color_range);
+  }
+
+  frame_t hwframe;
+
+  egl::display_t display;
+  egl::ctx_t ctx;
+
+  egl::sws_t sws;
+
+  gl::tex_t tex;
+  gl::frame_buf_t fb;
+
+  res_t<2> res;
+
+  int width, height;
+};
+
+std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay) {
+  if(init()) {
+    return nullptr;
+  }
+
+  auto cuda = std::make_shared<cuda_t>();
+  if(cuda->init(width, height, xdisplay)) {
+    return nullptr;
+  }
+
+  return cuda;
+}
+} // namespace cuda
--- a/sunshine/platform/linux/cuda.h
+++ b/sunshine/platform/linux/cuda.h
@ -0,0 +1,12 @@
+#ifndef SUNSHINE_PLATFORM_CUDA_H
+#define SUNSHINE_PLATFORM_CUDA_H
+
+#include "sunshine/platform/common.h"
+#include "x11grab.h"
+
+namespace cuda {
+std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay);
+int init();
+} // namespace cuda
+
+#endif
--- a/sunshine/platform/linux/graphics.cpp
+++ b/sunshine/platform/linux/graphics.cpp
@ -313,19 +313,30 @@ bool fail() {
  return eglGetError() != EGL_SUCCESS;
 }

-display_t make_display(util::Either<gbm::gbm_t::pointer, wl_display *> native_display) {
+display_t make_display(std::variant<gbm::gbm_t::pointer, wl_display *, _XDisplay *> native_display) {
  constexpr auto EGL_PLATFORM_GBM_MESA    = 0x31D7;
  constexpr auto EGL_PLATFORM_WAYLAND_KHR = 0x31D8;
+  constexpr auto EGL_PLATFORM_X11_KHR     = 0x31D5;

  int egl_platform;
  void *native_display_p;
-  if(native_display.has_left()) {
+
+  switch(native_display.index()) {
+  case 0:
    egl_platform     = EGL_PLATFORM_GBM_MESA;
-    native_display_p = native_display.left();
-  }
-  else {
+    native_display_p = std::get<0>(native_display);
+    break;
+  case 1:
    egl_platform     = EGL_PLATFORM_WAYLAND_KHR;
-    native_display_p = native_display.right();
+    native_display_p = std::get<1>(native_display);
+    break;
+  case 2:
+    egl_platform     = EGL_PLATFORM_X11_KHR;
+    native_display_p = std::get<2>(native_display);
+    break;
+  default:
+    BOOST_LOG(error) << "egl::make_display(): Index ["sv << native_display.index() << "] not implemented"sv;
+    return nullptr;
  }

  // native_display.left() equals native_display.right()
@ -803,7 +814,7 @@ void sws_t::load_vram(img_descriptor_t &img, int offset_x, int offset_y, int tex
  }
 }

-int sws_t::convert(nv12_t &nv12) {
+int sws_t::convert(gl::frame_buf_t &fb) {
  gl::ctx.BindTexture(GL_TEXTURE_2D, loaded_texture);

  GLenum attachments[] {
@ -812,7 +823,7 @@ int sws_t::convert(nv12_t &nv12) {
  };

  for(int x = 0; x < sizeof(attachments) / sizeof(decltype(attachments[0])); ++x) {
-    gl::ctx.BindFramebuffer(GL_FRAMEBUFFER, nv12->buf[x]);
+    gl::ctx.BindFramebuffer(GL_FRAMEBUFFER, fb[x]);
    gl::ctx.DrawBuffers(1, &attachments[x]);

 #ifndef NDEBUG
--- a/sunshine/platform/linux/graphics.h
+++ b/sunshine/platform/linux/graphics.h
@ -19,6 +19,9 @@

 extern "C" int close(int __fd);

+// X11 Display
+extern "C" struct _XDisplay;
+
 struct AVFrame;
 void free_frame(AVFrame *frame);

@ -227,7 +230,7 @@ struct surface_descriptor_t {
  std::uint32_t offsets[4];
 };

-display_t make_display(util::Either<gbm::gbm_t::pointer, wl_display *> native_display);
+display_t make_display(std::variant<gbm::gbm_t::pointer, wl_display *, _XDisplay *> native_display);
 std::optional<ctx_t> make_ctx(display_t::pointer display);

 std::optional<rgb_t> import_source(
@ -276,7 +279,8 @@ public:
  static std::optional<sws_t> make(int in_width, int in_height, int out_width, int out_heigth, gl::tex_t &&tex);
  static std::optional<sws_t> make(int in_width, int in_height, int out_width, int out_heigth);

-  int convert(nv12_t &nv12);
+  // Convert the loaded image into the first two framebuffers
+  int convert(gl::frame_buf_t &fb);

  void load_ram(platf::img_t &img);
  void load_vram(img_descriptor_t &img, int offset_x, int offset_y, int texture);
--- a/sunshine/platform/linux/vaapi.cpp
+++ b/sunshine/platform/linux/vaapi.cpp
@ -3,8 +3,6 @@

 #include <fcntl.h>

-#include <glad/egl.h>
-
 extern "C" {
 #include <libavcodec/avcodec.h>
 }
@ -404,7 +402,7 @@ public:
  int convert(platf::img_t &img) override {
    sws.load_ram(img);

-    sws.convert(nv12);
+    sws.convert(nv12->buf);
    return 0;
  }
 };
@ -430,7 +428,7 @@ public:

    sws.load_vram(descriptor, offset_x, offset_y, rgb->tex[0]);

-    sws.convert(nv12);
+    sws.convert(nv12->buf);
    return 0;
  }

--- a/sunshine/platform/linux/x11grab.cpp
+++ b/sunshine/platform/linux/x11grab.cpp
@ -24,6 +24,7 @@
 #include "misc.h"
 #include "vaapi.h"
 #include "x11grab.h"
+#include "cuda.h"

 using namespace std::literals;

@ -259,9 +260,8 @@ void freeX(XFixesCursorImage *);
 using xcb_connect_t = util::dyn_safe_ptr<xcb_connection_t, &xcb::disconnect>;
 using xcb_img_t     = util::c_ptr<xcb_shm_get_image_reply_t>;

-using xdisplay_t = util::dyn_safe_ptr_v2<Display, int, &x11::CloseDisplay>;
-using ximg_t     = util::safe_ptr<XImage, freeImage>;
-using xcursor_t  = util::safe_ptr<XFixesCursorImage, freeX>;
+using ximg_t    = util::safe_ptr<XImage, freeImage>;
+using xcursor_t = util::safe_ptr<XFixesCursorImage, freeX>;

 using crtc_info_t   = util::dyn_safe_ptr<_XRRCrtcInfo, &x11::rr::FreeCrtcInfo>;
 using output_info_t = util::dyn_safe_ptr<_XRROutputInfo, &x11::rr::FreeOutputInfo>;
@ -366,7 +366,7 @@ static void blend_cursor(Display *display, img_t &img, int offsetX, int offsetY)
 struct x11_attr_t : public display_t {
  std::chrono::nanoseconds delay;

-  xdisplay_t xdisplay;
+  x11::xdisplay_t xdisplay;
  Window xwindow;
  XWindowAttributes xattr;

@ -516,6 +516,10 @@ struct x11_attr_t : public display_t {
      return va::make_hwdevice(width, height, false);
    }

+    if(mem_type == mem_type_e::cuda) {
+      return cuda::make_hwdevice(width, height, xdisplay.get());
+    }
+
    return std::make_shared<hwdevice_t>();
  }

@ -526,7 +530,7 @@ struct x11_attr_t : public display_t {
 };

 struct shm_attr_t : public x11_attr_t {
-  xdisplay_t shm_xdisplay; // Prevent race condition with x11_attr_t::xdisplay
+  x11::xdisplay_t shm_xdisplay; // Prevent race condition with x11_attr_t::xdisplay
  xcb_connect_t xcb;
  xcb_screen_t *display;
  std::uint32_t seg;
@ -713,7 +717,7 @@ std::vector<std::string> x11_display_names() {

  BOOST_LOG(info) << "Detecting connected monitors"sv;

-  xdisplay_t xdisplay { x11::OpenDisplay(nullptr) };
+  x11::xdisplay_t xdisplay { x11::OpenDisplay(nullptr) };
  if(!xdisplay) {
    return {};
  }
@ -807,8 +811,16 @@ void cursor_t::blend(img_t &img, int offsetX, int offsetY) {
  blend_cursor((xdisplay_t::pointer)ctx.get(), img, offsetX, offsetY);
 }

+xdisplay_t make_display() {
+  return OpenDisplay(nullptr);
+}
+
+void freeDisplay(_XDisplay *xdisplay) {
+  CloseDisplay(xdisplay);
+}
+
 void freeCursorCtx(cursor_ctx_t::pointer ctx) {
-  x11::CloseDisplay((xdisplay_t::pointer)ctx);
+  CloseDisplay((xdisplay_t::pointer)ctx);
 }
 } // namespace x11
 } // namespace platf
--- a/sunshine/platform/linux/x11grab.h
+++ b/sunshine/platform/linux/x11grab.h
@ -6,6 +6,9 @@
 #include "sunshine/platform/common.h"
 #include "sunshine/utility.h"

+// X11 Display
+extern "C" struct _XDisplay;
+
 namespace egl {
 class cursor_t;
 }
@ -15,8 +18,10 @@ namespace platf::x11 {
 #ifdef SUNSHINE_BUILD_X11
 struct cursor_ctx_raw_t;
 void freeCursorCtx(cursor_ctx_raw_t *ctx);
+void freeDisplay(_XDisplay *xdisplay);

 using cursor_ctx_t = util::safe_ptr<cursor_ctx_raw_t, freeCursorCtx>;
+using xdisplay_t = util::safe_ptr<_XDisplay, freeDisplay>;

 class cursor_t {
 public:
@ -34,7 +39,12 @@ public:

  cursor_ctx_t ctx;
 };
+
+xdisplay_t make_display();
 #else
+// It's never something different from nullptr
+util::safe_ptr<_XDisplay, std::default_delete<_XDisplay>>;
+
 class cursor_t {
 public:
  static std::optional<cursor_t> make() { return std::nullopt; }
@ -42,6 +52,8 @@ public:
  void capture(egl::cursor_t &) {}
  void blend(img_t &, int, int) {}
 };
+
+xdisplay_t make_display() { return nullptr; }
 #endif
 } // namespace platf::x11

--- a/sunshine/utility.h
+++ b/sunshine/utility.h
@ -64,8 +64,7 @@ struct argument_type<T(U)> { typedef U type; };

 #define KITTY_DEFAULT_CONSTR_MOVE(x)     \
  x(x &&) noexcept = default;            \
-  x &operator=(x &&) noexcept = default; \
-  x()                         = default;
+  x &operator=(x &&) noexcept = default;

 #define KITTY_DEFAULT_CONSTR_MOVE_THROW(x) \
  x(x &&)    = default;                    \
@ -415,9 +414,9 @@ inline std::int64_t from_view(const std::string_view &number) {
 }

 template<class X, class Y>
-class Either : public std::variant<X, Y> {
+class Either : public std::variant<std::monostate, X, Y> {
 public:
-  using std::variant<X, Y>::variant;
+  using std::variant<std::monostate, X, Y>::variant;

  constexpr bool has_left() const {
    return std::holds_alternative<X>(*this);
--- a/sunshine/video.cpp
+++ b/sunshine/video.cpp
@ -409,13 +409,11 @@ static encoder_t nvenc {
 #ifdef _WIN32
  AV_HWDEVICE_TYPE_D3D11VA,
  AV_PIX_FMT_D3D11,
-  AV_PIX_FMT_NV12, AV_PIX_FMT_P010,
 #else
  AV_HWDEVICE_TYPE_CUDA,
  AV_PIX_FMT_CUDA,
-  // Fully planar YUV formats are more efficient for sws_scale()
-  AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV420P10,
 #endif
+  AV_PIX_FMT_NV12, AV_PIX_FMT_P010,
  {
    {
      { "forced-idr"s, 1 },
--- a/third-party/nv-codec-headers
+++ b/third-party/nv-codec-headers
@ -0,0 +1 @@
+Subproject commit b641a195edbe3ac9788e681e22c2e2fad8aacddb
				`@ -0,0 +1 @@`
				`Subproject commit b641a195edbe3ac9788e681e22c2e2fad8aacddb`