Convert images on the GPU on Linux for NVidia cards

This commit is contained in:
loki-47-6F-64 2021-09-14 11:58:21 +02:00
parent 1a7ed53559
commit 9982ae4675
13 changed files with 372 additions and 28 deletions

3
.gitmodules vendored
View File

@ -10,3 +10,6 @@
[submodule "third-party/miniupnp"]
path = third-party/miniupnp
url = https://github.com/miniupnp/miniupnp
[submodule "third-party/nv-codec-headers"]
path = third-party/nv-codec-headers
url = https://github.com/FFmpeg/nv-codec-headers

View File

@ -188,6 +188,8 @@ else()
sunshine/platform/linux/publish.cpp
sunshine/platform/linux/vaapi.h
sunshine/platform/linux/vaapi.cpp
sunshine/platform/linux/cuda.cpp
sunshine/platform/linux/cuda.h
sunshine/platform/linux/graphics.h
sunshine/platform/linux/graphics.cpp
sunshine/platform/linux/misc.h
@ -212,6 +214,7 @@ else()
include_directories(
/usr/include/libevdev-1.0
third-party/nv-codec-headers/include
third-party/glad/include)
if(NOT DEFINED SUNSHINE_EXECUTABLE_PATH)

View File

@ -26,6 +26,8 @@
#include "upnp.h"
#include "video.h"
#include "platform/linux/cuda.h"
#include "platform/common.h"
extern "C" {
#include <libavutil/log.h>

View File

@ -0,0 +1,289 @@
#include "cuda.h"
#include "graphics.h"
#include "sunshine/main.h"
#include "sunshine/utility.h"
#include "wayland.h"
#include "x11grab.h"
#include <ffnvcodec/dynlink_loader.h>
extern "C" {
#include <libavcodec/avcodec.h>
#include <libavutil/hwcontext_cuda.h>
#include <libavutil/imgutils.h>
}
#define SUNSHINE_STRINGVIEW_HELPER(x) x##sv
#define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x)
#define CU_CHECK(x, y) \
if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return -1
#define CU_CHECK_IGNORE(x, y) \
check((x), SUNSHINE_STRINGVIEW(y ": "))
using namespace std::literals;
namespace cuda {
void cff(CudaFunctions *cf) {
cuda_free_functions(&cf);
}
using cdf_t = util::safe_ptr<CudaFunctions, cff>;
static cdf_t cdf;
inline static int check(CUresult result, const std::string_view &sv) {
if(result != CUDA_SUCCESS) {
const char *name;
const char *description;
cdf->cuGetErrorName(result, &name);
cdf->cuGetErrorString(result, &description);
BOOST_LOG(error) << sv << name << ':' << description;
return -1;
}
return 0;
}
class ctx_t {
public:
ctx_t(CUcontext ctx) {
CU_CHECK_IGNORE(cdf->cuCtxPushCurrent(ctx), "Couldn't push cuda context");
}
~ctx_t() {
CUcontext dummy;
CU_CHECK_IGNORE(cdf->cuCtxPopCurrent(&dummy), "Couldn't pop cuda context");
}
};
void free_res(CUgraphicsResource res) {
cdf->cuGraphicsUnregisterResource(res);
}
using res_internal_t = util::safe_ptr<CUgraphicsResource_st, free_res>;
template<std::size_t N>
class res_t {
public:
res_t() : resources {}, mapped { false } {}
res_t(res_t &&other) noexcept : resources { other.resources }, array_p { other.array_p }, ctx { other.ctx }, stream { other.stream } {
other.resources = std::array<res_internal_t::pointer, N> {};
}
res_t &operator=(res_t &&other) {
for(auto x = 0; x < N; ++x) {
std::swap(resources[x], other.resources[x]);
std::swap(array_p[x], other.array_p[x]);
}
std::swap(ctx, other.ctx);
std::swap(stream, other.stream);
std::swap(mapped, other.mapped);
return *this;
}
res_t(CUcontext ctx, CUstream stream) : resources {}, ctx { ctx }, stream { stream }, mapped { false } {}
int bind(gl::tex_t &tex) {
ctx_t ctx { this->ctx };
CU_CHECK(cdf->cuGraphicsGLRegisterImage(&resources[0], tex[0], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register Y image");
CU_CHECK(cdf->cuGraphicsGLRegisterImage(&resources[1], tex[1], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register uv image");
return 0;
}
int map() {
ctx_t ctx { this->ctx };
CU_CHECK(cdf->cuGraphicsMapResources(resources.size(), resources.data(), stream), "Coudn't map cuda resources");
mapped = true;
CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&array_p[0], resources[0], 0, 0), "Couldn't get mapped subresource [0]");
CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&array_p[1], resources[1], 0, 0), "Couldn't get mapped subresource [1]");
return 0;
}
void unmap() {
// Either all or none are mapped
if(mapped) {
ctx_t ctx { this->ctx };
CU_CHECK_IGNORE(cdf->cuGraphicsUnmapResources(resources.size(), resources.data(), stream), "Couldn't unmap cuda resources");
mapped = false;
}
}
inline CUarray &operator[](std::size_t index) {
return array_p[index];
}
~res_t() {
unmap();
}
std::array<res_internal_t::pointer, N> resources;
std::array<CUarray, N> array_p;
CUcontext ctx;
CUstream stream;
bool mapped;
};
int init() {
auto status = cuda_load_functions(&cdf, nullptr);
if(status) {
BOOST_LOG(error) << "Couldn't load cuda: "sv << status;
return -1;
}
CU_CHECK(cdf->cuInit(0), "Couldn't initialize cuda");
return 0;
}
class cuda_t : public platf::hwdevice_t {
public:
int init(int in_width, int in_height, platf::x11::xdisplay_t::pointer xdisplay) {
if(!cdf) {
BOOST_LOG(warning) << "cuda not initialized"sv;
return -1;
}
this->data = (void *)0x1;
display = egl::make_display(xdisplay);
if(!display) {
return -1;
}
auto ctx_opt = egl::make_ctx(display.get());
if(!ctx_opt) {
return -1;
}
ctx = std::move(*ctx_opt);
width = in_width;
height = in_height;
return 0;
}
int set_frame(AVFrame *frame) override {
auto cuda_ctx = (AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx;
tex = gl::tex_t::make(2);
fb = gl::frame_buf_t::make(2);
gl::ctx.BindTexture(GL_TEXTURE_2D, tex[0]);
gl::ctx.TexImage2D(GL_TEXTURE_2D, 0, GL_RED, frame->width, frame->height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
gl::ctx.BindTexture(GL_TEXTURE_2D, tex[1]);
gl::ctx.TexImage2D(GL_TEXTURE_2D, 0, GL_RG, frame->width / 2, frame->height / 2, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr);
gl::ctx.BindTexture(GL_TEXTURE_2D, 0);
fb.bind(std::begin(tex), std::end(tex));
res = res_t<2> { cuda_ctx->cuda_ctx, cuda_ctx->stream };
if(res.bind(tex)) {
return -1;
}
this->hwframe.reset(frame);
this->frame = frame;
if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) {
BOOST_LOG(error) << "Couldn't get hwframe for NVENC"sv;
return -1;
}
auto sws_opt = egl::sws_t::make(width, height, frame->width, frame->height);
if(!sws_opt) {
return -1;
}
this->sws = std::move(*sws_opt);
return 0;
}
int convert(platf::img_t &img) override {
sws.load_ram(img);
if(sws.convert(fb)) {
return -1;
}
if(res.map()) {
return -1;
}
// Push and pop cuda context
ctx_t ctx { res.ctx };
for(auto x = 0; x < 2; ++x) {
CUDA_MEMCPY2D desc {};
auto shift = x;
desc.srcPitch = frame->width;
desc.dstPitch = frame->linesize[x];
desc.Height = frame->height >> shift;
desc.WidthInBytes = std::min(desc.srcPitch, desc.dstPitch);
desc.srcMemoryType = CU_MEMORYTYPE_ARRAY;
desc.dstMemoryType = CU_MEMORYTYPE_DEVICE;
desc.srcArray = res[x];
desc.dstDevice = (CUdeviceptr)frame->data[x];
CU_CHECK(cdf->cuMemcpy2DAsync(&desc, res.stream), "Couldn't copy from OpenGL to cuda");
}
res.unmap();
return 0;
}
void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override {
sws.set_colorspace(colorspace, color_range);
}
frame_t hwframe;
egl::display_t display;
egl::ctx_t ctx;
egl::sws_t sws;
gl::tex_t tex;
gl::frame_buf_t fb;
res_t<2> res;
int width, height;
};
std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay) {
if(init()) {
return nullptr;
}
auto cuda = std::make_shared<cuda_t>();
if(cuda->init(width, height, xdisplay)) {
return nullptr;
}
return cuda;
}
} // namespace cuda

View File

@ -0,0 +1,12 @@
#ifndef SUNSHINE_PLATFORM_CUDA_H
#define SUNSHINE_PLATFORM_CUDA_H
#include "sunshine/platform/common.h"
#include "x11grab.h"
namespace cuda {
std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay);
int init();
} // namespace cuda
#endif

View File

@ -313,19 +313,30 @@ bool fail() {
return eglGetError() != EGL_SUCCESS;
}
display_t make_display(util::Either<gbm::gbm_t::pointer, wl_display *> native_display) {
display_t make_display(std::variant<gbm::gbm_t::pointer, wl_display *, _XDisplay *> native_display) {
constexpr auto EGL_PLATFORM_GBM_MESA = 0x31D7;
constexpr auto EGL_PLATFORM_WAYLAND_KHR = 0x31D8;
constexpr auto EGL_PLATFORM_X11_KHR = 0x31D5;
int egl_platform;
void *native_display_p;
if(native_display.has_left()) {
switch(native_display.index()) {
case 0:
egl_platform = EGL_PLATFORM_GBM_MESA;
native_display_p = native_display.left();
}
else {
native_display_p = std::get<0>(native_display);
break;
case 1:
egl_platform = EGL_PLATFORM_WAYLAND_KHR;
native_display_p = native_display.right();
native_display_p = std::get<1>(native_display);
break;
case 2:
egl_platform = EGL_PLATFORM_X11_KHR;
native_display_p = std::get<2>(native_display);
break;
default:
BOOST_LOG(error) << "egl::make_display(): Index ["sv << native_display.index() << "] not implemented"sv;
return nullptr;
}
// native_display.left() equals native_display.right()
@ -803,7 +814,7 @@ void sws_t::load_vram(img_descriptor_t &img, int offset_x, int offset_y, int tex
}
}
int sws_t::convert(nv12_t &nv12) {
int sws_t::convert(gl::frame_buf_t &fb) {
gl::ctx.BindTexture(GL_TEXTURE_2D, loaded_texture);
GLenum attachments[] {
@ -812,7 +823,7 @@ int sws_t::convert(nv12_t &nv12) {
};
for(int x = 0; x < sizeof(attachments) / sizeof(decltype(attachments[0])); ++x) {
gl::ctx.BindFramebuffer(GL_FRAMEBUFFER, nv12->buf[x]);
gl::ctx.BindFramebuffer(GL_FRAMEBUFFER, fb[x]);
gl::ctx.DrawBuffers(1, &attachments[x]);
#ifndef NDEBUG

View File

@ -19,6 +19,9 @@
extern "C" int close(int __fd);
// X11 Display
extern "C" struct _XDisplay;
struct AVFrame;
void free_frame(AVFrame *frame);
@ -227,7 +230,7 @@ struct surface_descriptor_t {
std::uint32_t offsets[4];
};
display_t make_display(util::Either<gbm::gbm_t::pointer, wl_display *> native_display);
display_t make_display(std::variant<gbm::gbm_t::pointer, wl_display *, _XDisplay *> native_display);
std::optional<ctx_t> make_ctx(display_t::pointer display);
std::optional<rgb_t> import_source(
@ -276,7 +279,8 @@ public:
static std::optional<sws_t> make(int in_width, int in_height, int out_width, int out_heigth, gl::tex_t &&tex);
static std::optional<sws_t> make(int in_width, int in_height, int out_width, int out_heigth);
int convert(nv12_t &nv12);
// Convert the loaded image into the first two framebuffers
int convert(gl::frame_buf_t &fb);
void load_ram(platf::img_t &img);
void load_vram(img_descriptor_t &img, int offset_x, int offset_y, int texture);

View File

@ -3,8 +3,6 @@
#include <fcntl.h>
#include <glad/egl.h>
extern "C" {
#include <libavcodec/avcodec.h>
}
@ -404,7 +402,7 @@ public:
int convert(platf::img_t &img) override {
sws.load_ram(img);
sws.convert(nv12);
sws.convert(nv12->buf);
return 0;
}
};
@ -430,7 +428,7 @@ public:
sws.load_vram(descriptor, offset_x, offset_y, rgb->tex[0]);
sws.convert(nv12);
sws.convert(nv12->buf);
return 0;
}

View File

@ -24,6 +24,7 @@
#include "misc.h"
#include "vaapi.h"
#include "x11grab.h"
#include "cuda.h"
using namespace std::literals;
@ -259,9 +260,8 @@ void freeX(XFixesCursorImage *);
using xcb_connect_t = util::dyn_safe_ptr<xcb_connection_t, &xcb::disconnect>;
using xcb_img_t = util::c_ptr<xcb_shm_get_image_reply_t>;
using xdisplay_t = util::dyn_safe_ptr_v2<Display, int, &x11::CloseDisplay>;
using ximg_t = util::safe_ptr<XImage, freeImage>;
using xcursor_t = util::safe_ptr<XFixesCursorImage, freeX>;
using ximg_t = util::safe_ptr<XImage, freeImage>;
using xcursor_t = util::safe_ptr<XFixesCursorImage, freeX>;
using crtc_info_t = util::dyn_safe_ptr<_XRRCrtcInfo, &x11::rr::FreeCrtcInfo>;
using output_info_t = util::dyn_safe_ptr<_XRROutputInfo, &x11::rr::FreeOutputInfo>;
@ -366,7 +366,7 @@ static void blend_cursor(Display *display, img_t &img, int offsetX, int offsetY)
struct x11_attr_t : public display_t {
std::chrono::nanoseconds delay;
xdisplay_t xdisplay;
x11::xdisplay_t xdisplay;
Window xwindow;
XWindowAttributes xattr;
@ -516,6 +516,10 @@ struct x11_attr_t : public display_t {
return va::make_hwdevice(width, height, false);
}
if(mem_type == mem_type_e::cuda) {
return cuda::make_hwdevice(width, height, xdisplay.get());
}
return std::make_shared<hwdevice_t>();
}
@ -526,7 +530,7 @@ struct x11_attr_t : public display_t {
};
struct shm_attr_t : public x11_attr_t {
xdisplay_t shm_xdisplay; // Prevent race condition with x11_attr_t::xdisplay
x11::xdisplay_t shm_xdisplay; // Prevent race condition with x11_attr_t::xdisplay
xcb_connect_t xcb;
xcb_screen_t *display;
std::uint32_t seg;
@ -713,7 +717,7 @@ std::vector<std::string> x11_display_names() {
BOOST_LOG(info) << "Detecting connected monitors"sv;
xdisplay_t xdisplay { x11::OpenDisplay(nullptr) };
x11::xdisplay_t xdisplay { x11::OpenDisplay(nullptr) };
if(!xdisplay) {
return {};
}
@ -807,8 +811,16 @@ void cursor_t::blend(img_t &img, int offsetX, int offsetY) {
blend_cursor((xdisplay_t::pointer)ctx.get(), img, offsetX, offsetY);
}
xdisplay_t make_display() {
return OpenDisplay(nullptr);
}
void freeDisplay(_XDisplay *xdisplay) {
CloseDisplay(xdisplay);
}
void freeCursorCtx(cursor_ctx_t::pointer ctx) {
x11::CloseDisplay((xdisplay_t::pointer)ctx);
CloseDisplay((xdisplay_t::pointer)ctx);
}
} // namespace x11
} // namespace platf

View File

@ -6,6 +6,9 @@
#include "sunshine/platform/common.h"
#include "sunshine/utility.h"
// X11 Display
extern "C" struct _XDisplay;
namespace egl {
class cursor_t;
}
@ -15,8 +18,10 @@ namespace platf::x11 {
#ifdef SUNSHINE_BUILD_X11
struct cursor_ctx_raw_t;
void freeCursorCtx(cursor_ctx_raw_t *ctx);
void freeDisplay(_XDisplay *xdisplay);
using cursor_ctx_t = util::safe_ptr<cursor_ctx_raw_t, freeCursorCtx>;
using xdisplay_t = util::safe_ptr<_XDisplay, freeDisplay>;
class cursor_t {
public:
@ -34,7 +39,12 @@ public:
cursor_ctx_t ctx;
};
xdisplay_t make_display();
#else
// It's never something different from nullptr
util::safe_ptr<_XDisplay, std::default_delete<_XDisplay>>;
class cursor_t {
public:
static std::optional<cursor_t> make() { return std::nullopt; }
@ -42,6 +52,8 @@ public:
void capture(egl::cursor_t &) {}
void blend(img_t &, int, int) {}
};
xdisplay_t make_display() { return nullptr; }
#endif
} // namespace platf::x11

View File

@ -64,8 +64,7 @@ struct argument_type<T(U)> { typedef U type; };
#define KITTY_DEFAULT_CONSTR_MOVE(x) \
x(x &&) noexcept = default; \
x &operator=(x &&) noexcept = default; \
x() = default;
x &operator=(x &&) noexcept = default;
#define KITTY_DEFAULT_CONSTR_MOVE_THROW(x) \
x(x &&) = default; \
@ -415,9 +414,9 @@ inline std::int64_t from_view(const std::string_view &number) {
}
template<class X, class Y>
class Either : public std::variant<X, Y> {
class Either : public std::variant<std::monostate, X, Y> {
public:
using std::variant<X, Y>::variant;
using std::variant<std::monostate, X, Y>::variant;
constexpr bool has_left() const {
return std::holds_alternative<X>(*this);

View File

@ -409,13 +409,11 @@ static encoder_t nvenc {
#ifdef _WIN32
AV_HWDEVICE_TYPE_D3D11VA,
AV_PIX_FMT_D3D11,
AV_PIX_FMT_NV12, AV_PIX_FMT_P010,
#else
AV_HWDEVICE_TYPE_CUDA,
AV_PIX_FMT_CUDA,
// Fully planar YUV formats are more efficient for sws_scale()
AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV420P10,
#endif
AV_PIX_FMT_NV12, AV_PIX_FMT_P010,
{
{
{ "forced-idr"s, 1 },

1
third-party/nv-codec-headers vendored Submodule

@ -0,0 +1 @@
Subproject commit b641a195edbe3ac9788e681e22c2e2fad8aacddb