From 9982ae46758858bd51c4eba603adbc9a125f7b24 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Tue, 14 Sep 2021 11:58:21 +0200
Subject: [PATCH 01/27] Convert images on the GPU on Linux for NVidia cards

---
 .gitmodules                          |   3 +
 CMakeLists.txt                       |   3 +
 sunshine/main.cpp                    |   2 +
 sunshine/platform/linux/cuda.cpp     | 289 +++++++++++++++++++++++++++
 sunshine/platform/linux/cuda.h       |  12 ++
 sunshine/platform/linux/graphics.cpp |  27 ++-
 sunshine/platform/linux/graphics.h   |   8 +-
 sunshine/platform/linux/vaapi.cpp    |   6 +-
 sunshine/platform/linux/x11grab.cpp  |  26 ++-
 sunshine/platform/linux/x11grab.h    |  12 ++
 sunshine/utility.h                   |   7 +-
 sunshine/video.cpp                   |   4 +-
 third-party/nv-codec-headers         |   1 +
 13 files changed, 372 insertions(+), 28 deletions(-)
 create mode 100644 sunshine/platform/linux/cuda.cpp
 create mode 100644 sunshine/platform/linux/cuda.h
 create mode 160000 third-party/nv-codec-headers

diff --git a/.gitmodules b/.gitmodules
index 153d2de8..39650e86 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "third-party/miniupnp"]
 	path = third-party/miniupnp
 	url = https://github.com/miniupnp/miniupnp
+[submodule "third-party/nv-codec-headers"]
+	path = third-party/nv-codec-headers
+	url = https://github.com/FFmpeg/nv-codec-headers
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ff42e7a..1b04f96f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -188,6 +188,8 @@ else()
 		sunshine/platform/linux/publish.cpp
 		sunshine/platform/linux/vaapi.h
 		sunshine/platform/linux/vaapi.cpp
+		sunshine/platform/linux/cuda.cpp
+		sunshine/platform/linux/cuda.h
 		sunshine/platform/linux/graphics.h
 		sunshine/platform/linux/graphics.cpp
 		sunshine/platform/linux/misc.h
@@ -212,6 +214,7 @@ else()
 	
 	include_directories(
 		/usr/include/libevdev-1.0
+		third-party/nv-codec-headers/include
 		third-party/glad/include)
 
 	if(NOT DEFINED SUNSHINE_EXECUTABLE_PATH)
diff --git a/sunshine/main.cpp b/sunshine/main.cpp
index b2bed72f..e1f07bf5 100644
--- a/sunshine/main.cpp
+++ b/sunshine/main.cpp
@@ -26,6 +26,8 @@
 #include "upnp.h"
 #include "video.h"
 
+#include "platform/linux/cuda.h"
+
 #include "platform/common.h"
 extern "C" {
 #include <libavutil/log.h>
diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp
new file mode 100644
index 00000000..5c79acde
--- /dev/null
+++ b/sunshine/platform/linux/cuda.cpp
@@ -0,0 +1,289 @@
+#include "cuda.h"
+#include "graphics.h"
+#include "sunshine/main.h"
+#include "sunshine/utility.h"
+#include "wayland.h"
+#include "x11grab.h"
+#include <ffnvcodec/dynlink_loader.h>
+
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavutil/hwcontext_cuda.h>
+#include <libavutil/imgutils.h>
+}
+
+#define SUNSHINE_STRINGVIEW_HELPER(x) x##sv
+#define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x)
+
+#define CU_CHECK(x, y) \
+  if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return -1
+
+#define CU_CHECK_IGNORE(x, y) \
+  check((x), SUNSHINE_STRINGVIEW(y ": "))
+
+using namespace std::literals;
+namespace cuda {
+void cff(CudaFunctions *cf) {
+  cuda_free_functions(&cf);
+}
+
+using cdf_t = util::safe_ptr<CudaFunctions, cff>;
+
+static cdf_t cdf;
+
+inline static int check(CUresult result, const std::string_view &sv) {
+  if(result != CUDA_SUCCESS) {
+    const char *name;
+    const char *description;
+
+    cdf->cuGetErrorName(result, &name);
+    cdf->cuGetErrorString(result, &description);
+
+    BOOST_LOG(error) << sv << name << ':' << description;
+    return -1;
+  }
+
+  return 0;
+}
+
+class ctx_t {
+public:
+  ctx_t(CUcontext ctx) {
+    CU_CHECK_IGNORE(cdf->cuCtxPushCurrent(ctx), "Couldn't push cuda context");
+  }
+
+  ~ctx_t() {
+    CUcontext dummy;
+
+    CU_CHECK_IGNORE(cdf->cuCtxPopCurrent(&dummy), "Couldn't pop cuda context");
+  }
+};
+
+void free_res(CUgraphicsResource res) {
+  cdf->cuGraphicsUnregisterResource(res);
+}
+
+using res_internal_t = util::safe_ptr<CUgraphicsResource_st, free_res>;
+
+template<std::size_t N>
+class res_t {
+public:
+  res_t() : resources {}, mapped { false } {}
+  res_t(res_t &&other) noexcept : resources { other.resources }, array_p { other.array_p }, ctx { other.ctx }, stream { other.stream } {
+    other.resources = std::array<res_internal_t::pointer, N> {};
+  }
+
+  res_t &operator=(res_t &&other) {
+    for(auto x = 0; x < N; ++x) {
+      std::swap(resources[x], other.resources[x]);
+      std::swap(array_p[x], other.array_p[x]);
+    }
+
+    std::swap(ctx, other.ctx);
+    std::swap(stream, other.stream);
+    std::swap(mapped, other.mapped);
+
+    return *this;
+  }
+
+  res_t(CUcontext ctx, CUstream stream) : resources {}, ctx { ctx }, stream { stream }, mapped { false } {}
+
+  int bind(gl::tex_t &tex) {
+    ctx_t ctx { this->ctx };
+
+    CU_CHECK(cdf->cuGraphicsGLRegisterImage(&resources[0], tex[0], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register Y image");
+    CU_CHECK(cdf->cuGraphicsGLRegisterImage(&resources[1], tex[1], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register uv image");
+
+    return 0;
+  }
+
+  int map() {
+    ctx_t ctx { this->ctx };
+
+    CU_CHECK(cdf->cuGraphicsMapResources(resources.size(), resources.data(), stream), "Coudn't map cuda resources");
+
+    mapped = true;
+
+    CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&array_p[0], resources[0], 0, 0), "Couldn't get mapped subresource [0]");
+    CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&array_p[1], resources[1], 0, 0), "Couldn't get mapped subresource [1]");
+
+    return 0;
+  }
+
+  void unmap() {
+    // Either all or none are mapped
+    if(mapped) {
+      ctx_t ctx { this->ctx };
+
+      CU_CHECK_IGNORE(cdf->cuGraphicsUnmapResources(resources.size(), resources.data(), stream), "Couldn't unmap cuda resources");
+
+      mapped = false;
+    }
+  }
+
+  inline CUarray &operator[](std::size_t index) {
+    return array_p[index];
+  }
+
+  ~res_t() {
+    unmap();
+  }
+
+  std::array<res_internal_t::pointer, N> resources;
+  std::array<CUarray, N> array_p;
+
+  CUcontext ctx;
+  CUstream stream;
+
+  bool mapped;
+};
+
+int init() {
+  auto status = cuda_load_functions(&cdf, nullptr);
+  if(status) {
+    BOOST_LOG(error) << "Couldn't load cuda: "sv << status;
+
+    return -1;
+  }
+
+  CU_CHECK(cdf->cuInit(0), "Couldn't initialize cuda");
+
+  return 0;
+}
+
+class cuda_t : public platf::hwdevice_t {
+public:
+  int init(int in_width, int in_height, platf::x11::xdisplay_t::pointer xdisplay) {
+    if(!cdf) {
+      BOOST_LOG(warning) << "cuda not initialized"sv;
+      return -1;
+    }
+
+    this->data = (void *)0x1;
+
+    display = egl::make_display(xdisplay);
+    if(!display) {
+      return -1;
+    }
+
+    auto ctx_opt = egl::make_ctx(display.get());
+    if(!ctx_opt) {
+      return -1;
+    }
+
+    ctx = std::move(*ctx_opt);
+
+    width  = in_width;
+    height = in_height;
+
+    return 0;
+  }
+
+  int set_frame(AVFrame *frame) override {
+    auto cuda_ctx = (AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx;
+
+    tex = gl::tex_t::make(2);
+    fb  = gl::frame_buf_t::make(2);
+
+    gl::ctx.BindTexture(GL_TEXTURE_2D, tex[0]);
+    gl::ctx.TexImage2D(GL_TEXTURE_2D, 0, GL_RED, frame->width, frame->height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
+    gl::ctx.BindTexture(GL_TEXTURE_2D, tex[1]);
+    gl::ctx.TexImage2D(GL_TEXTURE_2D, 0, GL_RG, frame->width / 2, frame->height / 2, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr);
+    gl::ctx.BindTexture(GL_TEXTURE_2D, 0);
+
+    fb.bind(std::begin(tex), std::end(tex));
+
+    res = res_t<2> { cuda_ctx->cuda_ctx, cuda_ctx->stream };
+
+    if(res.bind(tex)) {
+      return -1;
+    }
+
+    this->hwframe.reset(frame);
+    this->frame = frame;
+
+    if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) {
+      BOOST_LOG(error) << "Couldn't get hwframe for NVENC"sv;
+
+      return -1;
+    }
+
+    auto sws_opt = egl::sws_t::make(width, height, frame->width, frame->height);
+    if(!sws_opt) {
+      return -1;
+    }
+
+    this->sws = std::move(*sws_opt);
+
+    return 0;
+  }
+
+  int convert(platf::img_t &img) override {
+    sws.load_ram(img);
+
+    if(sws.convert(fb)) {
+      return -1;
+    }
+
+    if(res.map()) {
+      return -1;
+    }
+
+    // Push and pop cuda context
+    ctx_t ctx { res.ctx };
+    for(auto x = 0; x < 2; ++x) {
+      CUDA_MEMCPY2D desc {};
+
+      auto shift = x;
+
+      desc.srcPitch     = frame->width;
+      desc.dstPitch     = frame->linesize[x];
+      desc.Height       = frame->height >> shift;
+      desc.WidthInBytes = std::min(desc.srcPitch, desc.dstPitch);
+
+      desc.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+      desc.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+
+      desc.srcArray  = res[x];
+      desc.dstDevice = (CUdeviceptr)frame->data[x];
+
+      CU_CHECK(cdf->cuMemcpy2DAsync(&desc, res.stream), "Couldn't copy from OpenGL to cuda");
+    }
+
+    res.unmap();
+
+    return 0;
+  }
+
+  void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override {
+    sws.set_colorspace(colorspace, color_range);
+  }
+
+  frame_t hwframe;
+
+  egl::display_t display;
+  egl::ctx_t ctx;
+
+  egl::sws_t sws;
+
+  gl::tex_t tex;
+  gl::frame_buf_t fb;
+
+  res_t<2> res;
+
+  int width, height;
+};
+
+std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay) {
+  if(init()) {
+    return nullptr;
+  }
+
+  auto cuda = std::make_shared<cuda_t>();
+  if(cuda->init(width, height, xdisplay)) {
+    return nullptr;
+  }
+
+  return cuda;
+}
+} // namespace cuda
diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h
new file mode 100644
index 00000000..a56c961b
--- /dev/null
+++ b/sunshine/platform/linux/cuda.h
@@ -0,0 +1,12 @@
+#ifndef SUNSHINE_PLATFORM_CUDA_H
+#define SUNSHINE_PLATFORM_CUDA_H
+
+#include "sunshine/platform/common.h"
+#include "x11grab.h"
+
+namespace cuda {
+std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay);
+int init();
+} // namespace cuda
+
+#endif
\ No newline at end of file
diff --git a/sunshine/platform/linux/graphics.cpp b/sunshine/platform/linux/graphics.cpp
index bbf75324..3e0594b8 100644
--- a/sunshine/platform/linux/graphics.cpp
+++ b/sunshine/platform/linux/graphics.cpp
@@ -313,19 +313,30 @@ bool fail() {
   return eglGetError() != EGL_SUCCESS;
 }
 
-display_t make_display(util::Either<gbm::gbm_t::pointer, wl_display *> native_display) {
+display_t make_display(std::variant<gbm::gbm_t::pointer, wl_display *, _XDisplay *> native_display) {
   constexpr auto EGL_PLATFORM_GBM_MESA    = 0x31D7;
   constexpr auto EGL_PLATFORM_WAYLAND_KHR = 0x31D8;
+  constexpr auto EGL_PLATFORM_X11_KHR     = 0x31D5;
 
   int egl_platform;
   void *native_display_p;
-  if(native_display.has_left()) {
+
+  switch(native_display.index()) {
+  case 0:
     egl_platform     = EGL_PLATFORM_GBM_MESA;
-    native_display_p = native_display.left();
-  }
-  else {
+    native_display_p = std::get<0>(native_display);
+    break;
+  case 1:
     egl_platform     = EGL_PLATFORM_WAYLAND_KHR;
-    native_display_p = native_display.right();
+    native_display_p = std::get<1>(native_display);
+    break;
+  case 2:
+    egl_platform     = EGL_PLATFORM_X11_KHR;
+    native_display_p = std::get<2>(native_display);
+    break;
+  default:
+    BOOST_LOG(error) << "egl::make_display(): Index ["sv << native_display.index() << "] not implemented"sv;
+    return nullptr;
   }
 
   // native_display.left() equals native_display.right()
@@ -803,7 +814,7 @@ void sws_t::load_vram(img_descriptor_t &img, int offset_x, int offset_y, int tex
   }
 }
 
-int sws_t::convert(nv12_t &nv12) {
+int sws_t::convert(gl::frame_buf_t &fb) {
   gl::ctx.BindTexture(GL_TEXTURE_2D, loaded_texture);
 
   GLenum attachments[] {
@@ -812,7 +823,7 @@ int sws_t::convert(nv12_t &nv12) {
   };
 
   for(int x = 0; x < sizeof(attachments) / sizeof(decltype(attachments[0])); ++x) {
-    gl::ctx.BindFramebuffer(GL_FRAMEBUFFER, nv12->buf[x]);
+    gl::ctx.BindFramebuffer(GL_FRAMEBUFFER, fb[x]);
     gl::ctx.DrawBuffers(1, &attachments[x]);
 
 #ifndef NDEBUG
diff --git a/sunshine/platform/linux/graphics.h b/sunshine/platform/linux/graphics.h
index 02de4d93..5599d9a4 100644
--- a/sunshine/platform/linux/graphics.h
+++ b/sunshine/platform/linux/graphics.h
@@ -19,6 +19,9 @@
 
 extern "C" int close(int __fd);
 
+// X11 Display
+extern "C" struct _XDisplay;
+
 struct AVFrame;
 void free_frame(AVFrame *frame);
 
@@ -227,7 +230,7 @@ struct surface_descriptor_t {
   std::uint32_t offsets[4];
 };
 
-display_t make_display(util::Either<gbm::gbm_t::pointer, wl_display *> native_display);
+display_t make_display(std::variant<gbm::gbm_t::pointer, wl_display *, _XDisplay *> native_display);
 std::optional<ctx_t> make_ctx(display_t::pointer display);
 
 std::optional<rgb_t> import_source(
@@ -276,7 +279,8 @@ public:
   static std::optional<sws_t> make(int in_width, int in_height, int out_width, int out_heigth, gl::tex_t &&tex);
   static std::optional<sws_t> make(int in_width, int in_height, int out_width, int out_heigth);
 
-  int convert(nv12_t &nv12);
+  // Convert the loaded image into the first two framebuffers
+  int convert(gl::frame_buf_t &fb);
 
   void load_ram(platf::img_t &img);
   void load_vram(img_descriptor_t &img, int offset_x, int offset_y, int texture);
diff --git a/sunshine/platform/linux/vaapi.cpp b/sunshine/platform/linux/vaapi.cpp
index 33646333..37a64cb8 100644
--- a/sunshine/platform/linux/vaapi.cpp
+++ b/sunshine/platform/linux/vaapi.cpp
@@ -3,8 +3,6 @@
 
 #include <fcntl.h>
 
-#include <glad/egl.h>
-
 extern "C" {
 #include <libavcodec/avcodec.h>
 }
@@ -404,7 +402,7 @@ public:
   int convert(platf::img_t &img) override {
     sws.load_ram(img);
 
-    sws.convert(nv12);
+    sws.convert(nv12->buf);
     return 0;
   }
 };
@@ -430,7 +428,7 @@ public:
 
     sws.load_vram(descriptor, offset_x, offset_y, rgb->tex[0]);
 
-    sws.convert(nv12);
+    sws.convert(nv12->buf);
     return 0;
   }
 
diff --git a/sunshine/platform/linux/x11grab.cpp b/sunshine/platform/linux/x11grab.cpp
index 33797f24..24a87900 100644
--- a/sunshine/platform/linux/x11grab.cpp
+++ b/sunshine/platform/linux/x11grab.cpp
@@ -24,6 +24,7 @@
 #include "misc.h"
 #include "vaapi.h"
 #include "x11grab.h"
+#include "cuda.h"
 
 using namespace std::literals;
 
@@ -259,9 +260,8 @@ void freeX(XFixesCursorImage *);
 using xcb_connect_t = util::dyn_safe_ptr<xcb_connection_t, &xcb::disconnect>;
 using xcb_img_t     = util::c_ptr<xcb_shm_get_image_reply_t>;
 
-using xdisplay_t = util::dyn_safe_ptr_v2<Display, int, &x11::CloseDisplay>;
-using ximg_t     = util::safe_ptr<XImage, freeImage>;
-using xcursor_t  = util::safe_ptr<XFixesCursorImage, freeX>;
+using ximg_t    = util::safe_ptr<XImage, freeImage>;
+using xcursor_t = util::safe_ptr<XFixesCursorImage, freeX>;
 
 using crtc_info_t   = util::dyn_safe_ptr<_XRRCrtcInfo, &x11::rr::FreeCrtcInfo>;
 using output_info_t = util::dyn_safe_ptr<_XRROutputInfo, &x11::rr::FreeOutputInfo>;
@@ -366,7 +366,7 @@ static void blend_cursor(Display *display, img_t &img, int offsetX, int offsetY)
 struct x11_attr_t : public display_t {
   std::chrono::nanoseconds delay;
 
-  xdisplay_t xdisplay;
+  x11::xdisplay_t xdisplay;
   Window xwindow;
   XWindowAttributes xattr;
 
@@ -516,6 +516,10 @@ struct x11_attr_t : public display_t {
       return va::make_hwdevice(width, height, false);
     }
 
+    if(mem_type == mem_type_e::cuda) {
+      return cuda::make_hwdevice(width, height, xdisplay.get());
+    }
+
     return std::make_shared<hwdevice_t>();
   }
 
@@ -526,7 +530,7 @@ struct x11_attr_t : public display_t {
 };
 
 struct shm_attr_t : public x11_attr_t {
-  xdisplay_t shm_xdisplay; // Prevent race condition with x11_attr_t::xdisplay
+  x11::xdisplay_t shm_xdisplay; // Prevent race condition with x11_attr_t::xdisplay
   xcb_connect_t xcb;
   xcb_screen_t *display;
   std::uint32_t seg;
@@ -713,7 +717,7 @@ std::vector<std::string> x11_display_names() {
 
   BOOST_LOG(info) << "Detecting connected monitors"sv;
 
-  xdisplay_t xdisplay { x11::OpenDisplay(nullptr) };
+  x11::xdisplay_t xdisplay { x11::OpenDisplay(nullptr) };
   if(!xdisplay) {
     return {};
   }
@@ -807,8 +811,16 @@ void cursor_t::blend(img_t &img, int offsetX, int offsetY) {
   blend_cursor((xdisplay_t::pointer)ctx.get(), img, offsetX, offsetY);
 }
 
+xdisplay_t make_display() {
+  return OpenDisplay(nullptr);
+}
+
+void freeDisplay(_XDisplay *xdisplay) {
+  CloseDisplay(xdisplay);
+}
+
 void freeCursorCtx(cursor_ctx_t::pointer ctx) {
-  x11::CloseDisplay((xdisplay_t::pointer)ctx);
+  CloseDisplay((xdisplay_t::pointer)ctx);
 }
 } // namespace x11
 } // namespace platf
diff --git a/sunshine/platform/linux/x11grab.h b/sunshine/platform/linux/x11grab.h
index 1440ae76..3d2868c8 100644
--- a/sunshine/platform/linux/x11grab.h
+++ b/sunshine/platform/linux/x11grab.h
@@ -6,6 +6,9 @@
 #include "sunshine/platform/common.h"
 #include "sunshine/utility.h"
 
+// X11 Display
+extern "C" struct _XDisplay;
+
 namespace egl {
 class cursor_t;
 }
@@ -15,8 +18,10 @@ namespace platf::x11 {
 #ifdef SUNSHINE_BUILD_X11
 struct cursor_ctx_raw_t;
 void freeCursorCtx(cursor_ctx_raw_t *ctx);
+void freeDisplay(_XDisplay *xdisplay);
 
 using cursor_ctx_t = util::safe_ptr<cursor_ctx_raw_t, freeCursorCtx>;
+using xdisplay_t = util::safe_ptr<_XDisplay, freeDisplay>;
 
 class cursor_t {
 public:
@@ -34,7 +39,12 @@ public:
 
   cursor_ctx_t ctx;
 };
+
+xdisplay_t make_display();
 #else
+// It's never something different from nullptr
+util::safe_ptr<_XDisplay, std::default_delete<_XDisplay>>;
+
 class cursor_t {
 public:
   static std::optional<cursor_t> make() { return std::nullopt; }
@@ -42,6 +52,8 @@ public:
   void capture(egl::cursor_t &) {}
   void blend(img_t &, int, int) {}
 };
+
+xdisplay_t make_display() { return nullptr; }
 #endif
 } // namespace platf::x11
 
diff --git a/sunshine/utility.h b/sunshine/utility.h
index 90435696..0e585e21 100644
--- a/sunshine/utility.h
+++ b/sunshine/utility.h
@@ -64,8 +64,7 @@ struct argument_type<T(U)> { typedef U type; };
 
 #define KITTY_DEFAULT_CONSTR_MOVE(x)     \
   x(x &&) noexcept = default;            \
-  x &operator=(x &&) noexcept = default; \
-  x()                         = default;
+  x &operator=(x &&) noexcept = default;
 
 #define KITTY_DEFAULT_CONSTR_MOVE_THROW(x) \
   x(x &&)    = default;                    \
@@ -415,9 +414,9 @@ inline std::int64_t from_view(const std::string_view &number) {
 }
 
 template<class X, class Y>
-class Either : public std::variant<X, Y> {
+class Either : public std::variant<std::monostate, X, Y> {
 public:
-  using std::variant<X, Y>::variant;
+  using std::variant<std::monostate, X, Y>::variant;
 
   constexpr bool has_left() const {
     return std::holds_alternative<X>(*this);
diff --git a/sunshine/video.cpp b/sunshine/video.cpp
index 261d86bc..7205aa07 100644
--- a/sunshine/video.cpp
+++ b/sunshine/video.cpp
@@ -409,13 +409,11 @@ static encoder_t nvenc {
 #ifdef _WIN32
   AV_HWDEVICE_TYPE_D3D11VA,
   AV_PIX_FMT_D3D11,
-  AV_PIX_FMT_NV12, AV_PIX_FMT_P010,
 #else
   AV_HWDEVICE_TYPE_CUDA,
   AV_PIX_FMT_CUDA,
-  // Fully planar YUV formats are more efficient for sws_scale()
-  AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV420P10,
 #endif
+  AV_PIX_FMT_NV12, AV_PIX_FMT_P010,
   {
     {
       { "forced-idr"s, 1 },
diff --git a/third-party/nv-codec-headers b/third-party/nv-codec-headers
new file mode 160000
index 00000000..b641a195
--- /dev/null
+++ b/third-party/nv-codec-headers
@@ -0,0 +1 @@
+Subproject commit b641a195edbe3ac9788e681e22c2e2fad8aacddb

From f5db0e438b7a20a1b0fbf81964720ded088c43c5 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Tue, 14 Sep 2021 15:07:34 +0200
Subject: [PATCH 02/27] The background is black instead of green

---
 sunshine/platform/linux/cuda.cpp     |  5 ++---
 sunshine/platform/linux/graphics.cpp | 14 ++++++++++++++
 sunshine/platform/linux/graphics.h   |  3 +++
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp
index 5c79acde..cdcaba57 100644
--- a/sunshine/platform/linux/cuda.cpp
+++ b/sunshine/platform/linux/cuda.cpp
@@ -213,9 +213,8 @@ public:
       return -1;
     }
 
-    this->sws = std::move(*sws_opt);
-
-    return 0;
+    sws = std::move(*sws_opt);
+    return sws.blank(fb, 0, 0, frame->width, frame->height);
   }
 
   int convert(platf::img_t &img) override {
diff --git a/sunshine/platform/linux/graphics.cpp b/sunshine/platform/linux/graphics.cpp
index 3e0594b8..8d31e37b 100644
--- a/sunshine/platform/linux/graphics.cpp
+++ b/sunshine/platform/linux/graphics.cpp
@@ -739,6 +739,20 @@ std::optional<sws_t> sws_t::make(int in_width, int in_height, int out_width, int
   return std::move(sws);
 }
 
+int sws_t::blank(gl::frame_buf_t &fb, int offsetX, int offsetY, int width, int height) {
+  auto f = [&]() {
+    std::swap(offsetX, this->offsetX);
+    std::swap(offsetY, this->offsetY);
+    std::swap(width, this->out_width);
+    std::swap(height, this->out_height);
+  };
+
+  f();
+  auto fg = util::fail_guard(f);
+
+  return convert(fb);
+}
+
 std::optional<sws_t> sws_t::make(int in_width, int in_height, int out_width, int out_heigth) {
   auto tex = gl::tex_t::make(2);
   gl::ctx.BindTexture(GL_TEXTURE_2D, tex[0]);
diff --git a/sunshine/platform/linux/graphics.h b/sunshine/platform/linux/graphics.h
index 5599d9a4..2cc24b01 100644
--- a/sunshine/platform/linux/graphics.h
+++ b/sunshine/platform/linux/graphics.h
@@ -282,6 +282,9 @@ public:
   // Convert the loaded image into the first two framebuffers
   int convert(gl::frame_buf_t &fb);
 
+  // Make an area of the image black
+  int blank(gl::frame_buf_t &fb, int offsetX, int offsetY, int width, int height);
+
   void load_ram(platf::img_t &img);
   void load_vram(img_descriptor_t &img, int offset_x, int offset_y, int texture);
 

From c94d922282e2e6c4044b5c456ffcb9224b80db81 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Tue, 14 Sep 2021 19:16:29 +0200
Subject: [PATCH 03/27] Fix windows build

---
 sunshine/main.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sunshine/main.cpp b/sunshine/main.cpp
index e1f07bf5..b2bed72f 100644
--- a/sunshine/main.cpp
+++ b/sunshine/main.cpp
@@ -26,8 +26,6 @@
 #include "upnp.h"
 #include "video.h"
 
-#include "platform/linux/cuda.h"
-
 #include "platform/common.h"
 extern "C" {
 #include <libavutil/log.h>

From e3f642ac25ac4ad6f47135b2334380ff1caedfc8 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Wed, 15 Sep 2021 12:10:12 +0200
Subject: [PATCH 04/27] Reduce cpu usage with x11grab

---
 sunshine/platform/linux/x11grab.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sunshine/platform/linux/x11grab.cpp b/sunshine/platform/linux/x11grab.cpp
index 24a87900..c8bc70f0 100644
--- a/sunshine/platform/linux/x11grab.cpp
+++ b/sunshine/platform/linux/x11grab.cpp
@@ -458,6 +458,7 @@ struct x11_attr_t : public display_t {
         std::this_thread::sleep_for((next_frame - now) / 3 * 2);
       }
       while(next_frame > now) {
+        std::this_thread::sleep_for(1ns);
         now = std::chrono::steady_clock::now();
       }
       next_frame = now + delay;
@@ -566,6 +567,7 @@ struct shm_attr_t : public x11_attr_t {
         std::this_thread::sleep_for((next_frame - now) / 3 * 2);
       }
       while(next_frame > now) {
+        std::this_thread::sleep_for(1ns);
         now = std::chrono::steady_clock::now();
       }
       next_frame = now + delay;

From fed329568ca44993d9cc0a8fe145e73a332fb111 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Sun, 19 Sep 2021 20:40:34 +0200
Subject: [PATCH 05/27] Use an actual cuda kernel to convert RGB to NV12

---
 CMakeLists.txt                   |   39 +-
 sunshine/platform/linux/cuda.cpp |  212 +++-
 sunshine/platform/linux/cuda.cu  |  248 ++++
 sunshine/platform/linux/cuda.h   |   46 +
 sunshine/video.cpp               |   26 +-
 third-party/nvfbc/NvFBC.h        | 2006 ++++++++++++++++++++++++++++++
 third-party/nvfbc/helper_math.h  | 1469 ++++++++++++++++++++++
 7 files changed, 4007 insertions(+), 39 deletions(-)
 create mode 100644 sunshine/platform/linux/cuda.cu
 create mode 100644 third-party/nvfbc/NvFBC.h
 create mode 100644 third-party/nvfbc/helper_math.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b04f96f..df0424d5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,15 +4,6 @@ project(Sunshine)
 
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
-add_subdirectory(third-party/Simple-Web-Server)
-
-set(UPNPC_BUILD_SHARED OFF CACHE BOOL "no shared libraries")
-set(UPNPC_BUILD_TESTS OFF CACHE BOOL "Don't build tests for miniupnpc")
-set(UPNPC_BUILD_SAMPLE OFF CACHE BOOL "Don't build samples for miniupnpc")
-set(UPNPC_NO_INSTALL ON CACHE BOOL "Don't install any libraries build for miniupnpc")
-add_subdirectory(third-party/miniupnp/miniupnpc)
-include_directories(third-party/miniupnp)
-
 if(WIN32)
 	# Ugly hack to compile with #include <qos2.h>
 	add_compile_definitions(
@@ -21,9 +12,20 @@ if(WIN32)
 		QOS_NON_ADAPTIVE_FLOW=2)
 endif()
 add_subdirectory(third-party/moonlight-common-c/enet)
+add_subdirectory(third-party/Simple-Web-Server)
+add_subdirectory(third-party/cbs)
+
+set(UPNPC_BUILD_SHARED OFF CACHE BOOL "no shared libraries")
+set(UPNPC_BUILD_TESTS OFF CACHE BOOL "Don't build tests for miniupnpc")
+set(UPNPC_BUILD_SAMPLE OFF CACHE BOOL "Don't build samples for miniupnpc")
+set(UPNPC_NO_INSTALL ON CACHE BOOL "Don't install any libraries build for miniupnpc")
+add_subdirectory(third-party/miniupnp/miniupnpc)
+include_directories(third-party/miniupnp)
 
 find_package(Threads REQUIRED)
 find_package(OpenSSL REQUIRED)
+set(Boost_USE_STATIC_LIBS ON)
+find_package(Boost COMPONENTS log filesystem REQUIRED)
 
 list(APPEND SUNSHINE_COMPILE_OPTIONS -fPIC -Wall -Wno-missing-braces -Wno-maybe-uninitialized -Wno-sign-compare)
 
@@ -106,6 +108,11 @@ else()
 	option(SUNSHINE_ENABLE_X11 "Enable X11 grab if available" ON)
 	option(SUNSHINE_ENABLE_WAYLAND "Enable building wayland specific code" ON)
 
+	if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  	set(CMAKE_CUDA_ARCHITECTURES 75)
+	endif()
+	enable_language(CUDA)
+
 	if(${SUNSHINE_ENABLE_X11})
 		find_package(X11)
 	else()
@@ -188,6 +195,7 @@ else()
 		sunshine/platform/linux/publish.cpp
 		sunshine/platform/linux/vaapi.h
 		sunshine/platform/linux/vaapi.cpp
+		sunshine/platform/linux/cuda.cu
 		sunshine/platform/linux/cuda.cpp
 		sunshine/platform/linux/cuda.h
 		sunshine/platform/linux/graphics.h
@@ -203,7 +211,8 @@ else()
 		third-party/glad/include/EGL/eglplatform.h
 		third-party/glad/include/KHR/khrplatform.h
 		third-party/glad/include/glad/gl.h
-		third-party/glad/include/glad/egl.h)
+		third-party/glad/include/glad/egl.h
+		third-party/nvfbc/NvFBC.h)
 		
 	list(APPEND PLATFORM_LIBRARIES
 		dl
@@ -215,7 +224,8 @@ else()
 	include_directories(
 		/usr/include/libevdev-1.0
 		third-party/nv-codec-headers/include
-		third-party/glad/include)
+		third-party/glad/include
+		third-party/nvfbc)
 
 	if(NOT DEFINED SUNSHINE_EXECUTABLE_PATH)
 		set(SUNSHINE_EXECUTABLE_PATH "sunshine")
@@ -224,11 +234,6 @@ else()
 	configure_file(sunshine.service.in sunshine.service @ONLY)
 endif()
 
-add_subdirectory(third-party/cbs)
-
-set(Boost_USE_STATIC_LIBS ON)
-find_package(Boost COMPONENTS log filesystem REQUIRED)
-
 set(SUNSHINE_TARGET_FILES
 	third-party/moonlight-common-c/reedsolomon/rs.c
 	third-party/moonlight-common-c/reedsolomon/rs.h
@@ -290,7 +295,7 @@ include_directories(
 
 string(TOUPPER "x${CMAKE_BUILD_TYPE}" BUILD_TYPE)
 if("${BUILD_TYPE}" STREQUAL "XDEBUG")
-	list(APPEND SUNSHINE_COMPILE_OPTIONS -O0 -pedantic -ggdb3)
+	list(APPEND SUNSHINE_COMPILE_OPTIONS -O0 -ggdb3)
 	if(WIN32)
 		set_source_files_properties(sunshine/nvhttp.cpp PROPERTIES COMPILE_FLAGS -O2)
 	endif()
diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp
index cdcaba57..811293d6 100644
--- a/sunshine/platform/linux/cuda.cpp
+++ b/sunshine/platform/linux/cuda.cpp
@@ -1,9 +1,4 @@
-#include "cuda.h"
-#include "graphics.h"
-#include "sunshine/main.h"
-#include "sunshine/utility.h"
-#include "wayland.h"
-#include "x11grab.h"
+#include <NvFBC.h>
 #include <ffnvcodec/dynlink_loader.h>
 
 extern "C" {
@@ -12,6 +7,13 @@ extern "C" {
 #include <libavutil/imgutils.h>
 }
 
+#include "cuda.h"
+#include "graphics.h"
+#include "sunshine/main.h"
+#include "sunshine/utility.h"
+#include "wayland.h"
+#include "x11grab.h"
+
 #define SUNSHINE_STRINGVIEW_HELPER(x) x##sv
 #define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x)
 
@@ -23,6 +25,13 @@ extern "C" {
 
 using namespace std::literals;
 namespace cuda {
+constexpr auto cudaDevAttrMaxThreadsPerBlock          = (CUdevice_attribute)1;
+constexpr auto cudaDevAttrMaxThreadsPerMultiProcessor = (CUdevice_attribute)39;
+
+void pass_error(const std::string_view &sv, const char *name, const char *description) {
+  BOOST_LOG(error) << sv << name << ':' << description;
+}
+
 void cff(CudaFunctions *cf) {
   cuda_free_functions(&cf);
 }
@@ -151,7 +160,7 @@ int init() {
   return 0;
 }
 
-class cuda_t : public platf::hwdevice_t {
+class opengl_t : public platf::hwdevice_t {
 public:
   int init(int in_width, int in_height, platf::x11::xdisplay_t::pointer xdisplay) {
     if(!cdf) {
@@ -273,16 +282,203 @@ public:
   int width, height;
 };
 
+class cuda_t : public platf::hwdevice_t {
+public:
+  ~cuda_t() override {
+    // sws_t needs to be destroyed while the context is active
+    if(sws) {
+      ctx_t ctx { cuda_ctx };
+
+      sws.reset();
+    }
+  }
+
+  int init(int in_width, int in_height) {
+    if(!cdf) {
+      BOOST_LOG(warning) << "cuda not initialized"sv;
+      return -1;
+    }
+
+    data = (void *)0x1;
+
+    width  = in_width;
+    height = in_height;
+
+    return 0;
+  }
+
+  int set_frame(AVFrame *frame) override {
+    this->hwframe.reset(frame);
+    this->frame = frame;
+
+    if(((AVHWFramesContext *)frame->hw_frames_ctx->data)->sw_format != AV_PIX_FMT_NV12) {
+      BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12"sv;
+      return -1;
+    }
+
+    if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) {
+      BOOST_LOG(error) << "Couldn't get hwframe for NVENC"sv;
+
+      return -1;
+    }
+
+    cuda_ctx = ((AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx)->cuda_ctx;
+
+    ctx_t ctx { cuda_ctx };
+    sws = sws_t::make(width * 4, height, frame->width, frame->height);
+
+    if(!sws) {
+      return -1;
+    }
+
+    return 0;
+  }
+
+  int convert(platf::img_t &img) override {
+    ctx_t ctx { cuda_ctx };
+
+    return sws->load_ram(img) || sws->convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1]);
+  }
+
+  void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override {
+    ctx_t ctx { cuda_ctx };
+    sws->set_colorspace(colorspace, color_range);
+  }
+
+  frame_t hwframe;
+
+  std::unique_ptr<sws_t> sws;
+
+  int width, height;
+
+  CUcontext cuda_ctx;
+};
+
 std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay) {
   if(init()) {
     return nullptr;
   }
 
   auto cuda = std::make_shared<cuda_t>();
-  if(cuda->init(width, height, xdisplay)) {
+  if(cuda->init(width, height)) {
     return nullptr;
   }
 
   return cuda;
 }
 } // namespace cuda
+
+namespace platf::nvfbc {
+static PNVFBCCREATEINSTANCE createInstance {};
+static NVFBC_API_FUNCTION_LIST func { NVFBC_VERSION };
+
+static void *handle { nullptr };
+int init() {
+  static bool funcs_loaded = false;
+
+  if(funcs_loaded) return 0;
+
+  if(!handle) {
+    handle = dyn::handle({ "libnvidia-fbc.so.1", "libnvidia-fbc.so" });
+    if(!handle) {
+      return -1;
+    }
+  }
+
+  std::vector<std::tuple<dyn::apiproc *, const char *>> funcs {
+    { (dyn::apiproc *)&createInstance, "NvFBCCreateInstance" },
+  };
+
+  if(dyn::load(handle, funcs)) {
+    dlclose(handle);
+    handle = nullptr;
+
+    return -1;
+  }
+
+  funcs_loaded = true;
+  return 0;
+}
+
+class handle_t {
+  KITTY_USING_MOVE_T(session_t, NVFBC_SESSION_HANDLE, std::numeric_limits<std::uint64_t>::max(), {
+    if(el == std::numeric_limits<std::uint64_t>::max()) {
+      return;
+    }
+    NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER };
+
+    auto status = func.nvFBCDestroyHandle(el, &params);
+    if(status) {
+      BOOST_LOG(error) << "Failed to destroy nvfbc handle: "sv << func.nvFBCGetLastErrorStr(el);
+    }
+  });
+
+public:
+  static std::optional<handle_t> make() {
+    NVFBC_CREATE_HANDLE_PARAMS params { NVFBC_CREATE_HANDLE_PARAMS_VER };
+    session_t session;
+
+    auto status = func.nvFBCCreateHandle(&session.el, &params);
+    if(status) {
+      BOOST_LOG(error) << "Failed to create session: "sv << func.nvFBCGetLastErrorStr(session.el);
+      session.release();
+
+      return std::nullopt;
+    }
+
+    return handle_t { std::move(session) };
+  }
+
+  const char *last_error() {
+    return func.nvFBCGetLastErrorStr(session.el);
+  }
+
+  std::optional<NVFBC_GET_STATUS_PARAMS> status() {
+    NVFBC_GET_STATUS_PARAMS params { NVFBC_GET_STATUS_PARAMS_VER };
+
+    auto status = func.nvFBCGetStatus(session.el, &params);
+    if(status) {
+      BOOST_LOG(error) << "Failed to create session: "sv << last_error();
+
+      return std::nullopt;
+    }
+
+    return params;
+  }
+
+  session_t session;
+};
+
+std::vector<std::string> nvfbc_display_names() {
+  if(init()) {
+    return {};
+  }
+
+  std::vector<std::string> display_names;
+
+  auto status = createInstance(&func);
+  if(status) {
+    BOOST_LOG(error) << "Unable to create NvFBC instance"sv;
+    return {};
+  }
+
+  auto handle = handle_t::make();
+  if(!handle) {
+    return {};
+  }
+
+  auto status_params = handle->status();
+  if(!status_params) {
+    return {};
+  }
+
+  if(!status_params->bIsCapturePossible) {
+    BOOST_LOG(error) << "NVidia driver doesn't support NvFBC screencasting"sv;
+  }
+
+  BOOST_LOG(info) << "Found ["sv << status_params->dwOutputNum << "] outputs"sv;
+  BOOST_LOG(info) << "Virtual Desktop: "sv << status_params->screenSize.w << 'x' << status_params->screenSize.h;
+
+  return display_names;
+}
+} // namespace platf::nvfbc
\ No newline at end of file
diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu
new file mode 100644
index 00000000..a2ca6508
--- /dev/null
+++ b/sunshine/platform/linux/cuda.cu
@@ -0,0 +1,248 @@
+// #include <algorithm>
+#include <helper_math.h>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string_view>
+
+#include "cuda.h"
+
+using namespace std::literals;
+
+#define SUNSHINE_STRINGVIEW_HELPER(x) x##sv
+#define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x)
+
+#define CU_CHECK(x, y) \
+  if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return -1
+
+#define CU_CHECK_VOID(x, y) \
+  if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return;
+
+#define CU_CHECK_PTR(x, y) \
+  if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return nullptr;
+
+#define CU_CHECK_IGNORE(x, y) \
+  check((x), SUNSHINE_STRINGVIEW(y ": "))
+
+using namespace std::literals;
+
+//////////////////// Special desclarations
+/**
+ * NVCC segfaults when including <chrono>
+ * Therefore, some declarations need to be added explicitely
+ */
+namespace platf {
+struct img_t {
+public:
+  std::uint8_t *data {};
+  std::int32_t width {};
+  std::int32_t height {};
+  std::int32_t pixel_pitch {};
+  std::int32_t row_pitch {};
+
+  virtual ~img_t() = default;
+};
+} // namespace platf
+
+namespace video {
+using __float4 = float[4];
+using __float3 = float[3];
+using __float2 = float[2];
+
+struct __attribute__((__aligned__(16))) color_t {
+  float4 color_vec_y;
+  float4 color_vec_u;
+  float4 color_vec_v;
+  float2 range_y;
+  float2 range_uv;
+};
+
+struct __attribute__((__aligned__(16))) color_extern_t {
+  __float4 color_vec_y;
+  __float4 color_vec_u;
+  __float4 color_vec_v;
+  __float2 range_y;
+  __float2 range_uv;
+};
+
+extern color_extern_t colors[4];
+} // namespace video
+
+//////////////////// End special declarations
+
+namespace cuda {
+auto constexpr INVALID_TEXTURE = std::numeric_limits<cudaTextureObject_t>::max();
+
+template<class T>
+inline T div_align(T l, T r) {
+  return (l + r - 1) / r;
+}
+
+void pass_error(const std::string_view &sv, const char *name, const char *description);
+inline static int check(cudaError_t result, const std::string_view &sv) {
+  if(result) {
+    auto name        = cudaGetErrorName(result);
+    auto description = cudaGetErrorString(result);
+
+    pass_error(sv, name, description);
+    return -1;
+  }
+
+  return 0;
+}
+
+__device__ __constant__ video::color_t color;
+
+
+inline __device__ float3 bgra_to_rgb(uchar4 vec) {
+  return make_float3((float)vec.z, (float)vec.y, (float)vec.x);
+}
+
+inline __device__ float2 calcUV(float3 pixel) {
+  float4 vec_u = color.color_vec_u;
+  float4 vec_v = color.color_vec_v;
+
+  float u = dot(pixel, make_float3(vec_u)) + vec_u.w;
+  float v = dot(pixel, make_float3(vec_v)) + vec_v.w;
+
+  u = u * color.range_uv.x + color.range_uv.y;
+  v = (v * color.range_uv.x + color.range_uv.y) * 224.0f / 256.0f + 0.0625f * 256.0f;
+
+  return make_float2(u, v);
+}
+
+inline __device__ float calcY(float3 pixel) {
+  float4 vec_y = color.color_vec_y;
+
+  return (dot(pixel, make_float3(vec_y)) + vec_y.w) * color.range_y.x + color.range_y.y;
+}
+
+__global__ void RGBA_to_NV12(
+  cudaTextureObject_t srcImage, std::uint8_t *dstY, std::uint8_t *dstUV,
+  std::uint32_t dstPitchY, std::uint32_t dstPitchUV,
+  std::uint32_t width, std::uint32_t height) {
+
+  int idX = (threadIdx.x + blockDim.x * blockIdx.x) * 2;
+  int idY = (threadIdx.y + blockDim.y * blockIdx.y);
+
+  if(idX >= width) return;
+  if(idY >= height) return;
+
+  dstY  = dstY + idX + idY * dstPitchY;
+  dstUV = dstUV + idX + (idY / 2 * dstPitchUV);
+
+  float x = (float)idX / (float)width / 4;
+  float y = (float)idY / (float)height;
+
+  float3 rgb_l = bgra_to_rgb(tex2D<uchar4>(srcImage, x, y));
+  float3 rgb_r = bgra_to_rgb(tex2D<uchar4>(srcImage, x + 0.25f / width, y + 1.0f / height));
+
+  float2 uv = calcUV((rgb_l + rgb_r) * 0.5f);
+
+  dstUV[0] = uv.x;
+  dstUV[1] = uv.y;
+  dstY[0]  = calcY(rgb_l);
+  dstY[1]  = calcY(rgb_r);
+}
+
+sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock)
+    : array {}, texture { INVALID_TEXTURE }, width { out_width }, height { out_height }, threadsPerBlock { threadsPerBlock } {
+  auto format = cudaCreateChannelDesc<uchar4>();
+
+  CU_CHECK_VOID(cudaMallocArray(&array, &format, in_width, in_height, cudaArrayDefault), "Couldn't allocate cuda array");
+
+  cudaResourceDesc res {};
+  res.resType         = cudaResourceTypeArray;
+  res.res.array.array = array;
+
+  cudaTextureDesc desc {};
+
+  desc.readMode         = cudaReadModeElementType;
+  desc.filterMode       = cudaFilterModePoint;
+  desc.normalizedCoords = true;
+
+  std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp);
+
+  CU_CHECK_VOID(cudaCreateTextureObject(&texture, &res, &desc, nullptr), "Couldn't create cuda texture");
+}
+
+sws_t::~sws_t() {
+  if(texture != INVALID_TEXTURE) {
+    CU_CHECK_IGNORE(cudaDestroyTextureObject(texture), "Couldn't deallocate cuda texture");
+
+    texture = INVALID_TEXTURE;
+  }
+
+  if(array) {
+    CU_CHECK_IGNORE(cudaFreeArray(array), "Couldn't deallocate cuda array");
+
+    array = cudaArray_t {};
+  }
+}
+
+std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, int out_height) {
+  cudaDeviceProp props;
+  int device;
+  CU_CHECK_PTR(cudaGetDevice(&device), "Couldn't get cuda device");
+  CU_CHECK_PTR(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties");
+
+  auto sws = std::make_unique<sws_t>(in_width, in_height, out_width, out_height, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2);
+
+  if(sws->texture == INVALID_TEXTURE) {
+    return nullptr;
+  }
+
+  return sws;
+}
+
+int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV) {
+  int threadsX = width / 2;
+  int threadsY = height;
+
+  dim3 block(threadsPerBlock, threadsPerBlock);
+  dim3 grid(div_align(threadsX, threadsPerBlock), div_align(threadsY, threadsPerBlock));
+
+  RGBA_to_NV12<<<block, grid>>>(texture, Y, UV, pitchY, pitchUV, width, height);
+
+  return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
+}
+
+void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) {
+  color_range = 1;
+  colorspace = 5;
+  video::color_extern_t *color_p;
+  switch(colorspace) {
+  case 5: // SWS_CS_SMPTE170M
+    color_p = &video::colors[0];
+    break;
+  case 1: // SWS_CS_ITU709
+    color_p = &video::colors[2];
+    break;
+  case 9: // SWS_CS_BT2020
+  default:
+    color_p = &video::colors[0];
+  };
+
+  if(color_range > 1) {
+    // Full range
+    ++color_p;
+  }
+
+  auto color_matrix = *(video::color_t*)color_p;
+  color_matrix.color_vec_y.w *= 256.0f;
+  color_matrix.color_vec_u.w *= 256.0f;
+  color_matrix.color_vec_v.w *= 256.0f;
+
+  color_matrix.range_y.y *= 256.0f;
+  color_matrix.range_uv.y *= 256.0f;
+
+  static_assert(sizeof(video::color_t) == sizeof(video::color_extern_t), "color matrix struct mismatch");
+
+  CU_CHECK_IGNORE(cudaMemcpyToSymbol(color, &color_matrix, sizeof(video::color_t)), "Couldn't copy color matrix to cuda");
+}
+
+int sws_t::load_ram(platf::img_t &img) {
+  return CU_CHECK_IGNORE(cudaMemcpy2DToArray(array, 0, 0, img.data, img.row_pitch, img.width * img.pixel_pitch, img.height, cudaMemcpyHostToDevice), "Couldn't copy to cuda array");
+}
+
+} // namespace cuda
\ No newline at end of file
diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h
index a56c961b..41087506 100644
--- a/sunshine/platform/linux/cuda.h
+++ b/sunshine/platform/linux/cuda.h
@@ -1,6 +1,8 @@
 #ifndef SUNSHINE_PLATFORM_CUDA_H
 #define SUNSHINE_PLATFORM_CUDA_H
 
+#ifndef __NVCC__
+
 #include "sunshine/platform/common.h"
 #include "x11grab.h"
 
@@ -9,4 +11,48 @@ std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x
 int init();
 } // namespace cuda
 
+#else
+namespace platf {
+class img_t;
+}
+#endif
+
+typedef struct cudaArray *cudaArray_t;
+
+#if !defined(__CUDACC__)
+typedef unsigned long long cudaTextureObject_t;
+#else  /* defined(__CUDACC__) */
+typedef __location__(device_builtin) unsigned long long cudaTextureObject_t;
+#endif /* !defined(__CUDACC__) */
+
+namespace cuda {
+class sws_t {
+public:
+  ~sws_t();
+  sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock);
+
+  /**
+   * in_width, out_width -- The width and height of the captured image in bytes
+   * out_width, out_height -- the width and height of the NV12 image in pixels
+   * 
+   * cuda_device -- pointer to the cuda device
+   */
+  static std::unique_ptr<sws_t> make(int in_width, int in_height, int out_width, int out_height);
+
+  // Converts loaded image into a CUDevicePtr
+  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV);
+
+  void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range);
+
+  int load_ram(platf::img_t &img);
+
+  cudaArray_t array;
+  cudaTextureObject_t texture;
+
+  int width, height;
+
+  int threadsPerBlock;
+};
+} // namespace cuda
+
 #endif
\ No newline at end of file
diff --git a/sunshine/video.cpp b/sunshine/video.cpp
index 7205aa07..3b143cbe 100644
--- a/sunshine/video.cpp
+++ b/sunshine/video.cpp
@@ -324,7 +324,7 @@ struct encoder_t {
 class session_t {
 public:
   session_t() = default;
-  session_t(ctx_t &&ctx, util::wrap_ptr<platf::hwdevice_t> &&device, int inject) : ctx { std::move(ctx) }, device { std::move(device) }, inject { inject } {}
+  session_t(ctx_t &&ctx, std::shared_ptr<platf::hwdevice_t> &&device, int inject) : ctx { std::move(ctx) }, device { std::move(device) }, inject { inject } {}
 
   session_t(session_t &&other) noexcept = default;
 
@@ -342,7 +342,7 @@ public:
   }
 
   ctx_t ctx;
-  util::wrap_ptr<platf::hwdevice_t> device;
+  std::shared_ptr<platf::hwdevice_t> device;
 
   std::vector<packet_raw_t::replace_t> replacements;
 
@@ -369,7 +369,6 @@ struct sync_session_t {
   sync_session_ctx_t *ctx;
 
   platf::img_t *img_tmp;
-  std::shared_ptr<platf::hwdevice_t> hwdevice;
   session_t session;
 };
 
@@ -779,7 +778,7 @@ int encode(int64_t frame_nr, session_t &session, frame_t::pointer frame, safe::m
   return 0;
 }
 
-std::optional<session_t> make_session(const encoder_t &encoder, const config_t &config, int width, int height, platf::hwdevice_t *hwdevice) {
+std::optional<session_t> make_session(const encoder_t &encoder, const config_t &config, int width, int height, std::shared_ptr<platf::hwdevice_t> &&hwdevice) {
   bool hardware = encoder.dev_type != AV_HWDEVICE_TYPE_NONE;
 
   auto &video_format = config.videoFormat == 0 ? encoder.h264 : encoder.hevc;
@@ -886,7 +885,7 @@ std::optional<session_t> make_session(const encoder_t &encoder, const config_t &
   if(hardware) {
     ctx->pix_fmt = encoder.dev_pix_fmt;
 
-    auto buf_or_error = encoder.make_hwdevice_ctx(hwdevice);
+    auto buf_or_error = encoder.make_hwdevice_ctx(hwdevice.get());
     if(buf_or_error.has_right()) {
       return std::nullopt;
     }
@@ -965,7 +964,7 @@ std::optional<session_t> make_session(const encoder_t &encoder, const config_t &
     frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
   }
 
-  util::wrap_ptr<platf::hwdevice_t> device;
+  std::shared_ptr<platf::hwdevice_t> device;
 
   if(!hwdevice->data) {
     auto device_tmp = std::make_unique<swdevice_t>();
@@ -977,7 +976,7 @@ std::optional<session_t> make_session(const encoder_t &encoder, const config_t &
     device = std::move(device_tmp);
   }
   else {
-    device = hwdevice;
+    device = std::move(hwdevice);
   }
 
   if(device->set_frame(frame.release())) {
@@ -1009,12 +1008,12 @@ void encode_run(
   img_event_t images,
   config_t config,
   int width, int height,
-  platf::hwdevice_t *hwdevice,
+  std::shared_ptr<platf::hwdevice_t> &&hwdevice,
   safe::signal_t &reinit_event,
   const encoder_t &encoder,
   void *channel_data) {
 
-  auto session = make_session(encoder, config, width, height, hwdevice);
+  auto session = make_session(encoder, config, width, height, std::move(hwdevice));
   if(!session) {
     return;
   }
@@ -1101,12 +1100,11 @@ std::optional<sync_session_t> make_synced_session(platf::display_t *disp, const
   // absolute mouse coordinates require that the dimensions of the screen are known
   ctx.touch_port_events->raise(make_port(disp, ctx.config));
 
-  auto session = make_session(encoder, ctx.config, img.width, img.height, hwdevice.get());
+  auto session = make_session(encoder, ctx.config, img.width, img.height, std::move(hwdevice));
   if(!session) {
     return std::nullopt;
   }
 
-  encode_session.hwdevice = std::move(hwdevice);
   encode_session.session  = std::move(*session);
 
   return std::move(encode_session);
@@ -1208,7 +1206,7 @@ encode_e encode_run_sync(
           ctx->idr_events->pop();
         }
 
-        if(pos->hwdevice->convert(*img)) {
+        if(pos->session.device->convert(*img)) {
           BOOST_LOG(error) << "Could not convert image"sv;
           ctx->shutdown_event->raise(true);
 
@@ -1356,7 +1354,7 @@ void capture_async(
       frame_nr,
       mail, images,
       config, display->width, display->height,
-      hwdevice.get(),
+      std::move(hwdevice),
       ref->reinit_event, *ref->encoder_p,
       channel_data);
   }
@@ -1409,7 +1407,7 @@ int validate_config(std::shared_ptr<platf::display_t> &disp, const encoder_t &en
     return -1;
   }
 
-  auto session = make_session(encoder, config, disp->width, disp->height, hwdevice.get());
+  auto session = make_session(encoder, config, disp->width, disp->height, std::move(hwdevice));
   if(!session) {
     return -1;
   }
diff --git a/third-party/nvfbc/NvFBC.h b/third-party/nvfbc/NvFBC.h
new file mode 100644
index 00000000..8990eeab
--- /dev/null
+++ b/third-party/nvfbc/NvFBC.h
@@ -0,0 +1,2006 @@
+/*!
+ * \file
+ *
+ * This file contains the interface constants, structure definitions and
+ * function prototypes defining the NvFBC API for Linux.
+ *
+ * Copyright (c) 2013-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _NVFBC_H_
+#define _NVFBC_H_
+
+#include <stdint.h>
+
+/*!
+ * \mainpage NVIDIA Framebuffer Capture (NvFBC) for Linux.
+ *
+ * NvFBC is a high performance, low latency API to capture the framebuffer of
+ * an X server screen.
+ *
+ * The output from NvFBC captures everything that would be visible if we were
+ * directly looking at the monitor.  This includes window manager decoration,
+ * mouse cursor, overlay, etc.
+ *
+ * It is ideally suited to desktop or fullscreen application capture and
+ * remoting.
+ */
+
+/*!
+ * \defgroup FBC_REQ Requirements
+ *
+ * The following requirements are provided by the regular NVIDIA Display Driver
+ * package:
+ *
+ * - OpenGL core >= 4.2:
+ *   Required.  NvFBC relies on OpenGL to perform frame capture and
+ *   post-processing.
+ *
+ * - Vulkan 1.1:
+ *   Required.
+ *
+ * - libcuda.so.1 >= 5.5:
+ *   Optional. Used for capture to video memory with CUDA interop.
+ *
+ * The following requirements must be installed separately depending on the
+ * Linux distribution being used:
+ *
+ * - XRandR extension >= 1.2:
+ *   Optional.  Used for RandR output tracking.
+ *
+ * - libX11-xcb.so.1 >= 1.2:
+ *   Required.  NvFBC uses a mix of Xlib and XCB.  Xlib is needed to use GLX,
+ *   XCB is needed to make NvFBC more resilient against X server terminations
+ *   while a capture session is active.
+ *
+ * - libxcb.so.1 >= 1.3:
+ *   Required.  See above.
+ *
+ * - xorg-server >= 1.3:
+ *   Optional.  Required for push model to work properly.
+ *
+ * Note that all optional dependencies are dlopen()'d at runtime.  Failure to
+ * load an optional library is not fatal.
+ */
+
+/*!
+ * \defgroup FBC_CHANGES ChangeLog
+ *
+ * NvFBC Linux API version 0.1
+ * - Initial BETA release.
+ *
+ * NvFBC Linux API version 0.2
+ * - Added 'bEnableMSE' field to NVFBC_H264_HW_ENC_CONFIG.
+ * - Added 'dwMSE' field to NVFBC_TOH264_GRAB_FRAME_PARAMS.
+ * - Added 'bEnableAQ' field to NVFBC_H264_HW_ENC_CONFIG.
+ * - Added 'NVFBC_H264_PRESET_LOSSLESS_HP' enum to NVFBC_H264_PRESET.
+ * - Added 'NVFBC_BUFFER_FORMAT_YUV444P' enum to NVFBC_BUFFER_FORMAT.
+ * - Added 'eInputBufferFormat' field to NVFBC_H264_HW_ENC_CONFIG.
+ * - Added '0' and '244' values for NVFBC_H264_HW_ENC_CONFIG::dwProfile.
+ *
+ * NvFBC Linux API version 0.3
+ * - Improved multi-threaded support by implementing an API locking mechanism.
+ * - Added 'nvFBCBindContext' API entry point.
+ * - Added 'nvFBCReleaseContext' API entry point.
+ *
+ * NvFBC Linux API version 1.0
+ * - Added codec agnostic interface for HW encoding.
+ * - Deprecated H.264 interface.
+ * - Added support for H.265/HEVC HW encoding.
+ *
+ * NvFBC Linux API version 1.1
+ * - Added 'nvFBCToHwGetCaps' API entry point.
+ * - Added 'dwDiffMapScalingFactor' field to NVFBC_TOSYS_SETUP_PARAMS.
+ *
+ * NvFBC Linux API version 1.2
+ * - Deprecated ToHwEnc interface.
+ * - Added ToGL interface that captures frames to an OpenGL texture in video
+ *   memory.
+ * - Added 'bDisableAutoModesetRecovery' field to
+ *   NVFBC_CREATE_CAPTURE_SESSION_PARAMS.
+ * - Added 'bExternallyManagedContext' field to NVFBC_CREATE_HANDLE_PARAMS.
+ *
+ * NvFBC Linux API version 1.3
+ * - Added NVFBC_BUFFER_FORMAT_RGBA
+ * - Added 'dwTimeoutMs' field to NVFBC_TOSYS_GRAB_FRAME_PARAMS,
+ *   NVFBC_TOCUDA_GRAB_FRAME_PARAMS, and NVFBC_TOGL_GRAB_FRAME_PARAMS.
+ *
+ * NvFBC Linux API version 1.4
+ * - Clarified that NVFBC_BUFFER_FORMAT_{ARGB,RGB,RGBA} are byte-order formats.
+ * - Renamed NVFBC_BUFFER_FORMAT_YUV420P to NVFBC_BUFFER_FORMAT_NV12.
+ * - Added new requirements.
+ * - Made NvFBC more resilient against the X server terminating during an active
+ *   capture session.  See new comments for ::NVFBC_ERR_X.
+ * - Relaxed requirement that 'frameSize' must have a width being a multiple of
+ *   4 and a height being a multiple of 2.
+ * - Added 'bRoundFrameSize' field to NVFBC_CREATE_CAPTURE_SESSION_PARAMS.
+ * - Relaxed requirement that the scaling factor for differential maps must be
+ *   a multiple of the size of the frame.
+ * - Added 'diffMapSize' field to NVFBC_TOSYS_SETUP_PARAMS and
+ *   NVFBC_TOGL_SETUP_PARAMS.
+ *
+ * NvFBC Linux API version 1.5
+ * - Added NVFBC_BUFFER_FORMAT_BGRA
+ *
+ * NvFBC Linux API version 1.6
+ * - Added the 'NVFBC_TOSYS_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY',
+ *   'NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY', and
+ *   'NVFBC_TOGL_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY' capture flags.
+ * - Exposed debug and performance logs through the NVFBC_LOG_LEVEL environment
+ *   variable.  Setting it to "1" enables performance logs, setting it to "2"
+ *   enables debugging logs, setting it to "3" enables both.
+ * - Logs are printed to stdout or to the file pointed by the NVFBC_LOG_FILE
+ *   environment variable.
+ * - Added 'ulTimestampUs' to NVFBC_FRAME_GRAB_INFO.
+ * - Added 'dwSamplingRateMs' to NVFBC_CREATE_CAPTURE_SESSION_PARAMS.
+ * - Added 'bPushModel' to NVFBC_CREATE_CAPTURE_SESSION_PARAMS.
+ *
+ * NvFBC Linux API version 1.7
+ * - Retired the NVFBC_CAPTURE_TO_HW_ENCODER interface.
+ *   This interface has been deprecated since NvFBC 1.2 and has received no
+ *   updates or new features since. We recommend using the NVIDIA Video Codec
+ *   SDK to encode NvFBC frames.
+ *   See: https://developer.nvidia.com/nvidia-video-codec-sdk
+ * - Added a 'Capture Modes' section to those headers.
+ * - Added a 'Post Processing' section to those headers.
+ * - Added an 'Environment Variables' section to those headers.
+ * - Added 'bInModeset' to NVFBC_GET_STATUS_PARAMS.
+ * - Added 'bAllowDirectCapture' to NVFBC_CREATE_CAPTURE_SESSION_PARAMS.
+ * - Added 'bDirectCaptured' to NVFBC_FRAME_GRAB_INFO.
+ * - Added 'bRequiredPostProcessing' to NVFBC_FRAME_GRAB_INFO.
+ */
+
+/*!
+ * \defgroup FBC_MODES Capture Modes
+ *
+ * When creating a capture session, NvFBC instantiates a capture subsystem
+ * living in the NVIDIA X driver.
+ *
+ * This subsystem listens for damage events coming from applications then
+ * generates (composites) frames for NvFBC when new content is available.
+ *
+ * This capture server can operate on a timer where it periodically checks if
+ * there are any pending damage events, or it can generate frames as soon as it
+ * receives a new damage event.
+ * See NVFBC_CREATE_CAPTURE_SESSION_PARAMS::dwSamplingRateMs,
+ * and NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bPushModel.
+ *
+ * NvFBC can also attach itself to a fullscreen unoccluded application and have
+ * it copy its frames directly into a buffer owned by NvFBC upon present. This
+ * mode bypasses the X server.
+ * See NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bAllowDirectCapture.
+ *
+ * NvFBC is designed to capture frames with as few copies as possible. The
+ * NVIDIA X driver composites frames directly into the NvFBC buffers, and
+ * direct capture copies frames directly into these buffers as well.
+ *
+ * Depending on the configuration of a capture session, an extra copy (rendering
+ * pass) may be needed. See the 'Post Processing' section.
+ */
+
+/*!
+ * \defgroup FBC_PP Post Processing
+ *
+ * Depending on the configuration of a capture session, NvFBC might require to
+ * do post processing on frames.
+ *
+ * Post processing is required for the following reasons:
+ * - NvFBC needs to do a pixel format conversion.
+ * - Diffmaps are requested.
+ * - Capture to system memory is requested.
+ *
+ * NvFBC needs to do a conversion if the requested pixel format does not match
+ * the native format. The native format is NVFBC_BUFFER_FORMAT_BGRA.
+ *
+ * Note: post processing is *not* required for frame scaling and frame cropping.
+ *
+ * Skipping post processing can reduce capture latency. An application can know
+ * whether post processing was required by checking
+ * NVFBC_FRAME_GRAB_INFO::bRequiredPostProcessing.
+ */
+
+/*!
+ * \defgroup FBC_ENVVAR Environment Variables
+ *
+ * Below are the environment variables supported by NvFBC:
+ *
+ * - NVFBC_LOG_LEVEL
+ *   Bitfield where the first bit enables debug logs and the second bit enables
+ *   performance logs. Both can be enabled by setting this envvar to 3.
+ *
+ * - NVFBC_LOG_FILE
+ *   Write all NvFBC logs to the given file.
+ *
+ * - NVFBC_FORCE_ALLOW_DIRECT_CAPTURE
+ *   Used to override NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bAllowDirectCapture.
+ *
+ * - NVFBC_FORCE_POST_PROCESSING
+ *   Used to force the post processing step, even if it could be skipped.
+ *   See the 'Post Processing' section.
+ */
+
+/*!
+ * \defgroup FBC_STRUCT Structure Definition
+ *
+ * @{
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * Calling convention.
+ */
+#define NVFBCAPI
+
+/*!
+ * NvFBC API major version.
+ */
+#define NVFBC_VERSION_MAJOR 1
+
+/*!
+ * NvFBC API minor version.
+ */
+#define NVFBC_VERSION_MINOR 7
+
+/*!
+ * NvFBC API version.
+ */
+#define NVFBC_VERSION (uint32_t) (NVFBC_VERSION_MINOR | (NVFBC_VERSION_MAJOR << 8))
+
+/*!
+ * Creates a version number for structure parameters.
+ */
+#define NVFBC_STRUCT_VERSION(typeName, ver) \
+    (uint32_t) (sizeof(typeName) | ((ver) << 16) | (NVFBC_VERSION << 24))
+
+/*!
+ * Defines error codes.
+ *
+ * \see NvFBCGetLastErrorStr
+ */
+typedef enum _NVFBCSTATUS
+{
+    /*!
+     * This indicates that the API call returned with no errors.
+     */
+    NVFBC_SUCCESS             = 0,
+    /*!
+     * This indicates that the API version between the client and the library
+     * is not compatible.
+     */
+    NVFBC_ERR_API_VERSION     = 1,
+    /*!
+     * An internal error occurred.
+     */
+    NVFBC_ERR_INTERNAL        = 2,
+    /*!
+     * This indicates that one or more of the parameter passed to the API call
+     * is invalid.
+     */
+    NVFBC_ERR_INVALID_PARAM   = 3,
+    /*!
+     * This indicates that one or more of the pointers passed to the API call
+     * is invalid.
+     */
+    NVFBC_ERR_INVALID_PTR     = 4,
+    /*!
+     * This indicates that the handle passed to the API call to identify the
+     * client is invalid.
+     */
+    NVFBC_ERR_INVALID_HANDLE  = 5,
+    /*!
+     * This indicates that the maximum number of threaded clients of the same
+     * process has been reached.  The limit is 10 threads per process.
+     * There is no limit on the number of process.
+     */
+    NVFBC_ERR_MAX_CLIENTS     = 6,
+    /*!
+     * This indicates that the requested feature is not currently supported
+     * by the library.
+     */
+    NVFBC_ERR_UNSUPPORTED     = 7,
+    /*!
+     * This indicates that the API call failed because it was unable to allocate
+     * enough memory to perform the requested operation.
+     */
+    NVFBC_ERR_OUT_OF_MEMORY   = 8,
+    /*!
+     * This indicates that the API call was not expected.  This happens when
+     * API calls are performed in a wrong order, such as trying to capture
+     * a frame prior to creating a new capture session; or trying to set up
+     * a capture to video memory although a capture session to system memory
+     * was created.
+     */
+    NVFBC_ERR_BAD_REQUEST     = 9,
+    /*!
+     * This indicates an X error, most likely meaning that the X server has
+     * been terminated.  When this error is returned, the only resort is to
+     * create another FBC handle using NvFBCCreateHandle().
+     *
+     * The previous handle should still be freed with NvFBCDestroyHandle(), but
+     * it might leak resources, in particular X, GLX, and GL resources since
+     * it is no longer possible to communicate with an X server to free them
+     * through the driver.
+     *
+     * The best course of action to eliminate this potential leak is to close
+     * the OpenGL driver, close the forked process running the capture, or
+     * restart the application.
+     */
+    NVFBC_ERR_X               = 10,
+    /*!
+     * This indicates a GLX error.
+     */
+    NVFBC_ERR_GLX             = 11,
+    /*!
+     * This indicates an OpenGL error.
+     */
+    NVFBC_ERR_GL              = 12,
+    /*!
+     * This indicates a CUDA error.
+     */
+    NVFBC_ERR_CUDA            = 13,
+    /*!
+     * This indicates a HW encoder error.
+     */
+    NVFBC_ERR_ENCODER         = 14,
+    /*!
+     * This indicates an NvFBC context error.
+     */
+    NVFBC_ERR_CONTEXT         = 15,
+    /*!
+     * This indicates that the application must recreate the capture session.
+     *
+     * This error can be returned if a modeset event occurred while capturing
+     * frames, and NVFBC_CREATE_HANDLE_PARAMS::bDisableAutoModesetRecovery
+     * was set to NVFBC_TRUE.
+     */
+    NVFBC_ERR_MUST_RECREATE   = 16,
+    /*!
+     * This indicates a Vulkan error.
+     */
+    NVFBC_ERR_VULKAN          = 17,
+} NVFBCSTATUS;
+
+/*!
+ * Defines boolean values.
+ */
+typedef enum _NVFBC_BOOL
+{
+    /*!
+     * False value.
+     */
+    NVFBC_FALSE = 0,
+    /*!
+     * True value.
+     */
+    NVFBC_TRUE,
+} NVFBC_BOOL;
+
+/*!
+ * Maximum size in bytes of an error string.
+ */
+#define NVFBC_ERR_STR_LEN 512
+
+/*!
+ * Capture type.
+ */
+typedef enum _NVFBC_CAPTURE_TYPE
+{
+    /*!
+     * Capture frames to a buffer in system memory.
+     */
+    NVFBC_CAPTURE_TO_SYS = 0,
+    /*!
+     * Capture frames to a CUDA device in video memory.
+     *
+     * Specifying this will dlopen() libcuda.so.1 and fail if not available.
+     */
+    NVFBC_CAPTURE_SHARED_CUDA,
+    /*!
+     * Retired. Do not use.
+     */
+    /* NVFBC_CAPTURE_TO_HW_ENCODER, */
+    /*!
+     * Capture frames to an OpenGL buffer in video memory.
+     */
+    NVFBC_CAPTURE_TO_GL = 3,
+} NVFBC_CAPTURE_TYPE;
+
+/*!
+ * Tracking type.
+ *
+ * NvFBC can track a specific region of the framebuffer to capture.
+ *
+ * An X screen corresponds to the entire framebuffer.
+ *
+ * An RandR CRTC is a component of the GPU that reads pixels from a region of
+ * the X screen and sends them through a pipeline to an RandR output.
+ * A physical monitor can be connected to an RandR output.  Tracking an RandR
+ * output captures the region of the X screen that the RandR CRTC is sending to
+ * the RandR output.
+ */
+typedef enum
+{
+    /*!
+     * By default, NvFBC tries to track a connected primary output.  If none is
+     * found, then it tries to track the first connected output.  If none is
+     * found then it tracks the entire X screen.
+     *
+     * If the XRandR extension is not available, this option has the same effect
+     * as ::NVFBC_TRACKING_SCREEN.
+     *
+     * This default behavior might be subject to changes in the future.
+     */
+    NVFBC_TRACKING_DEFAULT = 0,
+    /*!
+     * Track an RandR output specified by its ID in the appropriate field.
+     *
+     * The list of connected outputs can be queried via NvFBCGetStatus().
+     * This list can also be obtained using e.g., xrandr(1).
+     *
+     * If the XRandR extension is not available, setting this option returns an
+     * error.
+     */
+    NVFBC_TRACKING_OUTPUT,
+    /*!
+     * Track the entire X screen.
+     */
+    NVFBC_TRACKING_SCREEN,
+} NVFBC_TRACKING_TYPE;
+
+/*!
+ * Buffer format.
+ */
+typedef enum _NVFBC_BUFFER_FORMAT
+{
+    /*!
+     * Data will be converted to ARGB8888 byte-order format. 32 bpp.
+     */
+    NVFBC_BUFFER_FORMAT_ARGB = 0,
+    /*!
+     * Data will be converted to RGB888 byte-order format. 24 bpp.
+     */
+    NVFBC_BUFFER_FORMAT_RGB,
+    /*!
+     * Data will be converted to NV12 format using HDTV weights
+     * according to ITU-R BT.709.  12 bpp.
+     */
+    NVFBC_BUFFER_FORMAT_NV12,
+    /*!
+     * Data will be converted to YUV 444 planar format using HDTV weights
+     * according to ITU-R BT.709.  24 bpp
+     */
+    NVFBC_BUFFER_FORMAT_YUV444P,
+    /*!
+     * Data will be converted to RGBA8888 byte-order format. 32 bpp.
+     */
+    NVFBC_BUFFER_FORMAT_RGBA,
+    /*!
+     * Native format. No pixel conversion needed.
+     * BGRA8888 byte-order format. 32 bpp.
+     */
+    NVFBC_BUFFER_FORMAT_BGRA,
+} NVFBC_BUFFER_FORMAT;
+
+#define NVFBC_BUFFER_FORMAT_YUV420P NVFBC_BUFFER_FORMAT_NV12
+
+/*!
+ * Handle used to identify an NvFBC session.
+ */
+typedef uint64_t NVFBC_SESSION_HANDLE;
+
+/*!
+ * Box used to describe an area of the tracked region to capture.
+ *
+ * The coordinates are relative to the tracked region.
+ *
+ * E.g., if the size of the X screen is 3520x1200 and the tracked RandR output
+ * scans a region of 1600x1200+1920+0, then setting a capture box of
+ * 800x600+100+50 effectively captures a region of 800x600+2020+50 relative to
+ * the X screen.
+ */
+typedef struct _NVFBC_BOX
+{
+    /*!
+     * [in] X offset of the box.
+     */
+    uint32_t x;
+    /*!
+     * [in] Y offset of the box.
+     */
+    uint32_t y;
+    /*!
+     * [in] Width of the box.
+     */
+    uint32_t w;
+    /*!
+     * [in] Height of the box.
+     */
+    uint32_t h;
+} NVFBC_BOX;
+
+/*!
+ * Size used to describe the size of a frame.
+ */
+typedef struct _NVFBC_SIZE
+{
+    /*!
+     * [in] Width.
+     */
+    uint32_t w;
+    /*!
+     * [in] Height.
+     */
+    uint32_t h;
+} NVFBC_SIZE;
+
+/*!
+ * Describes information about a captured frame.
+ */
+typedef struct _NVFBC_FRAME_GRAB_INFO
+{
+    /*!
+     * [out] Width of the captured frame.
+     */
+    uint32_t dwWidth;
+    /*!
+     * [out] Height of the captured frame.
+     */
+    uint32_t dwHeight;
+    /*!
+     * [out] Size of the frame in bytes.
+     */
+    uint32_t dwByteSize;
+    /*!
+     * [out] Incremental ID of the current frame.
+     *
+     * This can be used to identify a frame.
+     */
+    uint32_t dwCurrentFrame;
+    /*!
+     * [out] Whether the captured frame is a new frame.
+     *
+     * When using non blocking calls it is possible to capture a frame
+     * that was already captured before if the display server did not
+     * render a new frame in the meantime.  In that case, this flag
+     * will be set to NVFBC_FALSE.
+     *
+     * When using blocking calls each captured frame will have
+     * this flag set to NVFBC_TRUE since the blocking mechanism waits for
+     * the display server to render a new frame.
+     *
+     * Note that this flag does not guarantee that the content of
+     * the frame will be different compared to the previous captured frame.
+     *
+     * In particular, some compositing managers report the entire
+     * framebuffer as damaged when an application refreshes its content.
+     *
+     * Consider a single X screen spanned across physical displays A and B
+     * and an NvFBC application tracking display A.  Depending on the
+     * compositing manager, it is possible that an application refreshing
+     * itself on display B will trigger a frame capture on display A.
+     *
+     * Workarounds include:
+     * - Using separate X screens
+     * - Disabling the composite extension
+     * - Using a compositing manager that properly reports what regions
+     *   are damaged
+     * - Using NvFBC's diffmaps to find out if the frame changed
+     */
+    NVFBC_BOOL bIsNewFrame;
+    /*!
+     * [out] Frame timestamp
+     *
+     * Time in micro seconds when the display server started rendering the
+     * frame.
+     *
+     * This does not account for when the frame was captured.  If capturing an
+     * old frame (e.g., bIsNewFrame is NVFBC_FALSE) the reported timestamp
+     * will reflect the time when the old frame was rendered by the display
+     * server.
+     */
+    uint64_t ulTimestampUs;
+    /*
+     * [out] Number of frames generated since the last capture.
+     *
+     * This can help applications tell whether they missed frames or there
+     * were no frames generated by the server since the last capture.
+     */
+    uint32_t dwMissedFrames;
+    /*
+     * [out] Whether the captured frame required post processing.
+     *
+     * See the 'Post Processing' section.
+     */
+    NVFBC_BOOL bRequiredPostProcessing;
+    /*
+     * [out] Whether this frame was obtained via direct capture.
+     *
+     * See NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bAllowDirectCapture.
+     */
+    NVFBC_BOOL bDirectCapture;
+} NVFBC_FRAME_GRAB_INFO;
+
+/*!
+ * Defines parameters for the CreateHandle() API call.
+ */
+typedef struct _NVFBC_CREATE_HANDLE_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_CREATE_HANDLE_PARAMS_VER
+     */
+    uint32_t dwVersion;
+    /*!
+     * [in] Application specific private information passed to the NvFBC
+     * session.
+     */
+    const void *privateData;
+    /*!
+     * [in] Size of the application specific private information passed to the
+     * NvFBC session.
+     */
+    uint32_t privateDataSize;
+    /*!
+     * [in] Whether NvFBC should not create and manage its own graphics context
+     *
+     * NvFBC internally uses OpenGL to perfom graphics operations on the
+     * captured frames.  By default, NvFBC will create and manage (e.g., make
+     * current, detect new threads, etc.) its own OpenGL context.
+     *
+     * If set to NVFBC_TRUE, NvFBC will use the application's context.  It will
+     * be the application's responsibility to make sure that a context is
+     * current on the thread calling into the NvFBC API.
+     */
+    NVFBC_BOOL bExternallyManagedContext;
+    /*!
+     * [in] GLX context
+     *
+     * GLX context that NvFBC should use internally to create pixmaps and
+     * make them current when creating a new capture session.
+     *
+     * Note: NvFBC expects a context created against a GLX_RGBA_TYPE render
+     * type.
+     */
+    void *glxCtx;
+    /*!
+     * [in] GLX framebuffer configuration
+     *
+     * Framebuffer configuration that was used to create the GLX context, and
+     * that will be used to create pixmaps internally.
+     *
+     * Note: NvFBC expects a configuration having at least the following
+     * attributes:
+     *  GLX_DRAWABLE_TYPE, GLX_PIXMAP_BIT
+     *  GLX_BIND_TO_TEXTURE_RGBA_EXT, 1
+     *  GLX_BIND_TO_TEXTURE_TARGETS_EXT, GLX_TEXTURE_2D_BIT_EXT
+     */
+    void *glxFBConfig;
+} NVFBC_CREATE_HANDLE_PARAMS;
+
+/*!
+ * NVFBC_CREATE_HANDLE_PARAMS structure version.
+ */
+#define NVFBC_CREATE_HANDLE_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_CREATE_HANDLE_PARAMS, 2)
+
+/*!
+ * Defines parameters for the ::NvFBCDestroyHandle() API call.
+ */
+typedef struct _NVFBC_DESTROY_HANDLE_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_DESTROY_HANDLE_PARAMS_VER
+     */
+    uint32_t dwVersion;
+} NVFBC_DESTROY_HANDLE_PARAMS;
+
+/*!
+ * NVFBC_DESTROY_HANDLE_PARAMS structure version.
+ */
+#define NVFBC_DESTROY_HANDLE_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_DESTROY_HANDLE_PARAMS, 1)
+
+/*!
+ * Maximum number of connected RandR outputs to an X screen.
+ */
+#define NVFBC_OUTPUT_MAX 5
+
+/*!
+ * Maximum size in bytes of an RandR output name.
+ */
+#define NVFBC_OUTPUT_NAME_LEN 128
+
+/*!
+ * Describes an RandR output.
+ *
+ * Filling this structure relies on the XRandR extension.  This feature cannot
+ * be used if the extension is missing or its version is below the requirements.
+ *
+ * \see Requirements
+ */
+typedef struct _NVFBC_OUTPUT
+{
+    /*!
+     * Identifier of the RandR output.
+     */
+    uint32_t dwId;
+    /*!
+     * Name of the RandR output, as reported by tools such as xrandr(1).
+     *
+     * Example: "DVI-I-0"
+     */
+    char name[NVFBC_OUTPUT_NAME_LEN];
+    /*!
+     * Region of the X screen tracked by the RandR CRTC driving this RandR
+     * output.
+     */
+    NVFBC_BOX trackedBox;
+} NVFBC_RANDR_OUTPUT_INFO;
+
+/*!
+ * Defines parameters for the ::NvFBCGetStatus() API call.
+ */
+typedef struct _NVFBC_GET_STATUS_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_GET_STATUS_PARAMS_VER
+     */
+    uint32_t dwVersion;
+    /*!
+     * [out] Whether or not framebuffer capture is supported by the graphics
+     * driver.
+     */
+    NVFBC_BOOL bIsCapturePossible;
+    /*!
+     * [out] Whether or not there is already a capture session on this system.
+     */
+    NVFBC_BOOL bCurrentlyCapturing;
+    /*!
+     * [out] Whether or not it is possible to create a capture session on this
+     * system.
+     */
+    NVFBC_BOOL bCanCreateNow;
+    /*!
+     * [out] Size of the X screen (framebuffer).
+     */
+    NVFBC_SIZE screenSize;
+    /*!
+     * [out] Whether the XRandR extension is available.
+     *
+     * If this extension is not available then it is not possible to have
+     * information about RandR outputs.
+     */
+    NVFBC_BOOL bXRandRAvailable;
+    /*!
+     * [out] Array of outputs connected to the X screen.
+     *
+     * An application can track a specific output by specifying its ID when
+     * creating a capture session.
+     *
+     * Only if XRandR is available.
+     */
+    NVFBC_RANDR_OUTPUT_INFO outputs[NVFBC_OUTPUT_MAX];
+    /*!
+     * [out] Number of outputs connected to the X screen.
+     *
+     * This must be used to parse the array of connected outputs.
+     *
+     * Only if XRandR is available.
+     */
+    uint32_t dwOutputNum;
+    /*!
+     * [out] Version of the NvFBC library running on this system.
+     */
+    uint32_t dwNvFBCVersion;
+    /*!
+     * [out] Whether the X server is currently in modeset.
+     *
+     * When the X server is in modeset, it must give up all its video
+     * memory allocations. It is not possible to create a capture
+     * session until the modeset is over.
+     *
+     * Note that VT-switches are considered modesets.
+     */
+    NVFBC_BOOL bInModeset;
+} NVFBC_GET_STATUS_PARAMS;
+
+/*!
+ * NVFBC_GET_STATUS_PARAMS structure version.
+ */
+#define NVFBC_GET_STATUS_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_GET_STATUS_PARAMS, 2)
+
+/*!
+ * Defines parameters for the ::NvFBCCreateCaptureSession() API call.
+ */
+typedef struct _NVFBC_CREATE_CAPTURE_SESSION_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_CREATE_CAPTURE_SESSION_PARAMS_VER
+     */
+    uint32_t dwVersion;
+    /*!
+     * [in] Desired capture type.
+     *
+     * Note that when specyfing ::NVFBC_CAPTURE_SHARED_CUDA NvFBC will try to
+     * dlopen() the corresponding libraries.  This means that NvFBC can run on
+     * a system without the CUDA library since it does not link against them.
+     */
+    NVFBC_CAPTURE_TYPE eCaptureType;
+    /*!
+     * [in] What region of the framebuffer should be tracked.
+     */
+    NVFBC_TRACKING_TYPE eTrackingType;
+    /*!
+     * [in] ID of the output to track if eTrackingType is set to
+     * ::NVFBC_TRACKING_OUTPUT.
+     */
+    uint32_t dwOutputId;
+    /*!
+     * [in] Crop the tracked region.
+     *
+     * The coordinates are relative to the tracked region.
+     *
+     * It can be set to 0 to capture the entire tracked region.
+     */
+    NVFBC_BOX captureBox;
+    /*!
+     * [in] Desired size of the captured frame.
+     *
+     * This parameter allow to scale the captured frame.
+     *
+     * It can be set to 0 to disable frame resizing.
+     */
+    NVFBC_SIZE frameSize;
+    /*!
+     * [in] Whether the mouse cursor should be composited to the frame.
+     *
+     * Disabling the cursor will not generate new frames when only the cursor
+     * is moved.
+     */
+    NVFBC_BOOL bWithCursor;
+    /*!
+     * [in] Whether NvFBC should not attempt to recover from modesets.
+     *
+     * NvFBC is able to detect when a modeset event occured and can automatically
+     * re-create a capture session with the same settings as before, then resume
+     * its frame capture session transparently.
+     *
+     * This option allows to disable this behavior.  NVFBC_ERR_MUST_RECREATE
+     * will be returned in that case.
+     *
+     * It can be useful in the cases when an application needs to do some work
+     * between setting up a capture and grabbing the first frame.
+     *
+     * For example: an application using the ToGL interface needs to register
+     * resources with EncodeAPI prior to encoding frames.
+     *
+     * Note that during modeset recovery, NvFBC will try to re-create the
+     * capture session every second until it succeeds.
+     */
+    NVFBC_BOOL bDisableAutoModesetRecovery;
+    /*!
+     * [in] Whether NvFBC should round the requested frameSize.
+     *
+     * When disabled, NvFBC will not attempt to round the requested resolution.
+     *
+     * However, some pixel formats have resolution requirements.  E.g., YUV/NV
+     * formats must have a width being a multiple of 4, and a height being a
+     * multiple of 2.  RGB formats don't have such requirements.
+     *
+     * If the resolution doesn't meet the requirements of the format, then NvFBC
+     * will fail at setup time.
+     *
+     * When enabled, NvFBC will round the requested width to the next multiple
+     * of 4 and the requested height to the next multiple of 2.
+     *
+     * In this case, requesting any resolution will always work with every
+     * format.  However, an NvFBC client must be prepared to handle the case
+     * where the requested resolution is different than the captured resolution.
+     *
+     * NVFBC_FRAME_GRAB_INFO::dwWidth and NVFBC_FRAME_GRAB_INFO::dwHeight should
+     * always be used for getting information about captured frames.
+     */
+    NVFBC_BOOL bRoundFrameSize;
+    /*!
+     * [in] Rate in ms at which the display server generates new frames
+     *
+     * This controls the frequency at which the display server will generate
+     * new frames if new content is available.  This effectively controls the
+     * capture rate when using blocking calls.
+     *
+     * Note that lower values will increase the CPU and GPU loads.
+     *
+     * The default value is 16ms (~ 60 Hz).
+     */
+    uint32_t dwSamplingRateMs;
+    /*!
+     * [in] Enable push model for frame capture
+     *
+     * When set to NVFBC_TRUE, the display server will generate frames whenever
+     * it receives a damage event from applications.
+     *
+     * Setting this to NVFBC_TRUE will ignore ::dwSamplingRateMs.
+     *
+     * Using push model with the NVFBC_*_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY
+     * capture flag should guarantee the shortest amount of time between an
+     * application rendering a frame and an NvFBC client capturing it, provided
+     * that the NvFBC client is able to process the frames quickly enough.
+     *
+     * Note that applications running at high frame rates will increase CPU and
+     * GPU loads.
+     */
+    NVFBC_BOOL bPushModel;
+    /*!
+     * [in] Allow direct capture
+     *
+     * Direct capture allows NvFBC to attach itself to a fullscreen graphics
+     * application. Whenever that application presents a frame, it makes a copy
+     * of it directly into a buffer owned by NvFBC thus bypassing the X server.
+     *
+     * When direct capture is *not* enabled, the NVIDIA X driver generates a
+     * frame for NvFBC when it receives a damage event from an application if push
+     * model is enabled, or periodically checks if there are any pending damage
+     * events otherwise (see NVFBC_CREATE_CAPTURE_SESSION_PARAMS::dwSamplingRateMs).
+     *
+     * Direct capture is possible under the following conditions:
+     * - Direct capture is allowed
+     * - Push model is enabled (see NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bPushModel)
+     * - The mouse cursor is not composited (see NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bWithCursor)
+     * - No viewport transformation is required. This happens when the remote
+     *   desktop is e.g. rotated.
+     *
+     * When direct capture is possible, NvFBC will automatically attach itself
+     * to a fullscreen unoccluded application, if such exists.
+     *
+     * Notes:
+     * - This includes compositing desktops such as GNOME (e.g., gnome-shell
+     *   is the fullscreen unoccluded application).
+     * - There can be only one fullscreen unoccluded application at a time.
+     * - The NVIDIA X driver monitors which application qualifies or no
+     *   longer qualifies.
+     *
+     * For example, if a fullscreen application is launched in GNOME, NvFBC will
+     * detach from gnome-shell and attach to that application.
+     *
+     * Attaching and detaching happens automatically from the perspective of an
+     * NvFBC client. When detaching from an application, the X driver will
+     * transparently resume generating frames for NvFBC.
+     *
+     * An application can know whether a given frame was obtained through
+     * direct capture by checking NVFBC_FRAME_GRAB_INFO::bDirectCapture.
+     */
+    NVFBC_BOOL bAllowDirectCapture;
+} NVFBC_CREATE_CAPTURE_SESSION_PARAMS;
+
+/*!
+ * NVFBC_CREATE_CAPTURE_SESSION_PARAMS structure version.
+ */
+#define NVFBC_CREATE_CAPTURE_SESSION_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_CREATE_CAPTURE_SESSION_PARAMS, 6)
+
+/*!
+ * Defines parameters for the ::NvFBCDestroyCaptureSession() API call.
+ */
+typedef struct _NVFBC_DESTROY_CAPTURE_SESSION_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER
+     */
+    uint32_t dwVersion;
+} NVFBC_DESTROY_CAPTURE_SESSION_PARAMS;
+
+/*!
+ * NVFBC_DESTROY_CAPTURE_SESSION_PARAMS structure version.
+ */
+#define NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_DESTROY_CAPTURE_SESSION_PARAMS, 1)
+
+/*!
+ * Defines parameters for the ::NvFBCBindContext() API call.
+ */
+typedef struct _NVFBC_BIND_CONTEXT_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_BIND_CONTEXT_PARAMS_VER
+     */
+    uint32_t dwVersion;
+} NVFBC_BIND_CONTEXT_PARAMS;
+
+/*!
+ * NVFBC_BIND_CONTEXT_PARAMS structure version.
+ */
+#define NVFBC_BIND_CONTEXT_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_BIND_CONTEXT_PARAMS, 1)
+
+/*!
+ * Defines parameters for the ::NvFBCReleaseContext() API call.
+ */
+typedef struct _NVFBC_RELEASE_CONTEXT_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_RELEASE_CONTEXT_PARAMS_VER
+     */
+    uint32_t dwVersion;
+} NVFBC_RELEASE_CONTEXT_PARAMS;
+
+/*!
+ * NVFBC_RELEASE_CONTEXT_PARAMS structure version.
+ */
+#define NVFBC_RELEASE_CONTEXT_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_RELEASE_CONTEXT_PARAMS, 1)
+
+/*!
+ * Defines flags that can be used when capturing to system memory.
+ */
+typedef enum
+{
+    /*!
+     * Default, capturing waits for a new frame or mouse move.
+     *
+     * The default behavior of blocking grabs is to wait for a new frame until
+     * after the call was made.  But it's possible that there is a frame already
+     * ready that the client hasn't seen.
+     * \see NVFBC_TOSYS_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY
+     */
+    NVFBC_TOSYS_GRAB_FLAGS_NOFLAGS       = 0,
+    /*!
+     * Capturing does not wait for a new frame nor a mouse move.
+     *
+     * It is therefore possible to capture the same frame multiple times.
+     * When this occurs, the dwCurrentFrame parameter of the
+     * NVFBC_FRAME_GRAB_INFO structure is not incremented.
+     */
+    NVFBC_TOSYS_GRAB_FLAGS_NOWAIT        = (1 << 0),
+    /*!
+     * Forces the destination buffer to be refreshed even if the frame has not
+     * changed since previous capture.
+     *
+     * By default, if the captured frame is identical to the previous one, NvFBC
+     * will omit one copy and not update the destination buffer.
+     *
+     * Setting that flag will prevent this behavior.  This can be useful e.g.,
+     * if the application has modified the buffer in the meantime.
+     */
+    NVFBC_TOSYS_GRAB_FLAGS_FORCE_REFRESH = (1 << 1),
+    /*!
+     * Similar to NVFBC_TOSYS_GRAB_FLAGS_NOFLAGS, except that the capture will
+     * not wait if there is already a frame available that the client has
+     * never seen yet.
+     */
+    NVFBC_TOSYS_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY = (1 << 2),
+} NVFBC_TOSYS_GRAB_FLAGS;
+
+/*!
+ * Defines parameters for the ::NvFBCToSysSetUp() API call.
+ */
+typedef struct _NVFBC_TOSYS_SETUP_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_TOSYS_SETUP_PARAMS_VER
+     */
+    uint32_t dwVersion;
+    /*!
+     * [in] Desired buffer format.
+     */
+    NVFBC_BUFFER_FORMAT eBufferFormat;
+    /*!
+     * [out] Pointer to a pointer to a buffer in system memory.
+     *
+     * This buffer contains the pixel value of the requested format.  Refer to
+     * the description of the buffer formats to understand the memory layout.
+     *
+     * The application does not need to allocate memory for this buffer.  It
+     * should not free this buffer either.  This buffer is automatically
+     * re-allocated when needed (e.g., when the resolution changes).
+     *
+     * This buffer is allocated by the NvFBC library to the proper size.  This
+     * size is returned in the dwByteSize field of the
+     * ::NVFBC_FRAME_GRAB_INFO structure.
+     */
+    void **ppBuffer;
+    /*!
+     * [in] Whether differential maps should be generated.
+     */
+    NVFBC_BOOL bWithDiffMap;
+    /*!
+     * [out] Pointer to a pointer to a buffer in system memory.
+     *
+     * This buffer contains the differential map of two frames.  It must be read
+     * as an array of unsigned char.  Each unsigned char is either 0 or
+     * non-zero.  0 means that the pixel value at the given location has not
+     * changed since the previous captured frame.  Non-zero means that the pixel
+     * value has changed.
+     *
+     * The application does not need to allocate memory for this buffer.  It
+     * should not free this buffer either.  This buffer is automatically
+     * re-allocated when needed (e.g., when the resolution changes).
+     *
+     * This buffer is allocated by the NvFBC library to the proper size.  The
+     * size of the differential map is returned in ::diffMapSize.
+     *
+     * This option is not compatible with the ::NVFBC_BUFFER_FORMAT_YUV420P and
+     * ::NVFBC_BUFFER_FORMAT_YUV444P buffer formats.
+     */
+    void **ppDiffMap;
+    /*!
+     * [in] Scaling factor of the differential maps.
+     *
+     * For example, a scaling factor of 16 means that one pixel of the diffmap
+     * will represent 16x16 pixels of the original frames.
+     *
+     * If any of these 16x16 pixels is different between the current and the
+     * previous frame, then the corresponding pixel in the diffmap will be set
+     * to non-zero.
+     *
+     * The default scaling factor is 1.  A dwDiffMapScalingFactor of 0 will be
+     * set to 1.
+     */
+    uint32_t dwDiffMapScalingFactor;
+    /*!
+     * [out] Size of the differential map.
+     *
+     * Only set if bWithDiffMap is set to NVFBC_TRUE.
+     */
+    NVFBC_SIZE diffMapSize;
+} NVFBC_TOSYS_SETUP_PARAMS;
+
+/*!
+ * NVFBC_TOSYS_SETUP_PARAMS structure version.
+ */
+#define NVFBC_TOSYS_SETUP_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOSYS_SETUP_PARAMS, 3)
+
+/*!
+ * Defines parameters for the ::NvFBCToSysGrabFrame() API call.
+ */
+typedef struct _NVFBC_TOSYS_GRAB_FRAME_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_TOSYS_GRAB_FRAME_PARAMS_VER
+     */
+    uint32_t dwVersion;
+    /*!
+     * [in] Flags defining the behavior of this frame capture.
+     */
+    uint32_t dwFlags;
+    /*!
+     * [out] Information about the captured frame.
+     *
+     * Can be NULL.
+     */
+    NVFBC_FRAME_GRAB_INFO *pFrameGrabInfo;
+    /*!
+     * [in] Wait timeout in milliseconds.
+     *
+     * When capturing frames with the NVFBC_TOSYS_GRAB_FLAGS_NOFLAGS or
+     * NVFBC_TOSYS_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY flags,
+     * NvFBC will wait for a new frame or mouse move until the below timer
+     * expires.
+     *
+     * When timing out, the last captured frame will be returned.  Note that as
+     * long as the NVFBC_TOSYS_GRAB_FLAGS_FORCE_REFRESH flag is not set,
+     * returning an old frame will incur no performance penalty.
+     *
+     * NvFBC clients can use the return value of the grab frame operation to
+     * find out whether a new frame was captured, or the timer expired.
+     *
+     * Note that the behavior of blocking calls is to wait for a new frame
+     * *after* the call has been made.  When using timeouts, it is possible
+     * that NvFBC will return a new frame (e.g., it has never been captured
+     * before) even though no new frame was generated after the grab call.
+     *
+     * For the precise definition of what constitutes a new frame, see
+     * ::bIsNewFrame.
+     *
+     * Set to 0 to disable timeouts.
+     */
+    uint32_t dwTimeoutMs;
+} NVFBC_TOSYS_GRAB_FRAME_PARAMS;
+
+/*!
+ * NVFBC_TOSYS_GRAB_FRAME_PARAMS structure version.
+ */
+#define NVFBC_TOSYS_GRAB_FRAME_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOSYS_GRAB_FRAME_PARAMS, 2)
+
+/*!
+ * Defines flags that can be used when capturing to a CUDA buffer in video memory.
+ */
+typedef enum
+{
+    /*!
+     * Default, capturing waits for a new frame or mouse move.
+     *
+     * The default behavior of blocking grabs is to wait for a new frame until
+     * after the call was made.  But it's possible that there is a frame already
+     * ready that the client hasn't seen.
+     * \see NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY
+     */
+    NVFBC_TOCUDA_GRAB_FLAGS_NOFLAGS      = 0,
+    /*!
+     * Capturing does not wait for a new frame nor a mouse move.
+     *
+     * It is therefore possible to capture the same frame multiple times.
+     * When this occurs, the dwCurrentFrame parameter of the
+     * NVFBC_FRAME_GRAB_INFO structure is not incremented.
+     */
+    NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT       = (1 << 0),
+    /*!
+     * [in] Forces the destination buffer to be refreshed even if the frame
+     * has not changed since previous capture.
+     *
+     * By default, if the captured frame is identical to the previous one, NvFBC
+     * will omit one copy and not update the destination buffer.
+     *
+     * Setting that flag will prevent this behavior.  This can be useful e.g.,
+     * if the application has modified the buffer in the meantime.
+     */
+    NVFBC_TOCUDA_GRAB_FLAGS_FORCE_REFRESH = (1 << 1),
+    /*!
+     * Similar to NVFBC_TOCUDA_GRAB_FLAGS_NOFLAGS, except that the capture will
+     * not wait if there is already a frame available that the client has
+     * never seen yet.
+     */
+    NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY = (1 << 2),
+} NVFBC_TOCUDA_FLAGS;
+
+/*!
+ * Defines parameters for the ::NvFBCToCudaSetUp() API call.
+ */
+typedef struct _NVFBC_TOCUDA_SETUP_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_TOCUDA_SETUP_PARAMS_VER
+     */
+    uint32_t dwVersion;
+    /*!
+     * [in] Desired buffer format.
+     */
+    NVFBC_BUFFER_FORMAT eBufferFormat;
+} NVFBC_TOCUDA_SETUP_PARAMS;
+
+/*!
+ * NVFBC_TOCUDA_SETUP_PARAMS structure version.
+ */
+#define NVFBC_TOCUDA_SETUP_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOCUDA_SETUP_PARAMS, 1)
+
+/*!
+ * Defines parameters for the ::NvFBCToCudaGrabFrame() API call.
+ */
+typedef struct _NVFBC_TOCUDA_GRAB_FRAME_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER.
+     */
+    uint32_t dwVersion;
+    /*!
+     * [in] Flags defining the behavior of this frame capture.
+     */
+    uint32_t dwFlags;
+    /*!
+     * [out] Pointer to a ::CUdeviceptr
+     *
+     * The application does not need to allocate memory for this CUDA device.
+     *
+     * The application does need to create its own CUDA context to use this
+     * CUDA device.
+     *
+     * This ::CUdeviceptr will be mapped to a segment in video memory containing
+     * the frame.  It is not possible to process a CUDA device while capturing
+     * a new frame.  If the application wants to do so, it must copy the CUDA
+     * device using ::cuMemcpyDtoD or ::cuMemcpyDtoH beforehand.
+     */
+    void *pCUDADeviceBuffer;
+    /*!
+     * [out] Information about the captured frame.
+     *
+     * Can be NULL.
+     */
+    NVFBC_FRAME_GRAB_INFO *pFrameGrabInfo;
+    /*!
+     * [in] Wait timeout in milliseconds.
+     *
+     * When capturing frames with the NVFBC_TOCUDA_GRAB_FLAGS_NOFLAGS or
+     * NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY flags,
+     * NvFBC will wait for a new frame or mouse move until the below timer
+     * expires.
+     *
+     * When timing out, the last captured frame will be returned.  Note that as
+     * long as the NVFBC_TOCUDA_GRAB_FLAGS_FORCE_REFRESH flag is not set,
+     * returning an old frame will incur no performance penalty.
+     *
+     * NvFBC clients can use the return value of the grab frame operation to
+     * find out whether a new frame was captured, or the timer expired.
+     *
+     * Note that the behavior of blocking calls is to wait for a new frame
+     * *after* the call has been made.  When using timeouts, it is possible
+     * that NvFBC will return a new frame (e.g., it has never been captured
+     * before) even though no new frame was generated after the grab call.
+     *
+     * For the precise definition of what constitutes a new frame, see
+     * ::bIsNewFrame.
+     *
+     * Set to 0 to disable timeouts.
+     */
+    uint32_t dwTimeoutMs;
+} NVFBC_TOCUDA_GRAB_FRAME_PARAMS;
+
+/*!
+ * NVFBC_TOCUDA_GRAB_FRAME_PARAMS structure version.
+ */
+#define NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOCUDA_GRAB_FRAME_PARAMS, 2)
+
+/*!
+ * Defines flags that can be used when capturing to an OpenGL buffer in video memory.
+ */
+typedef enum
+{
+    /*!
+     * Default, capturing waits for a new frame or mouse move.
+     *
+     * The default behavior of blocking grabs is to wait for a new frame until
+     * after the call was made.  But it's possible that there is a frame already
+     * ready that the client hasn't seen.
+     * \see NVFBC_TOGL_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY
+     */
+    NVFBC_TOGL_GRAB_FLAGS_NOFLAGS      = 0,
+    /*!
+     * Capturing does not wait for a new frame nor a mouse move.
+     *
+     * It is therefore possible to capture the same frame multiple times.
+     * When this occurs, the dwCurrentFrame parameter of the
+     * NVFBC_FRAME_GRAB_INFO structure is not incremented.
+     */
+    NVFBC_TOGL_GRAB_FLAGS_NOWAIT       = (1 << 0),
+    /*!
+     * [in] Forces the destination buffer to be refreshed even if the frame
+     * has not changed since previous capture.
+     *
+     * By default, if the captured frame is identical to the previous one, NvFBC
+     * will omit one copy and not update the destination buffer.
+     *
+     * Setting that flag will prevent this behavior.  This can be useful e.g.,
+     * if the application has modified the buffer in the meantime.
+     */
+    NVFBC_TOGL_GRAB_FLAGS_FORCE_REFRESH = (1 << 1),
+    /*!
+     * Similar to NVFBC_TOGL_GRAB_FLAGS_NOFLAGS, except that the capture will
+     * not wait if there is already a frame available that the client has
+     * never seen yet.
+     */
+    NVFBC_TOGL_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY = (1 << 2),
+} NVFBC_TOGL_FLAGS;
+
+/*!
+ * Maximum number of GL textures that can be used to store frames.
+ */
+#define NVFBC_TOGL_TEXTURES_MAX 2
+
+/*!
+ * Defines parameters for the ::NvFBCToGLSetUp() API call.
+ */
+typedef struct _NVFBC_TOGL_SETUP_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_TOGL_SETUP_PARAMS_VER
+     */
+    uint32_t dwVersion;
+    /*!
+     * [in] Desired buffer format.
+     */
+    NVFBC_BUFFER_FORMAT eBufferFormat;
+    /*!
+     * [in] Whether differential maps should be generated.
+     */
+    NVFBC_BOOL bWithDiffMap;
+    /*!
+     * [out] Pointer to a pointer to a buffer in system memory.
+     *
+     * \see NVFBC_TOSYS_SETUP_PARAMS::ppDiffMap
+     */
+    void **ppDiffMap;
+    /*!
+     * [in] Scaling factor of the differential maps.
+     *
+     * \see NVFBC_TOSYS_SETUP_PARAMS::dwDiffMapScalingFactor
+     */
+    uint32_t dwDiffMapScalingFactor;
+    /*!
+     * [out] List of GL textures that will store the captured frames.
+     *
+     * This array is 0 terminated.  The number of textures varies depending on
+     * the capture settings (such as whether diffmaps are enabled).
+     *
+     * An application wishing to interop with, for example, EncodeAPI will need
+     * to register these textures prior to start encoding frames.
+     *
+     * After each frame capture, the texture holding the current frame will be
+     * returned in NVFBC_TOGL_GRAB_FRAME_PARAMS::dwTexture.
+     */
+    uint32_t dwTextures[NVFBC_TOGL_TEXTURES_MAX];
+    /*!
+     * [out] GL target to which the texture should be bound.
+     */
+    uint32_t dwTexTarget;
+    /*!
+     * [out] GL format of the textures.
+     */
+    uint32_t dwTexFormat;
+    /*!
+     * [out] GL type of the textures.
+     */
+    uint32_t dwTexType;
+    /*!
+     * [out] Size of the differential map.
+     *
+     * Only set if bWithDiffMap is set to NVFBC_TRUE.
+     */
+    NVFBC_SIZE diffMapSize;
+} NVFBC_TOGL_SETUP_PARAMS;
+
+/*!
+ * NVFBC_TOGL_SETUP_PARAMS structure version.
+ */
+#define NVFBC_TOGL_SETUP_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOGL_SETUP_PARAMS, 2)
+
+/*!
+ * Defines parameters for the ::NvFBCToGLGrabFrame() API call.
+ */
+typedef struct _NVFBC_TOGL_GRAB_FRAME_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_TOGL_GRAB_FRAME_PARAMS_VER.
+     */
+    uint32_t dwVersion;
+    /*!
+     * [in] Flags defining the behavior of this frame capture.
+     */
+    uint32_t dwFlags;
+    /*!
+     * [out] Index of the texture storing the current frame.
+     *
+     * This is an index in the NVFBC_TOGL_SETUP_PARAMS::dwTextures array.
+     */
+    uint32_t dwTextureIndex;
+    /*!
+     * [out] Information about the captured frame.
+     *
+     * Can be NULL.
+     */
+    NVFBC_FRAME_GRAB_INFO *pFrameGrabInfo;
+    /*!
+     * [in] Wait timeout in milliseconds.
+     *
+     * When capturing frames with the NVFBC_TOGL_GRAB_FLAGS_NOFLAGS or
+     * NVFBC_TOGL_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY flags,
+     * NvFBC will wait for a new frame or mouse move until the below timer
+     * expires.
+     *
+     * When timing out, the last captured frame will be returned.  Note that as
+     * long as the NVFBC_TOGL_GRAB_FLAGS_FORCE_REFRESH flag is not set,
+     * returning an old frame will incur no performance penalty.
+     *
+     * NvFBC clients can use the return value of the grab frame operation to
+     * find out whether a new frame was captured, or the timer expired.
+     *
+     * Note that the behavior of blocking calls is to wait for a new frame
+     * *after* the call has been made.  When using timeouts, it is possible
+     * that NvFBC will return a new frame (e.g., it has never been captured
+     * before) even though no new frame was generated after the grab call.
+     *
+     * For the precise definition of what constitutes a new frame, see
+     * ::bIsNewFrame.
+     *
+     * Set to 0 to disable timeouts.
+     */
+    uint32_t dwTimeoutMs;
+} NVFBC_TOGL_GRAB_FRAME_PARAMS;
+
+/*!
+ * NVFBC_TOGL_GRAB_FRAME_PARAMS structure version.
+ */
+#define NVFBC_TOGL_GRAB_FRAME_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOGL_GRAB_FRAME_PARAMS, 2)
+
+/*! @} FBC_STRUCT */
+
+/*!
+ * \defgroup FBC_FUNC API Entry Points
+ *
+ * Entry points are thread-safe and can be called concurrently.
+ *
+ * The locking model includes a global lock that protects session handle
+ * management (\see NvFBCCreateHandle, \see NvFBCDestroyHandle).
+ *
+ * Each NvFBC session uses a local lock to protect other entry points.  Note
+ * that in certain cases, a thread can hold the local lock for an undefined
+ * amount of time, such as grabbing a frame using a blocking call.
+ *
+ * Note that a context is associated with each session.  NvFBC clients wishing
+ * to share a session between different threads are expected to release and
+ * bind the context appropriately (\see NvFBCBindContext,
+ * \see NvFBCReleaseContext).  This is not required when each thread uses its
+ * own NvFBC session.
+ *
+ * @{
+ */
+
+/*!
+ * Gets the last error message that got recorded for a client.
+ *
+ * When NvFBC returns an error, it will save an error message that can be
+ * queried through this API call.  Only the last message is saved.
+ * The message and the return code should give enough information about
+ * what went wrong.
+ *
+ * \param [in] sessionHandle
+ *   Handle to the NvFBC client.
+ * \return
+ *   A NULL terminated error message, or an empty string.  Its maximum length
+ *   is NVFBC_ERROR_STR_LEN.
+ */
+const char* NVFBCAPI NvFBCGetLastErrorStr(const NVFBC_SESSION_HANDLE sessionHandle);
+
+/*!
+ * \brief Allocates a new handle for an NvFBC client.
+ *
+ * This function allocates a session handle used to identify an FBC client.
+ *
+ * This function implicitly calls NvFBCBindContext().
+ *
+ * \param [out] pSessionHandle
+ *   Pointer that will hold the allocated session handle.
+ * \param [in] pParams
+ *   ::NVFBC_CREATE_HANDLE_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_PTR \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_OUT_OF_MEMORY \n
+ *   ::NVFBC_ERR_MAX_CLIENTS \n
+ *   ::NVFBC_ERR_X \n
+ *   ::NVFBC_ERR_GLX \n
+ *   ::NVFBC_ERR_GL
+ *
+ */
+NVFBCSTATUS NVFBCAPI NvFBCCreateHandle(NVFBC_SESSION_HANDLE *pSessionHandle, NVFBC_CREATE_HANDLE_PARAMS *pParams);
+
+/*!
+ * \brief Destroys the handle of an NvFBC client.
+ *
+ * This function uninitializes an FBC client.
+ *
+ * This function implicitly calls NvFBCReleaseContext().
+ *
+ * After this fucntion returns, it is not possible to use this session handle
+ * for any further API call.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ * \param [in] pParams
+ *   ::NVFBC_DESTROY_HANDLE_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_X
+ */
+NVFBCSTATUS NVFBCAPI NvFBCDestroyHandle(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_DESTROY_HANDLE_PARAMS *pParams);
+
+/*!
+ * \brief Gets the current status of the display driver.
+ *
+ * This function queries the display driver for various information.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ * \param [in] pParams
+ *   ::NVFBC_GET_STATUS_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_X
+ */
+NVFBCSTATUS NVFBCAPI NvFBCGetStatus(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_GET_STATUS_PARAMS *pParams);
+
+/*!
+ * \brief Binds the FBC context to the calling thread.
+ *
+ * The NvFBC library internally relies on objects that must be bound to a
+ * thread.  Such objects are OpenGL contexts and CUDA contexts.
+ *
+ * This function binds these objects to the calling thread.
+ *
+ * The FBC context must be bound to the calling thread for most NvFBC entry
+ * points, otherwise ::NVFBC_ERR_CONTEXT is returned.
+ *
+ * If the FBC context is already bound to a different thread,
+ * ::NVFBC_ERR_CONTEXT is returned.  The other thread must release the context
+ * first by calling the ReleaseContext() entry point.
+ *
+ * If the FBC context is already bound to the current thread, this function has
+ * no effects.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ * \param [in] pParams
+ *   ::NVFBC_DESTROY_CAPTURE_SESSION_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_X
+ */
+NVFBCSTATUS NVFBCAPI NvFBCBindContext(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_BIND_CONTEXT_PARAMS *pParams);
+
+/*!
+ * \brief Releases the FBC context from the calling thread.
+ *
+ * If the FBC context is bound to a different thread, ::NVFBC_ERR_CONTEXT is
+ * returned.
+ *
+ * If the FBC context is already released, this functino has no effects.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ * \param [in] pParams
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_X
+ */
+NVFBCSTATUS NVFBCAPI NvFBCReleaseContext(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_RELEASE_CONTEXT_PARAMS *pParams);
+
+/*!
+ * \brief Creates a capture session for an FBC client.
+ *
+ * This function starts a capture session of the desired type (system memory,
+ * video memory with CUDA interop, or H.264 compressed frames in system memory).
+ *
+ * Not all types are supported on all systems.  Also, it is possible to use
+ * NvFBC without having the CUDA library.  In this case, requesting a capture
+ * session of the concerned type will return an error.
+ *
+ * After this function returns, the display driver will start generating frames
+ * that can be captured using the corresponding API call.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ * \param [in] pParams
+ *   ::NVFBC_CREATE_CAPTURE_SESSION_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_INVALID_PARAM \n
+ *   ::NVFBC_ERR_OUT_OF_MEMORY \n
+ *   ::NVFBC_ERR_X \n
+ *   ::NVFBC_ERR_GLX \n
+ *   ::NVFBC_ERR_GL \n
+ *   ::NVFBC_ERR_CUDA \n
+ *   ::NVFBC_ERR_MUST_RECREATE \n
+ *   ::NVFBC_ERR_INTERNAL
+ */
+NVFBCSTATUS NVFBCAPI NvFBCCreateCaptureSession(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_CREATE_CAPTURE_SESSION_PARAMS *pParams);
+
+/*!
+ * \brief Destroys a capture session for an FBC client.
+ *
+ * This function stops a capture session and frees allocated objects.
+ *
+ * After this function returns, it is possible to create another capture
+ * session using the corresponding API call.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ * \param [in] pParams
+ *   ::NVFBC_DESTROY_CAPTURE_SESSION_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_X
+ */
+NVFBCSTATUS NVFBCAPI NvFBCDestroyCaptureSession(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_DESTROY_CAPTURE_SESSION_PARAMS *pParams);
+
+/*!
+ * \brief Sets up a capture to system memory session.
+ *
+ * This function configures how the capture to system memory should behave. It
+ * can be called anytime and several times after the capture session has been
+ * created.  However, it must be called at least once prior to start capturing
+ * frames.
+ *
+ * This function allocates the buffer that will contain the captured frame.
+ * The application does not need to free this buffer.  The size of this buffer
+ * is returned in the ::NVFBC_FRAME_GRAB_INFO structure.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ * \param [in] pParams
+ *   ::NVFBC_TOSYS_SETUP_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_UNSUPPORTED \n
+ *   ::NVFBC_ERR_INVALID_PTR \n
+ *   ::NVFBC_ERR_INVALID_PARAM \n
+ *   ::NVFBC_ERR_OUT_OF_MEMORY \n
+ *   ::NVFBC_ERR_X
+ */
+NVFBCSTATUS NVFBCAPI NvFBCToSysSetUp(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOSYS_SETUP_PARAMS *pParams);
+
+/*!
+ * \brief Captures a frame to a buffer in system memory.
+ *
+ * This function triggers a frame capture to a buffer in system memory that was
+ * registered with the ToSysSetUp() API call.
+ *
+ * Note that it is possible that the resolution of the desktop changes while
+ * capturing frames. This should be transparent for the application.
+ *
+ * When the resolution changes, the capture session is recreated using the same
+ * parameters, and necessary buffers are re-allocated. The frame counter is not
+ * reset.
+ *
+ * An application can detect that the resolution changed by comparing the
+ * dwByteSize member of the ::NVFBC_FRAME_GRAB_INFO against a previous
+ * frame and/or dwWidth and dwHeight.
+ *
+ * During a change of resolution the capture is paused even in asynchronous
+ * mode.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ * \param [in] pParams
+ *   ::NVFBC_TOSYS_GRAB_FRAME_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_INVALID_PTR \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_X \n
+ *   ::NVFBC_ERR_MUST_RECREATE \n
+ *   \see NvFBCCreateCaptureSession \n
+ *   \see NvFBCToSysSetUp
+ */
+NVFBCSTATUS NVFBCAPI NvFBCToSysGrabFrame(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOSYS_GRAB_FRAME_PARAMS *pParams);
+
+/*!
+ * \brief Sets up a capture to video memory session.
+ *
+ * This function configures how the capture to video memory with CUDA interop
+ * should behave.  It can be called anytime and several times after the capture
+ * session has been created.  However, it must be called at least once prior
+ * to start capturing frames.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ *
+ * \param [in] pParams
+ *   ::NVFBC_TOCUDA_SETUP_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_UNSUPPORTED \n
+ *   ::NVFBC_ERR_GL \n
+ *   ::NVFBC_ERR_X
+ */
+NVFBCSTATUS NVFBCAPI NvFBCToCudaSetUp(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOCUDA_SETUP_PARAMS *pParams);
+
+/*!
+ * \brief Captures a frame to a CUDA device in video memory.
+ *
+ * This function triggers a frame capture to a CUDA device in video memory.
+ *
+ * Note about changes of resolution: \see NvFBCToSysGrabFrame
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ *
+ * \param [in] pParams
+ *   ::NVFBC_TOCUDA_GRAB_FRAME_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_INVALID_PTR \n
+ *   ::NVFBC_ERR_CUDA \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_X \n
+ *   ::NVFBC_ERR_MUST_RECREATE \n
+ *   \see NvFBCCreateCaptureSession \n
+ *   \see NvFBCToCudaSetUp
+ */
+NVFBCSTATUS NVFBCAPI NvFBCToCudaGrabFrame(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOCUDA_GRAB_FRAME_PARAMS *pParams);
+
+/*!
+ * \brief Sets up a capture to OpenGL buffer in video memory session.
+ *
+ * This function configures how the capture to video memory should behave.
+ * It can be called anytime and several times after the capture session has been
+ * created.  However, it must be called at least once prior to start capturing
+ * frames.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ *
+ * \param [in] pParams
+ *   ::NVFBC_TOGL_SETUP_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_UNSUPPORTED \n
+ *   ::NVFBC_ERR_GL \n
+ *   ::NVFBC_ERR_X
+ */
+NVFBCSTATUS NVFBCAPI NvFBCToGLSetUp(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOGL_SETUP_PARAMS *pParams);
+
+/*!
+ * \brief Captures a frame to an OpenGL buffer in video memory.
+ *
+ * This function triggers a frame capture to a selected resource in video memory.
+ *
+ * Note about changes of resolution: \see NvFBCToSysGrabFrame
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ *
+ * \param [in] pParams
+ *   ::NVFBC_TOGL_GRAB_FRAME_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_INVALID_PTR \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_X \n
+ *   ::NVFBC_ERR_MUST_RECREATE \n
+ *   \see NvFBCCreateCaptureSession \n
+ *   \see NvFBCToCudaSetUp
+ */
+NVFBCSTATUS NVFBCAPI NvFBCToGLGrabFrame(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOGL_GRAB_FRAME_PARAMS *pParams);
+
+/*!
+ * \cond FBC_PFN
+ *
+ * Defines API function pointers
+ */
+typedef const char* (NVFBCAPI* PNVFBCGETLASTERRORSTR)(const NVFBC_SESSION_HANDLE sessionHandle);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCCREATEHANDLE)(NVFBC_SESSION_HANDLE *pSessionHandle, NVFBC_CREATE_HANDLE_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCDESTROYHANDLE)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_DESTROY_HANDLE_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCBINDCONTEXT)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_BIND_CONTEXT_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCRELEASECONTEXT)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_RELEASE_CONTEXT_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCGETSTATUS)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_GET_STATUS_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCCREATECAPTURESESSION)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_CREATE_CAPTURE_SESSION_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCDESTROYCAPTURESESSION)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_DESTROY_CAPTURE_SESSION_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOSYSSETUP)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOSYS_SETUP_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOSYSGRABFRAME)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOSYS_GRAB_FRAME_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOCUDASETUP)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOCUDA_SETUP_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOCUDAGRABFRAME)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOCUDA_GRAB_FRAME_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOGLSETUP)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOGL_SETUP_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOGLGRABFRAME)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOGL_GRAB_FRAME_PARAMS *pParams);
+
+/// \endcond
+
+/*! @} FBC_FUNC */
+
+/*!
+ * \ingroup FBC_STRUCT
+ *
+ * Structure populated with API function pointers.
+ */
+typedef struct
+{
+    uint32_t                                  dwVersion;                  //!< [in] Must be set to NVFBC_VERSION.
+    PNVFBCGETLASTERRORSTR                     nvFBCGetLastErrorStr;       //!< [out] Pointer to ::NvFBCGetLastErrorStr().
+    PNVFBCCREATEHANDLE                        nvFBCCreateHandle;          //!< [out] Pointer to ::NvFBCCreateHandle().
+    PNVFBCDESTROYHANDLE                       nvFBCDestroyHandle;         //!< [out] Pointer to ::NvFBCDestroyHandle().
+    PNVFBCGETSTATUS                           nvFBCGetStatus;             //!< [out] Pointer to ::NvFBCGetStatus().
+    PNVFBCCREATECAPTURESESSION                nvFBCCreateCaptureSession;  //!< [out] Pointer to ::NvFBCCreateCaptureSession().
+    PNVFBCDESTROYCAPTURESESSION               nvFBCDestroyCaptureSession; //!< [out] Pointer to ::NvFBCDestroyCaptureSession().
+    PNVFBCTOSYSSETUP                          nvFBCToSysSetUp;            //!< [out] Pointer to ::NvFBCToSysSetUp().
+    PNVFBCTOSYSGRABFRAME                      nvFBCToSysGrabFrame;        //!< [out] Pointer to ::NvFBCToSysGrabFrame().
+    PNVFBCTOCUDASETUP                         nvFBCToCudaSetUp;           //!< [out] Pointer to ::NvFBCToCudaSetUp().
+    PNVFBCTOCUDAGRABFRAME                     nvFBCToCudaGrabFrame;       //!< [out] Pointer to ::NvFBCToCudaGrabFrame().
+    void*                                     pad1;                       //!< [out] Retired. Do not use.
+    void*                                     pad2;                       //!< [out] Retired. Do not use.
+    void*                                     pad3;                       //!< [out] Retired. Do not use.
+    PNVFBCBINDCONTEXT                         nvFBCBindContext;           //!< [out] Pointer to ::NvFBCBindContext().
+    PNVFBCRELEASECONTEXT                      nvFBCReleaseContext;        //!< [out] Pointer to ::NvFBCReleaseContext().
+    void*                                     pad4;                       //!< [out] Retired. Do not use.
+    void*                                     pad5;                       //!< [out] Retired. Do not use.
+    void*                                     pad6;                       //!< [out] Retired. Do not use.
+    void*                                     pad7;                       //!< [out] Retired. Do not use.
+    PNVFBCTOGLSETUP                           nvFBCToGLSetUp;             //!< [out] Pointer to ::nvFBCToGLSetup().
+    PNVFBCTOGLGRABFRAME                       nvFBCToGLGrabFrame;         //!< [out] Pointer to ::nvFBCToGLGrabFrame().
+} NVFBC_API_FUNCTION_LIST;
+
+/*!
+ * \ingroup FBC_FUNC
+ *
+ * \brief Entry Points to the NvFBC interface.
+ *
+ * Creates an instance of the NvFBC interface, and populates the
+ * pFunctionList with function pointers to the API routines implemented by
+ * the NvFBC interface.
+ *
+ * \param [out] pFunctionList
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_PTR \n
+ *   ::NVFBC_ERR_API_VERSION
+ */
+NVFBCSTATUS NVFBCAPI NvFBCCreateInstance(NVFBC_API_FUNCTION_LIST *pFunctionList);
+/*!
+ * \ingroup FBC_FUNC
+ *
+ * Defines function pointer for the ::NvFBCCreateInstance() API call.
+ */
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCCREATEINSTANCE)(NVFBC_API_FUNCTION_LIST *pFunctionList);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _NVFBC_H_
diff --git a/third-party/nvfbc/helper_math.h b/third-party/nvfbc/helper_math.h
new file mode 100644
index 00000000..d17b024e
--- /dev/null
+++ b/third-party/nvfbc/helper_math.h
@@ -0,0 +1,1469 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ *  This file implements common mathematical operations on vector types
+ *  (float3, float4 etc.) since these are not provided as standard by CUDA.
+ *
+ *  The syntax is modeled on the Cg standard library.
+ *
+ *  This is part of the Helper library includes
+ *
+ *    Thanks to Linh Hah for additions and fixes.
+ */
+
+#ifndef HELPER_MATH_H
+#define HELPER_MATH_H
+
+#include "cuda_runtime.h"
+
+typedef unsigned int uint;
+typedef unsigned short ushort;
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#ifndef __CUDACC__
+#include <math.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// host implementations of CUDA functions
+////////////////////////////////////////////////////////////////////////////////
+
+inline float fminf(float a, float b)
+{
+    return a < b ? a : b;
+}
+
+inline float fmaxf(float a, float b)
+{
+    return a > b ? a : b;
+}
+
+inline int max(int a, int b)
+{
+    return a > b ? a : b;
+}
+
+inline int min(int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline float rsqrtf(float x)
+{
+    return 1.0f / sqrtf(x);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// constructors
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 make_float2(float s)
+{
+    return make_float2(s, s);
+}
+inline __host__ __device__ float2 make_float2(float3 a)
+{
+    return make_float2(a.x, a.y);
+}
+inline __host__ __device__ float2 make_float2(int2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+inline __host__ __device__ float2 make_float2(uint2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+
+inline __host__ __device__ int2 make_int2(int s)
+{
+    return make_int2(s, s);
+}
+inline __host__ __device__ int2 make_int2(int3 a)
+{
+    return make_int2(a.x, a.y);
+}
+inline __host__ __device__ int2 make_int2(uint2 a)
+{
+    return make_int2(int(a.x), int(a.y));
+}
+inline __host__ __device__ int2 make_int2(float2 a)
+{
+    return make_int2(int(a.x), int(a.y));
+}
+
+inline __host__ __device__ uint2 make_uint2(uint s)
+{
+    return make_uint2(s, s);
+}
+inline __host__ __device__ uint2 make_uint2(uint3 a)
+{
+    return make_uint2(a.x, a.y);
+}
+inline __host__ __device__ uint2 make_uint2(int2 a)
+{
+    return make_uint2(uint(a.x), uint(a.y));
+}
+
+inline __host__ __device__ float3 make_float3(float s)
+{
+    return make_float3(s, s, s);
+}
+inline __host__ __device__ float3 make_float3(float2 a)
+{
+    return make_float3(a.x, a.y, 0.0f);
+}
+inline __host__ __device__ float3 make_float3(float2 a, float s)
+{
+    return make_float3(a.x, a.y, s);
+}
+inline __host__ __device__ float3 make_float3(float4 a)
+{
+    return make_float3(a.x, a.y, a.z);
+}
+inline __host__ __device__ float3 make_float3(int3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+inline __host__ __device__ float3 make_float3(uint3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+
+inline __host__ __device__ int3 make_int3(int s)
+{
+    return make_int3(s, s, s);
+}
+inline __host__ __device__ int3 make_int3(int2 a)
+{
+    return make_int3(a.x, a.y, 0);
+}
+inline __host__ __device__ int3 make_int3(int2 a, int s)
+{
+    return make_int3(a.x, a.y, s);
+}
+inline __host__ __device__ int3 make_int3(uint3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+inline __host__ __device__ int3 make_int3(float3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+
+inline __host__ __device__ uint3 make_uint3(uint s)
+{
+    return make_uint3(s, s, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a)
+{
+    return make_uint3(a.x, a.y, 0);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a, uint s)
+{
+    return make_uint3(a.x, a.y, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint4 a)
+{
+    return make_uint3(a.x, a.y, a.z);
+}
+inline __host__ __device__ uint3 make_uint3(int3 a)
+{
+    return make_uint3(uint(a.x), uint(a.y), uint(a.z));
+}
+
+inline __host__ __device__ float4 make_float4(float s)
+{
+    return make_float4(s, s, s, s);
+}
+inline __host__ __device__ float4 make_float4(float3 a)
+{
+    return make_float4(a.x, a.y, a.z, 0.0f);
+}
+inline __host__ __device__ float4 make_float4(float3 a, float w)
+{
+    return make_float4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ float4 make_float4(int4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+inline __host__ __device__ float4 make_float4(uint4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+
+inline __host__ __device__ int4 make_int4(int s)
+{
+    return make_int4(s, s, s, s);
+}
+inline __host__ __device__ int4 make_int4(int3 a)
+{
+    return make_int4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ int4 make_int4(int3 a, int w)
+{
+    return make_int4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ int4 make_int4(uint4 a)
+{
+    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+inline __host__ __device__ int4 make_int4(float4 a)
+{
+    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+
+
+inline __host__ __device__ uint4 make_uint4(uint s)
+{
+    return make_uint4(s, s, s, s);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a)
+{
+    return make_uint4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a, uint w)
+{
+    return make_uint4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ uint4 make_uint4(int4 a)
+{
+    return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// negate
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator-(float2 &a)
+{
+    return make_float2(-a.x, -a.y);
+}
+inline __host__ __device__ int2 operator-(int2 &a)
+{
+    return make_int2(-a.x, -a.y);
+}
+inline __host__ __device__ float3 operator-(float3 &a)
+{
+    return make_float3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ int3 operator-(int3 &a)
+{
+    return make_int3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ float4 operator-(float4 &a)
+{
+    return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+inline __host__ __device__ int4 operator-(int4 &a)
+{
+    return make_int4(-a.x, -a.y, -a.z, -a.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// addition
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator+(float2 a, float2 b)
+{
+    return make_float2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(float2 &a, float2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ float2 operator+(float2 a, float b)
+{
+    return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ float2 operator+(float b, float2 a)
+{
+    return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(float2 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+inline __host__ __device__ int2 operator+(int2 a, int2 b)
+{
+    return make_int2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(int2 &a, int2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ int2 operator+(int2 a, int b)
+{
+    return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ int2 operator+(int b, int2 a)
+{
+    return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(int2 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
+{
+    return make_uint2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ uint2 operator+(uint2 a, uint b)
+{
+    return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ uint2 operator+(uint b, uint2 a)
+{
+    return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+
+inline __host__ __device__ float3 operator+(float3 a, float3 b)
+{
+    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(float3 &a, float3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ float3 operator+(float3 a, float b)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(float3 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ int3 operator+(int3 a, int3 b)
+{
+    return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(int3 &a, int3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ int3 operator+(int3 a, int b)
+{
+    return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(int3 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
+{
+    return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ uint3 operator+(uint3 a, uint b)
+{
+    return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ int3 operator+(int b, int3 a)
+{
+    return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ uint3 operator+(uint b, uint3 a)
+{
+    return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ float3 operator+(float b, float3 a)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+
+inline __host__ __device__ float4 operator+(float4 a, float4 b)
+{
+    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(float4 &a, float4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ float4 operator+(float4 a, float b)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ float4 operator+(float b, float4 a)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ void operator+=(float4 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+inline __host__ __device__ int4 operator+(int4 a, int4 b)
+{
+    return make_int4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(int4 &a, int4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ int4 operator+(int4 a, int b)
+{
+    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ int4 operator+(int b, int4 a)
+{
+    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ void operator+=(int4 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
+{
+    return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ uint4 operator+(uint4 a, uint b)
+{
+    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ uint4 operator+(uint b, uint4 a)
+{
+    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// subtract
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator-(float2 a, float2 b)
+{
+    return make_float2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ float2 operator-(float2 a, float b)
+{
+    return make_float2(a.x - b, a.y - b);
+}
+inline __host__ __device__ float2 operator-(float b, float2 a)
+{
+    return make_float2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ int2 operator-(int2 a, int2 b)
+{
+    return make_int2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ int2 operator-(int2 a, int b)
+{
+    return make_int2(a.x - b, a.y - b);
+}
+inline __host__ __device__ int2 operator-(int b, int2 a)
+{
+    return make_int2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
+{
+    return make_uint2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ uint2 operator-(uint2 a, uint b)
+{
+    return make_uint2(a.x - b, a.y - b);
+}
+inline __host__ __device__ uint2 operator-(uint b, uint2 a)
+{
+    return make_uint2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ float3 operator-(float3 a, float3 b)
+{
+    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ float3 operator-(float3 a, float b)
+{
+    return make_float3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ float3 operator-(float b, float3 a)
+{
+    return make_float3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ int3 operator-(int3 a, int3 b)
+{
+    return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ int3 operator-(int3 a, int b)
+{
+    return make_int3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ int3 operator-(int b, int3 a)
+{
+    return make_int3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
+{
+    return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ uint3 operator-(uint3 a, uint b)
+{
+    return make_uint3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ uint3 operator-(uint b, uint3 a)
+{
+    return make_uint3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ float4 operator-(float4 a, float4 b)
+{
+    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(float4 &a, float4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ float4 operator-(float4 a, float b)
+{
+    return make_float4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ void operator-=(float4 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+inline __host__ __device__ int4 operator-(int4 a, int4 b)
+{
+    return make_int4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ int4 operator-(int4 a, int b)
+{
+    return make_int4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ int4 operator-(int b, int4 a)
+{
+    return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
+{
+    return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ uint4 operator-(uint4 a, uint b)
+{
+    return make_uint4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ uint4 operator-(uint b, uint4 a)
+{
+    return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// multiply
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator*(float2 a, float2 b)
+{
+    return make_float2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ float2 operator*(float2 a, float b)
+{
+    return make_float2(a.x * b, a.y * b);
+}
+inline __host__ __device__ float2 operator*(float b, float2 a)
+{
+    return make_float2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ int2 operator*(int2 a, int2 b)
+{
+    return make_int2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ int2 operator*(int2 a, int b)
+{
+    return make_int2(a.x * b, a.y * b);
+}
+inline __host__ __device__ int2 operator*(int b, int2 a)
+{
+    return make_int2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
+{
+    return make_uint2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ uint2 operator*(uint2 a, uint b)
+{
+    return make_uint2(a.x * b, a.y * b);
+}
+inline __host__ __device__ uint2 operator*(uint b, uint2 a)
+{
+    return make_uint2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ float3 operator*(float3 a, float3 b)
+{
+    return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ float3 operator*(float3 a, float b)
+{
+    return make_float3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ float3 operator*(float b, float3 a)
+{
+    return make_float3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ int3 operator*(int3 a, int3 b)
+{
+    return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ int3 operator*(int3 a, int b)
+{
+    return make_int3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ int3 operator*(int b, int3 a)
+{
+    return make_int3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
+{
+    return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ uint3 operator*(uint3 a, uint b)
+{
+    return make_uint3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ uint3 operator*(uint b, uint3 a)
+{
+    return make_uint3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ float4 operator*(float4 a, float4 b)
+{
+    return make_float4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ float4 operator*(float4 a, float b)
+{
+    return make_float4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ float4 operator*(float b, float4 a)
+{
+    return make_float4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+inline __host__ __device__ int4 operator*(int4 a, int4 b)
+{
+    return make_int4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ int4 operator*(int4 a, int b)
+{
+    return make_int4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ int4 operator*(int b, int4 a)
+{
+    return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
+{
+    return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ uint4 operator*(uint4 a, uint b)
+{
+    return make_uint4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ uint4 operator*(uint b, uint4 a)
+{
+    return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// divide
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator/(float2 a, float2 b)
+{
+    return make_float2(a.x / b.x, a.y / b.y);
+}
+inline __host__ __device__ void operator/=(float2 &a, float2 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+}
+inline __host__ __device__ float2 operator/(float2 a, float b)
+{
+    return make_float2(a.x / b, a.y / b);
+}
+inline __host__ __device__ void operator/=(float2 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+}
+inline __host__ __device__ float2 operator/(float b, float2 a)
+{
+    return make_float2(b / a.x, b / a.y);
+}
+
+inline __host__ __device__ float3 operator/(float3 a, float3 b)
+{
+    return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+inline __host__ __device__ void operator/=(float3 &a, float3 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+}
+inline __host__ __device__ float3 operator/(float3 a, float b)
+{
+    return make_float3(a.x / b, a.y / b, a.z / b);
+}
+inline __host__ __device__ void operator/=(float3 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+    a.z /= b;
+}
+inline __host__ __device__ float3 operator/(float b, float3 a)
+{
+    return make_float3(b / a.x, b / a.y, b / a.z);
+}
+
+inline __host__ __device__ float4 operator/(float4 a, float4 b)
+{
+    return make_float4(a.x / b.x, a.y / b.y, a.z / b.z,  a.w / b.w);
+}
+inline __host__ __device__ void operator/=(float4 &a, float4 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+    a.w /= b.w;
+}
+inline __host__ __device__ float4 operator/(float4 a, float b)
+{
+    return make_float4(a.x / b, a.y / b, a.z / b,  a.w / b);
+}
+inline __host__ __device__ void operator/=(float4 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+    a.z /= b;
+    a.w /= b;
+}
+inline __host__ __device__ float4 operator/(float b, float4 a)
+{
+    return make_float4(b / a.x, b / a.y, b / a.z, b / a.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// min
+////////////////////////////////////////////////////////////////////////////////
+
+inline  __host__ __device__ float2 fminf(float2 a, float2 b)
+{
+    return make_float2(fminf(a.x,b.x), fminf(a.y,b.y));
+}
+inline __host__ __device__ float3 fminf(float3 a, float3 b)
+{
+    return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
+}
+inline  __host__ __device__ float4 fminf(float4 a, float4 b)
+{
+    return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
+}
+
+inline __host__ __device__ int2 min(int2 a, int2 b)
+{
+    return make_int2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ int3 min(int3 a, int3 b)
+{
+    return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ int4 min(int4 a, int4 b)
+{
+    return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+
+inline __host__ __device__ uint2 min(uint2 a, uint2 b)
+{
+    return make_uint2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ uint3 min(uint3 a, uint3 b)
+{
+    return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ uint4 min(uint4 a, uint4 b)
+{
+    return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// max
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fmaxf(float2 a, float2 b)
+{
+    return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y));
+}
+inline __host__ __device__ float3 fmaxf(float3 a, float3 b)
+{
+    return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
+}
+inline __host__ __device__ float4 fmaxf(float4 a, float4 b)
+{
+    return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
+}
+
+inline __host__ __device__ int2 max(int2 a, int2 b)
+{
+    return make_int2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ int3 max(int3 a, int3 b)
+{
+    return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ int4 max(int4 a, int4 b)
+{
+    return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+
+inline __host__ __device__ uint2 max(uint2 a, uint2 b)
+{
+    return make_uint2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ uint3 max(uint3 a, uint3 b)
+{
+    return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ uint4 max(uint4 a, uint4 b)
+{
+    return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// lerp
+// - linear interpolation between a and b, based on value t in [0, 1] range
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float lerp(float a, float b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
+{
+    return a + t*(b-a);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// clamp
+// - clamp the value v to be in the range [a, b]
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float clamp(float f, float a, float b)
+{
+    return fmaxf(a, fminf(f, b));
+}
+inline __device__ __host__ int clamp(int f, int a, int b)
+{
+    return max(a, min(f, b));
+}
+inline __device__ __host__ uint clamp(uint f, uint a, uint b)
+{
+    return max(a, min(f, b));
+}
+
+inline __device__ __host__ float2 clamp(float2 v, float a, float b)
+{
+    return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
+{
+    return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ float3 clamp(float3 v, float a, float b)
+{
+    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
+{
+    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ float4 clamp(float4 v, float a, float b)
+{
+    return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
+{
+    return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+inline __device__ __host__ int2 clamp(int2 v, int a, int b)
+{
+    return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
+{
+    return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ int3 clamp(int3 v, int a, int b)
+{
+    return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
+{
+    return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ int4 clamp(int4 v, int a, int b)
+{
+    return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
+{
+    return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
+{
+    return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
+{
+    return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
+{
+    return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
+{
+    return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
+{
+    return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
+{
+    return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// dot product
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float dot(float2 a, float2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ float dot(float3 a, float3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ float dot(float4 a, float4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __host__ __device__ int dot(int2 a, int2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ int dot(int3 a, int3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ int dot(int4 a, int4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __host__ __device__ uint dot(uint2 a, uint2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ uint dot(uint3 a, uint3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ uint dot(uint4 a, uint4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// length
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float length(float2 v)
+{
+    return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float3 v)
+{
+    return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float4 v)
+{
+    return sqrtf(dot(v, v));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// normalize
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 normalize(float2 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+inline __host__ __device__ float3 normalize(float3 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+inline __host__ __device__ float4 normalize(float4 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// floor
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 floorf(float2 v)
+{
+    return make_float2(floorf(v.x), floorf(v.y));
+}
+inline __host__ __device__ float3 floorf(float3 v)
+{
+    return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
+}
+inline __host__ __device__ float4 floorf(float4 v)
+{
+    return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// frac - returns the fractional portion of a scalar or each vector component
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float fracf(float v)
+{
+    return v - floorf(v);
+}
+inline __host__ __device__ float2 fracf(float2 v)
+{
+    return make_float2(fracf(v.x), fracf(v.y));
+}
+inline __host__ __device__ float3 fracf(float3 v)
+{
+    return make_float3(fracf(v.x), fracf(v.y), fracf(v.z));
+}
+inline __host__ __device__ float4 fracf(float4 v)
+{
+    return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// fmod
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fmodf(float2 a, float2 b)
+{
+    return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y));
+}
+inline __host__ __device__ float3 fmodf(float3 a, float3 b)
+{
+    return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
+}
+inline __host__ __device__ float4 fmodf(float4 a, float4 b)
+{
+    return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// absolute value
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fabs(float2 v)
+{
+    return make_float2(fabs(v.x), fabs(v.y));
+}
+inline __host__ __device__ float3 fabs(float3 v)
+{
+    return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
+}
+inline __host__ __device__ float4 fabs(float4 v)
+{
+    return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
+}
+
+inline __host__ __device__ int2 abs(int2 v)
+{
+    return make_int2(abs(v.x), abs(v.y));
+}
+inline __host__ __device__ int3 abs(int3 v)
+{
+    return make_int3(abs(v.x), abs(v.y), abs(v.z));
+}
+inline __host__ __device__ int4 abs(int4 v)
+{
+    return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// reflect
+// - returns reflection of incident ray I around surface normal N
+// - N should be normalized, reflected vector's length is equal to length of I
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float3 reflect(float3 i, float3 n)
+{
+    return i - 2.0f * n * dot(n,i);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// cross product
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float3 cross(float3 a, float3 b)
+{
+    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// smoothstep
+// - returns 0 if x < a
+// - returns 1 if x > b
+// - otherwise returns smooth interpolation between 0 and 1 based on x
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float smoothstep(float a, float b, float x)
+{
+    float y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(3.0f - (2.0f*y)));
+}
+inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x)
+{
+    float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y)));
+}
+inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x)
+{
+    float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y)));
+}
+inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x)
+{
+    float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y)));
+}
+
+#endif

From 4d1689d6e94584f616f9f960c51c1ea97ef825ec Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Sun, 19 Sep 2021 21:27:31 +0200
Subject: [PATCH 06/27] Each cuda::sws_t has it's own color matrix

---
 sunshine/platform/linux/cuda.cu | 64 ++++++++++++++++++++-------------
 sunshine/platform/linux/cuda.h  | 13 ++++++-
 2 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu
index a2ca6508..8b7d76b7 100644
--- a/sunshine/platform/linux/cuda.cu
+++ b/sunshine/platform/linux/cuda.cu
@@ -65,7 +65,9 @@ struct __attribute__((__aligned__(16))) color_extern_t {
   __float2 range_uv;
 };
 
-extern color_extern_t colors[4];
+static_assert(sizeof(video::color_t) == sizeof(video::color_extern_t), "color matrix struct mismatch");
+
+extern color_t colors[4];
 } // namespace video
 
 //////////////////// End special declarations
@@ -91,36 +93,47 @@ inline static int check(cudaError_t result, const std::string_view &sv) {
   return 0;
 }
 
-__device__ __constant__ video::color_t color;
+template<class T>
+ptr_t make_ptr() {
+  void *p;
+  CU_CHECK_PTR(cudaMalloc(&p, sizeof(T)), "Couldn't allocate color matrix");
 
+  ptr_t ptr { p };
+
+  return ptr;
+}
+
+void freeCudaPtr_t::operator()(void *ptr) {
+  CU_CHECK_IGNORE(cudaFree(ptr), "Couldn't free cuda device pointer");
+}
 
 inline __device__ float3 bgra_to_rgb(uchar4 vec) {
   return make_float3((float)vec.z, (float)vec.y, (float)vec.x);
 }
 
-inline __device__ float2 calcUV(float3 pixel) {
-  float4 vec_u = color.color_vec_u;
-  float4 vec_v = color.color_vec_v;
+inline __device__ float2 calcUV(float3 pixel, const video::color_t *const color_matrix) {
+  float4 vec_u = color_matrix->color_vec_u;
+  float4 vec_v = color_matrix->color_vec_v;
 
   float u = dot(pixel, make_float3(vec_u)) + vec_u.w;
   float v = dot(pixel, make_float3(vec_v)) + vec_v.w;
 
-  u = u * color.range_uv.x + color.range_uv.y;
-  v = (v * color.range_uv.x + color.range_uv.y) * 224.0f / 256.0f + 0.0625f * 256.0f;
+  u = u * color_matrix->range_uv.x + color_matrix->range_uv.y;
+  v = (v * color_matrix->range_uv.x + color_matrix->range_uv.y) * 224.0f / 256.0f + 0.0625f * 256.0f;
 
   return make_float2(u, v);
 }
 
-inline __device__ float calcY(float3 pixel) {
-  float4 vec_y = color.color_vec_y;
+inline __device__ float calcY(float3 pixel, const video::color_t *const color_matrix) {
+  float4 vec_y = color_matrix->color_vec_y;
 
-  return (dot(pixel, make_float3(vec_y)) + vec_y.w) * color.range_y.x + color.range_y.y;
+  return (dot(pixel, make_float3(vec_y)) + vec_y.w) * color_matrix->range_y.x + color_matrix->range_y.y;
 }
 
 __global__ void RGBA_to_NV12(
   cudaTextureObject_t srcImage, std::uint8_t *dstY, std::uint8_t *dstUV,
   std::uint32_t dstPitchY, std::uint32_t dstPitchUV,
-  std::uint32_t width, std::uint32_t height) {
+  std::uint32_t width, std::uint32_t height, const video::color_t *const color_matrix) {
 
   int idX = (threadIdx.x + blockDim.x * blockIdx.x) * 2;
   int idY = (threadIdx.y + blockDim.y * blockIdx.y);
@@ -137,16 +150,16 @@ __global__ void RGBA_to_NV12(
   float3 rgb_l = bgra_to_rgb(tex2D<uchar4>(srcImage, x, y));
   float3 rgb_r = bgra_to_rgb(tex2D<uchar4>(srcImage, x + 0.25f / width, y + 1.0f / height));
 
-  float2 uv = calcUV((rgb_l + rgb_r) * 0.5f);
+  float2 uv = calcUV((rgb_l + rgb_r) * 0.5f, color_matrix);
 
   dstUV[0] = uv.x;
   dstUV[1] = uv.y;
-  dstY[0]  = calcY(rgb_l);
-  dstY[1]  = calcY(rgb_r);
+  dstY[0]  = calcY(rgb_l, color_matrix);
+  dstY[1]  = calcY(rgb_r, color_matrix);
 }
 
-sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock)
-    : array {}, texture { INVALID_TEXTURE }, width { out_width }, height { out_height }, threadsPerBlock { threadsPerBlock } {
+sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock, ptr_t &&color_matrix)
+    : array {}, texture { INVALID_TEXTURE }, width { out_width }, height { out_height }, threadsPerBlock { threadsPerBlock }, color_matrix { std::move(color_matrix) } {
   auto format = cudaCreateChannelDesc<uchar4>();
 
   CU_CHECK_VOID(cudaMallocArray(&array, &format, in_width, in_height, cudaArrayDefault), "Couldn't allocate cuda array");
@@ -186,7 +199,12 @@ std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, i
   CU_CHECK_PTR(cudaGetDevice(&device), "Couldn't get cuda device");
   CU_CHECK_PTR(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties");
 
-  auto sws = std::make_unique<sws_t>(in_width, in_height, out_width, out_height, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2);
+  auto ptr = make_ptr<video::color_t>();
+  if(!ptr) {
+    return nullptr;
+  }
+
+  auto sws = std::make_unique<sws_t>(in_width, in_height, out_width, out_height, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2, std::move(ptr));
 
   if(sws->texture == INVALID_TEXTURE) {
     return nullptr;
@@ -202,15 +220,13 @@ int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std:
   dim3 block(threadsPerBlock, threadsPerBlock);
   dim3 grid(div_align(threadsX, threadsPerBlock), div_align(threadsY, threadsPerBlock));
 
-  RGBA_to_NV12<<<block, grid>>>(texture, Y, UV, pitchY, pitchUV, width, height);
+  RGBA_to_NV12<<<block, grid>>>(texture, Y, UV, pitchY, pitchUV, width, height, (video::color_t*)color_matrix.get());
 
   return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
 }
 
 void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) {
-  color_range = 1;
-  colorspace = 5;
-  video::color_extern_t *color_p;
+  video::color_t *color_p;
   switch(colorspace) {
   case 5: // SWS_CS_SMPTE170M
     color_p = &video::colors[0];
@@ -228,7 +244,7 @@ void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range)
     ++color_p;
   }
 
-  auto color_matrix = *(video::color_t*)color_p;
+  auto color_matrix = *color_p;
   color_matrix.color_vec_y.w *= 256.0f;
   color_matrix.color_vec_u.w *= 256.0f;
   color_matrix.color_vec_v.w *= 256.0f;
@@ -236,9 +252,7 @@ void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range)
   color_matrix.range_y.y *= 256.0f;
   color_matrix.range_uv.y *= 256.0f;
 
-  static_assert(sizeof(video::color_t) == sizeof(video::color_extern_t), "color matrix struct mismatch");
-
-  CU_CHECK_IGNORE(cudaMemcpyToSymbol(color, &color_matrix, sizeof(video::color_t)), "Couldn't copy color matrix to cuda");
+  CU_CHECK_IGNORE(cudaMemcpy(this->color_matrix.get(), &color_matrix, sizeof(video::color_t), cudaMemcpyHostToDevice), "Couldn't copy color matrix to cuda");
 }
 
 int sws_t::load_ram(platf::img_t &img) {
diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h
index 41087506..0260dad4 100644
--- a/sunshine/platform/linux/cuda.h
+++ b/sunshine/platform/linux/cuda.h
@@ -1,6 +1,8 @@
 #ifndef SUNSHINE_PLATFORM_CUDA_H
 #define SUNSHINE_PLATFORM_CUDA_H
 
+#include <memory>
+
 #ifndef __NVCC__
 
 #include "sunshine/platform/common.h"
@@ -26,10 +28,18 @@ typedef __location__(device_builtin) unsigned long long cudaTextureObject_t;
 #endif /* !defined(__CUDACC__) */
 
 namespace cuda {
+
+class freeCudaPtr_t {
+public:
+  void operator()(void *ptr);
+};
+
+using ptr_t = std::unique_ptr<void, freeCudaPtr_t>;
+
 class sws_t {
 public:
   ~sws_t();
-  sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock);
+  sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock, ptr_t &&color_matrix);
 
   /**
    * in_width, out_width -- The width and height of the captured image in bytes
@@ -46,6 +56,7 @@ public:
 
   int load_ram(platf::img_t &img);
 
+  ptr_t color_matrix;
   cudaArray_t array;
   cudaTextureObject_t texture;
 

From a963b31c1dfbf329a01889d2ea94b5d985ba13ba Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Sun, 19 Sep 2021 23:00:42 +0200
Subject: [PATCH 07/27] Ensure the background color is black

---
 sunshine/platform/linux/cuda.cpp | 24 ++++++++++++++-
 sunshine/platform/linux/cuda.cu  | 53 +++++++++++++++++++++++---------
 sunshine/platform/linux/cuda.h   | 18 +++++++----
 3 files changed, 73 insertions(+), 22 deletions(-)

diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp
index 811293d6..e57f914d 100644
--- a/sunshine/platform/linux/cuda.cpp
+++ b/sunshine/platform/linux/cuda.cpp
@@ -325,7 +325,7 @@ public:
     cuda_ctx = ((AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx)->cuda_ctx;
 
     ctx_t ctx { cuda_ctx };
-    sws = sws_t::make(width * 4, height, frame->width, frame->height);
+    sws = sws_t::make(width, height, frame->width, frame->height, width * 4);
 
     if(!sws) {
       return -1;
@@ -343,6 +343,28 @@ public:
   void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override {
     ctx_t ctx { cuda_ctx };
     sws->set_colorspace(colorspace, color_range);
+
+
+    // The default green color is ugly.
+    // Update the background color
+    platf::img_t img;
+    img.width = frame->width;
+    img.height = frame->height;
+    img.pixel_pitch = 4;
+    img.row_pitch = img.width * img.pixel_pitch;
+
+    std::vector<std::uint8_t> image_data;
+    image_data.resize(img.row_pitch * img.height);
+
+    img.data = image_data.data();
+
+    if(sws->load_ram(img)) {
+      return;
+    }
+
+    sws->convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], {
+      frame->width, frame->height, 0, 0
+    });
   }
 
   frame_t hwframe;
diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu
index 8b7d76b7..1dac5dd5 100644
--- a/sunshine/platform/linux/cuda.cu
+++ b/sunshine/platform/linux/cuda.cu
@@ -133,22 +133,25 @@ inline __device__ float calcY(float3 pixel, const video::color_t *const color_ma
 __global__ void RGBA_to_NV12(
   cudaTextureObject_t srcImage, std::uint8_t *dstY, std::uint8_t *dstUV,
   std::uint32_t dstPitchY, std::uint32_t dstPitchUV,
-  std::uint32_t width, std::uint32_t height, const video::color_t *const color_matrix) {
+  const viewport_t viewport, const video::color_t *const color_matrix) {
 
   int idX = (threadIdx.x + blockDim.x * blockIdx.x) * 2;
   int idY = (threadIdx.y + blockDim.y * blockIdx.y);
 
-  if(idX >= width) return;
-  if(idY >= height) return;
+  if(idX >= viewport.width) return;
+  if(idY >= viewport.height) return;
+
+  float x = (float)idX / (float)viewport.width / 4;
+  float y = (float)idY / (float)viewport.height;
+
+  idX += viewport.offsetX;
+  idY += viewport.offsetY;
 
   dstY  = dstY + idX + idY * dstPitchY;
   dstUV = dstUV + idX + (idY / 2 * dstPitchUV);
 
-  float x = (float)idX / (float)width / 4;
-  float y = (float)idY / (float)height;
-
   float3 rgb_l = bgra_to_rgb(tex2D<uchar4>(srcImage, x, y));
-  float3 rgb_r = bgra_to_rgb(tex2D<uchar4>(srcImage, x + 0.25f / width, y + 1.0f / height));
+  float3 rgb_r = bgra_to_rgb(tex2D<uchar4>(srcImage, x + 0.25f / viewport.width, y + 1.0f / viewport.height));
 
   float2 uv = calcUV((rgb_l + rgb_r) * 0.5f, color_matrix);
 
@@ -158,11 +161,11 @@ __global__ void RGBA_to_NV12(
   dstY[1]  = calcY(rgb_r, color_matrix);
 }
 
-sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock, ptr_t &&color_matrix)
-    : array {}, texture { INVALID_TEXTURE }, width { out_width }, height { out_height }, threadsPerBlock { threadsPerBlock }, color_matrix { std::move(color_matrix) } {
+sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pitch, int threadsPerBlock, ptr_t &&color_matrix)
+    : array {}, texture { INVALID_TEXTURE }, threadsPerBlock { threadsPerBlock }, color_matrix { std::move(color_matrix) } {
   auto format = cudaCreateChannelDesc<uchar4>();
 
-  CU_CHECK_VOID(cudaMallocArray(&array, &format, in_width, in_height, cudaArrayDefault), "Couldn't allocate cuda array");
+  CU_CHECK_VOID(cudaMallocArray(&array, &format, pitch, in_height, cudaArrayDefault), "Couldn't allocate cuda array");
 
   cudaResourceDesc res {};
   res.resType         = cudaResourceTypeArray;
@@ -177,6 +180,22 @@ sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int thr
   std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp);
 
   CU_CHECK_VOID(cudaCreateTextureObject(&texture, &res, &desc, nullptr), "Couldn't create cuda texture");
+
+
+  // Ensure aspect ratio is maintained
+  auto scalar       = std::fminf(out_width / (float)in_width, out_height / (float)in_height);
+  auto out_width_f  = in_width * scalar;
+  auto out_height_f = in_height * scalar;
+
+  // result is always positive
+  auto offsetX_f = (out_width - out_width_f) / 2;
+  auto offsetY_f = (out_height - out_height_f) / 2;
+
+  viewport.width  = out_width_f;
+  viewport.height = out_height_f;
+
+  viewport.offsetX = offsetX_f;
+  viewport.offsetY = offsetY_f;
 }
 
 sws_t::~sws_t() {
@@ -193,7 +212,7 @@ sws_t::~sws_t() {
   }
 }
 
-std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, int out_height) {
+std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, int out_height, int pitch) {
   cudaDeviceProp props;
   int device;
   CU_CHECK_PTR(cudaGetDevice(&device), "Couldn't get cuda device");
@@ -204,7 +223,7 @@ std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, i
     return nullptr;
   }
 
-  auto sws = std::make_unique<sws_t>(in_width, in_height, out_width, out_height, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2, std::move(ptr));
+  auto sws = std::make_unique<sws_t>(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2, std::move(ptr));
 
   if(sws->texture == INVALID_TEXTURE) {
     return nullptr;
@@ -214,13 +233,17 @@ std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, i
 }
 
 int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV) {
-  int threadsX = width / 2;
-  int threadsY = height;
+  return convert(Y, UV, pitchY, pitchUV, viewport);
+}
+
+int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, const viewport_t &viewport) {
+  int threadsX = viewport.width / 2;
+  int threadsY = viewport.height;
 
   dim3 block(threadsPerBlock, threadsPerBlock);
   dim3 grid(div_align(threadsX, threadsPerBlock), div_align(threadsY, threadsPerBlock));
 
-  RGBA_to_NV12<<<block, grid>>>(texture, Y, UV, pitchY, pitchUV, width, height, (video::color_t*)color_matrix.get());
+  RGBA_to_NV12<<<block, grid>>>(texture, Y, UV, pitchY, pitchUV, viewport, (video::color_t*)color_matrix.get());
 
   return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
 }
diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h
index 0260dad4..27dede07 100644
--- a/sunshine/platform/linux/cuda.h
+++ b/sunshine/platform/linux/cuda.h
@@ -36,21 +36,27 @@ public:
 
 using ptr_t = std::unique_ptr<void, freeCudaPtr_t>;
 
+struct viewport_t {
+  int width, height;
+  int offsetX, offsetY;
+};
+
 class sws_t {
 public:
   ~sws_t();
-  sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock, ptr_t &&color_matrix);
+  sws_t(int in_width, int in_height, int out_width, int out_height, int pitch, int threadsPerBlock, ptr_t &&color_matrix);
 
   /**
-   * in_width, out_width -- The width and height of the captured image in bytes
+   * in_width, in_height -- The width and height of the captured image in pixels
    * out_width, out_height -- the width and height of the NV12 image in pixels
    * 
-   * cuda_device -- pointer to the cuda device
+   * pitch -- The size of a single row of pixels in bytes
    */
-  static std::unique_ptr<sws_t> make(int in_width, int in_height, int out_width, int out_height);
+  static std::unique_ptr<sws_t> make(int in_width, int in_height, int out_width, int out_height, int pitch);
 
   // Converts loaded image into a CUDevicePtr
   int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV);
+  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, const viewport_t &viewport);
 
   void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range);
 
@@ -60,9 +66,9 @@ public:
   cudaArray_t array;
   cudaTextureObject_t texture;
 
-  int width, height;
-
   int threadsPerBlock;
+
+  viewport_t viewport;
 };
 } // namespace cuda
 

From e3cc25f23febc0487fc6daed6e53804460d15864 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Mon, 20 Sep 2021 00:03:33 +0200
Subject: [PATCH 08/27] Use linear interpolation with the cuda kernel

---
 sunshine/platform/linux/cuda.cu | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu
index 1dac5dd5..63e40341 100644
--- a/sunshine/platform/linux/cuda.cu
+++ b/sunshine/platform/linux/cuda.cu
@@ -111,6 +111,10 @@ inline __device__ float3 bgra_to_rgb(uchar4 vec) {
   return make_float3((float)vec.z, (float)vec.y, (float)vec.x);
 }
 
+inline __device__ float3 bgra_to_rgb(float4 vec) {
+  return make_float3(vec.z, vec.y, vec.x) * 255.0f;;
+}
+
 inline __device__ float2 calcUV(float3 pixel, const video::color_t *const color_matrix) {
   float4 vec_u = color_matrix->color_vec_u;
   float4 vec_v = color_matrix->color_vec_v;
@@ -150,8 +154,8 @@ __global__ void RGBA_to_NV12(
   dstY  = dstY + idX + idY * dstPitchY;
   dstUV = dstUV + idX + (idY / 2 * dstPitchUV);
 
-  float3 rgb_l = bgra_to_rgb(tex2D<uchar4>(srcImage, x, y));
-  float3 rgb_r = bgra_to_rgb(tex2D<uchar4>(srcImage, x + 0.25f / viewport.width, y + 1.0f / viewport.height));
+  float3 rgb_l = bgra_to_rgb(tex2D<float4>(srcImage, x, y));
+  float3 rgb_r = bgra_to_rgb(tex2D<float4>(srcImage, x + 0.25f / viewport.width, y + 1.0f / viewport.height));
 
   float2 uv = calcUV((rgb_l + rgb_r) * 0.5f, color_matrix);
 
@@ -173,8 +177,8 @@ sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pit
 
   cudaTextureDesc desc {};
 
-  desc.readMode         = cudaReadModeElementType;
-  desc.filterMode       = cudaFilterModePoint;
+  desc.readMode         = cudaReadModeNormalizedFloat;
+  desc.filterMode       = cudaFilterModeLinear;
   desc.normalizedCoords = true;
 
   std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp);

From 196f1f74715d0cf3cf91a5a6d110df4a7e9b7d0b Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Mon, 20 Sep 2021 00:21:54 +0200
Subject: [PATCH 09/27] Make changes in brightness of the color more visible

---
 sunshine/platform/linux/cuda.cu | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu
index 63e40341..e310964c 100644
--- a/sunshine/platform/linux/cuda.cu
+++ b/sunshine/platform/linux/cuda.cu
@@ -112,7 +112,7 @@ inline __device__ float3 bgra_to_rgb(uchar4 vec) {
 }
 
 inline __device__ float3 bgra_to_rgb(float4 vec) {
-  return make_float3(vec.z, vec.y, vec.x) * 255.0f;;
+  return make_float3(vec.z, vec.y, vec.x);
 }
 
 inline __device__ float2 calcUV(float3 pixel, const video::color_t *const color_matrix) {
@@ -123,7 +123,7 @@ inline __device__ float2 calcUV(float3 pixel, const video::color_t *const color_
   float v = dot(pixel, make_float3(vec_v)) + vec_v.w;
 
   u = u * color_matrix->range_uv.x + color_matrix->range_uv.y;
-  v = (v * color_matrix->range_uv.x + color_matrix->range_uv.y) * 224.0f / 256.0f + 0.0625f * 256.0f;
+  v = (v * color_matrix->range_uv.x + color_matrix->range_uv.y) * 224.0f / 256.0f + 0.0625f;
 
   return make_float2(u, v);
 }
@@ -157,12 +157,12 @@ __global__ void RGBA_to_NV12(
   float3 rgb_l = bgra_to_rgb(tex2D<float4>(srcImage, x, y));
   float3 rgb_r = bgra_to_rgb(tex2D<float4>(srcImage, x + 0.25f / viewport.width, y + 1.0f / viewport.height));
 
-  float2 uv = calcUV((rgb_l + rgb_r) * 0.5f, color_matrix);
+  float2 uv = calcUV((rgb_l + rgb_r) * 0.5f, color_matrix) * 255.0f;
 
   dstUV[0] = uv.x;
   dstUV[1] = uv.y;
-  dstY[0]  = calcY(rgb_l, color_matrix);
-  dstY[1]  = calcY(rgb_r, color_matrix);
+  dstY[0]  = calcY(rgb_l, color_matrix) * 245.0f; // 245.0f is a magic number to ensure slight changes in luminosity are more visisble
+  dstY[1]  = calcY(rgb_r, color_matrix) * 245.0f; // 245.0f is a magic number to ensure slight changes in luminosity are more visisble
 }
 
 sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pitch, int threadsPerBlock, ptr_t &&color_matrix)
@@ -271,15 +271,7 @@ void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range)
     ++color_p;
   }
 
-  auto color_matrix = *color_p;
-  color_matrix.color_vec_y.w *= 256.0f;
-  color_matrix.color_vec_u.w *= 256.0f;
-  color_matrix.color_vec_v.w *= 256.0f;
-
-  color_matrix.range_y.y *= 256.0f;
-  color_matrix.range_uv.y *= 256.0f;
-
-  CU_CHECK_IGNORE(cudaMemcpy(this->color_matrix.get(), &color_matrix, sizeof(video::color_t), cudaMemcpyHostToDevice), "Couldn't copy color matrix to cuda");
+  CU_CHECK_IGNORE(cudaMemcpy(color_matrix.get(), color_p, sizeof(video::color_t), cudaMemcpyHostToDevice), "Couldn't copy color matrix to cuda");
 }
 
 int sws_t::load_ram(platf::img_t &img) {

From bb912786bdbc65b08aeb5ae508f2d481ce4b2bf6 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Wed, 22 Sep 2021 11:36:59 +0200
Subject: [PATCH 10/27] Added NvFBC on Linux

---
 sunshine/platform/common.h          |   7 +
 sunshine/platform/linux/cuda.cpp    | 703 +++++++++++++++++-----------
 sunshine/platform/linux/cuda.cu     |  92 ++--
 sunshine/platform/linux/cuda.h      |  36 +-
 sunshine/platform/linux/misc.cpp    |  29 +-
 sunshine/platform/linux/x11grab.cpp |   4 +-
 sunshine/video.cpp                  |   2 +-
 7 files changed, 546 insertions(+), 327 deletions(-)

diff --git a/sunshine/platform/common.h b/sunshine/platform/common.h
index 7482f0ea..4b702a9b 100644
--- a/sunshine/platform/common.h
+++ b/sunshine/platform/common.h
@@ -141,6 +141,13 @@ public:
 
 struct img_t {
 public:
+  img_t() = default;
+
+  img_t(img_t &&)      = delete;
+  img_t(const img_t &) = delete;
+  img_t &operator=(img_t &&) = delete;
+  img_t &operator=(const img_t &) = delete;
+
   std::uint8_t *data {};
   std::int32_t width {};
   std::int32_t height {};
diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp
index e57f914d..25d88518 100644
--- a/sunshine/platform/linux/cuda.cpp
+++ b/sunshine/platform/linux/cuda.cpp
@@ -1,3 +1,5 @@
+#include <bitset>
+
 #include <NvFBC.h>
 #include <ffnvcodec/dynlink_loader.h>
 
@@ -55,96 +57,9 @@ inline static int check(CUresult result, const std::string_view &sv) {
   return 0;
 }
 
-class ctx_t {
+class img_t : public platf::img_t {
 public:
-  ctx_t(CUcontext ctx) {
-    CU_CHECK_IGNORE(cdf->cuCtxPushCurrent(ctx), "Couldn't push cuda context");
-  }
-
-  ~ctx_t() {
-    CUcontext dummy;
-
-    CU_CHECK_IGNORE(cdf->cuCtxPopCurrent(&dummy), "Couldn't pop cuda context");
-  }
-};
-
-void free_res(CUgraphicsResource res) {
-  cdf->cuGraphicsUnregisterResource(res);
-}
-
-using res_internal_t = util::safe_ptr<CUgraphicsResource_st, free_res>;
-
-template<std::size_t N>
-class res_t {
-public:
-  res_t() : resources {}, mapped { false } {}
-  res_t(res_t &&other) noexcept : resources { other.resources }, array_p { other.array_p }, ctx { other.ctx }, stream { other.stream } {
-    other.resources = std::array<res_internal_t::pointer, N> {};
-  }
-
-  res_t &operator=(res_t &&other) {
-    for(auto x = 0; x < N; ++x) {
-      std::swap(resources[x], other.resources[x]);
-      std::swap(array_p[x], other.array_p[x]);
-    }
-
-    std::swap(ctx, other.ctx);
-    std::swap(stream, other.stream);
-    std::swap(mapped, other.mapped);
-
-    return *this;
-  }
-
-  res_t(CUcontext ctx, CUstream stream) : resources {}, ctx { ctx }, stream { stream }, mapped { false } {}
-
-  int bind(gl::tex_t &tex) {
-    ctx_t ctx { this->ctx };
-
-    CU_CHECK(cdf->cuGraphicsGLRegisterImage(&resources[0], tex[0], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register Y image");
-    CU_CHECK(cdf->cuGraphicsGLRegisterImage(&resources[1], tex[1], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register uv image");
-
-    return 0;
-  }
-
-  int map() {
-    ctx_t ctx { this->ctx };
-
-    CU_CHECK(cdf->cuGraphicsMapResources(resources.size(), resources.data(), stream), "Coudn't map cuda resources");
-
-    mapped = true;
-
-    CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&array_p[0], resources[0], 0, 0), "Couldn't get mapped subresource [0]");
-    CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&array_p[1], resources[1], 0, 0), "Couldn't get mapped subresource [1]");
-
-    return 0;
-  }
-
-  void unmap() {
-    // Either all or none are mapped
-    if(mapped) {
-      ctx_t ctx { this->ctx };
-
-      CU_CHECK_IGNORE(cdf->cuGraphicsUnmapResources(resources.size(), resources.data(), stream), "Couldn't unmap cuda resources");
-
-      mapped = false;
-    }
-  }
-
-  inline CUarray &operator[](std::size_t index) {
-    return array_p[index];
-  }
-
-  ~res_t() {
-    unmap();
-  }
-
-  std::array<res_internal_t::pointer, N> resources;
-  std::array<CUarray, N> array_p;
-
-  CUcontext ctx;
-  CUstream stream;
-
-  bool mapped;
+  tex_t tex;
 };
 
 int init() {
@@ -160,139 +75,8 @@ int init() {
   return 0;
 }
 
-class opengl_t : public platf::hwdevice_t {
-public:
-  int init(int in_width, int in_height, platf::x11::xdisplay_t::pointer xdisplay) {
-    if(!cdf) {
-      BOOST_LOG(warning) << "cuda not initialized"sv;
-      return -1;
-    }
-
-    this->data = (void *)0x1;
-
-    display = egl::make_display(xdisplay);
-    if(!display) {
-      return -1;
-    }
-
-    auto ctx_opt = egl::make_ctx(display.get());
-    if(!ctx_opt) {
-      return -1;
-    }
-
-    ctx = std::move(*ctx_opt);
-
-    width  = in_width;
-    height = in_height;
-
-    return 0;
-  }
-
-  int set_frame(AVFrame *frame) override {
-    auto cuda_ctx = (AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx;
-
-    tex = gl::tex_t::make(2);
-    fb  = gl::frame_buf_t::make(2);
-
-    gl::ctx.BindTexture(GL_TEXTURE_2D, tex[0]);
-    gl::ctx.TexImage2D(GL_TEXTURE_2D, 0, GL_RED, frame->width, frame->height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
-    gl::ctx.BindTexture(GL_TEXTURE_2D, tex[1]);
-    gl::ctx.TexImage2D(GL_TEXTURE_2D, 0, GL_RG, frame->width / 2, frame->height / 2, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr);
-    gl::ctx.BindTexture(GL_TEXTURE_2D, 0);
-
-    fb.bind(std::begin(tex), std::end(tex));
-
-    res = res_t<2> { cuda_ctx->cuda_ctx, cuda_ctx->stream };
-
-    if(res.bind(tex)) {
-      return -1;
-    }
-
-    this->hwframe.reset(frame);
-    this->frame = frame;
-
-    if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) {
-      BOOST_LOG(error) << "Couldn't get hwframe for NVENC"sv;
-
-      return -1;
-    }
-
-    auto sws_opt = egl::sws_t::make(width, height, frame->width, frame->height);
-    if(!sws_opt) {
-      return -1;
-    }
-
-    sws = std::move(*sws_opt);
-    return sws.blank(fb, 0, 0, frame->width, frame->height);
-  }
-
-  int convert(platf::img_t &img) override {
-    sws.load_ram(img);
-
-    if(sws.convert(fb)) {
-      return -1;
-    }
-
-    if(res.map()) {
-      return -1;
-    }
-
-    // Push and pop cuda context
-    ctx_t ctx { res.ctx };
-    for(auto x = 0; x < 2; ++x) {
-      CUDA_MEMCPY2D desc {};
-
-      auto shift = x;
-
-      desc.srcPitch     = frame->width;
-      desc.dstPitch     = frame->linesize[x];
-      desc.Height       = frame->height >> shift;
-      desc.WidthInBytes = std::min(desc.srcPitch, desc.dstPitch);
-
-      desc.srcMemoryType = CU_MEMORYTYPE_ARRAY;
-      desc.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-
-      desc.srcArray  = res[x];
-      desc.dstDevice = (CUdeviceptr)frame->data[x];
-
-      CU_CHECK(cdf->cuMemcpy2DAsync(&desc, res.stream), "Couldn't copy from OpenGL to cuda");
-    }
-
-    res.unmap();
-
-    return 0;
-  }
-
-  void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override {
-    sws.set_colorspace(colorspace, color_range);
-  }
-
-  frame_t hwframe;
-
-  egl::display_t display;
-  egl::ctx_t ctx;
-
-  egl::sws_t sws;
-
-  gl::tex_t tex;
-  gl::frame_buf_t fb;
-
-  res_t<2> res;
-
-  int width, height;
-};
-
 class cuda_t : public platf::hwdevice_t {
 public:
-  ~cuda_t() override {
-    // sws_t needs to be destroyed while the context is active
-    if(sws) {
-      ctx_t ctx { cuda_ctx };
-
-      sws.reset();
-    }
-  }
-
   int init(int in_width, int in_height) {
     if(!cdf) {
       BOOST_LOG(warning) << "cuda not initialized"sv;
@@ -322,78 +106,111 @@ public:
       return -1;
     }
 
-    cuda_ctx = ((AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx)->cuda_ctx;
-
-    ctx_t ctx { cuda_ctx };
-    sws = sws_t::make(width, height, frame->width, frame->height, width * 4);
-
-    if(!sws) {
+    auto sws_opt = sws_t::make(width, height, frame->width, frame->height, width * 4);
+    if(!sws_opt) {
       return -1;
     }
 
+    sws = std::move(*sws_opt);
+
     return 0;
   }
 
-  int convert(platf::img_t &img) override {
-    ctx_t ctx { cuda_ctx };
-
-    return sws->load_ram(img) || sws->convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1]);
-  }
-
   void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override {
-    ctx_t ctx { cuda_ctx };
-    sws->set_colorspace(colorspace, color_range);
+    sws.set_colorspace(colorspace, color_range);
 
+    auto tex = tex_t::make(height, width * 4);
+    if(!tex) {
+      return;
+    }
 
     // The default green color is ugly.
     // Update the background color
     platf::img_t img;
-    img.width = frame->width;
-    img.height = frame->height;
+    img.width       = width;
+    img.height      = height;
     img.pixel_pitch = 4;
-    img.row_pitch = img.width * img.pixel_pitch;
+    img.row_pitch   = img.width * img.pixel_pitch;
 
     std::vector<std::uint8_t> image_data;
     image_data.resize(img.row_pitch * img.height);
 
     img.data = image_data.data();
 
-    if(sws->load_ram(img)) {
+    if(sws.load_ram(img, tex->array)) {
       return;
     }
 
-    sws->convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], {
-      frame->width, frame->height, 0, 0
-    });
+    sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture, { frame->width, frame->height, 0, 0 });
   }
 
   frame_t hwframe;
 
-  std::unique_ptr<sws_t> sws;
-
   int width, height;
 
-  CUcontext cuda_ctx;
+  sws_t sws;
 };
 
-std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay) {
+class cuda_ram_t : public cuda_t {
+public:
+  int convert(platf::img_t &img) override {
+    return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex.texture);
+  }
+
+  int set_frame(AVFrame *frame) {
+    if(cuda_t::set_frame(frame)) {
+      return -1;
+    }
+
+    auto tex_opt = tex_t::make(height, width * 4);
+    if(!tex_opt) {
+      return -1;
+    }
+
+    tex = std::move(*tex_opt);
+
+    return 0;
+  }
+
+  tex_t tex;
+};
+
+class cuda_vram_t : public cuda_t {
+public:
+  int convert(platf::img_t &img) override {
+    return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], (cudaTextureObject_t)img.data);
+  }
+};
+
+std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, bool vram) {
   if(init()) {
     return nullptr;
   }
 
-  auto cuda = std::make_shared<cuda_t>();
+  std::shared_ptr<cuda_t> cuda;
+
+  if(vram) {
+    cuda = std::make_shared<cuda_vram_t>();
+  }
+  else {
+    cuda = std::make_shared<cuda_ram_t>();
+  }
+
   if(cuda->init(width, height)) {
     return nullptr;
   }
 
   return cuda;
 }
-} // namespace cuda
 
-namespace platf::nvfbc {
+namespace nvfbc {
 static PNVFBCCREATEINSTANCE createInstance {};
 static NVFBC_API_FUNCTION_LIST func { NVFBC_VERSION };
 
+static constexpr inline NVFBC_BOOL nv_bool(bool b) {
+  return b ? NVFBC_TRUE : NVFBC_FALSE;
+}
+
 static void *handle { nullptr };
 int init() {
   static bool funcs_loaded = false;
@@ -418,49 +235,65 @@ int init() {
     return -1;
   }
 
+  auto status = cuda::nvfbc::createInstance(&cuda::nvfbc::func);
+  if(status) {
+    BOOST_LOG(error) << "Unable to create NvFBC instance"sv;
+
+    dlclose(handle);
+    handle = nullptr;
+    return -1;
+  }
+
   funcs_loaded = true;
   return 0;
 }
 
 class handle_t {
-  KITTY_USING_MOVE_T(session_t, NVFBC_SESSION_HANDLE, std::numeric_limits<std::uint64_t>::max(), {
-    if(el == std::numeric_limits<std::uint64_t>::max()) {
-      return;
-    }
-    NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER };
-
-    auto status = func.nvFBCDestroyHandle(el, &params);
-    if(status) {
-      BOOST_LOG(error) << "Failed to destroy nvfbc handle: "sv << func.nvFBCGetLastErrorStr(el);
-    }
-  });
+  enum flag_e {
+    SESSION_HANDLE,
+    SESSION_CAPTURE,
+    MAX_FLAGS,
+  };
 
 public:
+  handle_t() = default;
+  handle_t(handle_t &&other) : handle_flags { other.handle_flags }, handle { other.handle } {
+    other.handle_flags.reset();
+  }
+
+  handle_t &operator=(handle_t &&other) {
+    std::swap(handle_flags, other.handle_flags);
+    std::swap(handle, other.handle);
+
+    return *this;
+  }
+
   static std::optional<handle_t> make() {
     NVFBC_CREATE_HANDLE_PARAMS params { NVFBC_CREATE_HANDLE_PARAMS_VER };
-    session_t session;
 
-    auto status = func.nvFBCCreateHandle(&session.el, &params);
+    handle_t handle;
+    auto status = func.nvFBCCreateHandle(&handle.handle, &params);
     if(status) {
-      BOOST_LOG(error) << "Failed to create session: "sv << func.nvFBCGetLastErrorStr(session.el);
-      session.release();
+      BOOST_LOG(error) << "Failed to create session: "sv << handle.last_error();
 
       return std::nullopt;
     }
 
-    return handle_t { std::move(session) };
+    handle.handle_flags[SESSION_HANDLE] = true;
+
+    return std::move(handle);
   }
 
   const char *last_error() {
-    return func.nvFBCGetLastErrorStr(session.el);
+    return func.nvFBCGetLastErrorStr(handle);
   }
 
   std::optional<NVFBC_GET_STATUS_PARAMS> status() {
     NVFBC_GET_STATUS_PARAMS params { NVFBC_GET_STATUS_PARAMS_VER };
 
-    auto status = func.nvFBCGetStatus(session.el, &params);
+    auto status = func.nvFBCGetStatus(handle, &params);
     if(status) {
-      BOOST_LOG(error) << "Failed to create session: "sv << last_error();
+      BOOST_LOG(error) << "Failed to get NvFBC status: "sv << last_error();
 
       return std::nullopt;
     }
@@ -468,23 +301,328 @@ public:
     return params;
   }
 
-  session_t session;
+  int capture(NVFBC_CREATE_CAPTURE_SESSION_PARAMS &capture_params) {
+    if(func.nvFBCCreateCaptureSession(handle, &capture_params)) {
+      BOOST_LOG(error) << "Failed to start capture session: "sv << last_error();
+      return -1;
+    }
+
+    handle_flags[SESSION_CAPTURE] = true;
+
+    NVFBC_TOCUDA_SETUP_PARAMS setup_params {
+      NVFBC_TOCUDA_SETUP_PARAMS_VER,
+      NVFBC_BUFFER_FORMAT_BGRA,
+    };
+
+    if(func.nvFBCToCudaSetUp(handle, &setup_params)) {
+      BOOST_LOG(error) << "Failed to setup cuda interop with nvFBC: "sv << last_error();
+      return -1;
+    }
+    return 0;
+  }
+
+  int stop() {
+    if(!handle_flags[SESSION_CAPTURE]) {
+      return 0;
+    }
+
+    NVFBC_DESTROY_CAPTURE_SESSION_PARAMS params { NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER };
+
+    if(func.nvFBCDestroyCaptureSession(handle, &params)) {
+      BOOST_LOG(error) << "Couldn't destroy capture session: "sv << last_error();
+
+      return -1;
+    }
+
+    handle_flags[SESSION_CAPTURE] = false;
+
+    return 0;
+  }
+
+  ~handle_t() {
+    if(!handle_flags[SESSION_HANDLE]) {
+      return;
+    }
+
+    if(handle_flags[SESSION_CAPTURE]) {
+      NVFBC_DESTROY_CAPTURE_SESSION_PARAMS params { NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER };
+
+      if(func.nvFBCDestroyCaptureSession(handle, &params)) {
+        BOOST_LOG(error) << "Couldn't destroy capture session: "sv << func.nvFBCGetLastErrorStr(handle);
+      }
+    }
+
+    NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER };
+
+    if(func.nvFBCDestroyHandle(handle, &params)) {
+      BOOST_LOG(error) << "Couldn't destroy session handle: "sv << func.nvFBCGetLastErrorStr(handle);
+    }
+  }
+
+  std::bitset<MAX_FLAGS> handle_flags;
+
+  NVFBC_SESSION_HANDLE handle;
 };
 
+class display_t : public platf::display_t {
+public:
+  int init(const std::string_view &display_name, int framerate) {
+    auto handle = handle_t::make();
+    if(!handle) {
+      return -1;
+    }
+
+    auto status_params = handle->status();
+    if(!status_params) {
+      return -1;
+    }
+
+    int streamedMonitor = -1;
+    if(!display_name.empty()) {
+      if(status_params->bXRandRAvailable) {
+        auto monitor_nr = util::from_view(display_name);
+
+        if(monitor_nr < 0 || monitor_nr >= status_params->dwOutputNum) {
+          BOOST_LOG(warning) << "Can't stream monitor ["sv << monitor_nr << "], it needs to be between [0] and ["sv << status_params->dwOutputNum - 1 << "], defaulting to virtual desktop"sv;
+        }
+        else {
+          streamedMonitor = monitor_nr;
+        }
+      }
+      else {
+        BOOST_LOG(warning) << "XrandR not available, streaming entire virtual desktop"sv;
+      }
+    }
+
+    capture_params = NVFBC_CREATE_CAPTURE_SESSION_PARAMS { NVFBC_CREATE_CAPTURE_SESSION_PARAMS_VER };
+
+    capture_params.eCaptureType                = NVFBC_CAPTURE_SHARED_CUDA;
+    capture_params.bDisableAutoModesetRecovery = nv_bool(true);
+
+    capture_params.dwSamplingRateMs = 1000 /* ms */ / framerate;
+
+    if(streamedMonitor != -1) {
+      auto &output = status_params->outputs[streamedMonitor];
+
+      width    = output.trackedBox.w;
+      height   = output.trackedBox.h;
+      offset_x = output.trackedBox.x;
+      offset_y = output.trackedBox.y;
+
+      capture_params.eTrackingType = NVFBC_TRACKING_OUTPUT;
+      capture_params.dwOutputId    = output.dwId;
+    }
+    else {
+      capture_params.eTrackingType = NVFBC_TRACKING_SCREEN;
+
+      width  = status_params->screenSize.w;
+      height = status_params->screenSize.h;
+    }
+
+    env_width  = status_params->screenSize.w;
+    env_height = status_params->screenSize.h;
+
+    this->handle = std::move(*handle);
+    return 0;
+  }
+
+  platf::capture_e capture(snapshot_cb_t &&snapshot_cb, std::shared_ptr<platf::img_t> img, bool *cursor) override {
+    // Force display_t::capture to initialize handle_t::capture
+    cursor_visible = !*cursor;
+
+    auto fg = util::fail_guard([&]() {
+      handle.stop();
+    });
+
+    while(img) {
+      auto status = snapshot(img.get(), 500ms, *cursor);
+      switch(status) {
+      case platf::capture_e::reinit:
+      case platf::capture_e::error:
+        return status;
+      case platf::capture_e::timeout:
+        std::this_thread::sleep_for(1ms);
+        continue;
+      case platf::capture_e::ok:
+        img = snapshot_cb(img);
+        break;
+      default:
+        BOOST_LOG(error) << "Unrecognized capture status ["sv << (int)status << ']';
+        return status;
+      }
+    }
+
+    return platf::capture_e::ok;
+  }
+
+  // Reinitialize the capture session.
+  platf::capture_e reinit(bool cursor) {
+    if(handle.stop()) {
+      return platf::capture_e::error;
+    }
+
+    cursor_visible = cursor;
+    if(false && cursor) {
+      capture_params.bPushModel          = nv_bool(false);
+      capture_params.bWithCursor         = nv_bool(true);
+      capture_params.bAllowDirectCapture = nv_bool(false);
+    }
+    else {
+      capture_params.bPushModel          = nv_bool(true);
+      capture_params.bWithCursor         = nv_bool(false);
+      capture_params.bAllowDirectCapture = nv_bool(true);
+    }
+
+    if(handle.capture(capture_params)) {
+      return platf::capture_e::error;
+    }
+
+    // If trying to capture directly, test if it actually does.
+    if(capture_params.bAllowDirectCapture) {
+      CUdeviceptr device_ptr;
+      NVFBC_FRAME_GRAB_INFO info;
+
+      NVFBC_TOCUDA_GRAB_FRAME_PARAMS grab {
+        NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER,
+        NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY,
+        &device_ptr,
+        &info,
+        0,
+      };
+
+      // Direct Capture may fail the first few times, even if it's possible
+      for(int x = 0; x < 3; ++x) {
+        if(auto status = func.nvFBCToCudaGrabFrame(handle.handle, &grab)) {
+          if(status == NVFBC_ERR_MUST_RECREATE) {
+            return platf::capture_e::reinit;
+          }
+
+          BOOST_LOG(error) << "Couldn't capture nvFramebuffer: "sv << handle.last_error();
+
+          return platf::capture_e::error;
+        }
+
+        if(info.bDirectCapture) {
+          break;
+        }
+
+        BOOST_LOG(debug) << "Direct capture failed attempt ["sv << x << ']';
+      }
+
+      if(!info.bDirectCapture) {
+        BOOST_LOG(debug) << "Direct capture failed, trying the extra copy method"sv;
+        // Direct capture failed
+        capture_params.bPushModel          = nv_bool(false);
+        capture_params.bWithCursor         = nv_bool(false);
+        capture_params.bAllowDirectCapture = nv_bool(false);
+
+        if(handle.stop() || handle.capture(capture_params)) {
+          return platf::capture_e::error;
+        }
+      }
+    }
+
+    return platf::capture_e::ok;
+  }
+
+  platf::capture_e snapshot(platf::img_t *img, std::chrono::milliseconds timeout, bool cursor) {
+    if(cursor != cursor_visible) {
+      auto status = reinit(cursor);
+      if(status != platf::capture_e::ok) {
+        return status;
+      }
+    }
+
+    CUdeviceptr device_ptr;
+    NVFBC_FRAME_GRAB_INFO info;
+
+    NVFBC_TOCUDA_GRAB_FRAME_PARAMS grab {
+      NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER,
+      NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY,
+      &device_ptr,
+      &info,
+      (std::uint32_t)timeout.count(),
+    };
+
+    if(auto status = func.nvFBCToCudaGrabFrame(handle.handle, &grab)) {
+      if(status == NVFBC_ERR_MUST_RECREATE) {
+        return platf::capture_e::reinit;
+      }
+
+      BOOST_LOG(error) << "Couldn't capture nvFramebuffer: "sv << handle.last_error();
+      return platf::capture_e::error;
+    }
+
+    if(!info.bIsNewFrame) {
+      return platf::capture_e::timeout;
+    }
+
+    if(((img_t *)img)->tex.copy((std::uint8_t *)device_ptr, img->height, img->row_pitch)) {
+      return platf::capture_e::error;
+    }
+
+    return platf::capture_e::ok;
+  }
+
+  std::shared_ptr<platf::hwdevice_t> make_hwdevice(platf::pix_fmt_e pix_fmt) override {
+    return ::cuda::make_hwdevice(width, height, true);
+  }
+
+  std::shared_ptr<platf::img_t> alloc_img() override {
+    auto img = std::make_shared<cuda::img_t>();
+
+    img->width       = width;
+    img->height      = height;
+    img->pixel_pitch = 4;
+    img->row_pitch   = img->width * img->pixel_pitch;
+
+    auto tex_opt = tex_t::make(height, width * img->pixel_pitch);
+    if(!tex_opt) {
+      return nullptr;
+    }
+
+    img->tex  = std::move(*tex_opt);
+    img->data = (std::uint8_t *)img->tex.texture;
+
+    return img;
+  };
+
+  int dummy_img(platf::img_t *) override {
+    return 0;
+  }
+
+  bool cursor_visible;
+  handle_t handle;
+
+  NVFBC_CREATE_CAPTURE_SESSION_PARAMS capture_params;
+};
+} // namespace nvfbc
+} // namespace cuda
+
+namespace platf {
+std::shared_ptr<display_t> nvfbc_display(mem_type_e hwdevice_type, const std::string &display_name, int framerate) {
+  if(hwdevice_type != mem_type_e::cuda) {
+    BOOST_LOG(error) << "Could not initialize nvfbc display with the given hw device type"sv;
+    return nullptr;
+  }
+
+  auto display = std::make_shared<cuda::nvfbc::display_t>();
+
+  if(display->init(display_name, framerate)) {
+    return nullptr;
+  }
+
+  return display;
+}
+
 std::vector<std::string> nvfbc_display_names() {
-  if(init()) {
+  if(cuda::init() || cuda::nvfbc::init()) {
     return {};
   }
 
   std::vector<std::string> display_names;
 
-  auto status = createInstance(&func);
-  if(status) {
-    BOOST_LOG(error) << "Unable to create NvFBC instance"sv;
-    return {};
-  }
-
-  auto handle = handle_t::make();
+  auto handle = cuda::nvfbc::handle_t::make();
   if(!handle) {
     return {};
   }
@@ -500,7 +638,18 @@ std::vector<std::string> nvfbc_display_names() {
 
   BOOST_LOG(info) << "Found ["sv << status_params->dwOutputNum << "] outputs"sv;
   BOOST_LOG(info) << "Virtual Desktop: "sv << status_params->screenSize.w << 'x' << status_params->screenSize.h;
+  BOOST_LOG(info) << "XrandR: "sv << (status_params->bXRandRAvailable ? "available"sv : "unavailable"sv);
+
+  for(auto x = 0; x < status_params->dwOutputNum; ++x) {
+    auto &output = status_params->outputs[x];
+    BOOST_LOG(info) << "-- Output --"sv;
+    BOOST_LOG(debug) << "  ID: "sv << output.dwId;
+    BOOST_LOG(debug) << "  Name: "sv << output.name;
+    BOOST_LOG(info) << "  Resolution: "sv << output.trackedBox.w << 'x' << output.trackedBox.h;
+    BOOST_LOG(info) << "  Offset: "sv << output.trackedBox.x << 'x' << output.trackedBox.y;
+    display_names.emplace_back(std::to_string(x));
+  }
 
   return display_names;
 }
-} // namespace platf::nvfbc
\ No newline at end of file
+} // namespace platf
\ No newline at end of file
diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu
index e310964c..68368095 100644
--- a/sunshine/platform/linux/cuda.cu
+++ b/sunshine/platform/linux/cuda.cu
@@ -21,6 +21,9 @@ using namespace std::literals;
 #define CU_CHECK_PTR(x, y) \
   if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return nullptr;
 
+#define CU_CHECK_OPT(x, y) \
+  if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return std::nullopt;
+
 #define CU_CHECK_IGNORE(x, y) \
   check((x), SUNSHINE_STRINGVIEW(y ": "))
 
@@ -165,15 +168,21 @@ __global__ void RGBA_to_NV12(
   dstY[1]  = calcY(rgb_r, color_matrix) * 245.0f; // 245.0f is a magic number to ensure slight changes in luminosity are more visisble
 }
 
-sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pitch, int threadsPerBlock, ptr_t &&color_matrix)
-    : array {}, texture { INVALID_TEXTURE }, threadsPerBlock { threadsPerBlock }, color_matrix { std::move(color_matrix) } {
-  auto format = cudaCreateChannelDesc<uchar4>();
+int tex_t::copy(std::uint8_t *src, int height, int pitch) {
+  CU_CHECK(cudaMemcpy2DToArray(array, 0, 0, src, pitch, pitch, height, cudaMemcpyDeviceToDevice), "Couldn't copy to cuda array from deviceptr");
 
-  CU_CHECK_VOID(cudaMallocArray(&array, &format, pitch, in_height, cudaArrayDefault), "Couldn't allocate cuda array");
+  return 0;
+}
+
+std::optional<tex_t> tex_t::make(int height, int pitch) {
+  tex_t tex;
+
+  auto format = cudaCreateChannelDesc<uchar4>();
+  CU_CHECK_OPT(cudaMallocArray(&tex.array, &format, pitch, height, cudaArrayDefault), "Couldn't allocate cuda array");
 
   cudaResourceDesc res {};
   res.resType         = cudaResourceTypeArray;
-  res.res.array.array = array;
+  res.res.array.array = tex.array;
 
   cudaTextureDesc desc {};
 
@@ -183,9 +192,40 @@ sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pit
 
   std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp);
 
-  CU_CHECK_VOID(cudaCreateTextureObject(&texture, &res, &desc, nullptr), "Couldn't create cuda texture");
+  CU_CHECK_OPT(cudaCreateTextureObject(&tex.texture, &res, &desc, nullptr), "Couldn't create cuda texture");
 
+  return std::move(tex);
+}
 
+tex_t::tex_t() : array { }, texture { INVALID_TEXTURE } {}
+tex_t::tex_t(tex_t &&other) : array { other.array }, texture { other.texture } {
+  other.array = 0;
+  other.texture = INVALID_TEXTURE;
+}
+
+tex_t &tex_t::operator=(tex_t &&other) {
+  std::swap(array, other.array);
+  std::swap(texture, other.texture);
+
+  return *this;
+}
+
+tex_t::~tex_t() {
+  if(texture != INVALID_TEXTURE) {
+    CU_CHECK_IGNORE(cudaDestroyTextureObject(texture), "Couldn't deallocate cuda texture");
+
+    texture = INVALID_TEXTURE;
+  }
+
+  if(array) {
+    CU_CHECK_IGNORE(cudaFreeArray(array), "Couldn't deallocate cuda array");
+
+    array = cudaArray_t {};
+  }
+}
+
+sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pitch, int threadsPerBlock, ptr_t &&color_matrix)
+    : threadsPerBlock { threadsPerBlock }, color_matrix { std::move(color_matrix) } {
   // Ensure aspect ratio is maintained
   auto scalar       = std::fminf(out_width / (float)in_width, out_height / (float)in_height);
   auto out_width_f  = in_width * scalar;
@@ -202,52 +242,32 @@ sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pit
   viewport.offsetY = offsetY_f;
 }
 
-sws_t::~sws_t() {
-  if(texture != INVALID_TEXTURE) {
-    CU_CHECK_IGNORE(cudaDestroyTextureObject(texture), "Couldn't deallocate cuda texture");
-
-    texture = INVALID_TEXTURE;
-  }
-
-  if(array) {
-    CU_CHECK_IGNORE(cudaFreeArray(array), "Couldn't deallocate cuda array");
-
-    array = cudaArray_t {};
-  }
-}
-
-std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, int out_height, int pitch) {
+std::optional<sws_t> sws_t::make(int in_width, int in_height, int out_width, int out_height, int pitch) {
   cudaDeviceProp props;
   int device;
-  CU_CHECK_PTR(cudaGetDevice(&device), "Couldn't get cuda device");
-  CU_CHECK_PTR(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties");
+  CU_CHECK_OPT(cudaGetDevice(&device), "Couldn't get cuda device");
+  CU_CHECK_OPT(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties");
 
   auto ptr = make_ptr<video::color_t>();
   if(!ptr) {
-    return nullptr;
+    return std::nullopt;
   }
 
-  auto sws = std::make_unique<sws_t>(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2, std::move(ptr));
-
-  if(sws->texture == INVALID_TEXTURE) {
-    return nullptr;
-  }
-
-  return sws;
+  return std::make_optional<sws_t>(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2, std::move(ptr));
 }
 
-int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV) {
-  return convert(Y, UV, pitchY, pitchUV, viewport);
+int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture) {
+  return convert(Y, UV, pitchY, pitchUV, texture, viewport);
 }
 
-int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, const viewport_t &viewport) {
+int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport) {
   int threadsX = viewport.width / 2;
   int threadsY = viewport.height;
 
   dim3 block(threadsPerBlock, threadsPerBlock);
   dim3 grid(div_align(threadsX, threadsPerBlock), div_align(threadsY, threadsPerBlock));
 
-  RGBA_to_NV12<<<block, grid>>>(texture, Y, UV, pitchY, pitchUV, viewport, (video::color_t*)color_matrix.get());
+  RGBA_to_NV12<<<block, grid>>>(texture, Y, UV, pitchY, pitchUV, viewport, (video::color_t *)color_matrix.get());
 
   return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
 }
@@ -274,7 +294,7 @@ void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range)
   CU_CHECK_IGNORE(cudaMemcpy(color_matrix.get(), color_p, sizeof(video::color_t), cudaMemcpyHostToDevice), "Couldn't copy color matrix to cuda");
 }
 
-int sws_t::load_ram(platf::img_t &img) {
+int sws_t::load_ram(platf::img_t &img, cudaArray_t array) {
   return CU_CHECK_IGNORE(cudaMemcpy2DToArray(array, 0, 0, img.data, img.row_pitch, img.width * img.pixel_pitch, img.height, cudaMemcpyHostToDevice), "Couldn't copy to cuda array");
 }
 
diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h
index 27dede07..08cb0156 100644
--- a/sunshine/platform/linux/cuda.h
+++ b/sunshine/platform/linux/cuda.h
@@ -1,15 +1,16 @@
 #ifndef SUNSHINE_PLATFORM_CUDA_H
 #define SUNSHINE_PLATFORM_CUDA_H
 
-#include <memory>
-
 #ifndef __NVCC__
 
 #include "sunshine/platform/common.h"
 #include "x11grab.h"
 
 namespace cuda {
-std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay);
+namespace nvfbc {
+std::vector<std::string> display_names();
+}
+std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, bool vram);
 int init();
 } // namespace cuda
 
@@ -41,9 +42,26 @@ struct viewport_t {
   int offsetX, offsetY;
 };
 
+class tex_t {
+public:
+  static std::optional<tex_t> make(int height, int pitch);
+
+  tex_t();
+  tex_t(tex_t &&);
+
+  tex_t &operator=(tex_t &&other);
+
+  ~tex_t();
+
+  int copy(std::uint8_t *src, int height, int pitch);
+
+  cudaArray_t array;
+  cudaTextureObject_t texture;
+};
+
 class sws_t {
 public:
-  ~sws_t();
+  sws_t() = default;
   sws_t(int in_width, int in_height, int out_width, int out_height, int pitch, int threadsPerBlock, ptr_t &&color_matrix);
 
   /**
@@ -52,19 +70,17 @@ public:
    * 
    * pitch -- The size of a single row of pixels in bytes
    */
-  static std::unique_ptr<sws_t> make(int in_width, int in_height, int out_width, int out_height, int pitch);
+  static std::optional<sws_t> make(int in_width, int in_height, int out_width, int out_height, int pitch);
 
   // Converts loaded image into a CUDevicePtr
-  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV);
-  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, const viewport_t &viewport);
+  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture);
+  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport);
 
   void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range);
 
-  int load_ram(platf::img_t &img);
+  int load_ram(platf::img_t &img, cudaArray_t array);
 
   ptr_t color_matrix;
-  cudaArray_t array;
-  cudaTextureObject_t texture;
 
   int threadsPerBlock;
 
diff --git a/sunshine/platform/linux/misc.cpp b/sunshine/platform/linux/misc.cpp
index ad5afc5a..e10242ff 100644
--- a/sunshine/platform/linux/misc.cpp
+++ b/sunshine/platform/linux/misc.cpp
@@ -141,6 +141,9 @@ std::string get_mac_address(const std::string_view &address) {
 }
 
 enum class source_e {
+  // #ifdef SUNSHINE_BUILD_CUDA
+  NVFBC,
+// #endif
 #ifdef SUNSHINE_BUILD_WAYLAND
   WAYLAND,
 #endif
@@ -153,6 +156,15 @@ enum class source_e {
 };
 static source_e source;
 
+// #ifdef SUNSHINE_BUILD_CUDA
+std::vector<std::string> nvfbc_display_names();
+std::shared_ptr<display_t> nvfbc_display(mem_type_e hwdevice_type, const std::string &display_name, int framerate);
+
+bool verify_nvfbc() {
+  return !nvfbc_display_names().empty();
+}
+// #endif
+
 #ifdef SUNSHINE_BUILD_WAYLAND
 std::vector<std::string> wl_display_names();
 std::shared_ptr<display_t> wl_display(mem_type_e hwdevice_type, const std::string &display_name, int framerate);
@@ -182,6 +194,10 @@ bool verify_x11() {
 
 std::vector<std::string> display_names() {
   switch(source) {
+  // #ifdef SUNSHINE_BUILD_CUDA
+  case source_e::NVFBC:
+    return nvfbc_display_names();
+    // #endif
 #ifdef SUNSHINE_BUILD_WAYLAND
   case source_e::WAYLAND:
     return wl_display_names();
@@ -201,6 +217,10 @@ std::vector<std::string> display_names() {
 
 std::shared_ptr<display_t> display(mem_type_e hwdevice_type, const std::string &display_name, int framerate) {
   switch(source) {
+    // #ifdef SUNSHINE_BUILD_CUDA
+  case source_e::NVFBC:
+    return nvfbc_display(hwdevice_type, display_name, framerate);
+    // #endif
 #ifdef SUNSHINE_BUILD_WAYLAND
   case source_e::WAYLAND:
     return wl_display(hwdevice_type, display_name, framerate);
@@ -229,7 +249,7 @@ std::unique_ptr<deinit_t> init() {
     window_system = window_system_e::WAYLAND;
   }
 #endif
-#ifdef SUNSHINE_BUILD_X11
+#if defined(SUNSHINE_BUILD_X11) // || defined(SUNSHINE_BUILD_CUDA)
   if(std::getenv("DISPLAY") && window_system != window_system_e::WAYLAND) {
     if(std::getenv("WAYLAND_DISPLAY")) {
       BOOST_LOG(warning) << "Wayland detected, yet sunshine will use X11 for screencasting, screencasting will only work on XWayland applications"sv;
@@ -238,6 +258,13 @@ std::unique_ptr<deinit_t> init() {
     window_system = window_system_e::X11;
   }
 #endif
+// #ifdef SUNSHINE_BUILD_CUDA
+  if(verify_nvfbc()) {
+    BOOST_LOG(info) << "Using nvFBC for screencasting"sv;
+    source = source_e::NVFBC;
+    goto found_source;
+  }
+// #endif
 #ifdef SUNSHINE_BUILD_WAYLAND
   if(verify_wl()) {
     BOOST_LOG(info) << "Using Wayland for screencasting"sv;
diff --git a/sunshine/platform/linux/x11grab.cpp b/sunshine/platform/linux/x11grab.cpp
index c8bc70f0..decf24aa 100644
--- a/sunshine/platform/linux/x11grab.cpp
+++ b/sunshine/platform/linux/x11grab.cpp
@@ -518,7 +518,7 @@ struct x11_attr_t : public display_t {
     }
 
     if(mem_type == mem_type_e::cuda) {
-      return cuda::make_hwdevice(width, height, xdisplay.get());
+      return cuda::make_hwdevice(width, height, false);
     }
 
     return std::make_shared<hwdevice_t>();
@@ -678,7 +678,7 @@ struct shm_attr_t : public x11_attr_t {
 
 std::shared_ptr<display_t> x11_display(platf::mem_type_e hwdevice_type, const std::string &display_name, int framerate) {
   if(hwdevice_type != platf::mem_type_e::system && hwdevice_type != platf::mem_type_e::vaapi && hwdevice_type != platf::mem_type_e::cuda) {
-    BOOST_LOG(error) << "Could not initialize display with the given hw device type."sv;
+    BOOST_LOG(error) << "Could not initialize x11 display with the given hw device type"sv;
     return nullptr;
   }
 
diff --git a/sunshine/video.cpp b/sunshine/video.cpp
index 3b143cbe..6faf82ee 100644
--- a/sunshine/video.cpp
+++ b/sunshine/video.cpp
@@ -1699,7 +1699,7 @@ util::Either<buffer_t, int> vaapi_make_hwdevice_ctx(platf::hwdevice_t *base) {
 util::Either<buffer_t, int> cuda_make_hwdevice_ctx(platf::hwdevice_t *base) {
   buffer_t hw_device_buf;
 
-  auto status = av_hwdevice_ctx_create(&hw_device_buf, AV_HWDEVICE_TYPE_CUDA, nullptr, nullptr, 0);
+  auto status = av_hwdevice_ctx_create(&hw_device_buf, AV_HWDEVICE_TYPE_CUDA, nullptr, nullptr, 1 /* AV_CUDA_USE_PRIMARY_CONTEXT */);
   if(status < 0) {
     char string[AV_ERROR_MAX_STRING_SIZE];
     BOOST_LOG(error) << "Failed to create a CUDA device: "sv << av_make_error_string(string, AV_ERROR_MAX_STRING_SIZE, status);

From d0529fb2347b878221196fb2cc8fd7c2690ee6ef Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Wed, 22 Sep 2021 14:17:08 +0200
Subject: [PATCH 11/27] Make dependency on cuda development files optional

---
 CMakeLists.txt                      | 44 ++++++++++++++++++++---------
 sunshine/platform/linux/cuda.cpp    |  1 -
 sunshine/platform/linux/cuda.h      | 18 ++++++------
 sunshine/platform/linux/misc.cpp    | 22 +++++++--------
 sunshine/platform/linux/x11grab.cpp |  4 ++-
 5 files changed, 53 insertions(+), 36 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index df0424d5..d35512f4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -107,17 +107,28 @@ else()
 	option(SUNSHINE_ENABLE_DRM "Enable KMS grab if available" ON)
 	option(SUNSHINE_ENABLE_X11 "Enable X11 grab if available" ON)
 	option(SUNSHINE_ENABLE_WAYLAND "Enable building wayland specific code" ON)
-
-	if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-  	set(CMAKE_CUDA_ARCHITECTURES 75)
-	endif()
-	enable_language(CUDA)
+	option(SUNSHINE_ENABLE_CUDA "Enable cuda specific code" ON)
 
 	if(${SUNSHINE_ENABLE_X11})
 		find_package(X11)
 	else()
 		set(X11_FOUND OFF)
 	endif()
+
+	set(CUDA_FOUND OFF)
+	if(${SUNSHINE_ENABLE_CUDA})
+		include(CheckLanguage)
+		check_language(CUDA)
+
+		if(CMAKE_CUDA_COMPILER)
+			if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+				set(CMAKE_CUDA_ARCHITECTURES 75)
+			endif()
+
+			set(CUDA_FOUND ON)
+			enable_language(CUDA)
+		endif()
+	endif()
 	if(${SUNSHINE_ENABLE_DRM})
 		find_package(LIBDRM)
 		find_package(LIBCAP)
@@ -138,6 +149,17 @@ else()
 		include_directories(${X11_INCLUDE_DIR})
 		list(APPEND PLATFORM_TARGET_FILES sunshine/platform/linux/x11grab.cpp)
 	endif()
+
+	if(CUDA_FOUND)
+		include_directories(third-party/nvfbc)
+		list(APPEND PLATFORM_TARGET_FILES
+			sunshine/platform/linux/cuda.cu
+			sunshine/platform/linux/cuda.cpp
+			third-party/nvfbc/NvFBC.h)
+
+			add_compile_definitions(SUNSHINE_BUILD_CUDA)
+	endif()
+
 	if(LIBDRM_FOUND AND LIBCAP_FOUND)
 		add_compile_definitions(SUNSHINE_BUILD_DRM)
 		include_directories(${LIBDRM_INCLUDE_DIRS} ${LIBCAP_INCLUDE_DIRS})
@@ -187,16 +209,14 @@ else()
 			sunshine/platform/linux/wlgrab.cpp
 			sunshine/platform/linux/wayland.cpp)
 	endif()
-	if(NOT ${X11_FOUND} AND NOT (${LIBDRM_FOUND} AND ${LIBCAP_FOUND}) AND NOT ${WAYLAND_FOUND})
-		message(FATAL_ERROR "Couldn't find either x11, wayland or (libdrm and libcap)")
+	if(NOT ${X11_FOUND} AND NOT (${LIBDRM_FOUND} AND ${LIBCAP_FOUND}) AND NOT ${WAYLAND_FOUND} AND NOT ${})
+		message(FATAL_ERROR "Couldn't find either x11, wayland, cuda or (libdrm and libcap)")
 	endif()
 
 	list(APPEND PLATFORM_TARGET_FILES
 		sunshine/platform/linux/publish.cpp
 		sunshine/platform/linux/vaapi.h
 		sunshine/platform/linux/vaapi.cpp
-		sunshine/platform/linux/cuda.cu
-		sunshine/platform/linux/cuda.cpp
 		sunshine/platform/linux/cuda.h
 		sunshine/platform/linux/graphics.h
 		sunshine/platform/linux/graphics.cpp
@@ -211,8 +231,7 @@ else()
 		third-party/glad/include/EGL/eglplatform.h
 		third-party/glad/include/KHR/khrplatform.h
 		third-party/glad/include/glad/gl.h
-		third-party/glad/include/glad/egl.h
-		third-party/nvfbc/NvFBC.h)
+		third-party/glad/include/glad/egl.h)
 		
 	list(APPEND PLATFORM_LIBRARIES
 		dl
@@ -224,8 +243,7 @@ else()
 	include_directories(
 		/usr/include/libevdev-1.0
 		third-party/nv-codec-headers/include
-		third-party/glad/include
-		third-party/nvfbc)
+		third-party/glad/include)
 
 	if(NOT DEFINED SUNSHINE_EXECUTABLE_PATH)
 		set(SUNSHINE_EXECUTABLE_PATH "sunshine")
diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp
index 25d88518..65bd07ee 100644
--- a/sunshine/platform/linux/cuda.cpp
+++ b/sunshine/platform/linux/cuda.cpp
@@ -14,7 +14,6 @@ extern "C" {
 #include "sunshine/main.h"
 #include "sunshine/utility.h"
 #include "wayland.h"
-#include "x11grab.h"
 
 #define SUNSHINE_STRINGVIEW_HELPER(x) x##sv
 #define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x)
diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h
index 08cb0156..7e377f20 100644
--- a/sunshine/platform/linux/cuda.h
+++ b/sunshine/platform/linux/cuda.h
@@ -1,10 +1,14 @@
-#ifndef SUNSHINE_PLATFORM_CUDA_H
+#if !defined(SUNSHINE_PLATFORM_CUDA_H) && defined(SUNSHINE_BUILD_CUDA)
 #define SUNSHINE_PLATFORM_CUDA_H
 
-#ifndef __NVCC__
+#include <vector>
+#include <memory>
+#include <optional>
 
-#include "sunshine/platform/common.h"
-#include "x11grab.h"
+namespace platf {
+  class hwdevice_t;
+  class img_t;
+}
 
 namespace cuda {
 namespace nvfbc {
@@ -14,12 +18,6 @@ std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, bool vra
 int init();
 } // namespace cuda
 
-#else
-namespace platf {
-class img_t;
-}
-#endif
-
 typedef struct cudaArray *cudaArray_t;
 
 #if !defined(__CUDACC__)
diff --git a/sunshine/platform/linux/misc.cpp b/sunshine/platform/linux/misc.cpp
index e10242ff..bf6201b3 100644
--- a/sunshine/platform/linux/misc.cpp
+++ b/sunshine/platform/linux/misc.cpp
@@ -141,9 +141,9 @@ std::string get_mac_address(const std::string_view &address) {
 }
 
 enum class source_e {
-  // #ifdef SUNSHINE_BUILD_CUDA
+#ifdef SUNSHINE_BUILD_CUDA
   NVFBC,
-// #endif
+#endif
 #ifdef SUNSHINE_BUILD_WAYLAND
   WAYLAND,
 #endif
@@ -156,14 +156,14 @@ enum class source_e {
 };
 static source_e source;
 
-// #ifdef SUNSHINE_BUILD_CUDA
+#ifdef SUNSHINE_BUILD_CUDA
 std::vector<std::string> nvfbc_display_names();
 std::shared_ptr<display_t> nvfbc_display(mem_type_e hwdevice_type, const std::string &display_name, int framerate);
 
 bool verify_nvfbc() {
   return !nvfbc_display_names().empty();
 }
-// #endif
+#endif
 
 #ifdef SUNSHINE_BUILD_WAYLAND
 std::vector<std::string> wl_display_names();
@@ -194,10 +194,10 @@ bool verify_x11() {
 
 std::vector<std::string> display_names() {
   switch(source) {
-  // #ifdef SUNSHINE_BUILD_CUDA
+#ifdef SUNSHINE_BUILD_CUDA
   case source_e::NVFBC:
     return nvfbc_display_names();
-    // #endif
+#endif
 #ifdef SUNSHINE_BUILD_WAYLAND
   case source_e::WAYLAND:
     return wl_display_names();
@@ -217,10 +217,10 @@ std::vector<std::string> display_names() {
 
 std::shared_ptr<display_t> display(mem_type_e hwdevice_type, const std::string &display_name, int framerate) {
   switch(source) {
-    // #ifdef SUNSHINE_BUILD_CUDA
+#ifdef SUNSHINE_BUILD_CUDA
   case source_e::NVFBC:
     return nvfbc_display(hwdevice_type, display_name, framerate);
-    // #endif
+#endif
 #ifdef SUNSHINE_BUILD_WAYLAND
   case source_e::WAYLAND:
     return wl_display(hwdevice_type, display_name, framerate);
@@ -249,7 +249,7 @@ std::unique_ptr<deinit_t> init() {
     window_system = window_system_e::WAYLAND;
   }
 #endif
-#if defined(SUNSHINE_BUILD_X11) // || defined(SUNSHINE_BUILD_CUDA)
+#if defined(SUNSHINE_BUILD_X11) || defined(SUNSHINE_BUILD_CUDA)
   if(std::getenv("DISPLAY") && window_system != window_system_e::WAYLAND) {
     if(std::getenv("WAYLAND_DISPLAY")) {
       BOOST_LOG(warning) << "Wayland detected, yet sunshine will use X11 for screencasting, screencasting will only work on XWayland applications"sv;
@@ -258,13 +258,13 @@ std::unique_ptr<deinit_t> init() {
     window_system = window_system_e::X11;
   }
 #endif
-// #ifdef SUNSHINE_BUILD_CUDA
+#ifdef SUNSHINE_BUILD_CUDA
   if(verify_nvfbc()) {
     BOOST_LOG(info) << "Using nvFBC for screencasting"sv;
     source = source_e::NVFBC;
     goto found_source;
   }
-// #endif
+#endif
 #ifdef SUNSHINE_BUILD_WAYLAND
   if(verify_wl()) {
     BOOST_LOG(info) << "Using Wayland for screencasting"sv;
diff --git a/sunshine/platform/linux/x11grab.cpp b/sunshine/platform/linux/x11grab.cpp
index decf24aa..eb303d46 100644
--- a/sunshine/platform/linux/x11grab.cpp
+++ b/sunshine/platform/linux/x11grab.cpp
@@ -20,11 +20,11 @@
 #include "sunshine/main.h"
 #include "sunshine/task_pool.h"
 
+#include "cuda.h"
 #include "graphics.h"
 #include "misc.h"
 #include "vaapi.h"
 #include "x11grab.h"
-#include "cuda.h"
 
 using namespace std::literals;
 
@@ -517,9 +517,11 @@ struct x11_attr_t : public display_t {
       return va::make_hwdevice(width, height, false);
     }
 
+#ifdef SUNSHINE_BUILD_CUDA
     if(mem_type == mem_type_e::cuda) {
       return cuda::make_hwdevice(width, height, false);
     }
+#endif
 
     return std::make_shared<hwdevice_t>();
   }

From b3304a059d9c8827482eb9b9c55185e372945208 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Wed, 22 Sep 2021 14:49:49 +0200
Subject: [PATCH 12/27] Target older cuda architecture for compatibility

---
 CMakeLists.txt                   | 2 +-
 sunshine/platform/linux/cuda.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d35512f4..2eefcdac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -122,7 +122,7 @@ else()
 
 		if(CMAKE_CUDA_COMPILER)
 			if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-				set(CMAKE_CUDA_ARCHITECTURES 75)
+				set(CMAKE_CUDA_ARCHITECTURES 35)
 			endif()
 
 			set(CUDA_FOUND ON)
diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp
index 65bd07ee..b08b832a 100644
--- a/sunshine/platform/linux/cuda.cpp
+++ b/sunshine/platform/linux/cuda.cpp
@@ -461,7 +461,7 @@ public:
     }
 
     cursor_visible = cursor;
-    if(false && cursor) {
+    if(cursor) {
       capture_params.bPushModel          = nv_bool(false);
       capture_params.bWithCursor         = nv_bool(true);
       capture_params.bAllowDirectCapture = nv_bool(false);

From bd7294e6728dfbac57b6155a73ba196af5e64cbd Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Wed, 22 Sep 2021 19:12:20 +0200
Subject: [PATCH 13/27] Fix cuda kernel launch when encoding in 4K

---
 sunshine/platform/linux/cuda.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu
index 68368095..74f99c09 100644
--- a/sunshine/platform/linux/cuda.cu
+++ b/sunshine/platform/linux/cuda.cu
@@ -264,10 +264,10 @@ int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std:
   int threadsX = viewport.width / 2;
   int threadsY = viewport.height;
 
-  dim3 block(threadsPerBlock, threadsPerBlock);
-  dim3 grid(div_align(threadsX, threadsPerBlock), div_align(threadsY, threadsPerBlock));
+  dim3 block(threadsPerBlock);
+  dim3 grid(div_align(threadsX, threadsPerBlock), threadsY);
 
-  RGBA_to_NV12<<<block, grid>>>(texture, Y, UV, pitchY, pitchUV, viewport, (video::color_t *)color_matrix.get());
+  RGBA_to_NV12<<<grid, block>>>(texture, Y, UV, pitchY, pitchUV, viewport, (video::color_t *)color_matrix.get());
 
   return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
 }

From f78a9e2ccf48c722bae9117728d88a06345a8053 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Sat, 25 Sep 2021 12:44:45 +0200
Subject: [PATCH 14/27] Fix downscaling image when using cuda

---
 sunshine/platform/linux/cuda.cu | 20 +++++++++++---------
 sunshine/platform/linux/cuda.h  |  2 ++
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu
index 74f99c09..7bfb5eab 100644
--- a/sunshine/platform/linux/cuda.cu
+++ b/sunshine/platform/linux/cuda.cu
@@ -140,7 +140,7 @@ inline __device__ float calcY(float3 pixel, const video::color_t *const color_ma
 __global__ void RGBA_to_NV12(
   cudaTextureObject_t srcImage, std::uint8_t *dstY, std::uint8_t *dstUV,
   std::uint32_t dstPitchY, std::uint32_t dstPitchUV,
-  const viewport_t viewport, const video::color_t *const color_matrix) {
+  float scale, const viewport_t viewport, const video::color_t *const color_matrix) {
 
   int idX = (threadIdx.x + blockDim.x * blockIdx.x) * 2;
   int idY = (threadIdx.y + blockDim.y * blockIdx.y);
@@ -148,8 +148,8 @@ __global__ void RGBA_to_NV12(
   if(idX >= viewport.width) return;
   if(idY >= viewport.height) return;
 
-  float x = (float)idX / (float)viewport.width / 4;
-  float y = (float)idY / (float)viewport.height;
+  float x = idX * scale;
+  float y = idY * scale;
 
   idX += viewport.offsetX;
   idY += viewport.offsetY;
@@ -158,7 +158,7 @@ __global__ void RGBA_to_NV12(
   dstUV = dstUV + idX + (idY / 2 * dstPitchUV);
 
   float3 rgb_l = bgra_to_rgb(tex2D<float4>(srcImage, x, y));
-  float3 rgb_r = bgra_to_rgb(tex2D<float4>(srcImage, x + 0.25f / viewport.width, y + 1.0f / viewport.height));
+  float3 rgb_r = bgra_to_rgb(tex2D<float4>(srcImage, x + scale, y));
 
   float2 uv = calcUV((rgb_l + rgb_r) * 0.5f, color_matrix) * 255.0f;
 
@@ -188,7 +188,7 @@ std::optional<tex_t> tex_t::make(int height, int pitch) {
 
   desc.readMode         = cudaReadModeNormalizedFloat;
   desc.filterMode       = cudaFilterModeLinear;
-  desc.normalizedCoords = true;
+  desc.normalizedCoords = false;
 
   std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp);
 
@@ -197,9 +197,9 @@ std::optional<tex_t> tex_t::make(int height, int pitch) {
   return std::move(tex);
 }
 
-tex_t::tex_t() : array { }, texture { INVALID_TEXTURE } {}
+tex_t::tex_t() : array {}, texture { INVALID_TEXTURE } {}
 tex_t::tex_t(tex_t &&other) : array { other.array }, texture { other.texture } {
-  other.array = 0;
+  other.array   = 0;
   other.texture = INVALID_TEXTURE;
 }
 
@@ -240,6 +240,8 @@ sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pit
 
   viewport.offsetX = offsetX_f;
   viewport.offsetY = offsetY_f;
+
+  scale = 1.0f / scalar;
 }
 
 std::optional<sws_t> sws_t::make(int in_width, int in_height, int out_width, int out_height, int pitch) {
@@ -253,7 +255,7 @@ std::optional<sws_t> sws_t::make(int in_width, int in_height, int out_width, int
     return std::nullopt;
   }
 
-  return std::make_optional<sws_t>(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2, std::move(ptr));
+  return std::make_optional<sws_t>(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor, std::move(ptr));
 }
 
 int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture) {
@@ -267,7 +269,7 @@ int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std:
   dim3 block(threadsPerBlock);
   dim3 grid(div_align(threadsX, threadsPerBlock), threadsY);
 
-  RGBA_to_NV12<<<grid, block>>>(texture, Y, UV, pitchY, pitchUV, viewport, (video::color_t *)color_matrix.get());
+  RGBA_to_NV12<<<grid, block>>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get());
 
   return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
 }
diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h
index 7e377f20..d55ab8d0 100644
--- a/sunshine/platform/linux/cuda.h
+++ b/sunshine/platform/linux/cuda.h
@@ -83,6 +83,8 @@ public:
   int threadsPerBlock;
 
   viewport_t viewport;
+
+  float scale;
 };
 } // namespace cuda
 

From e287404992822cb22bd81cca6d3d12dcb9b1bdb5 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Sat, 25 Sep 2021 14:44:38 +0200
Subject: [PATCH 15/27] Handle acquiring display names based on encoder

---
 sunshine/platform/common.h                 |  4 +-
 sunshine/platform/linux/misc.cpp           | 67 +++++++++++-----------
 sunshine/platform/windows/display_base.cpp |  2 +-
 sunshine/video.cpp                         | 38 ++++++------
 4 files changed, 54 insertions(+), 57 deletions(-)

diff --git a/sunshine/platform/common.h b/sunshine/platform/common.h
index 4b702a9b..9d465749 100644
--- a/sunshine/platform/common.h
+++ b/sunshine/platform/common.h
@@ -286,8 +286,8 @@ std::unique_ptr<audio_control_t> audio_control();
  */
 std::shared_ptr<display_t> display(mem_type_e hwdevice_type, const std::string &display_name, int framerate);
 
-// A list of names of displays accepted as display_name
-std::vector<std::string> display_names();
+// A list of names of displays accepted as display_name with the mem_type_e
+std::vector<std::string> display_names(mem_type_e hwdevice_type);
 
 input_t input();
 void move_mouse(input_t &input, int deltaX, int deltaY);
diff --git a/sunshine/platform/linux/misc.cpp b/sunshine/platform/linux/misc.cpp
index bf6201b3..dd114ec4 100644
--- a/sunshine/platform/linux/misc.cpp
+++ b/sunshine/platform/linux/misc.cpp
@@ -140,7 +140,8 @@ std::string get_mac_address(const std::string_view &address) {
   return "00:00:00:00:00:00"s;
 }
 
-enum class source_e {
+namespace source {
+enum source_e : std::size_t {
 #ifdef SUNSHINE_BUILD_CUDA
   NVFBC,
 #endif
@@ -153,8 +154,11 @@ enum class source_e {
 #ifdef SUNSHINE_BUILD_X11
   X11,
 #endif
+  MAX_FLAGS
 };
-static source_e source;
+} // namespace source
+
+static std::bitset<source::MAX_FLAGS> sources;
 
 #ifdef SUNSHINE_BUILD_CUDA
 std::vector<std::string> nvfbc_display_names();
@@ -192,48 +196,48 @@ bool verify_x11() {
 }
 #endif
 
-std::vector<std::string> display_names() {
-  switch(source) {
+std::vector<std::string> display_names(mem_type_e hwdevice_type) {
 #ifdef SUNSHINE_BUILD_CUDA
-  case source_e::NVFBC:
-    return nvfbc_display_names();
+  // display using NvFBC only supports mem_type_e::cuda
+  if(sources[source::NVFBC] && hwdevice_type == mem_type_e::cuda) return nvfbc_display_names();
 #endif
 #ifdef SUNSHINE_BUILD_WAYLAND
-  case source_e::WAYLAND:
-    return wl_display_names();
+  if(sources[source::WAYLAND]) return wl_display_names();
 #endif
 #ifdef SUNSHINE_BUILD_DRM
-  case source_e::KMS:
-    return kms_display_names();
+  if(sources[source::KMS]) return kms_display_names();
 #endif
 #ifdef SUNSHINE_BUILD_X11
-  case source_e::X11:
-    return x11_display_names();
+  if(sources[source::X11]) return x11_display_names();
 #endif
-  }
-
   return {};
 }
 
 std::shared_ptr<display_t> display(mem_type_e hwdevice_type, const std::string &display_name, int framerate) {
-  switch(source) {
 #ifdef SUNSHINE_BUILD_CUDA
-  case source_e::NVFBC:
+  if(sources[source::NVFBC] && hwdevice_type == mem_type_e::cuda) {
+    BOOST_LOG(info) << "Screencasting with NvFBC"sv;
     return nvfbc_display(hwdevice_type, display_name, framerate);
+  }
 #endif
 #ifdef SUNSHINE_BUILD_WAYLAND
-  case source_e::WAYLAND:
+  if(sources[source::WAYLAND]) {
+    BOOST_LOG(info) << "Screencasting with Wayland's protocol"sv;
     return wl_display(hwdevice_type, display_name, framerate);
+  }
 #endif
 #ifdef SUNSHINE_BUILD_DRM
-  case source_e::KMS:
+  if(sources[source::KMS]) {
+    BOOST_LOG(info) << "Screencasting with KMS"sv;
     return kms_display(hwdevice_type, display_name, framerate);
+  }
 #endif
 #ifdef SUNSHINE_BUILD_X11
-  case source_e::X11:
+  if(sources[source::X11]) {
+    BOOST_LOG(info) << "Screencasting with X11"sv;
     return x11_display(hwdevice_type, display_name, framerate);
-#endif
   }
+#endif
 
   return nullptr;
 }
@@ -260,16 +264,14 @@ std::unique_ptr<deinit_t> init() {
 #endif
 #ifdef SUNSHINE_BUILD_CUDA
   if(verify_nvfbc()) {
-    BOOST_LOG(info) << "Using nvFBC for screencasting"sv;
-    source = source_e::NVFBC;
-    goto found_source;
+    BOOST_LOG(info) << "Using NvFBC for screencasting"sv;
+    sources[source::NVFBC] = true;
   }
 #endif
 #ifdef SUNSHINE_BUILD_WAYLAND
   if(verify_wl()) {
     BOOST_LOG(info) << "Using Wayland for screencasting"sv;
-    source = source_e::WAYLAND;
-    goto found_source;
+    sources[source::WAYLAND] = true;
   }
 #endif
 #ifdef SUNSHINE_BUILD_DRM
@@ -281,23 +283,20 @@ std::unique_ptr<deinit_t> init() {
     }
 
     BOOST_LOG(info) << "Using KMS for screencasting"sv;
-    source = source_e::KMS;
-    goto found_source;
+    sources[source::KMS] = true;
   }
 #endif
 #ifdef SUNSHINE_BUILD_X11
   if(verify_x11()) {
     BOOST_LOG(info) << "Using X11 for screencasting"sv;
-    source = source_e::X11;
-    goto found_source;
+    sources[source::X11] = true;
   }
 #endif
-  // Did not find a source
-  return nullptr;
 
-// Normally, I would simply use if-else statements to achieve this result,
-// but due to the macro's, (*spits on ground*), it would be too messy
-found_source:
+  if(sources.none()) {
+    return nullptr;
+  }
+
   if(!gladLoaderLoadEGL(EGL_NO_DISPLAY) || !eglGetPlatformDisplay) {
     BOOST_LOG(warning) << "Couldn't load EGL library"sv;
   }
diff --git a/sunshine/platform/windows/display_base.cpp b/sunshine/platform/windows/display_base.cpp
index f816ad88..02b08c37 100644
--- a/sunshine/platform/windows/display_base.cpp
+++ b/sunshine/platform/windows/display_base.cpp
@@ -452,7 +452,7 @@ std::shared_ptr<display_t> display(mem_type_e hwdevice_type, const std::string &
   return nullptr;
 }
 
-std::vector<std::string> display_names() {
+std::vector<std::string> display_names(mem_type_e) {
   std::vector<std::string> display_names;
 
   HRESULT status;
diff --git a/sunshine/video.cpp b/sunshine/video.cpp
index 6faf82ee..f4a4c523 100644
--- a/sunshine/video.cpp
+++ b/sunshine/video.cpp
@@ -585,7 +585,7 @@ void captureThread(
 
   // Get all the monitor names now, rather than at boot, to
   // get the most up-to-date list available monitors
-  auto display_names = platf::display_names();
+  auto display_names = platf::display_names(map_dev_type(encoder.dev_type));
   int display_p      = 0;
 
   if(display_names.empty()) {
@@ -1105,17 +1105,30 @@ std::optional<sync_session_t> make_synced_session(platf::display_t *disp, const
     return std::nullopt;
   }
 
-  encode_session.session  = std::move(*session);
+  encode_session.session = std::move(*session);
 
   return std::move(encode_session);
 }
 
 encode_e encode_run_sync(
   std::vector<std::unique_ptr<sync_session_ctx_t>> &synced_session_ctxs,
-  encode_session_ctx_queue_t &encode_session_ctx_queue,
-  int &display_p, const std::vector<std::string> &display_names) {
+  encode_session_ctx_queue_t &encode_session_ctx_queue) {
 
   const auto &encoder = encoders.front();
+  auto display_names  = platf::display_names(map_dev_type(encoder.dev_type));
+  int display_p       = 0;
+
+  if(display_names.empty()) {
+    display_names.emplace_back(config::video.output_name);
+  }
+
+  for(int x = 0; x < display_names.size(); ++x) {
+    if(display_names[x] == config::video.output_name) {
+      display_p = x;
+
+      break;
+    }
+  }
 
   std::shared_ptr<platf::display_t> disp;
 
@@ -1269,22 +1282,7 @@ void captureThreadSync() {
     }
   });
 
-  auto display_names = platf::display_names();
-  int display_p      = 0;
-
-  if(display_names.empty()) {
-    display_names.emplace_back(config::video.output_name);
-  }
-
-  for(int x = 0; x < display_names.size(); ++x) {
-    if(display_names[x] == config::video.output_name) {
-      display_p = x;
-
-      break;
-    }
-  }
-
-  while(encode_run_sync(synced_session_ctxs, ctx, display_p, display_names) == encode_e::reinit) {}
+  while(encode_run_sync(synced_session_ctxs, ctx) == encode_e::reinit) {}
 }
 
 void capture_async(

From d7cb71f877f88dae6fdc2a87a12f5d6970ff99d2 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Sat, 25 Sep 2021 15:35:44 +0200
Subject: [PATCH 16/27] Update README

---
 README.md | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 69bf2c47..8be5438c 100644
--- a/README.md
+++ b/README.md
@@ -17,27 +17,35 @@ Sunshine is a Gamestream host for Moonlight
 
 Ubuntu 20.04:
 Install the following:
-#### X11 Only
+
+#### Common
 ```
-sudo apt install cmake gcc-10 g++-10 libssl-dev libavdevice-dev libboost-thread-dev libboost-filesystem-dev libboost-log-dev libpulse-dev libopus-dev libxtst-dev libx11-dev libxrandr-dev libxfixes-dev libevdev-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev
+sudo apt install cmake gcc-10 g++-10 libssl-dev libavdevice-dev libboost-thread-dev libboost-filesystem-dev libboost-log-dev libpulse-dev libopus-dev
+```
+#### X11
+```
+sudo apt install libxtst-dev libx11-dev libxrandr-dev libxfixes-dev libevdev-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev
 ```
 
-#### X11 + KMS (Requires additional setup)
-KMS allows Sunshine to grab the monitor with lower latency then through X11
-
+#### KMS
+This requires additional [setup](README.md#Setup).
 ```
-sudo apt install cmake gcc-10 g++-10 libssl-dev libavdevice-dev libboost-thread-dev libboost-filesystem-dev libboost-log-dev libpulse-dev libopus-dev libxtst-dev libx11-dev libxrandr-dev libxfixes-dev libevdev-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev libdrm-dev libcap-dev
+sudo apt install libdrm-dev libcap-dev
+```
+
+#### Wayland
+This is for wlroots based compositores, such as Sway
+```
+sudo apt install libwayland-dev
+```
+
+#### Cuda + NvFBC
+This requires proprietary software
+```
+sudo apt install nvidia-cuda-dev nvidia-cuda-toolkit
 ```
 
 ### Compilation:
-
-#### X11 Only
-- `git clone https://github.com/loki-47-6F-64/sunshine.git --recurse-submodules`
-- `cd sunshine && mkdir build && cd build`
-- `cmake -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_CXX_COMPILER=g++-10 -DSUNSHINE_ENABLE_DRM=OFF ..`
-- `make -j ${nproc}`
-
-#### X11 + KMS
 - `git clone https://github.com/loki-47-6F-64/sunshine.git --recurse-submodules`
 - `cd sunshine && mkdir build && cd build`
 - `cmake -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_CXX_COMPILER=g++-10 ..`

From 50bd3094b4fe3ebb34c5102cd99d348ccb895a19 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Sat, 25 Sep 2021 17:46:26 +0200
Subject: [PATCH 17/27] More accurate capture rate for NvFBC

---
 sunshine/platform/linux/cuda.cpp | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp
index b08b832a..ac6680b8 100644
--- a/sunshine/platform/linux/cuda.cpp
+++ b/sunshine/platform/linux/cuda.cpp
@@ -393,6 +393,8 @@ public:
       }
     }
 
+    delay = std::chrono::nanoseconds { 1s } / framerate;
+
     capture_params = NVFBC_CREATE_CAPTURE_SESSION_PARAMS { NVFBC_CREATE_CAPTURE_SESSION_PARAMS_VER };
 
     capture_params.eCaptureType                = NVFBC_CAPTURE_SHARED_CUDA;
@@ -426,6 +428,8 @@ public:
   }
 
   platf::capture_e capture(snapshot_cb_t &&snapshot_cb, std::shared_ptr<platf::img_t> img, bool *cursor) override {
+    auto next_frame = std::chrono::steady_clock::now();
+
     // Force display_t::capture to initialize handle_t::capture
     cursor_visible = !*cursor;
 
@@ -434,7 +438,17 @@ public:
     });
 
     while(img) {
-      auto status = snapshot(img.get(), 500ms, *cursor);
+      auto now = std::chrono::steady_clock::now();
+      if(next_frame > now) {
+        std::this_thread::sleep_for((next_frame - now) / 3 * 2);
+      }
+      while(next_frame > now) {
+        std::this_thread::sleep_for(1ns);
+        now = std::chrono::steady_clock::now();
+      }
+      next_frame = now + delay;
+
+      auto status = snapshot(img.get(), 150ms, *cursor);
       switch(status) {
       case platf::capture_e::reinit:
       case platf::capture_e::error:
@@ -552,10 +566,6 @@ public:
       return platf::capture_e::error;
     }
 
-    if(!info.bIsNewFrame) {
-      return platf::capture_e::timeout;
-    }
-
     if(((img_t *)img)->tex.copy((std::uint8_t *)device_ptr, img->height, img->row_pitch)) {
       return platf::capture_e::error;
     }
@@ -590,6 +600,8 @@ public:
     return 0;
   }
 
+  std::chrono::nanoseconds delay;
+
   bool cursor_visible;
   handle_t handle;
 

From fcb84132f43b074e3efbe3c29870d9e62d959319 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Sat, 25 Sep 2021 19:18:43 +0200
Subject: [PATCH 18/27] Sharper images when not scaling the image

---
 sunshine/platform/linux/cuda.cpp | 23 ++++++++++++++++-------
 sunshine/platform/linux/cuda.cu  | 25 ++++++++++++++++++-------
 sunshine/platform/linux/cuda.h   |  6 +++++-
 3 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp
index ac6680b8..b15104a5 100644
--- a/sunshine/platform/linux/cuda.cpp
+++ b/sunshine/platform/linux/cuda.cpp
@@ -112,6 +112,8 @@ public:
 
     sws = std::move(*sws_opt);
 
+    linear_interpolation = width != frame->width || height != frame->height;
+
     return 0;
   }
 
@@ -140,20 +142,27 @@ public:
       return;
     }
 
-    sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture, { frame->width, frame->height, 0, 0 });
+    sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, { frame->width, frame->height, 0, 0 });
+  }
+
+  cudaTextureObject_t tex_obj(const tex_t &tex) const {
+    return linear_interpolation ? tex.texture.linear : tex.texture.point;
   }
 
   frame_t hwframe;
 
   int width, height;
 
+  // When heigth and width don't change, it's not necessary to use linear interpolation
+  bool linear_interpolation;
+
   sws_t sws;
 };
 
 class cuda_ram_t : public cuda_t {
 public:
   int convert(platf::img_t &img) override {
-    return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex.texture);
+    return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex));
   }
 
   int set_frame(AVFrame *frame) {
@@ -177,7 +186,7 @@ public:
 class cuda_vram_t : public cuda_t {
 public:
   int convert(platf::img_t &img) override {
-    return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], (cudaTextureObject_t)img.data);
+    return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *)&img)->tex));
   }
 };
 
@@ -497,7 +506,7 @@ public:
 
       NVFBC_TOCUDA_GRAB_FRAME_PARAMS grab {
         NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER,
-        NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY,
+        NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT,
         &device_ptr,
         &info,
         0,
@@ -551,7 +560,7 @@ public:
 
     NVFBC_TOCUDA_GRAB_FRAME_PARAMS grab {
       NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER,
-      NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY,
+      NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT,
       &device_ptr,
       &info,
       (std::uint32_t)timeout.count(),
@@ -580,6 +589,7 @@ public:
   std::shared_ptr<platf::img_t> alloc_img() override {
     auto img = std::make_shared<cuda::img_t>();
 
+    img->data        = nullptr;
     img->width       = width;
     img->height      = height;
     img->pixel_pitch = 4;
@@ -590,8 +600,7 @@ public:
       return nullptr;
     }
 
-    img->tex  = std::move(*tex_opt);
-    img->data = (std::uint8_t *)img->tex.texture;
+    img->tex = std::move(*tex_opt);
 
     return img;
   };
diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu
index 7bfb5eab..49f088f8 100644
--- a/sunshine/platform/linux/cuda.cu
+++ b/sunshine/platform/linux/cuda.cu
@@ -160,7 +160,7 @@ __global__ void RGBA_to_NV12(
   float3 rgb_l = bgra_to_rgb(tex2D<float4>(srcImage, x, y));
   float3 rgb_r = bgra_to_rgb(tex2D<float4>(srcImage, x + scale, y));
 
-  float2 uv = calcUV((rgb_l + rgb_r) * 0.5f, color_matrix) * 255.0f;
+  float2 uv = calcUV((rgb_l + rgb_r) * 0.5f, color_matrix) * 256.0f;
 
   dstUV[0] = uv.x;
   dstUV[1] = uv.y;
@@ -187,12 +187,16 @@ std::optional<tex_t> tex_t::make(int height, int pitch) {
   cudaTextureDesc desc {};
 
   desc.readMode         = cudaReadModeNormalizedFloat;
-  desc.filterMode       = cudaFilterModeLinear;
+  desc.filterMode       = cudaFilterModePoint;
   desc.normalizedCoords = false;
 
   std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp);
 
-  CU_CHECK_OPT(cudaCreateTextureObject(&tex.texture, &res, &desc, nullptr), "Couldn't create cuda texture");
+  CU_CHECK_OPT(cudaCreateTextureObject(&tex.texture.point, &res, &desc, nullptr), "Couldn't create cuda texture that uses point interpolation");
+
+  desc.filterMode = cudaFilterModeLinear;
+
+  CU_CHECK_OPT(cudaCreateTextureObject(&tex.texture.linear, &res, &desc, nullptr), "Couldn't create cuda texture that uses linear interpolation");
 
   return std::move(tex);
 }
@@ -200,7 +204,8 @@ std::optional<tex_t> tex_t::make(int height, int pitch) {
 tex_t::tex_t() : array {}, texture { INVALID_TEXTURE } {}
 tex_t::tex_t(tex_t &&other) : array { other.array }, texture { other.texture } {
   other.array   = 0;
-  other.texture = INVALID_TEXTURE;
+  other.texture.point = INVALID_TEXTURE;
+  other.texture.linear = INVALID_TEXTURE;
 }
 
 tex_t &tex_t::operator=(tex_t &&other) {
@@ -211,10 +216,16 @@ tex_t &tex_t::operator=(tex_t &&other) {
 }
 
 tex_t::~tex_t() {
-  if(texture != INVALID_TEXTURE) {
-    CU_CHECK_IGNORE(cudaDestroyTextureObject(texture), "Couldn't deallocate cuda texture");
+  if(texture.point != INVALID_TEXTURE) {
+    CU_CHECK_IGNORE(cudaDestroyTextureObject(texture.point), "Couldn't deallocate cuda texture that uses point interpolation");
 
-    texture = INVALID_TEXTURE;
+    texture.point = INVALID_TEXTURE;
+  }
+
+  if(texture.linear != INVALID_TEXTURE) {
+    CU_CHECK_IGNORE(cudaDestroyTextureObject(texture.linear), "Couldn't deallocate cuda texture that uses linear interpolation");
+
+    texture.linear = INVALID_TEXTURE;
   }
 
   if(array) {
diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h
index d55ab8d0..5811379f 100644
--- a/sunshine/platform/linux/cuda.h
+++ b/sunshine/platform/linux/cuda.h
@@ -54,7 +54,11 @@ public:
   int copy(std::uint8_t *src, int height, int pitch);
 
   cudaArray_t array;
-  cudaTextureObject_t texture;
+
+  struct texture {
+    cudaTextureObject_t point;
+    cudaTextureObject_t linear;
+  } texture;
 };
 
 class sws_t {

From deecd19af256edc51046c39ea4a36b398f23ca07 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Sat, 25 Sep 2021 21:50:47 +0200
Subject: [PATCH 19/27] Update appveyor.yml

---
 CMakeLists.txt | 3 ++-
 appveyor.yml   | 2 +-
 gen-deb.in     | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2eefcdac..ba90985b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,7 +13,6 @@ if(WIN32)
 endif()
 add_subdirectory(third-party/moonlight-common-c/enet)
 add_subdirectory(third-party/Simple-Web-Server)
-add_subdirectory(third-party/cbs)
 
 set(UPNPC_BUILD_SHARED OFF CACHE BOOL "no shared libraries")
 set(UPNPC_BUILD_TESTS OFF CACHE BOOL "Don't build tests for miniupnpc")
@@ -311,6 +310,8 @@ include_directories(
   ${PLATFORM_INCLUDE_DIRS}
 )
 
+add_subdirectory(third-party/cbs)
+
 string(TOUPPER "x${CMAKE_BUILD_TYPE}" BUILD_TYPE)
 if("${BUILD_TYPE}" STREQUAL "XDEBUG")
 	list(APPEND SUNSHINE_COMPILE_OPTIONS -O0 -ggdb3)
diff --git a/appveyor.yml b/appveyor.yml
index ce60cc59..75ad5c77 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -9,7 +9,7 @@ environment:
 
 install:
   - sh: sudo apt update --ignore-missing
-  - sh: sudo apt install -y build-essential fakeroot gcc-10 g++-10 cmake libssl-dev libavdevice-dev libboost-thread-dev libboost-filesystem-dev libboost-log-dev libpulse-dev libopus-dev libxtst-dev libx11-dev libxrandr-dev libxfixes-dev libevdev-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev libdrm-dev libcap-dev
+  - sh: sudo apt install -y build-essential fakeroot gcc-10 g++-10 cmake libssl-dev libavdevice-dev libboost-thread-dev libboost-filesystem-dev libboost-log-dev libpulse-dev libopus-dev libxtst-dev libx11-dev libxrandr-dev libxfixes-dev libevdev-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev libdrm-dev libcap-dev nvidia-cuda-dev nvidia-cuda-toolkit
   - cmd: C:\msys64\usr\bin\bash -lc "pacman --needed --noconfirm -S mingw-w64-x86_64-openssl mingw-w64-x86_64-cmake mingw-w64-x86_64-toolchain mingw-w64-x86_64-opus mingw-w64-x86_64-x265 mingw-w64-x86_64-boost git yasm nasm diffutils make"
 
 before_build:
diff --git a/gen-deb.in b/gen-deb.in
index 9ddbf2a6..70da08d2 100755
--- a/gen-deb.in
+++ b/gen-deb.in
@@ -37,7 +37,7 @@ Package: sunshine
 Architecture: amd64
 Maintainer: @loki
 Priority: optional
-Version: 0.10.2
+Version: 0.11.0
 Depends: libssl1.1, libavdevice58, libboost-thread1.67.0 | libboost-thread1.71.0, libboost-filesystem1.67.0 | libboost-filesystem1.71.0, libboost-log1.67.0 | libboost-log1.71.0, libpulse0, libopus0, libxcb-shm0, libxcb-xfixes0, libxtst6, libevdev2, libdrm2, libcap2
 Description: Gamestream host for Moonlight
 EOF

From c5a356f3e7c91ee70d98678ec11012571c6f3409 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Sat, 25 Sep 2021 22:31:39 +0200
Subject: [PATCH 20/27] Fix compilation on ubuntu 20.04

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2eefcdac..53476445 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,7 @@ find_package(OpenSSL REQUIRED)
 set(Boost_USE_STATIC_LIBS ON)
 find_package(Boost COMPONENTS log filesystem REQUIRED)
 
-list(APPEND SUNSHINE_COMPILE_OPTIONS -fPIC -Wall -Wno-missing-braces -Wno-maybe-uninitialized -Wno-sign-compare)
+list(APPEND SUNSHINE_COMPILE_OPTIONS -Wall -Wno-missing-braces -Wno-maybe-uninitialized -Wno-sign-compare)
 
 if(WIN32)
 	file(

From 57c79458474c353d37c76fa2ae60bbb9e5aa4f4d Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Sun, 26 Sep 2021 00:18:49 +0200
Subject: [PATCH 21/27] Fix typo in README

---
 README.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 48d0ebaf..e9532d00 100644
--- a/README.md
+++ b/README.md
@@ -38,10 +38,6 @@ This is for wlroots based compositores, such as Sway
 ```
 sudo apt install libwayland-dev
 ```
-#### Warning:
-You might require ffmpeg version >= 4.3. Check the troubleshooting section for more information.
-
-### Compilation:
 
 #### Cuda + NvFBC
 This requires proprietary software
@@ -49,6 +45,9 @@ This requires proprietary software
 sudo apt install nvidia-cuda-dev nvidia-cuda-toolkit
 ```
 
+#### Warning:
+You might require ffmpeg version >= 4.3. Check the troubleshooting section for more information.
+
 ### Compilation:
 - `git clone https://github.com/loki-47-6F-64/sunshine.git --recurse-submodules`
 - `cd sunshine && mkdir build && cd build`

From e2fb02323c73c0de469ff6704e371c045a584362 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Sun, 26 Sep 2021 10:57:43 +0200
Subject: [PATCH 22/27] Attempt to fix ubuntu 20.04 build

---
 CMakeLists.txt | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c87c7844..62d8b435 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -350,12 +350,17 @@ list(APPEND SUNSHINE_EXTERNAL_LIBRARIES
 		${OPENSSL_LIBRARIES}
 		${PLATFORM_LIBRARIES})
 
+add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${SUNSHINE_COMPILE_OPTIONS}>")
+add_compile_options("$<$<COMPILE_LANGUAGE:C>:${SUNSHINE_COMPILE_OPTIONS}>")
+
+foreach(flag IN LISTS SUNSHINE_COMPILE_OPTIONS)
+  add_compile_options($<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${flag}>)
+endforeach()
+
 list(APPEND SUNSHINE_DEFINITIONS SUNSHINE_ASSETS_DIR="${SUNSHINE_ASSETS_DIR}")
 list(APPEND SUNSHINE_DEFINITIONS SUNSHINE_CONFIG_DIR="${SUNSHINE_CONFIG_DIR}")
 list(APPEND SUNSHINE_DEFINITIONS SUNSHINE_DEFAULT_DIR="${SUNSHINE_DEFAULT_DIR}")
 add_executable(sunshine ${SUNSHINE_TARGET_FILES})
 target_link_libraries(sunshine ${SUNSHINE_EXTERNAL_LIBRARIES})
 target_compile_definitions(sunshine PUBLIC ${SUNSHINE_DEFINITIONS})
-set_target_properties(sunshine PROPERTIES CXX_STANDARD 17)
-
-target_compile_options(sunshine PRIVATE ${SUNSHINE_COMPILE_OPTIONS})
+set_target_properties(sunshine PROPERTIES CXX_STANDARD 17)
\ No newline at end of file

From 60e3538adcbb2a15501f61f078190935e727952c Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Sun, 26 Sep 2021 11:39:36 +0200
Subject: [PATCH 23/27] Attempt to fix ubuntu 20.04 build, again

---
 CMakeLists.txt                   |  5 ++++-
 sunshine/platform/linux/cuda.cpp |  9 ++++----
 sunshine/platform/linux/cuda.cu  | 37 ++++++++++++--------------------
 sunshine/platform/linux/cuda.h   |  6 +++---
 4 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62d8b435..c96b1bbd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -351,11 +351,14 @@ list(APPEND SUNSHINE_EXTERNAL_LIBRARIES
 		${PLATFORM_LIBRARIES})
 
 add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${SUNSHINE_COMPILE_OPTIONS}>")
+add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-std=c++17>")
 add_compile_options("$<$<COMPILE_LANGUAGE:C>:${SUNSHINE_COMPILE_OPTIONS}>")
 
 foreach(flag IN LISTS SUNSHINE_COMPILE_OPTIONS)
   add_compile_options($<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${flag}>)
 endforeach()
+add_compile_options($<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-std=c++14>)
+add_compile_options($<$<COMPILE_LANGUAGE:CUDA>:-std=c++14>)
 
 list(APPEND SUNSHINE_DEFINITIONS SUNSHINE_ASSETS_DIR="${SUNSHINE_ASSETS_DIR}")
 list(APPEND SUNSHINE_DEFINITIONS SUNSHINE_CONFIG_DIR="${SUNSHINE_CONFIG_DIR}")
@@ -363,4 +366,4 @@ list(APPEND SUNSHINE_DEFINITIONS SUNSHINE_DEFAULT_DIR="${SUNSHINE_DEFAULT_DIR}")
 add_executable(sunshine ${SUNSHINE_TARGET_FILES})
 target_link_libraries(sunshine ${SUNSHINE_EXTERNAL_LIBRARIES})
 target_compile_definitions(sunshine PUBLIC ${SUNSHINE_DEFINITIONS})
-set_target_properties(sunshine PROPERTIES CXX_STANDARD 17)
\ No newline at end of file
+# set_target_properties(sunshine PROPERTIES CXX_STANDARD 17)
\ No newline at end of file
diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp
index b15104a5..8a25eb27 100644
--- a/sunshine/platform/linux/cuda.cpp
+++ b/sunshine/platform/linux/cuda.cpp
@@ -1,4 +1,5 @@
 #include <bitset>
+#include <optional>
 
 #include <NvFBC.h>
 #include <ffnvcodec/dynlink_loader.h>
@@ -29,7 +30,7 @@ namespace cuda {
 constexpr auto cudaDevAttrMaxThreadsPerBlock          = (CUdevice_attribute)1;
 constexpr auto cudaDevAttrMaxThreadsPerMultiProcessor = (CUdevice_attribute)39;
 
-void pass_error(const std::string_view &sv, const char *name, const char *description) {
+void pass_error(const std::string &sv, const char *name, const char *description) {
   BOOST_LOG(error) << sv << name << ':' << description;
 }
 
@@ -276,7 +277,7 @@ public:
     return *this;
   }
 
-  static std::optional<handle_t> make() {
+  static std::unique_ptr<handle_t> make() {
     NVFBC_CREATE_HANDLE_PARAMS params { NVFBC_CREATE_HANDLE_PARAMS_VER };
 
     handle_t handle;
@@ -284,12 +285,12 @@ public:
     if(status) {
       BOOST_LOG(error) << "Failed to create session: "sv << handle.last_error();
 
-      return std::nullopt;
+      return nullptr;
     }
 
     handle.handle_flags[SESSION_HANDLE] = true;
 
-    return std::move(handle);
+    return std::make_unique<handle_t>(std::move(handle));
   }
 
   const char *last_error() {
diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu
index 49f088f8..e93f7d9f 100644
--- a/sunshine/platform/linux/cuda.cu
+++ b/sunshine/platform/linux/cuda.cu
@@ -1,15 +1,11 @@
-// #include <algorithm>
+#include <algorithm>
 #include <helper_math.h>
 #include <limits>
 #include <memory>
-#include <optional>
-#include <string_view>
 
 #include "cuda.h"
 
-using namespace std::literals;
-
-#define SUNSHINE_STRINGVIEW_HELPER(x) x##sv
+#define SUNSHINE_STRINGVIEW_HELPER(x) x
 #define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x)
 
 #define CU_CHECK(x, y) \
@@ -21,14 +17,9 @@ using namespace std::literals;
 #define CU_CHECK_PTR(x, y) \
   if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return nullptr;
 
-#define CU_CHECK_OPT(x, y) \
-  if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return std::nullopt;
-
 #define CU_CHECK_IGNORE(x, y) \
   check((x), SUNSHINE_STRINGVIEW(y ": "))
 
-using namespace std::literals;
-
 //////////////////// Special desclarations
 /**
  * NVCC segfaults when including <chrono>
@@ -83,8 +74,8 @@ inline T div_align(T l, T r) {
   return (l + r - 1) / r;
 }
 
-void pass_error(const std::string_view &sv, const char *name, const char *description);
-inline static int check(cudaError_t result, const std::string_view &sv) {
+void pass_error(const std::string &sv, const char *name, const char *description);
+inline static int check(cudaError_t result, const std::string &sv) {
   if(result) {
     auto name        = cudaGetErrorName(result);
     auto description = cudaGetErrorString(result);
@@ -174,11 +165,11 @@ int tex_t::copy(std::uint8_t *src, int height, int pitch) {
   return 0;
 }
 
-std::optional<tex_t> tex_t::make(int height, int pitch) {
+std::unique_ptr<tex_t> tex_t::make(int height, int pitch) {
   tex_t tex;
 
   auto format = cudaCreateChannelDesc<uchar4>();
-  CU_CHECK_OPT(cudaMallocArray(&tex.array, &format, pitch, height, cudaArrayDefault), "Couldn't allocate cuda array");
+  CU_CHECK_PTR(cudaMallocArray(&tex.array, &format, pitch, height, cudaArrayDefault), "Couldn't allocate cuda array");
 
   cudaResourceDesc res {};
   res.resType         = cudaResourceTypeArray;
@@ -192,13 +183,13 @@ std::optional<tex_t> tex_t::make(int height, int pitch) {
 
   std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp);
 
-  CU_CHECK_OPT(cudaCreateTextureObject(&tex.texture.point, &res, &desc, nullptr), "Couldn't create cuda texture that uses point interpolation");
+  CU_CHECK_PTR(cudaCreateTextureObject(&tex.texture.point, &res, &desc, nullptr), "Couldn't create cuda texture that uses point interpolation");
 
   desc.filterMode = cudaFilterModeLinear;
 
-  CU_CHECK_OPT(cudaCreateTextureObject(&tex.texture.linear, &res, &desc, nullptr), "Couldn't create cuda texture that uses linear interpolation");
+  CU_CHECK_PTR(cudaCreateTextureObject(&tex.texture.linear, &res, &desc, nullptr), "Couldn't create cuda texture that uses linear interpolation");
 
-  return std::move(tex);
+  return std::make_unique<tex_t>(std::move(tex));
 }
 
 tex_t::tex_t() : array {}, texture { INVALID_TEXTURE } {}
@@ -255,18 +246,18 @@ sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pit
   scale = 1.0f / scalar;
 }
 
-std::optional<sws_t> sws_t::make(int in_width, int in_height, int out_width, int out_height, int pitch) {
+std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, int out_height, int pitch) {
   cudaDeviceProp props;
   int device;
-  CU_CHECK_OPT(cudaGetDevice(&device), "Couldn't get cuda device");
-  CU_CHECK_OPT(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties");
+  CU_CHECK_PTR(cudaGetDevice(&device), "Couldn't get cuda device");
+  CU_CHECK_PTR(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties");
 
   auto ptr = make_ptr<video::color_t>();
   if(!ptr) {
-    return std::nullopt;
+    return nullptr;
   }
 
-  return std::make_optional<sws_t>(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor, std::move(ptr));
+  return std::make_unique<sws_t>(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor, std::move(ptr));
 }
 
 int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture) {
diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h
index 5811379f..e46b4759 100644
--- a/sunshine/platform/linux/cuda.h
+++ b/sunshine/platform/linux/cuda.h
@@ -3,7 +3,7 @@
 
 #include <vector>
 #include <memory>
-#include <optional>
+#include <string>
 
 namespace platf {
   class hwdevice_t;
@@ -42,7 +42,7 @@ struct viewport_t {
 
 class tex_t {
 public:
-  static std::optional<tex_t> make(int height, int pitch);
+  static std::unique_ptr<tex_t> make(int height, int pitch);
 
   tex_t();
   tex_t(tex_t &&);
@@ -72,7 +72,7 @@ public:
    * 
    * pitch -- The size of a single row of pixels in bytes
    */
-  static std::optional<sws_t> make(int in_width, int in_height, int out_width, int out_height, int pitch);
+  static std::unique_ptr<sws_t> make(int in_width, int in_height, int out_width, int out_height, int pitch);
 
   // Converts loaded image into a CUDevicePtr
   int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture);

From 847d7b6980d19ef1197bdd4106b4ae9268e1abdc Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Sun, 26 Sep 2021 23:45:44 +0200
Subject: [PATCH 24/27] Fix minor error in README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e9532d00..b3160832 100644
--- a/README.md
+++ b/README.md
@@ -20,11 +20,11 @@ Install the following:
 
 #### Common
 ```
-sudo apt install cmake gcc-10 g++-10 libssl-dev libavdevice-dev libboost-thread-dev libboost-filesystem-dev libboost-log-dev libpulse-dev libopus-dev
+sudo apt install cmake gcc-10 g++-10 libssl-dev libavdevice-dev libboost-thread-dev libboost-filesystem-dev libboost-log-dev libpulse-dev libopus-dev libevdev-dev
 ```
 #### X11
 ```
-sudo apt install libxtst-dev libx11-dev libxrandr-dev libxfixes-dev libevdev-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev
+sudo apt install libxtst-dev libx11-dev libxrandr-dev libxfixes-dev libxcb1-dev libxcb-shm0-dev libxcb-xfixes0-dev
 ```
 
 #### KMS

From 4177b020647131be59d85d8b19c8e3143e820b36 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Mon, 27 Sep 2021 17:58:35 +0200
Subject: [PATCH 25/27] Allow cuda kernel to run in parallell

---
 sunshine/platform/linux/cuda.cpp | 70 +++++++++++++++++++++++++-------
 sunshine/platform/linux/cuda.cu  | 25 ++++++++++--
 sunshine/platform/linux/cuda.h   | 25 ++++++++----
 sunshine/platform/linux/misc.cpp |  4 --
 4 files changed, 95 insertions(+), 29 deletions(-)

diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp
index 8a25eb27..b96dcf17 100644
--- a/sunshine/platform/linux/cuda.cpp
+++ b/sunshine/platform/linux/cuda.cpp
@@ -57,6 +57,10 @@ inline static int check(CUresult result, const std::string_view &sv) {
   return 0;
 }
 
+void freeStream(CUstream stream) {
+  CU_CHECK_IGNORE(cdf->cuStreamDestroy(stream), "Couldn't destroy cuda stream");
+}
+
 class img_t : public platf::img_t {
 public:
   tex_t tex;
@@ -95,7 +99,8 @@ public:
     this->hwframe.reset(frame);
     this->frame = frame;
 
-    if(((AVHWFramesContext *)frame->hw_frames_ctx->data)->sw_format != AV_PIX_FMT_NV12) {
+    auto hwframe_ctx = (AVHWFramesContext *)frame->hw_frames_ctx->data;
+    if(hwframe_ctx->sw_format != AV_PIX_FMT_NV12) {
       BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12"sv;
       return -1;
     }
@@ -106,6 +111,15 @@ public:
       return -1;
     }
 
+    auto cuda_ctx = (AVCUDADeviceContext *)hwframe_ctx->device_ctx->hwctx;
+
+    stream = make_stream();
+    if(!stream) {
+      return -1;
+    }
+
+    cuda_ctx->stream = stream.get();
+
     auto sws_opt = sws_t::make(width, height, frame->width, frame->height, width * 4);
     if(!sws_opt) {
       return -1;
@@ -143,13 +157,14 @@ public:
       return;
     }
 
-    sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, { frame->width, frame->height, 0, 0 });
+    sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, stream.get(), { frame->width, frame->height, 0, 0 });
   }
 
   cudaTextureObject_t tex_obj(const tex_t &tex) const {
     return linear_interpolation ? tex.texture.linear : tex.texture.point;
   }
 
+  stream_t stream;
   frame_t hwframe;
 
   int width, height;
@@ -163,7 +178,7 @@ public:
 class cuda_ram_t : public cuda_t {
 public:
   int convert(platf::img_t &img) override {
-    return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex));
+    return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex), stream.get());
   }
 
   int set_frame(AVFrame *frame) {
@@ -187,7 +202,7 @@ public:
 class cuda_vram_t : public cuda_t {
 public:
   int convert(platf::img_t &img) override {
-    return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *)&img)->tex));
+    return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *)&img)->tex), stream.get());
   }
 };
 
@@ -257,6 +272,28 @@ int init() {
   return 0;
 }
 
+class ctx_t {
+public:
+  ctx_t(NVFBC_SESSION_HANDLE handle) {
+    NVFBC_BIND_CONTEXT_PARAMS params { NVFBC_BIND_CONTEXT_PARAMS_VER };
+
+    if(func.nvFBCBindContext(handle, &params)) {
+      BOOST_LOG(error) << "Couldn't bind NvFBC context to current thread: " << func.nvFBCGetLastErrorStr(handle);
+    }
+
+    this->handle = handle;
+  }
+
+  ~ctx_t() {
+    NVFBC_RELEASE_CONTEXT_PARAMS params { NVFBC_RELEASE_CONTEXT_PARAMS_VER };
+    if(func.nvFBCReleaseContext(handle, &params)) {
+      BOOST_LOG(error) << "Couldn't release NvFBC context from current thread: " << func.nvFBCGetLastErrorStr(handle);
+    }
+  }
+
+  NVFBC_SESSION_HANDLE handle;
+};
+
 class handle_t {
   enum flag_e {
     SESSION_HANDLE,
@@ -348,24 +385,26 @@ public:
     return 0;
   }
 
-  ~handle_t() {
+  int reset() {
     if(!handle_flags[SESSION_HANDLE]) {
-      return;
+      return 0;
     }
 
-    if(handle_flags[SESSION_CAPTURE]) {
-      NVFBC_DESTROY_CAPTURE_SESSION_PARAMS params { NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER };
-
-      if(func.nvFBCDestroyCaptureSession(handle, &params)) {
-        BOOST_LOG(error) << "Couldn't destroy capture session: "sv << func.nvFBCGetLastErrorStr(handle);
-      }
-    }
+    stop();
 
     NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER };
 
     if(func.nvFBCDestroyHandle(handle, &params)) {
       BOOST_LOG(error) << "Couldn't destroy session handle: "sv << func.nvFBCGetLastErrorStr(handle);
     }
+
+    handle_flags[SESSION_HANDLE] = false;
+
+    return 0;
+  }
+
+  ~handle_t() {
+    reset();
   }
 
   std::bitset<MAX_FLAGS> handle_flags;
@@ -381,6 +420,8 @@ public:
       return -1;
     }
 
+    ctx_t ctx { handle->handle };
+
     auto status_params = handle->status();
     if(!status_params) {
       return -1;
@@ -443,8 +484,9 @@ public:
     // Force display_t::capture to initialize handle_t::capture
     cursor_visible = !*cursor;
 
+    ctx_t ctx { handle.handle };
     auto fg = util::fail_guard([&]() {
-      handle.stop();
+      handle.reset();
     });
 
     while(img) {
diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu
index e93f7d9f..acf2d76d 100644
--- a/sunshine/platform/linux/cuda.cu
+++ b/sunshine/platform/linux/cuda.cu
@@ -101,6 +101,23 @@ void freeCudaPtr_t::operator()(void *ptr) {
   CU_CHECK_IGNORE(cudaFree(ptr), "Couldn't free cuda device pointer");
 }
 
+void freeCudaStream_t::operator()(cudaStream_t ptr) {
+  CU_CHECK_IGNORE(cudaStreamDestroy(ptr), "Couldn't free cuda stream");
+}
+
+stream_t make_stream(int flags) {
+  cudaStream_t stream;
+
+  if(!flags) {
+    CU_CHECK_PTR(cudaStreamCreate(&stream), "Couldn't create cuda stream");
+  }
+  else {
+    CU_CHECK_PTR(cudaStreamCreateWithFlags(&stream, flags), "Couldn't create cuda stream with flags");
+  }
+
+  return stream_t { stream };
+}
+
 inline __device__ float3 bgra_to_rgb(uchar4 vec) {
   return make_float3((float)vec.z, (float)vec.y, (float)vec.x);
 }
@@ -260,18 +277,18 @@ std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, i
   return std::make_unique<sws_t>(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor, std::move(ptr));
 }
 
-int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture) {
-  return convert(Y, UV, pitchY, pitchUV, texture, viewport);
+int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream) {
+  return convert(Y, UV, pitchY, pitchUV, texture, stream, viewport);
 }
 
-int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport) {
+int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream, const viewport_t &viewport) {
   int threadsX = viewport.width / 2;
   int threadsY = viewport.height;
 
   dim3 block(threadsPerBlock);
   dim3 grid(div_align(threadsX, threadsPerBlock), threadsY);
 
-  RGBA_to_NV12<<<grid, block>>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get());
+  RGBA_to_NV12<<<grid, block, 0, stream>>>(texture, Y, UV, pitchY, pitchUV, scale, viewport, (video::color_t *)color_matrix.get());
 
   return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
 }
diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h
index e46b4759..7e81ae99 100644
--- a/sunshine/platform/linux/cuda.h
+++ b/sunshine/platform/linux/cuda.h
@@ -1,16 +1,17 @@
 #if !defined(SUNSHINE_PLATFORM_CUDA_H) && defined(SUNSHINE_BUILD_CUDA)
 #define SUNSHINE_PLATFORM_CUDA_H
 
-#include <vector>
 #include <memory>
 #include <string>
+#include <vector>
 
 namespace platf {
-  class hwdevice_t;
-  class img_t;
-}
+class hwdevice_t;
+class img_t;
+} // namespace platf
 
 namespace cuda {
+
 namespace nvfbc {
 std::vector<std::string> display_names();
 }
@@ -21,8 +22,10 @@ int init();
 typedef struct cudaArray *cudaArray_t;
 
 #if !defined(__CUDACC__)
+typedef struct CUstream_st *cudaStream_t;
 typedef unsigned long long cudaTextureObject_t;
 #else  /* defined(__CUDACC__) */
+typedef __location__(device_builtin) struct CUstream_st *cudaStream_t;
 typedef __location__(device_builtin) unsigned long long cudaTextureObject_t;
 #endif /* !defined(__CUDACC__) */
 
@@ -33,7 +36,15 @@ public:
   void operator()(void *ptr);
 };
 
-using ptr_t = std::unique_ptr<void, freeCudaPtr_t>;
+class freeCudaStream_t {
+public:
+  void operator()(cudaStream_t ptr);
+};
+
+using ptr_t    = std::unique_ptr<void, freeCudaPtr_t>;
+using stream_t = std::unique_ptr<CUstream_st, freeCudaStream_t>;
+
+stream_t make_stream(int flags = 0);
 
 struct viewport_t {
   int width, height;
@@ -75,8 +86,8 @@ public:
   static std::unique_ptr<sws_t> make(int in_width, int in_height, int out_width, int out_height, int pitch);
 
   // Converts loaded image into a CUDevicePtr
-  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture);
-  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport);
+  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream);
+  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, stream_t::pointer stream, const viewport_t &viewport);
 
   void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range);
 
diff --git a/sunshine/platform/linux/misc.cpp b/sunshine/platform/linux/misc.cpp
index dd114ec4..6d7643c0 100644
--- a/sunshine/platform/linux/misc.cpp
+++ b/sunshine/platform/linux/misc.cpp
@@ -264,13 +264,11 @@ std::unique_ptr<deinit_t> init() {
 #endif
 #ifdef SUNSHINE_BUILD_CUDA
   if(verify_nvfbc()) {
-    BOOST_LOG(info) << "Using NvFBC for screencasting"sv;
     sources[source::NVFBC] = true;
   }
 #endif
 #ifdef SUNSHINE_BUILD_WAYLAND
   if(verify_wl()) {
-    BOOST_LOG(info) << "Using Wayland for screencasting"sv;
     sources[source::WAYLAND] = true;
   }
 #endif
@@ -282,13 +280,11 @@ std::unique_ptr<deinit_t> init() {
       display_cursor = false;
     }
 
-    BOOST_LOG(info) << "Using KMS for screencasting"sv;
     sources[source::KMS] = true;
   }
 #endif
 #ifdef SUNSHINE_BUILD_X11
   if(verify_x11()) {
-    BOOST_LOG(info) << "Using X11 for screencasting"sv;
     sources[source::X11] = true;
   }
 #endif

From 1f7bdb1b2a19444b5f9394f237a191cb6cd4dc6f Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Mon, 27 Sep 2021 19:35:06 +0200
Subject: [PATCH 26/27] Fix segfault when multiple controllers connected

---
 sunshine/platform/linux/input.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sunshine/platform/linux/input.cpp b/sunshine/platform/linux/input.cpp
index a0d18578..fbdc8133 100644
--- a/sunshine/platform/linux/input.cpp
+++ b/sunshine/platform/linux/input.cpp
@@ -696,7 +696,13 @@ public:
 };
 
 inline void rumbleIterate(std::vector<effect_t> &effects, std::vector<pollfd_t> &polls, std::chrono::milliseconds to) {
-  auto res = poll(&polls.data()->el, polls.size(), to.count());
+  std::vector<pollfd> polls_tmp;
+  polls_tmp.reserve(polls.size());
+  for(auto &poll : polls) {
+    polls_tmp.emplace_back(poll.el);
+  }
+
+  auto res = poll(polls_tmp.data(), polls.size(), to.count());
 
   // If timed out
   if(!res) {
@@ -871,7 +877,7 @@ void broadcastRumble(safe::queue_t<mail_evdev_t> &rumble_queue_queue) {
     }
 
     if(polls.empty()) {
-      std::this_thread::sleep_for(50ms);
+      std::this_thread::sleep_for(250ms);
     }
     else {
       rumbleIterate(effects, polls, 100ms);

From e7cbfb3ee92ae65064936f4938702e2c02b5beb0 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Mon, 27 Sep 2021 19:54:32 +0200
Subject: [PATCH 27/27] Fix dependencies for debian bullseye

---
 gen-deb.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gen-deb.in b/gen-deb.in
index 70da08d2..3cb64ea0 100755
--- a/gen-deb.in
+++ b/gen-deb.in
@@ -38,7 +38,7 @@ Architecture: amd64
 Maintainer: @loki
 Priority: optional
 Version: 0.11.0
-Depends: libssl1.1, libavdevice58, libboost-thread1.67.0 | libboost-thread1.71.0, libboost-filesystem1.67.0 | libboost-filesystem1.71.0, libboost-log1.67.0 | libboost-log1.71.0, libpulse0, libopus0, libxcb-shm0, libxcb-xfixes0, libxtst6, libevdev2, libdrm2, libcap2
+Depends: libssl1.1, libavdevice58, libboost-thread1.67.0 | libboost-thread1.71.0 | libboost-thread1.74.0, libboost-filesystem1.67.0 | libboost-filesystem1.71.0 | libboost-filesystem1.74.0, libboost-log1.67.0 | libboost-log1.71.0 | libboost-log1.74.0, libpulse0, libopus0, libxcb-shm0, libxcb-xfixes0, libxtst6, libevdev2, libdrm2, libcap2
 Description: Gamestream host for Moonlight
 EOF