1/* 2 * This file is part of FFmpeg. 3 * 4 * FFmpeg is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU Lesser General Public 6 * License as published by the Free Software Foundation; either 7 * version 2.1 of the License, or (at your option) any later version. 8 * 9 * FFmpeg is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 * Lesser General Public License for more details. 13 * 14 * You should have received a copy of the GNU Lesser General Public 15 * License along with FFmpeg; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 19#include "buffer.h" 20#include "common.h" 21#include "hwcontext.h" 22#include "hwcontext_internal.h" 23#include "hwcontext_cuda_internal.h" 24#if CONFIG_VULKAN 25#include "hwcontext_vulkan.h" 26#endif 27#include "cuda_check.h" 28#include "mem.h" 29#include "pixdesc.h" 30#include "pixfmt.h" 31#include "imgutils.h" 32 33typedef struct CUDAFramesContext { 34 int shift_width, shift_height; 35 int tex_alignment; 36} CUDAFramesContext; 37 38static const enum AVPixelFormat supported_formats[] = { 39 AV_PIX_FMT_NV12, 40 AV_PIX_FMT_YUV420P, 41 AV_PIX_FMT_YUVA420P, 42 AV_PIX_FMT_YUV444P, 43 AV_PIX_FMT_P010, 44 AV_PIX_FMT_P016, 45 AV_PIX_FMT_YUV444P16, 46 AV_PIX_FMT_0RGB32, 47 AV_PIX_FMT_0BGR32, 48#if CONFIG_VULKAN 49 AV_PIX_FMT_VULKAN, 50#endif 51}; 52 53#define CHECK_CU(x) FF_CUDA_CHECK_DL(device_ctx, cu, x) 54 55static int cuda_frames_get_constraints(AVHWDeviceContext *ctx, 56 const void *hwconfig, 57 AVHWFramesConstraints *constraints) 58{ 59 int i; 60 61 constraints->valid_sw_formats = av_malloc_array(FF_ARRAY_ELEMS(supported_formats) + 1, 62 sizeof(*constraints->valid_sw_formats)); 63 if (!constraints->valid_sw_formats) 64 return AVERROR(ENOMEM); 65 66 for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) 67 constraints->valid_sw_formats[i] = supported_formats[i]; 68 constraints->valid_sw_formats[FF_ARRAY_ELEMS(supported_formats)] = AV_PIX_FMT_NONE; 69 70 constraints->valid_hw_formats = av_malloc_array(2, sizeof(*constraints->valid_hw_formats)); 71 if (!constraints->valid_hw_formats) 72 return AVERROR(ENOMEM); 73 74 constraints->valid_hw_formats[0] = AV_PIX_FMT_CUDA; 75 constraints->valid_hw_formats[1] = AV_PIX_FMT_NONE; 76 77 return 0; 78} 79 80static void cuda_buffer_free(void *opaque, uint8_t *data) 81{ 82 AVHWFramesContext *ctx = opaque; 83 AVHWDeviceContext *device_ctx = ctx->device_ctx; 84 AVCUDADeviceContext *hwctx = device_ctx->hwctx; 85 CudaFunctions *cu = hwctx->internal->cuda_dl; 86 87 CUcontext dummy; 88 89 CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx)); 90 91 CHECK_CU(cu->cuMemFree((CUdeviceptr)data)); 92 93 CHECK_CU(cu->cuCtxPopCurrent(&dummy)); 94} 95 96static AVBufferRef *cuda_pool_alloc(void *opaque, size_t size) 97{ 98 AVHWFramesContext *ctx = opaque; 99 AVHWDeviceContext *device_ctx = ctx->device_ctx; 100 AVCUDADeviceContext *hwctx = device_ctx->hwctx; 101 CudaFunctions *cu = hwctx->internal->cuda_dl; 102 103 AVBufferRef *ret = NULL; 104 CUcontext dummy = NULL; 105 CUdeviceptr data; 106 int err; 107 108 err = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx)); 109 if (err < 0) 110 return NULL; 111 112 err = CHECK_CU(cu->cuMemAlloc(&data, size)); 113 if (err < 0) 114 goto fail; 115 116 ret = av_buffer_create((uint8_t*)data, size, cuda_buffer_free, ctx, 0); 117 if (!ret) { 118 CHECK_CU(cu->cuMemFree(data)); 119 goto fail; 120 } 121 122fail: 123 CHECK_CU(cu->cuCtxPopCurrent(&dummy)); 124 return ret; 125} 126 127static int cuda_frames_init(AVHWFramesContext *ctx) 128{ 129 AVHWDeviceContext *device_ctx = ctx->device_ctx; 130 AVCUDADeviceContext *hwctx = device_ctx->hwctx; 131 CUDAFramesContext *priv = ctx->internal->priv; 132 CudaFunctions *cu = hwctx->internal->cuda_dl; 133 int err, i; 134 135 for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) { 136 if (ctx->sw_format == supported_formats[i]) 137 break; 138 } 139 if (i == FF_ARRAY_ELEMS(supported_formats)) { 140 av_log(ctx, AV_LOG_ERROR, "Pixel format '%s' is not supported\n", 141 av_get_pix_fmt_name(ctx->sw_format)); 142 return AVERROR(ENOSYS); 143 } 144 145 err = CHECK_CU(cu->cuDeviceGetAttribute(&priv->tex_alignment, 146 14 /* CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT */, 147 hwctx->internal->cuda_device)); 148 if (err < 0) 149 return err; 150 151 av_log(ctx, AV_LOG_DEBUG, "CUDA texture alignment: %d\n", priv->tex_alignment); 152 153 // YUV420P is a special case. 154 // Since nvenc expects the U/V planes to have half the linesize of the Y plane 155 // alignment has to be doubled to ensure the U/V planes still end up aligned. 156 if (ctx->sw_format == AV_PIX_FMT_YUV420P) 157 priv->tex_alignment *= 2; 158 159 av_pix_fmt_get_chroma_sub_sample(ctx->sw_format, &priv->shift_width, &priv->shift_height); 160 161 if (!ctx->pool) { 162 int size = av_image_get_buffer_size(ctx->sw_format, ctx->width, ctx->height, priv->tex_alignment); 163 if (size < 0) 164 return size; 165 166 ctx->internal->pool_internal = av_buffer_pool_init2(size, ctx, cuda_pool_alloc, NULL); 167 if (!ctx->internal->pool_internal) 168 return AVERROR(ENOMEM); 169 } 170 171 return 0; 172} 173 174static int cuda_get_buffer(AVHWFramesContext *ctx, AVFrame *frame) 175{ 176 CUDAFramesContext *priv = ctx->internal->priv; 177 int res; 178 179 frame->buf[0] = av_buffer_pool_get(ctx->pool); 180 if (!frame->buf[0]) 181 return AVERROR(ENOMEM); 182 183 res = av_image_fill_arrays(frame->data, frame->linesize, frame->buf[0]->data, 184 ctx->sw_format, ctx->width, ctx->height, priv->tex_alignment); 185 if (res < 0) 186 return res; 187 188 // YUV420P is a special case. 189 // Nvenc expects the U/V planes in swapped order from how ffmpeg expects them, also chroma is half-aligned 190 if (ctx->sw_format == AV_PIX_FMT_YUV420P) { 191 frame->linesize[1] = frame->linesize[2] = frame->linesize[0] / 2; 192 frame->data[2] = frame->data[1]; 193 frame->data[1] = frame->data[2] + frame->linesize[2] * (ctx->height / 2); 194 } 195 196 frame->format = AV_PIX_FMT_CUDA; 197 frame->width = ctx->width; 198 frame->height = ctx->height; 199 200 return 0; 201} 202 203static int cuda_transfer_get_formats(AVHWFramesContext *ctx, 204 enum AVHWFrameTransferDirection dir, 205 enum AVPixelFormat **formats) 206{ 207 enum AVPixelFormat *fmts; 208 209 fmts = av_malloc_array(2, sizeof(*fmts)); 210 if (!fmts) 211 return AVERROR(ENOMEM); 212 213 fmts[0] = ctx->sw_format; 214 fmts[1] = AV_PIX_FMT_NONE; 215 216 *formats = fmts; 217 218 return 0; 219} 220 221static int cuda_transfer_data(AVHWFramesContext *ctx, AVFrame *dst, 222 const AVFrame *src) 223{ 224 CUDAFramesContext *priv = ctx->internal->priv; 225 AVHWDeviceContext *device_ctx = ctx->device_ctx; 226 AVCUDADeviceContext *hwctx = device_ctx->hwctx; 227 CudaFunctions *cu = hwctx->internal->cuda_dl; 228 229 CUcontext dummy; 230 int i, ret; 231 232 if ((src->hw_frames_ctx && ((AVHWFramesContext*)src->hw_frames_ctx->data)->format != AV_PIX_FMT_CUDA) || 233 (dst->hw_frames_ctx && ((AVHWFramesContext*)dst->hw_frames_ctx->data)->format != AV_PIX_FMT_CUDA)) 234 return AVERROR(ENOSYS); 235 236 ret = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx)); 237 if (ret < 0) 238 return ret; 239 240 for (i = 0; i < FF_ARRAY_ELEMS(src->data) && src->data[i]; i++) { 241 CUDA_MEMCPY2D cpy = { 242 .srcPitch = src->linesize[i], 243 .dstPitch = dst->linesize[i], 244 .WidthInBytes = FFMIN(src->linesize[i], dst->linesize[i]), 245 .Height = src->height >> ((i == 0 || i == 3) ? 0 : priv->shift_height), 246 }; 247 248 if (src->hw_frames_ctx) { 249 cpy.srcMemoryType = CU_MEMORYTYPE_DEVICE; 250 cpy.srcDevice = (CUdeviceptr)src->data[i]; 251 } else { 252 cpy.srcMemoryType = CU_MEMORYTYPE_HOST; 253 cpy.srcHost = src->data[i]; 254 } 255 256 if (dst->hw_frames_ctx) { 257 cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE; 258 cpy.dstDevice = (CUdeviceptr)dst->data[i]; 259 } else { 260 cpy.dstMemoryType = CU_MEMORYTYPE_HOST; 261 cpy.dstHost = dst->data[i]; 262 } 263 264 ret = CHECK_CU(cu->cuMemcpy2DAsync(&cpy, hwctx->stream)); 265 if (ret < 0) 266 goto exit; 267 } 268 269 if (!dst->hw_frames_ctx) { 270 ret = CHECK_CU(cu->cuStreamSynchronize(hwctx->stream)); 271 if (ret < 0) 272 goto exit; 273 } 274 275exit: 276 CHECK_CU(cu->cuCtxPopCurrent(&dummy)); 277 278 return 0; 279} 280 281static void cuda_device_uninit(AVHWDeviceContext *device_ctx) 282{ 283 AVCUDADeviceContext *hwctx = device_ctx->hwctx; 284 285 if (hwctx->internal) { 286 CudaFunctions *cu = hwctx->internal->cuda_dl; 287 288 if (hwctx->internal->is_allocated && hwctx->cuda_ctx) { 289 if (hwctx->internal->flags & AV_CUDA_USE_PRIMARY_CONTEXT) 290 CHECK_CU(cu->cuDevicePrimaryCtxRelease(hwctx->internal->cuda_device)); 291 else 292 CHECK_CU(cu->cuCtxDestroy(hwctx->cuda_ctx)); 293 294 hwctx->cuda_ctx = NULL; 295 } 296 297 cuda_free_functions(&hwctx->internal->cuda_dl); 298 } 299 300 av_freep(&hwctx->internal); 301} 302 303static int cuda_device_init(AVHWDeviceContext *ctx) 304{ 305 AVCUDADeviceContext *hwctx = ctx->hwctx; 306 int ret; 307 308 if (!hwctx->internal) { 309 hwctx->internal = av_mallocz(sizeof(*hwctx->internal)); 310 if (!hwctx->internal) 311 return AVERROR(ENOMEM); 312 } 313 314 if (!hwctx->internal->cuda_dl) { 315 ret = cuda_load_functions(&hwctx->internal->cuda_dl, ctx); 316 if (ret < 0) { 317 av_log(ctx, AV_LOG_ERROR, "Could not dynamically load CUDA\n"); 318 goto error; 319 } 320 } 321 322 return 0; 323 324error: 325 cuda_device_uninit(ctx); 326 return ret; 327} 328 329static int cuda_context_init(AVHWDeviceContext *device_ctx, int flags) { 330 AVCUDADeviceContext *hwctx = device_ctx->hwctx; 331 CudaFunctions *cu; 332 CUcontext dummy; 333 int ret, dev_active = 0; 334 unsigned int dev_flags = 0; 335 336 const unsigned int desired_flags = CU_CTX_SCHED_BLOCKING_SYNC; 337 338 cu = hwctx->internal->cuda_dl; 339 340 hwctx->internal->flags = flags; 341 342 if (flags & AV_CUDA_USE_PRIMARY_CONTEXT) { 343 ret = CHECK_CU(cu->cuDevicePrimaryCtxGetState(hwctx->internal->cuda_device, 344 &dev_flags, &dev_active)); 345 if (ret < 0) 346 return ret; 347 348 if (dev_active && dev_flags != desired_flags) { 349 av_log(device_ctx, AV_LOG_ERROR, "Primary context already active with incompatible flags.\n"); 350 return AVERROR(ENOTSUP); 351 } else if (dev_flags != desired_flags) { 352 ret = CHECK_CU(cu->cuDevicePrimaryCtxSetFlags(hwctx->internal->cuda_device, 353 desired_flags)); 354 if (ret < 0) 355 return ret; 356 } 357 358 ret = CHECK_CU(cu->cuDevicePrimaryCtxRetain(&hwctx->cuda_ctx, 359 hwctx->internal->cuda_device)); 360 if (ret < 0) 361 return ret; 362 } else { 363 ret = CHECK_CU(cu->cuCtxCreate(&hwctx->cuda_ctx, desired_flags, 364 hwctx->internal->cuda_device)); 365 if (ret < 0) 366 return ret; 367 368 CHECK_CU(cu->cuCtxPopCurrent(&dummy)); 369 } 370 371 hwctx->internal->is_allocated = 1; 372 373 // Setting stream to NULL will make functions automatically use the default CUstream 374 hwctx->stream = NULL; 375 376 return 0; 377} 378 379static int cuda_flags_from_opts(AVHWDeviceContext *device_ctx, 380 AVDictionary *opts, int *flags) 381{ 382 AVDictionaryEntry *primary_ctx_opt = av_dict_get(opts, "primary_ctx", NULL, 0); 383 384 if (primary_ctx_opt && strtol(primary_ctx_opt->value, NULL, 10)) { 385 av_log(device_ctx, AV_LOG_VERBOSE, "Using CUDA primary device context\n"); 386 *flags |= AV_CUDA_USE_PRIMARY_CONTEXT; 387 } else if (primary_ctx_opt) { 388 av_log(device_ctx, AV_LOG_VERBOSE, "Disabling use of CUDA primary device context\n"); 389 *flags &= ~AV_CUDA_USE_PRIMARY_CONTEXT; 390 } 391 392 return 0; 393} 394 395static int cuda_device_create(AVHWDeviceContext *device_ctx, 396 const char *device, 397 AVDictionary *opts, int flags) 398{ 399 AVCUDADeviceContext *hwctx = device_ctx->hwctx; 400 CudaFunctions *cu; 401 int ret, device_idx = 0; 402 403 ret = cuda_flags_from_opts(device_ctx, opts, &flags); 404 if (ret < 0) 405 goto error; 406 407 if (device) 408 device_idx = strtol(device, NULL, 0); 409 410 ret = cuda_device_init(device_ctx); 411 if (ret < 0) 412 goto error; 413 414 cu = hwctx->internal->cuda_dl; 415 416 ret = CHECK_CU(cu->cuInit(0)); 417 if (ret < 0) 418 goto error; 419 420 ret = CHECK_CU(cu->cuDeviceGet(&hwctx->internal->cuda_device, device_idx)); 421 if (ret < 0) 422 goto error; 423 424 ret = cuda_context_init(device_ctx, flags); 425 if (ret < 0) 426 goto error; 427 428 return 0; 429 430error: 431 cuda_device_uninit(device_ctx); 432 return ret; 433} 434 435static int cuda_device_derive(AVHWDeviceContext *device_ctx, 436 AVHWDeviceContext *src_ctx, AVDictionary *opts, 437 int flags) { 438 AVCUDADeviceContext *hwctx = device_ctx->hwctx; 439 CudaFunctions *cu; 440 const char *src_uuid = NULL; 441 int ret, i, device_count; 442 443 ret = cuda_flags_from_opts(device_ctx, opts, &flags); 444 if (ret < 0) 445 goto error; 446 447#if CONFIG_VULKAN 448 VkPhysicalDeviceIDProperties vk_idp = { 449 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES, 450 }; 451#endif 452 453 switch (src_ctx->type) { 454#if CONFIG_VULKAN 455#define TYPE PFN_vkGetPhysicalDeviceProperties2 456 case AV_HWDEVICE_TYPE_VULKAN: { 457 AVVulkanDeviceContext *vkctx = src_ctx->hwctx; 458 TYPE prop_fn = (TYPE)vkctx->get_proc_addr(vkctx->inst, "vkGetPhysicalDeviceProperties2"); 459 VkPhysicalDeviceProperties2 vk_dev_props = { 460 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2, 461 .pNext = &vk_idp, 462 }; 463 prop_fn(vkctx->phys_dev, &vk_dev_props); 464 src_uuid = vk_idp.deviceUUID; 465 break; 466 } 467#undef TYPE 468#endif 469 default: 470 ret = AVERROR(ENOSYS); 471 goto error; 472 } 473 474 if (!src_uuid) { 475 av_log(device_ctx, AV_LOG_ERROR, 476 "Failed to get UUID of source device.\n"); 477 ret = AVERROR(EINVAL); 478 goto error; 479 } 480 481 ret = cuda_device_init(device_ctx); 482 if (ret < 0) 483 goto error; 484 485 cu = hwctx->internal->cuda_dl; 486 487 ret = CHECK_CU(cu->cuInit(0)); 488 if (ret < 0) 489 goto error; 490 491 ret = CHECK_CU(cu->cuDeviceGetCount(&device_count)); 492 if (ret < 0) 493 goto error; 494 495 hwctx->internal->cuda_device = -1; 496 for (i = 0; i < device_count; i++) { 497 CUdevice dev; 498 CUuuid uuid; 499 500 ret = CHECK_CU(cu->cuDeviceGet(&dev, i)); 501 if (ret < 0) 502 goto error; 503 504 ret = CHECK_CU(cu->cuDeviceGetUuid(&uuid, dev)); 505 if (ret < 0) 506 goto error; 507 508 if (memcmp(src_uuid, uuid.bytes, sizeof (uuid.bytes)) == 0) { 509 hwctx->internal->cuda_device = dev; 510 break; 511 } 512 } 513 514 if (hwctx->internal->cuda_device == -1) { 515 av_log(device_ctx, AV_LOG_ERROR, "Could not derive CUDA device.\n"); 516 goto error; 517 } 518 519 ret = cuda_context_init(device_ctx, flags); 520 if (ret < 0) 521 goto error; 522 523 return 0; 524 525error: 526 cuda_device_uninit(device_ctx); 527 return ret; 528} 529 530const HWContextType ff_hwcontext_type_cuda = { 531 .type = AV_HWDEVICE_TYPE_CUDA, 532 .name = "CUDA", 533 534 .device_hwctx_size = sizeof(AVCUDADeviceContext), 535 .frames_priv_size = sizeof(CUDAFramesContext), 536 537 .device_create = cuda_device_create, 538 .device_derive = cuda_device_derive, 539 .device_init = cuda_device_init, 540 .device_uninit = cuda_device_uninit, 541 .frames_get_constraints = cuda_frames_get_constraints, 542 .frames_init = cuda_frames_init, 543 .frames_get_buffer = cuda_get_buffer, 544 .transfer_get_formats = cuda_transfer_get_formats, 545 .transfer_data_to = cuda_transfer_data, 546 .transfer_data_from = cuda_transfer_data, 547 548 .pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE }, 549}; 550