Files
Josh Allmann 4d9ab6275a Fix occassional DTS overlap
Fix an occassional DTS overlap by
closing the filtergraph after each
segment and re-creating it at the
beginning of each segment, instead
of attempting to persist the
filtergraph in between segments.

This overlap occurred mostly when
flip-flopping segments between transcoders,
or processing non-consecutive segments within
a single transcoder. This was due to drift in
adjusting input timestamps to match the fps
filter's expectation of mostly consecutive
timestamps while adjusting output timestamps
to remove accumulated delay from the filter.

There is roughly a 1% performance hit on my
machine from re-creating the filtergraph.

Because we are now resetting the filter after
each segment, we can remove a good chunk of
the special-cased timestamp handling code
before and after the filtergraph since
we no longer need to handle discontinuities
between segments.

However, we do need to keep some filter flushing
logic in order to accommodate low-fps or low-frame
content.

This does change our outputs, usually by one
fewer frame. Sometimes we seem to produce an
*additional* frame - it is unclear why. However,
as the test cases note, this actually clears up a
numer of long-standing oddities around the expected
frame count, so it should be seen as an improvement.

---

It is important to note that while this fixes DTS
overlap in a (rather unpredictable) general case,
there is another overlap bug in one very specific case.

These are the conditions for bug:

1. First and second segments of the stream are being
   processed. This could be the same transcoder or
   different ones.

2. The first segment starts at or near zero pts

3. mpegts is the output format

4. B-frames are being used

What happens is we may see DTS < PTS for the
very first frames in the very first segment,
potentially starting with PTS = 0, DTS < 0.
This is expected for B-frames.

However, if mpegts is in use, it cannot take negative
timestamps. To accompdate negative DTS, the muxer
will set PTS = -DTS, DTS = 0 and delay (offset) the
rest of the packets in the segment accordingly.

Unfortunately, subsequent transcodes will not know
about this delay! This typically leads to an overlap
between the first and second segments (but segments after
that would be fine).

The normal way to fix this would be to add a constant delay
to all segments - ffmpeg adds 1.4s to mpegts by default.

However, introducing a delay right now feels a little
odd since we don't really offer any other knobs to control
the timestamp (re-transcodes would accumulate the delay) and
there is some concern about falling out of sync with the
source segment since we have historically tried to make
timestamps follow the source as closely as possible.

So we're leaving this particular bug as-is for now.
There is some commented-out code that adds this delay
in case we feel that we would need it in the future.

Note that FFmpeg CLI also has the exact same problem
when the muxer delay is removed, so this is not a
LPMS-specific issue. This is exercised in the test cases.

Example of non-monotonic DTS after encoding and after muxing:

Segment.Frame | Encoder DTS | Encoder PTS | Muxer DTS | Muxer PTS
--------------|-------------|-------------|-----------|-----------
      1.1     |  -20        |    0        |    0      | 20
      1.2     |  -10        |   10        |   10      | 30
      1.3     |    0        |   20        |  *20*     | 40
      1.4     |   10        |   30        |  *30*     | 50
      2.1     |   20        |   40        |  *20*     | 40
      2.2     |   30        |   50        |  *30*     | 50
      2.3     |   40        |   60        |   40      | 60
2026-03-10 01:13:30 +00:00

401 lines
16 KiB
C

#include "filter.h"
#include "encoder.h"
#include "logging.h"
#include <libavfilter/buffersrc.h>
#include <libavfilter/buffersink.h>
#include <libavutil/opt.h>
#include <libavutil/pixdesc.h>
#include <assert.h>
int filtergraph_parser(struct filter_ctx *fctx, char* filters_descr, AVFilterInOut **inputs, AVFilterInOut **outputs)
{
int ret = -1;
if(fctx == NULL || filters_descr == NULL || inputs == NULL || outputs == NULL)
return ret;
/*
* Set the endpoints for the filter graph. The filter_graph will
* be linked to the graph described by filters_descr.
*/
/*
* The buffer source output must be connected to the input pad of
* the first filter described by filters_descr; since the first
* filter input label is not specified, it is set to "in" by
* default.
*/
(*outputs)->name = av_strdup("in");
(*outputs)->filter_ctx = fctx->src_ctx;
(*outputs)->pad_idx = 0;
(*outputs)->next = NULL;
/*
* The buffer sink input must be connected to the output pad of
* the last filter described by filters_descr; since the last
* filter output label is not specified, it is set to "out" by
* default.
*/
(*inputs)->name = av_strdup("out");
(*inputs)->filter_ctx = fctx->sink_ctx;
(*inputs)->pad_idx = 0;
(*inputs)->next = NULL;
ret = avfilter_graph_parse_ptr(fctx->graph, filters_descr,
inputs, outputs, NULL);
return ret;
}
int init_video_filters(struct input_ctx *ictx, struct output_ctx *octx, AVFrame *inf)
{
char args[512];
int ret = 0;
const AVFilter *buffersrc = avfilter_get_by_name("buffer");
const AVFilter *buffersink = avfilter_get_by_name("buffersink");
AVFilterInOut *outputs = NULL;
AVFilterInOut *inputs = NULL;
AVRational time_base = ictx->ic->streams[ictx->vi]->time_base;
enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_YUV420P, AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE }; // XXX ensure the encoder allows this
struct filter_ctx *vf = &octx->vf;
char *filters_descr = octx->vfilters;
enum AVPixelFormat in_pix_fmt = ictx->vc->pix_fmt;
// no need for filters with the following conditions
if (vf->active) goto vf_init_cleanup; // already initialized
if (!needs_decoder(octx->video->name)) goto vf_init_cleanup;
outputs = avfilter_inout_alloc();
inputs = avfilter_inout_alloc();
if (vf->graph == NULL) {
vf->graph = avfilter_graph_alloc();
}
if (!outputs || !inputs || !vf->graph) {
ret = AVERROR(ENOMEM);
LPMS_ERR(vf_init_cleanup, "Unable to allocate filters");
}
vf->time_base = time_base;
if (ictx->vc->hw_device_ctx) in_pix_fmt = hw2pixfmt(ictx->vc);
/* buffer video source: the decoded frames from the decoder will be inserted here. */
snprintf(args, sizeof args,
"video_size=%dx%d:pix_fmt=%d:time_base=%d/%d:pixel_aspect=%d/%d:colorspace=%s:range=%s",
ictx->vc->width, ictx->vc->height, in_pix_fmt,
time_base.num, time_base.den,
ictx->vc->sample_aspect_ratio.num, ictx->vc->sample_aspect_ratio.den,
av_color_space_name(ictx->vc->colorspace), av_color_range_name(ictx->vc->color_range));
ret = avfilter_graph_create_filter(&vf->src_ctx, buffersrc,
"in", args, NULL, vf->graph);
if (ret < 0) LPMS_ERR(vf_init_cleanup, "Cannot create video buffer source");
if (ictx->vc && ictx->vc->hw_frames_ctx) {
// XXX a bit problematic in that it's set before decoder is fully ready
AVBufferSrcParameters *srcpar = av_buffersrc_parameters_alloc();
AVBufferRef *hw_frames_ctx = inf && inf->hw_frames_ctx ? inf->hw_frames_ctx : ictx->vc->hw_frames_ctx;
srcpar->hw_frames_ctx = hw_frames_ctx;
av_buffer_replace(&vf->hw_frames_ctx, hw_frames_ctx);
av_buffersrc_parameters_set(vf->src_ctx, srcpar);
av_freep(&srcpar);
}
/* buffer video sink: to terminate the filter chain. */
ret = avfilter_graph_create_filter(&vf->sink_ctx, buffersink,
"out", NULL, NULL, vf->graph);
if (ret < 0) LPMS_ERR(vf_init_cleanup, "Cannot create video buffer sink");
ret = av_opt_set_int_list(vf->sink_ctx, "pix_fmts", pix_fmts,
AV_PIX_FMT_NONE, AV_OPT_SEARCH_CHILDREN);
if (ret < 0) LPMS_ERR(vf_init_cleanup, "Cannot set output pixel format");
ret = filtergraph_parser(vf, filters_descr, &inputs, &outputs);
if (ret < 0) LPMS_ERR(vf_init_cleanup, "Unable to parse video filters desc");
ret = avfilter_graph_config(vf->graph, NULL);
if (ret < 0) LPMS_ERR(vf_init_cleanup, "Unable configure video filtergraph");
char *dumped_graph = avfilter_graph_dump(vf->graph, NULL);
LPMS_DEBUG("Initialized filtergraph: ");
if (dumped_graph) {
LPMS_DEBUG(dumped_graph);
av_freep(&dumped_graph);
}
vf->frame = av_frame_alloc();
if (!vf->frame) LPMS_ERR(vf_init_cleanup, "Unable to allocate video frame");
vf->active = 1;
vf->closed = 0;
vf_init_cleanup:
avfilter_inout_free(&inputs);
avfilter_inout_free(&outputs);
return ret;
}
int init_audio_filters(struct input_ctx *ictx, struct output_ctx *octx)
{
int ret = 0;
char args[512];
char filters_descr[256];
char channel_layout[256];
const AVFilter *buffersrc = avfilter_get_by_name("abuffer");
const AVFilter *buffersink = avfilter_get_by_name("abuffersink");
AVFilterInOut *outputs = NULL;
AVFilterInOut *inputs = NULL;
struct filter_ctx *af = &octx->af;
AVRational time_base = ictx->ic->streams[ictx->ai]->time_base;
// no need for filters with the following conditions
if (af->active) goto af_init_cleanup; // already initialized
if (!needs_decoder(octx->audio->name)) goto af_init_cleanup;
outputs = avfilter_inout_alloc();
inputs = avfilter_inout_alloc();
af->graph = avfilter_graph_alloc();
if (!outputs || !inputs || !af->graph) {
ret = AVERROR(ENOMEM);
LPMS_ERR(af_init_cleanup, "Unable to allocate audio filters");
}
/* buffer audio source: the decoded frames from the decoder will be inserted here. */
ret = av_channel_layout_describe(&ictx->ac->ch_layout, channel_layout, sizeof(channel_layout));
if (ret < 0) LPMS_ERR(af_init_cleanup, "Unable to describe audio channel layout");
snprintf(args, sizeof args,
"sample_rate=%d:sample_fmt=%d:channel_layout=%s:channels=%d:"
"time_base=%d/%d",
ictx->ac->sample_rate, ictx->ac->sample_fmt, channel_layout,
ictx->ac->ch_layout.nb_channels, time_base.num, time_base.den);
// TODO set sample format and rate based on encoder support,
// rather than hardcoding
snprintf(filters_descr, sizeof filters_descr,
"aformat=sample_fmts=fltp:channel_layouts=stereo:sample_rates=44100");
ret = avfilter_graph_create_filter(&af->src_ctx, buffersrc,
"in", args, NULL, af->graph);
if (ret < 0) LPMS_ERR(af_init_cleanup, "Cannot create audio buffer source");
/* buffer audio sink: to terminate the filter chain. */
ret = avfilter_graph_create_filter(&af->sink_ctx, buffersink,
"out", NULL, NULL, af->graph);
if (ret < 0) LPMS_ERR(af_init_cleanup, "Cannot create audio buffer sink");
ret = filtergraph_parser(af, filters_descr, &inputs, &outputs);
if (ret < 0) LPMS_ERR(af_init_cleanup, "Unable to parse audio filters desc");
ret = avfilter_graph_config(af->graph, NULL);
if (ret < 0) LPMS_ERR(af_init_cleanup, "Unable configure audio filtergraph");
af->frame = av_frame_alloc();
if (!af->frame) LPMS_ERR(af_init_cleanup, "Unable to allocate audio frame");
af->active = 1;
af_init_cleanup:
avfilter_inout_free(&inputs);
avfilter_inout_free(&outputs);
return ret;
}
int init_signature_filters(struct output_ctx *octx, AVFrame *inf)
{
char args[512];
int ret = 0;
const AVFilter *buffersrc = avfilter_get_by_name("buffer");
const AVFilter *buffersink = avfilter_get_by_name("buffersink");
AVFilterInOut *outputs = NULL;
AVFilterInOut *inputs = NULL;
AVRational time_base = octx->oc->streams[0]->time_base;
enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_YUV420P, AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE }; // XXX ensure the encoder allows this
struct filter_ctx *sf = &octx->sf;
char *filters_descr = octx->sfilters;
enum AVPixelFormat in_pix_fmt = octx->vc->pix_fmt;
if(octx->sfilters == NULL || strlen(octx->sfilters) <= 0) goto sf_init_cleanup;
// no need for filters with the following conditions
if (sf->active) goto sf_init_cleanup; // already initialized
if (!needs_decoder(octx->video->name)) goto sf_init_cleanup;
outputs = avfilter_inout_alloc();
inputs = avfilter_inout_alloc();
sf->graph = avfilter_graph_alloc();
if (!outputs || !inputs || !sf->graph) {
ret = AVERROR(ENOMEM);
LPMS_ERR(sf_init_cleanup, "Unable to allocate filters");
}
if (octx->vc->hw_device_ctx) in_pix_fmt = hw2pixfmt(octx->vc);
/* buffer video source: the scaled frames from the decoder will be inserted here. */
snprintf(args, sizeof args,
"video_size=%dx%d:pix_fmt=%d:time_base=%d/%d:pixel_aspect=%d/%d",
octx->vc->width, octx->vc->height, in_pix_fmt,
time_base.num, time_base.den,
octx->vc->sample_aspect_ratio.num, octx->vc->sample_aspect_ratio.den);
ret = avfilter_graph_create_filter(&sf->src_ctx, buffersrc,
"in", args, NULL, sf->graph);
if (ret < 0) LPMS_ERR(sf_init_cleanup, "Cannot create video buffer source");
if (octx->vc && inf && inf->hw_frames_ctx) {
AVBufferSrcParameters *srcpar = av_buffersrc_parameters_alloc();
srcpar->hw_frames_ctx = inf->hw_frames_ctx;
av_buffer_replace(&sf->hw_frames_ctx, inf->hw_frames_ctx);
av_buffersrc_parameters_set(sf->src_ctx, srcpar);
av_freep(&srcpar);
} else if (octx->vc && octx->vc->hw_frames_ctx) {
AVBufferSrcParameters *srcpar = av_buffersrc_parameters_alloc();
srcpar->hw_frames_ctx = octx->vc->hw_frames_ctx;
av_buffer_replace(&sf->hw_frames_ctx, octx->vc->hw_frames_ctx);
av_buffersrc_parameters_set(sf->src_ctx, srcpar);
av_freep(&srcpar);
}
/* buffer video sink: to terminate the filter chain. */
ret = avfilter_graph_create_filter(&sf->sink_ctx, buffersink,
"out", NULL, NULL, sf->graph);
if (ret < 0) LPMS_ERR(sf_init_cleanup, "Cannot create video buffer sink");
ret = av_opt_set_int_list(sf->sink_ctx, "pix_fmts", pix_fmts,
AV_PIX_FMT_NONE, AV_OPT_SEARCH_CHILDREN);
if (ret < 0) LPMS_ERR(sf_init_cleanup, "Cannot set output pixel format");
ret = filtergraph_parser(sf, filters_descr, &inputs, &outputs);
if (ret < 0) LPMS_ERR(sf_init_cleanup, "Unable to parse signature filters desc");
ret = avfilter_graph_config(sf->graph, NULL);
if (ret < 0) LPMS_ERR(sf_init_cleanup, "Unable configure signature filtergraph");
sf->frame = av_frame_alloc();
if (!sf->frame) LPMS_ERR(sf_init_cleanup, "Unable to allocate video frame");
sf->active = 1;
sf_init_cleanup:
avfilter_inout_free(&inputs);
avfilter_inout_free(&outputs);
return ret;
}
int filtergraph_write(AVFrame *inf, struct input_ctx *ictx, struct output_ctx *octx, struct filter_ctx *filter, int is_video)
{
if (filter->closed) return 0;
int ret = 0;
// We have to reset the filter because we initially set the filter
// before the decoder is fully ready, and the decoder may change HW params
// XXX: Unclear if this path is hit on all devices
if (is_video && inf && (
(inf->hw_frames_ctx && filter->hw_frames_ctx &&
inf->hw_frames_ctx->data != filter->hw_frames_ctx->data) ||
(filter->src_ctx->nb_outputs > 0 &&
filter->src_ctx->outputs[0]->w != inf->width &&
filter->src_ctx->outputs[0]->h != inf->height))) {
// flush video filter
ret = av_buffersrc_write_frame(filter->src_ctx, NULL);
if (ret < 0) LPMS_ERR(fg_write_cleanup, "Error closing filter for reinit");
while (!ret) {
ret = filtergraph_read(ictx, octx, filter, is_video);
if (AVERROR(EAGAIN) == ret || AVERROR_EOF == ret) break;
AVFrame *frame = filter->frame;
AVCodecContext *encoder = octx->vc;
// TODO does clipping need to be handled?
// TODO calculate signature?
// Set GOP interval if necessary
if (octx->gop_pts_len && frame && frame->pts >= octx->next_kf_pts) {
frame->pict_type = AV_PICTURE_TYPE_I;
octx->next_kf_pts = frame->pts + octx->gop_pts_len;
}
if (frame) {
// rescale pts to match encoder timebase if necessary (eg, fps passthrough)
AVRational filter_tb = av_buffersink_get_time_base(filter->sink_ctx);
if (av_cmp_q(filter_tb, encoder->time_base)) {
frame->pts = av_rescale_q(frame->pts, filter_tb, encoder->time_base);
// TODO does frame->duration needs to be rescaled too?
}
}
ret = encode(encoder, frame, octx, octx->oc->streams[octx->vi]);
if (!ret) LPMS_ERR(fg_write_cleanup, "Encoder error during filter reinit");
}
ret = 0;
free_filter(&octx->vf);
ret = init_video_filters(ictx, octx, inf);
if (ret < 0) return lpms_ERR_FILTERS;
}
// Timestamp handling code
AVStream *vst = ictx->ic->streams[ictx->vi];
if (inf) { // Non-Flush Frame
inf->opaque = (void *) inf->pts; // Store original PTS for calc later
filter->custom_pts = inf->pts;
} else if (!filter->flushed) { // Flush Frame
// close filter right away if we already have some frames
if (octx->res->frames) {
filter->closed = 1;
return av_buffersrc_write_frame(filter->src_ctx, NULL);
}
// we don't have frames yet so flush the filter
// needed for extremely short or low-fps content
int64_t ts_step;
inf = (is_video) ? ictx->last_frame_v : ictx->last_frame_a;
inf->opaque = (void *) (INT64_MIN); // Store INT64_MIN as pts for flush frames
filter->flushing = 1;
if (is_video && octx->fps.den) {
// set ts_step to one frame (1/fps) in units of the output timebase
ts_step = av_rescale_q_rnd(1, av_inv_q(octx->fps), vst->time_base, AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);
} else {
// FPS Passthrough or Audio case - use packet duration instead of custom duration
ts_step = inf->duration;
}
filter->custom_pts += ts_step;
}
if (inf) {
// Apply the custom pts, then reset for the next output
int64_t old_pts = inf->pts;
inf->pts = filter->custom_pts;
ret = av_buffersrc_write_frame(filter->src_ctx, inf);
inf->pts = old_pts;
if (ret < 0) LPMS_ERR(fg_write_cleanup, "Error feeding the filtergraph");
}
fg_write_cleanup:
return ret;
}
int filtergraph_read(struct input_ctx *ictx, struct output_ctx *octx, struct filter_ctx *filter, int is_video)
{
AVFrame *frame = filter->frame;
av_frame_unref(frame);
int ret = av_buffersink_get_frame(filter->sink_ctx, frame);
frame->pict_type = AV_PICTURE_TYPE_NONE;
if (AVERROR(EAGAIN) == ret || AVERROR_EOF == ret) return ret;
else if (ret < 0) LPMS_ERR(fg_read_cleanup, "Error consuming the filtergraph");
if (frame && ((int64_t) frame->opaque == INT64_MIN)) {
// opaque being INT64_MIN means it's a flush packet
// don't set flushed flag in case this is a flush from a previous segment
if (filter->flushing) filter->flushed = 1;
ret = lpms_ERR_FILTER_FLUSHED;
}
fg_read_cleanup:
return ret;
}
void free_filter(struct filter_ctx *filter)
{
if (filter->frame) av_frame_free(&filter->frame);
if (filter->graph) avfilter_graph_free(&filter->graph);
if (filter->hw_frames_ctx) av_buffer_unref(&filter->hw_frames_ctx);
memset(filter, 0, sizeof(struct filter_ctx));
}