Untitled

======== encoder.cpp ======
 (There is a 'encode.h' that looks just like you would expect)

#include "encoder.h"
#include <algorithm>
#include <iterator>

extern "C"
{
#include "libavcodec/avcodec.h"
#include "libavdevice/avdevice.h"
#include "libavfilter/avfilter.h"
#include "libavformat/avformat.h"
#include "libavutil/avutil.h"
#include "libavutil/imgutils.h"
#include "libswscale/swscale.h"
#include "libswresample/swresample.h"

#define RES_NOT_MUL_OF_TWO 1
#define COULD_NOT_FIND_VID_CODEC 2
#define CONTEXT_CREATION_ERROR 3
#define COULD_NOT_OPEN_VID_CODEC 4
#define COULD_NOT_OPEN_FILE 5
#define COULD_NOT_ALLOCATE_FRAME 6
#define COULD_NOT_ALLOCATE_PIC_BUF 7
#define ERROR_ENCODING_FRAME_SEND 8
#define ERROR_ENCODING_FRAME_RECEIVE 9
#define COULD_NOT_FIND_AUD_CODEC 10
#define COULD_NOT_OPEN_AUD_CODEC 11
#define COULD_NOT_ALL_RESMPL_CONTEXT 12
#define FAILED_TO_INIT_RESMPL_CONTEXT 13
#define COULD_NOT_ALLOC_SAMPLES 14
#define COULD_NOT_CONVERT_AUD 15
#define ERROR_ENCODING_SAMPLES_SEND 16
#define ERROR_ENCODING_SAMPLES_RECEIVE 17
#define ENCODED_VIDEO 18
#define ENCODED_AUDIO 19
#define ENCODED_AUDIO_AND_VIDEO 20

    AVCodec *vid_codec, *aud_codec;
    AVCodecContext *vid_codec_context = NULL;
    AVCodecContext *aud_codec_context = NULL;
    AVFormatContext *outctx;
    AVStream *video_st, *audio_st;
    AVFrame *vid_frame, *aud_frame;
    SwsContext *sws_ctx;
    SwrContext *swr_ctx = NULL;

    int vid_frame_counter, aud_frame_counter;
    int vid_width, vid_height;

    // Audio converting
    //uint8_t **src_samples_data;
    int src_samples_linesize;
    //int src_nb_samples;
    int max_dst_nb_samples;

    uint8_t **dst_samples_data;
    int dst_samples_linesize;
    int dst_samples_size;

    int initialize_encoding_def(int width, int height, int frame_rate, int bitrate, const char *filename)
    {
        if (width % 2 != 0 || height % 2 != 0)
            return RES_NOT_MUL_OF_TWO;

        AVFormatContext *oc;

        AVCodecID vid_codec_id = AV_CODEC_ID_H264;
        AVCodecID aud_codec_id = AV_CODEC_ID_AAC;
        AVPixelFormat px_format = AV_PIX_FMT_YUV420P;
        AVSampleFormat sample_fmt = AV_SAMPLE_FMT_FLTP;

        vid_width = width;
        vid_height = height;

        int ret;

        avcodec_register_all();
        av_register_all();

        // Fixup video codec
        vid_codec = avcodec_find_encoder(vid_codec_id);
        avcodec_register(vid_codec);

        if (!vid_codec)
            return COULD_NOT_FIND_VID_CODEC;

        vid_codec_context = avcodec_alloc_context3(vid_codec);
        if (!vid_codec_context)
            return CONTEXT_CREATION_ERROR;

        vid_codec_context->bit_rate = bitrate;
        vid_codec_context->width = width;
        vid_codec_context->height = height;

        AVRational time_base;
        time_base.num = 1;
        time_base.den = frame_rate;
        vid_codec_context->time_base = time_base;

        vid_codec_context->gop_size = 10;
        vid_codec_context->max_b_frames = 1;
        vid_codec_context->pix_fmt = px_format;
        ret = av_opt_set(vid_codec_context->priv_data, "preset", "slow", 0);

        ret = avcodec_open2(vid_codec_context, vid_codec, NULL);
        if (ret < 0)
            return COULD_NOT_OPEN_VID_CODEC;

        outctx = avformat_alloc_context();
        ret = avformat_alloc_output_context2(&outctx, NULL, "mp4", filename);

        outctx->video_codec = vid_codec;
        outctx->video_codec_id = vid_codec_id;

        video_st = avformat_new_stream(outctx, vid_codec);

        video_st->codecpar->width = width;
        video_st->codecpar->height = height;
        video_st->codecpar->codec_id = vid_codec->id;
        video_st->codecpar->codec_type = vid_codec->type;
        video_st->codecpar->format = px_format;
        video_st->codecpar->bit_rate = bitrate;
        video_st->time_base = time_base;

        vid_frame = av_frame_alloc();

        if (!vid_frame)
            return COULD_NOT_ALLOCATE_FRAME;

        vid_frame->format = vid_codec_context->pix_fmt;
        vid_frame->width = vid_codec_context->width;
        vid_frame->height = vid_codec_context->height;

        ret = av_image_alloc(vid_frame->data, vid_frame->linesize, vid_codec_context->width, vid_codec_context->height, vid_codec_context->pix_fmt, 32);
        if (ret < 0)
            return COULD_NOT_ALLOCATE_PIC_BUF;

        sws_ctx = sws_getContext(vid_codec_context->width, vid_codec_context->height,
            AV_PIX_FMT_RGB24, vid_codec_context->width, vid_codec_context->height,
            AV_PIX_FMT_YUV420P, 0, 0, 0, 0);

        vid_frame_counter = 0;

        // Fixup audio codec
        aud_codec = avcodec_find_encoder(aud_codec_id);
        avcodec_register(aud_codec);

        if (!aud_codec)
            return COULD_NOT_FIND_AUD_CODEC;

        aud_codec_context = avcodec_alloc_context3(aud_codec);
        if (!aud_codec_context)
            return CONTEXT_CREATION_ERROR;

        /* select other audio parameters supported by the encoder */
        aud_codec_context->bit_rate = 192000;
        aud_codec_context->sample_rate = 48000;
        aud_codec_context->sample_fmt = sample_fmt;
        aud_codec_context->channel_layout = AV_CH_LAYOUT_STEREO;
        aud_codec_context->channels = av_get_channel_layout_nb_channels(aud_codec_context->channel_layout);
        //aud_codec_context->profile = FF_PROFILE_AAC_MAIN;

        aud_codec_context->codec = aud_codec;
        aud_codec_context->codec_id = aud_codec_id;

        ret = avcodec_open2(aud_codec_context, aud_codec, NULL);

        if (ret < 0)
            return COULD_NOT_OPEN_AUD_CODEC;

        outctx->audio_codec = aud_codec;
        outctx->audio_codec_id = aud_codec_id;

        audio_st = avformat_new_stream(outctx, aud_codec);

        audio_st->codecpar->bit_rate = aud_codec_context->bit_rate;
        audio_st->codecpar->sample_rate = aud_codec_context->sample_rate;
        audio_st->codecpar->channels = aud_codec_context->channels;
        audio_st->codecpar->channel_layout = aud_codec_context->channel_layout;
        audio_st->codecpar->codec_id = aud_codec_id;
        audio_st->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
        audio_st->codecpar->format = sample_fmt;
        audio_st->codecpar->frame_size = aud_codec_context->frame_size;
        audio_st->codecpar->block_align = aud_codec_context->block_align;
        audio_st->codecpar->initial_padding = aud_codec_context->initial_padding;

        //audio_st->codec->frame_size

        av_dump_format(outctx, 0, filename, 1);

        if (!(outctx->oformat->flags & AVFMT_NOFILE))
        {
            if (avio_open(&outctx->pb, filename, AVIO_FLAG_WRITE) < 0)
                return COULD_NOT_OPEN_FILE;
        }

        ret = avformat_write_header(outctx, NULL);

        aud_frame = av_frame_alloc();
        aud_frame->nb_samples = aud_codec_context->frame_size;
        aud_frame->format = aud_codec_context->sample_fmt;
        aud_frame->channel_layout = aud_codec_context->channel_layout;

        int buffer_size = av_samples_get_buffer_size(NULL, aud_codec_context->channels, aud_codec_context->frame_size,
            aud_codec_context->sample_fmt, 0);

        av_frame_get_buffer(aud_frame, buffer_size / 2);

        if (!aud_frame)
            return COULD_NOT_ALLOCATE_FRAME;

        aud_frame_counter = 0;

        return 0;
    }

    int write_interleaved_audio_samples(uint8_t **aud_samples)
    {
        int ret;

        aud_frame->data[0] = aud_samples[0];
        aud_frame->data[1] = aud_samples[1];

        aud_frame->pts = aud_frame_counter++;

        ret = avcodec_send_frame(aud_codec_context, aud_frame);
        if (ret < 0)
            return ERROR_ENCODING_SAMPLES_SEND;

        AVPacket pkt;
        av_init_packet(&pkt);
        pkt.data = NULL;
        pkt.size = 0;

        fflush(stdout);

        while (true)
        {
            ret = avcodec_receive_packet(aud_codec_context, &pkt);
            if (!ret)
            {
                av_packet_rescale_ts(&pkt, aud_codec_context->time_base, audio_st->time_base);

                pkt.stream_index = audio_st->index;
                av_interleaved_write_frame(outctx, &pkt);
                av_packet_unref(&pkt);
            }
            if (ret == AVERROR(EAGAIN))
                break;
            else if (ret < 0)
                return ERROR_ENCODING_SAMPLES_RECEIVE;
            else
                break;
        }

        return 0;
    }

    int write_interleaved_video_frame(uint8_t* rgb24Data, int skipped_frames)
    {
        uint8_t * inData[1] = { rgb24Data };
        int inLinesize[1] = { 3 * vid_codec_context->width };

        // Flips image
        inData[0] += (vid_height - 1) * inLinesize[0];
        inLinesize[0] *= -1;

        sws_scale(sws_ctx, inData, inLinesize, 0, vid_codec_context->height, vid_frame->data, vid_frame->linesize); // From RGB to YUV

        vid_frame_counter += skipped_frames;
        vid_frame->pts = vid_frame_counter++;

        int ret = avcodec_send_frame(vid_codec_context, vid_frame);
        if (ret < 0)
            return ERROR_ENCODING_FRAME_SEND;

        AVPacket pkt;
        av_init_packet(&pkt);
        pkt.data = NULL;
        pkt.size = 0;

        fflush(stdout);

        while (true)
        {
            ret = avcodec_receive_packet(vid_codec_context, &pkt);
            if (!ret)
            {
                av_packet_rescale_ts(&pkt, vid_codec_context->time_base, video_st->time_base);

                pkt.stream_index = video_st->index;
                av_interleaved_write_frame(outctx, &pkt);
                av_packet_unref(&pkt);
            }
            if (ret == AVERROR(EAGAIN))
                break;
            else if (ret < 0)
                return ERROR_ENCODING_FRAME_RECEIVE;
            else
                break;
        }

        return 0;
    }

    int encode_frame_with_audio(uint8_t* rgb24Data, uint8_t **aud_samples, int skipped_frames)
    {
        int ret;

        double audio_time = audio_st ? av_stream_get_end_pts(audio_st) * av_q2d(audio_st->time_base) : 0.0;
        double video_time = video_st ? av_stream_get_end_pts(video_st) * av_q2d(video_st->time_base) : 0.0;

        if (!video_st || (video_st && audio_st && audio_time < video_time))
        {
            ret = write_interleaved_audio_samples(aud_samples);
            if (ret != 0)
                return ret;

            return ENCODED_AUDIO;
        }
        else
        {
            ret = write_interleaved_video_frame(rgb24Data, skipped_frames);
            if (ret != 0)
                return ret;

            return ENCODED_VIDEO;
        }
    }

    int finish_video_and_audio_encoding()
    {
        AVPacket pkt;
        av_init_packet(&pkt);
        pkt.data = NULL;
        pkt.size = 0;

        fflush(stdout);

        int ret = avcodec_send_frame(aud_codec_context, NULL);
        if (ret < 0)
            return ERROR_ENCODING_FRAME_SEND;

        while (true)
        {
            ret = avcodec_receive_packet(aud_codec_context, &pkt);
            if (!ret)
            {
                av_packet_rescale_ts(&pkt, aud_codec_context->time_base, audio_st->time_base);

                pkt.stream_index = audio_st->index;
                av_interleaved_write_frame(outctx, &pkt);
                av_packet_unref(&pkt);
            }
            if (ret == AVERROR_EOF)
                break;
            else if (ret < 0)
                return ERROR_ENCODING_FRAME_RECEIVE;
        }

        av_init_packet(&pkt);
        pkt.data = NULL;
        pkt.size = 0;

        ret = avcodec_send_frame(vid_codec_context, NULL);
        if (ret < 0)
            return ERROR_ENCODING_FRAME_SEND;

        while (true)
        {
            ret = avcodec_receive_packet(vid_codec_context, &pkt);
            if (!ret)
            {
                av_packet_rescale_ts(&pkt, vid_codec_context->time_base, video_st->time_base);

                pkt.stream_index = audio_st->index;
                av_interleaved_write_frame(outctx, &pkt);
                av_packet_unref(&pkt);
            }
            if (ret == AVERROR_EOF)
                break;
            else if (ret < 0)
                return ERROR_ENCODING_FRAME_RECEIVE;
        }


        av_write_trailer(outctx);
    }

    void cleanup()
    {
        if (vid_frame)
        {
            av_frame_free(&vid_frame);
        }
        if (aud_frame)
        {
            av_frame_free(&aud_frame);
        }
        if (outctx)
        {
            for (int i = 0; i < outctx->nb_streams; i++)
                av_freep(&outctx->streams[i]);

            avio_close(outctx->pb);
            av_free(outctx);
        }

        if (aud_codec_context)
        {
            avcodec_close(aud_codec_context);
            av_free(aud_codec_context);
        }

        if (vid_codec_context)
        {
            avcodec_close(vid_codec_context);
            av_free(vid_codec_context);
        }
    }
}


===== main.cpp =====

#include "encoder.h"
extern "C"
{
#include "libavformat/avformat.h"
}
#include <math.h>

void get_frame(uint8_t* rgb24Data, int width, int height, int frame_Index, int frame_rate, int sec)
{
    for (int y = 0; y < height; y++)
    {
        for (int x = 0; x < width; x++)
        {
            int index = (y * width * 3) + (x * 3);
            float value = (float)frame_Index / (float)(frame_rate * sec);
            int rgb_value = (int)(value * 255);

            rgb24Data[index] = rgb_value;
            rgb24Data[index + 1] = rgb_value;
            rgb24Data[index + 2] = rgb_value;
        }
    }
}

void get_audio_frame(float_t *left_samples, float_t *right_samples, int frame_size, float* t)
{
    int j, i;
    float v;
    for (j = 0; j < frame_size; j++)
    {
        v = sin(*t) * 0.5;
        *left_samples = v;
        *right_samples = v;

        left_samples++;
        right_samples++;

        *t += M_PI_2 * 220 / frame_size;
    }
}

int main()
{
    int width = 640;
    int height = 480;
    int frame_rate = 30;
    float t = 0;

    initialize_encoding_def(width, height, frame_rate, 1000000, "movie.mp4");

    int sec = 50;

    int tot = 3 * width * height;
    uint8_t* rgb24Data = new uint8_t[tot];
    float_t** aud_samples;
    int src_samples_linesize;
    int src_nb_samples = 1024;
    int src_channels = 2;

    int ret = av_samples_alloc_array_and_samples((uint8_t***)&aud_samples, &src_samples_linesize, src_channels,
        src_nb_samples, AV_SAMPLE_FMT_FLTP, 0);

    ret = 20;
    for (size_t i = 0; i < frame_rate * sec; i++)
    {
        if (ret == 20 || ret == 19)
            get_audio_frame(aud_samples[0], aud_samples[1], src_nb_samples, &t);
        if (ret == 20 || ret == 18)
            get_frame(rgb24Data, width, height, i, frame_rate, sec);

        ret = encode_frame_with_audio(rgb24Data, (uint8_t **)aud_samples, 0);
    }

    finish_video_and_audio_encoding();
    cleanup();

    return 0;
}