From 788ce5e490593d1c80b59224306e12931c4c4950 Mon Sep 17 00:00:00 2001
From: Stephen Seo <seo.disparate@gmail.com>
Date: Fri, 3 Dec 2021 20:09:56 +0900
Subject: [PATCH] Impl output to video, update program args

Turns out output to video suffers from loss of quality problems when in
color (not grayscale). It may be better in some cases to just output
each frame as a png and combining them later with ffmpeg like how it is
mentioned here: https://trac.ffmpeg.org/wiki/Slideshow . Grayscale video
is noticably better, but both cases result in large video sizes, so care
may be needed if free disk-space is sparse.
---
 src/arg_parse.cc |   7 +-
 src/arg_parse.h  |   1 +
 src/main.cc      |   3 +-
 src/video.cc     | 572 +++++++++++++++++++++++++++++++++++++----------
 src/video.h      |  32 ++-
 5 files changed, 489 insertions(+), 126 deletions(-)
diff --git a/src/arg_parse.cc b/src/arg_parse.cc
index b61c9fa..67f904c 100644
--- a/src/arg_parse.cc
+++ b/src/arg_parse.cc
@@ -14,7 +14,8 @@ void Args::PrintUsage() {
   std::cout
       << "Usage: [-h | --help] [-i <filename> | --input <filename>] [-o "
          "<filename> | --output <filename>] [-b <filename> | --blue "
-         "<filename>] [-g | --gray] [--image] [--video] [--overwrite]\n"
+         "<filename>] [-g | --gray] [--image] [--video] [--video-pngs] "
+         "[--overwrite]\n"
          "  -h | --help\t\t\t\tPrint this usage text\n"
          "  -i <filename> | --input <filename>\tSet input filename\n"
          "  -o <filename> | --output <filename>\tSet output filename\n"
@@ -22,6 +23,7 @@ void Args::PrintUsage() {
          "  -g | --gray\t\t\t\tDither output in grayscale\n"
          "  --image\t\t\t\tDither a single image\n"
          "  --video\t\t\t\tDither frames in a video\n"
+         "  --video-pngs\t\t\t\tDither frames but output as individual pngs\n"
          "  --overwrite\t\t\t\tAllow overwriting existing files\n"
       << std::endl;
 }
@@ -56,6 +58,9 @@ bool Args::ParseArgs(int argc, char **argv) {
       do_dither_image_ = true;
     } else if (std::strcmp(argv[0], "--video") == 0) {
       do_dither_image_ = false;
+    } else if (std::strcmp(argv[0], "--video-pngs") == 0) {
+      do_dither_image_ = false;
+      do_video_pngs_ = true;
     } else if (std::strcmp(argv[0], "--overwrite") == 0) {
       do_overwrite_ = true;
     } else {
diff --git a/src/arg_parse.h b/src/arg_parse.h
index 5f62494..0679d24 100644
--- a/src/arg_parse.h
+++ b/src/arg_parse.h
@@ -14,6 +14,7 @@ struct Args {
   bool do_dither_image_;
   bool do_dither_grayscaled_;
   bool do_overwrite_;
+  bool do_video_pngs_;
   std::string input_filename;
   std::string output_filename;
   std::string blue_noise_filename;
diff --git a/src/main.cc b/src/main.cc
index e6c1720..0724fd3 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -60,7 +60,8 @@ int main(int argc, char **argv) {
   } else {
     Video video(args.input_filename);
     if (!video.DitherVideo(args.output_filename, &blue_noise,
-                           args.do_dither_grayscaled_, args.do_overwrite_)) {
+                           args.do_dither_grayscaled_, args.do_overwrite_,
+                           args.do_video_pngs_)) {
       std::cout << "ERROR: Failed to dither frames from input video \""
                 << args.input_filename << '"' << std::endl;
       Args::PrintUsage();
diff --git a/src/video.cc b/src/video.cc
index a8984aa..aac4f59 100644
--- a/src/video.cc
+++ b/src/video.cc
@@ -1,42 +1,63 @@
 #include "video.h"
 
+#include <cmath>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <iostream>
 
-extern "C" {
-#include <libavformat/avformat.h>
-}
-
 Video::Video(const char *video_filename) : Video(std::string(video_filename)) {}
 
 Video::Video(const std::string &video_filename)
     : image_(),
       input_filename_(video_filename),
-      sws_context_(nullptr),
+      sws_dec_context_(nullptr),
+      sws_enc_context_(nullptr),
       frame_count_(0),
-      packet_count_(0) {}
+      packet_count_(0),
+      was_grayscale_(false) {}
 
 Video::~Video() {
-  if (sws_context_ != nullptr) {
-    sws_freeContext(sws_context_);
+  if (sws_dec_context_ != nullptr) {
+    sws_freeContext(sws_dec_context_);
   }
 }
 
 bool Video::DitherVideo(const char *output_filename, Image *blue_noise,
-                        bool grayscale, bool overwrite) {
+                        bool grayscale, bool overwrite, bool output_as_pngs) {
   return DitherVideo(std::string(output_filename), blue_noise, grayscale,
-                     overwrite);
+                     overwrite, output_as_pngs);
 }
 
 bool Video::DitherVideo(const std::string &output_filename, Image *blue_noise,
-                        bool grayscale, bool overwrite) {
+                        bool grayscale, bool overwrite, bool output_as_pngs) {
+  if (!overwrite && !output_as_pngs) {
+    // check if output_file exists
+    std::ifstream ifs(output_filename);
+    if (ifs.is_open()) {
+      std::cout << "ERROR: output file \"" << output_filename
+                << "\" exists "
+                   "and overwrite is disabled"
+                << std::endl;
+      return false;
+    }
+  }
+
+  frame_count_ = 0;
+
+  bool color_changed = false;
+  if (was_grayscale_ != grayscale) {
+    color_changed = true;
+  }
+  was_grayscale_ = grayscale;
+
+  // set up decoding
+
   // Get AVFormatContext for input file
-  AVFormatContext *avf_context = nullptr;
+  AVFormatContext *avf_dec_context = nullptr;
   std::string url = std::string("file:") + input_filename_;
   int return_value =
-      avformat_open_input(&avf_context, url.c_str(), nullptr, nullptr);
+      avformat_open_input(&avf_dec_context, url.c_str(), nullptr, nullptr);
   if (return_value != 0) {
     std::cout << "ERROR: Failed to open input file to determine format"
               << std::endl;
@@ -44,60 +65,70 @@ bool Video::DitherVideo(const std::string &output_filename, Image *blue_noise,
   }
 
   // Read from input file to fill in info in AVFormatContext
-  return_value = avformat_find_stream_info(avf_context, nullptr);
+  return_value = avformat_find_stream_info(avf_dec_context, nullptr);
   if (return_value < 0) {
     std::cout << "ERROR: Failed to determine input file stream info"
               << std::endl;
-    avformat_close_input(&avf_context);
+    avformat_close_input(&avf_dec_context);
     return false;
   }
 
   // Get "best" video stream
-  AVCodec *avcodec = nullptr;
+  AVCodec *dec_codec = nullptr;
   return_value = av_find_best_stream(
-      avf_context, AVMediaType::AVMEDIA_TYPE_VIDEO, -1, -1, &avcodec, 0);
+      avf_dec_context, AVMediaType::AVMEDIA_TYPE_VIDEO, -1, -1, &dec_codec, 0);
   if (return_value < 0) {
     std::cout << "ERROR: Failed to get video stream in input file" << std::endl;
-    avformat_close_input(&avf_context);
+    avformat_close_input(&avf_dec_context);
     return false;
   }
   int video_stream_idx = return_value;
 
   // Alloc codec context
-  AVCodecContext *codec_ctx = avcodec_alloc_context3(avcodec);
+  AVCodecContext *codec_ctx = avcodec_alloc_context3(dec_codec);
   if (!codec_ctx) {
     std::cout << "ERROR: Failed to alloc codec context" << std::endl;
-    avformat_close_input(&avf_context);
+    avformat_close_input(&avf_dec_context);
     return false;
   }
 
   // Set codec parameters from input stream
   return_value = avcodec_parameters_to_context(
-      codec_ctx, avf_context->streams[video_stream_idx]->codecpar);
+      codec_ctx, avf_dec_context->streams[video_stream_idx]->codecpar);
   if (return_value < 0) {
     std::cout << "ERROR: Failed to set codec parameters from input stream"
               << std::endl;
     avcodec_free_context(&codec_ctx);
-    avformat_close_input(&avf_context);
+    avformat_close_input(&avf_dec_context);
     return false;
   }
 
   // Init codec context
-  return_value = avcodec_open2(codec_ctx, avcodec, nullptr);
+  return_value = avcodec_open2(codec_ctx, dec_codec, nullptr);
   if (return_value < 0) {
     std::cout << "ERROR: Failed to init codec context" << std::endl;
     avcodec_free_context(&codec_ctx);
-    avformat_close_input(&avf_context);
+    avformat_close_input(&avf_dec_context);
     return false;
   }
 
-  av_dump_format(avf_context, video_stream_idx, input_filename_.c_str(), 0);
+  std::cout << "Dumping input video format info..." << std::endl;
+  av_dump_format(avf_dec_context, video_stream_idx, input_filename_.c_str(), 0);
+
+  // get input stream info
+  unsigned int width =
+      avf_dec_context->streams[video_stream_idx]->codecpar->width;
+  unsigned int height =
+      avf_dec_context->streams[video_stream_idx]->codecpar->height;
+  auto r_frame_rate = avf_dec_context->streams[video_stream_idx]->r_frame_rate;
+  decltype(r_frame_rate) time_base = {r_frame_rate.den, r_frame_rate.num};
 
   // Alloc a packet object for reading packets
   AVPacket *pkt = av_packet_alloc();
   if (!pkt) {
     std::cout << "ERROR: Failed to alloc an AVPacket" << std::endl;
     avcodec_free_context(&codec_ctx);
+    avformat_close_input(&avf_dec_context);
     return false;
   }
 
@@ -107,162 +138,473 @@ bool Video::DitherVideo(const std::string &output_filename, Image *blue_noise,
     std::cout << "ERROR: Failed to alloc video frame object" << std::endl;
     av_packet_free(&pkt);
     avcodec_free_context(&codec_ctx);
+    avformat_close_input(&avf_dec_context);
     return false;
   }
 
-  // read frames
-  while (av_read_frame(avf_context, pkt) >= 0) {
-    if (pkt->stream_index == video_stream_idx) {
-      if (!HandleDecodingPacket(codec_ctx, pkt, frame, blue_noise, grayscale,
-                                overwrite)) {
+  // Set up encoding
+
+  // alloc/init encoding AVFormatContext
+  AVFormatContext *avf_enc_context = nullptr;
+  if (!output_as_pngs) {
+    return_value = avformat_alloc_output_context2(
+        &avf_enc_context, nullptr, nullptr, output_filename.c_str());
+    if (return_value < 0) {
+      std::cout << "ERROR: Failed to alloc/init avf_enc_context" << std::endl;
+      av_frame_free(&frame);
+      av_packet_free(&pkt);
+      avcodec_free_context(&codec_ctx);
+      avformat_close_input(&avf_dec_context);
+      return false;
+    }
+  }
+
+  // set output video codec (h264)
+  AVCodecContext *enc_codec_context = nullptr;
+  AVCodec *enc_codec = nullptr;
+
+  // get H264 codec
+  if (!output_as_pngs) {
+    enc_codec = avcodec_find_encoder(AVCodecID::AV_CODEC_ID_H264);
+    if (enc_codec == nullptr) {
+      std::cout << "ERROR: Failed to get H264 codec for encoding" << std::endl;
+      avformat_free_context(avf_enc_context);
+      av_frame_free(&frame);
+      av_packet_free(&pkt);
+      avcodec_free_context(&codec_ctx);
+      avformat_close_input(&avf_dec_context);
+      return false;
+    }
+  }
+
+  // create new video stream
+  AVStream *enc_stream = nullptr;
+  if (!output_as_pngs) {
+    enc_stream = avformat_new_stream(avf_enc_context, enc_codec);
+    if (enc_stream == nullptr) {
+      std::cout << "ERROR: Failed to create encoding stream" << std::endl;
+      avformat_free_context(avf_enc_context);
+      av_frame_free(&frame);
+      av_packet_free(&pkt);
+      avcodec_free_context(&codec_ctx);
+      avformat_close_input(&avf_dec_context);
+      return false;
+    }
+    // assign its id
+    enc_stream->id = avf_enc_context->nb_streams - 1;
+    // alloc enc AVCodecContext
+    enc_codec_context = avcodec_alloc_context3(enc_codec);
+    if (enc_codec_context == nullptr) {
+      std::cout << "ERROR: Failed to create AVCodecContext for encoding"
+                << std::endl;
+      avformat_free_context(avf_enc_context);
+      av_frame_free(&frame);
+      av_packet_free(&pkt);
+      avcodec_free_context(&codec_ctx);
+      avformat_close_input(&avf_dec_context);
+      return false;
+    }
+
+    // set values on enc_codec_context
+    enc_codec_context->codec_id = AVCodecID::AV_CODEC_ID_H264;
+    enc_codec_context->bit_rate = kOutputBitrate;
+    enc_codec_context->width = width;
+    enc_codec_context->height = height;
+    enc_stream->time_base = time_base;
+    enc_codec_context->time_base = time_base;
+    enc_codec_context->gop_size = 12;
+    enc_codec_context->pix_fmt = AVPixelFormat::AV_PIX_FMT_YUV444P;
+    if (avf_enc_context->oformat->flags & AVFMT_GLOBALHEADER) {
+      enc_codec_context->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
+    }
+
+    // more init on enc_codec_context
+    return_value = avcodec_open2(enc_codec_context, enc_codec, nullptr);
+    if (return_value != 0) {
+      std::cout << "ERROR: Failed to init enc_codec_context" << std::endl;
+      avcodec_close(enc_codec_context);
+      avformat_free_context(avf_enc_context);
+      av_frame_free(&frame);
+      av_packet_free(&pkt);
+      avcodec_free_context(&codec_ctx);
+      avformat_close_input(&avf_dec_context);
+      return false;
+    }
+
+    return_value = avcodec_parameters_from_context(enc_stream->codecpar,
+                                                   enc_codec_context);
+    if (return_value < 0) {
+      std::cout << "ERROR: Failed to set encoding codec parameters in stream"
+                << std::endl;
+      avcodec_close(enc_codec_context);
+      avformat_free_context(avf_enc_context);
+      av_frame_free(&frame);
+      av_packet_free(&pkt);
+      avcodec_free_context(&codec_ctx);
+      avformat_close_input(&avf_dec_context);
+      return false;
+    }
+
+    std::cout << "Dumping output video format info..." << std::endl;
+    av_dump_format(avf_enc_context, enc_stream->id, output_filename.c_str(), 1);
+
+    // open output file if needed
+    if (!(avf_enc_context->oformat->flags & AVFMT_NOFILE)) {
+      return_value = avio_open(&avf_enc_context->pb, output_filename.c_str(),
+                               AVIO_FLAG_WRITE);
+      if (return_value < 0) {
+        std::cout << "ERROR: Failed to open file \"" << output_filename
+                  << "\" for writing" << std::endl;
+        avcodec_close(enc_codec_context);
+        avformat_free_context(avf_enc_context);
+        av_frame_free(&frame);
+        av_packet_free(&pkt);
+        avcodec_free_context(&codec_ctx);
+        avformat_close_input(&avf_dec_context);
         return false;
       }
     }
+
+    // write header
+    return_value = avformat_write_header(avf_enc_context, nullptr);
+    if (return_value < 0) {
+      std::cout << "ERROR: Failed to write header in output video file"
+                << std::endl;
+      avcodec_close(enc_codec_context);
+      avformat_free_context(avf_enc_context);
+      av_frame_free(&frame);
+      av_packet_free(&pkt);
+      avcodec_free_context(&codec_ctx);
+      avformat_close_input(&avf_dec_context);
+      return false;
+    }
+  }  // if (!output_as_pngs)
+
+  // do decoding, then encoding per frame
+
+  // read frames
+  while (av_read_frame(avf_dec_context, pkt) >= 0) {
+    if (pkt->stream_index == video_stream_idx) {
+      auto ret_tuple =
+          HandleDecodingPacket(codec_ctx, pkt, frame, blue_noise, grayscale,
+                               color_changed, output_as_pngs);
+      if (!std::get<0>(ret_tuple)) {
+        avcodec_close(enc_codec_context);
+        avformat_free_context(avf_enc_context);
+        av_frame_free(&frame);
+        av_packet_free(&pkt);
+        avcodec_free_context(&codec_ctx);
+        avformat_close_input(&avf_dec_context);
+        return false;
+      } else if (!output_as_pngs && !std::get<1>(ret_tuple).empty()) {
+        for (auto *yuv_frame : std::get<1>(ret_tuple)) {
+          if (!HandleEncodingFrame(avf_enc_context, enc_codec_context,
+                                   yuv_frame, enc_stream)) {
+            av_frame_free(&yuv_frame);
+            avcodec_close(enc_codec_context);
+            avformat_free_context(avf_enc_context);
+            av_frame_free(&frame);
+            av_packet_free(&pkt);
+            avcodec_free_context(&codec_ctx);
+            avformat_close_input(&avf_dec_context);
+            return false;
+          }
+          av_frame_free(&yuv_frame);
+        }
+      }
+    }
   }
 
   // flush decoders
-  if (!HandleDecodingPacket(codec_ctx, nullptr, frame, blue_noise, grayscale,
-                            overwrite)) {
+  auto ret_tuple =
+      HandleDecodingPacket(codec_ctx, nullptr, frame, blue_noise, grayscale,
+                           color_changed, output_as_pngs);
+  if (!std::get<0>(ret_tuple)) {
+    avcodec_close(enc_codec_context);
+    avformat_free_context(avf_enc_context);
+    av_frame_free(&frame);
+    av_packet_free(&pkt);
+    avcodec_free_context(&codec_ctx);
+    avformat_close_input(&avf_dec_context);
     return false;
+  } else if (!output_as_pngs && !std::get<1>(ret_tuple).empty()) {
+    for (auto *yuv_frame : std::get<1>(ret_tuple)) {
+      if (!HandleEncodingFrame(avf_enc_context, enc_codec_context, yuv_frame,
+                               enc_stream)) {
+        av_frame_free(&yuv_frame);
+        avcodec_close(enc_codec_context);
+        avformat_free_context(avf_enc_context);
+        av_frame_free(&frame);
+        av_packet_free(&pkt);
+        avcodec_free_context(&codec_ctx);
+        avformat_close_input(&avf_dec_context);
+        return false;
+      }
+      av_frame_free(&yuv_frame);
+    }
+  }
+
+  if (!output_as_pngs) {
+    // flush encoder
+    if (!HandleEncodingFrame(avf_enc_context, enc_codec_context, nullptr,
+                             enc_stream)) {
+      avcodec_close(enc_codec_context);
+      avformat_free_context(avf_enc_context);
+      av_frame_free(&frame);
+      av_packet_free(&pkt);
+      avcodec_free_context(&codec_ctx);
+      avformat_close_input(&avf_dec_context);
+      return false;
+    }
+
+    // finish encoding
+    av_write_trailer(avf_enc_context);
   }
 
   // cleanup
+  if (enc_codec_context) {
+    avcodec_close(enc_codec_context);
+  }
+  if (!output_as_pngs && !(avf_enc_context->oformat->flags & AVFMT_NOFILE)) {
+    avio_closep(&avf_enc_context->pb);
+  }
+  if (avf_enc_context) {
+    avformat_free_context(avf_enc_context);
+  }
   av_frame_free(&frame);
   av_packet_free(&pkt);
   avcodec_free_context(&codec_ctx);
-  avformat_close_input(&avf_context);
+  avformat_close_input(&avf_dec_context);
   return true;
 }
 
-bool Video::HandleDecodingPacket(AVCodecContext *codec_ctx, AVPacket *pkt,
-                                 AVFrame *frame, Image *blue_noise,
-                                 bool grayscale, bool overwrite) {
+std::tuple<bool, std::vector<AVFrame *>> Video::HandleDecodingPacket(
+    AVCodecContext *codec_ctx, AVPacket *pkt, AVFrame *frame, Image *blue_noise,
+    bool grayscale, bool color_changed, bool output_as_pngs) {
   int return_value = avcodec_send_packet(codec_ctx, pkt);
   if (return_value < 0) {
     std::cout << "ERROR: Failed to decode packet (" << packet_count_ << ')'
               << std::endl;
-    return false;
+    return {false, {}};
   }
 
   return_value = 0;
+  std::vector<AVFrame *> return_frames{};
+
   while (return_value >= 0) {
     return_value = avcodec_receive_frame(codec_ctx, frame);
     if (return_value == AVERROR(EAGAIN) || return_value == AVERROR_EOF) {
-      return true;
+      return {true, return_frames};
     } else if (return_value < 0) {
       std::cout << "ERROR: Failed to get frame from decoded packet(s)"
                 << std::endl;
-      return false;
+      return {false, {}};
     }
     ++frame_count_;
 
     std::cout << "Frame " << frame_count_ << std::endl;  // TODO DEBUG
 
-    // output buffer info for converting pixel format to RGBA
-    uint8_t *dst[AV_NUM_DATA_POINTERS];
-    dst[0] = (uint8_t *)calloc(4 * frame->width * frame->height + 16,
-                               sizeof(uint8_t));
-    for (unsigned int i = 1; i < AV_NUM_DATA_POINTERS; ++i) {
-      dst[i] = nullptr;
-    }
-    std::array<int, AV_NUM_DATA_POINTERS> dst_strides = {
-        frame->width * (grayscale ? 1 : 4), 0, 0, 0, 0, 0, 0, 0};
-
-    unsigned int line_count = 0;
-    for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
-      if (frame->linesize[i] > 0) {
-        ++line_count;
-      }
-    }
-
-    if (line_count == 0) {
-      std::cout << "ERROR: Invalid number of picture planes" << std::endl;
-      for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
-        free(dst[i]);
-      }
-      return false;
+    AVFrame *temp_frame = av_frame_alloc();
+    temp_frame->format = AVPixelFormat::AV_PIX_FMT_RGBA;
+    temp_frame->width = frame->width;
+    temp_frame->height = frame->height;
+    return_value = av_frame_get_buffer(temp_frame, 0);
+    if (return_value != 0) {
+      std::cout << "ERROR: Failed to init temp_frame to receive RGBA data"
+                << std::endl;
+      av_frame_free(&temp_frame);
+      return {false, {}};
     }
 
     // Convert colors to RGBA
-    if (sws_context_ == nullptr) {
-      sws_context_ = sws_getContext(frame->width, frame->height,
-                                    (AVPixelFormat)frame->format, frame->width,
-                                    frame->height,
-                                    grayscale ? AVPixelFormat::AV_PIX_FMT_GRAY8
-                                              : AVPixelFormat::AV_PIX_FMT_RGBA,
-                                    SWS_BILINEAR, nullptr, nullptr, nullptr);
-      if (sws_context_ == nullptr) {
-        std::cout << "ERROR: Failed to init sws_context_" << std::endl;
-        for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
-          free(dst[i]);
-        }
-        return false;
+    if (sws_dec_context_ == nullptr) {
+      sws_dec_context_ = sws_getContext(
+          frame->width, frame->height, (AVPixelFormat)frame->format,
+          frame->width, frame->height, AVPixelFormat::AV_PIX_FMT_RGBA,
+          SWS_BILINEAR, nullptr, nullptr, nullptr);
+      if (sws_dec_context_ == nullptr) {
+        std::cout << "ERROR: Failed to init sws_dec_context_" << std::endl;
+        av_frame_free(&temp_frame);
+        return {false, {}};
       }
     }
 
-    return_value = sws_scale(sws_context_, frame->data, frame->linesize, 0,
-                             frame->height, dst, dst_strides.data());
+    return_value =
+        sws_scale(sws_dec_context_, frame->data, frame->linesize, 0,
+                  frame->height, temp_frame->data, temp_frame->linesize);
     if (return_value < 0) {
       std::cout << "ERROR: Failed to convert pixel format of frame"
                 << std::endl;
-      for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
-        free(dst[i]);
-      }
-      return false;
+      av_frame_free(&temp_frame);
+      return {false, {}};
     }
 
     // put RGBA data into image
     image_.width_ = frame->width;
     image_.height_ = frame->height;
-    if (grayscale) {
-      image_.is_grayscale_ = true;
-      image_.data_.resize(frame->width * frame->height);
-      for (unsigned int i = 0; (int)i < frame->width * frame->height; ++i) {
-        image_.data_.at(i) = dst[0][i];
-      }
-    } else {
-      image_.is_grayscale_ = false;
-      image_.data_.resize(frame->width * frame->height * 4);
-      for (unsigned int y = 0; (int)y < frame->height; ++y) {
-        for (unsigned int x = 0; (int)x < frame->width; ++x) {
-          image_.data_.at(x * 4 + y * 4 * frame->width) =
-              dst[0][x * 4 + y * 4 * frame->width];
-          image_.data_.at(1 + x * 4 + y * 4 * frame->width) =
-              dst[0][1 + x * 4 + y * 4 * frame->width];
-          image_.data_.at(2 + x * 4 + y * 4 * frame->width) =
-              dst[0][2 + x * 4 + y * 4 * frame->width];
-          image_.data_.at(3 + x * 4 + y * 4 * frame->width) =
-              dst[0][3 + x * 4 + y * 4 * frame->width];
-        }
+    image_.is_grayscale_ = false;
+    image_.data_.resize(frame->width * frame->height * 4);
+    for (unsigned int y = 0; (int)y < frame->height; ++y) {
+      for (unsigned int x = 0; (int)x < frame->width; ++x) {
+        image_.data_.at(x * 4 + y * 4 * frame->width) =
+            temp_frame->data[0][x * 4 + y * 4 * frame->width];
+        image_.data_.at(1 + x * 4 + y * 4 * frame->width) =
+            temp_frame->data[0][1 + x * 4 + y * 4 * frame->width];
+        image_.data_.at(2 + x * 4 + y * 4 * frame->width) =
+            temp_frame->data[0][2 + x * 4 + y * 4 * frame->width];
+        image_.data_.at(3 + x * 4 + y * 4 * frame->width) =
+            temp_frame->data[0][3 + x * 4 + y * 4 * frame->width];
       }
     }
 
+    av_frame_unref(temp_frame);
+
     std::unique_ptr<Image> dithered_image;
     if (grayscale) {
       dithered_image = image_.ToGrayscaleDitheredWithBlueNoise(blue_noise);
     } else {
       dithered_image = image_.ToColorDitheredWithBlueNoise(blue_noise);
     }
+    if (!dithered_image) {
+      std::cout << "ERROR: Failed to dither video frame" << std::endl;
+      return {false, {}};
+    }
 
-    std::string out_name = "output_";
-    if (frame_count_ < 10) {
-      out_name += "000" + std::to_string(frame_count_);
-    } else if (frame_count_ < 100) {
-      out_name += "00" + std::to_string(frame_count_);
-    } else if (frame_count_ < 1000) {
-      out_name += "0" + std::to_string(frame_count_);
-    } else {
+    if (output_as_pngs) {
+      std::string out_name = "output_";
+      for (unsigned int i = 0; i < 9; ++i) {
+        if (frame_count_ < (unsigned int)std::pow(10, i)) {
+          out_name += "0";
+        }
+      }
       out_name += std::to_string(frame_count_);
+      out_name += ".png";
+      if (!dithered_image->SaveAsPNG(out_name, true)) {
+        return {false, {}};
+      }
+      return {true, {}};
     }
-    out_name += ".png";
-    if (!dithered_image->SaveAsPNG(out_name, overwrite)) {
-      return false;
+
+    // convert grayscale/RGBA to YUV444p
+    if (sws_enc_context_ != nullptr && color_changed) {
+      // switched between grayscale/RGBA, context needs to be recreated
+      sws_freeContext(sws_enc_context_);
+      sws_enc_context_ = nullptr;
+    }
+    if (sws_enc_context_ == nullptr) {
+      sws_enc_context_ = sws_getContext(
+          frame->width, frame->height,
+          grayscale ? AVPixelFormat::AV_PIX_FMT_GRAY8
+                    : AVPixelFormat::AV_PIX_FMT_RGBA,
+          frame->width, frame->height, AVPixelFormat::AV_PIX_FMT_YUV444P,
+          SWS_BILINEAR, nullptr, nullptr, nullptr);
+      if (sws_enc_context_ == nullptr) {
+        std::cout << "ERROR: Failed to init sws_enc_context_" << std::endl;
+        return {false, {}};
+      }
+    }
+
+    // rgba data info
+    if (grayscale) {
+      av_frame_free(&temp_frame);
+      temp_frame = av_frame_alloc();
+      temp_frame->format = AVPixelFormat::AV_PIX_FMT_GRAY8;
+      temp_frame->width = frame->width;
+      temp_frame->height = frame->height;
+      return_value = av_frame_get_buffer(temp_frame, 0);
+      if (return_value != 0) {
+        std::cout
+            << "ERROR: Failed to init temp_frame for conversion from grayscale"
+            << std::endl;
+        av_frame_free(&temp_frame);
+        return {false, {}};
+      }
+      std::memcpy(temp_frame->data[0], dithered_image->data_.data(),
+                  frame->width * frame->height);
+    } else {
+      temp_frame->format = AVPixelFormat::AV_PIX_FMT_RGBA;
+      temp_frame->width = frame->width;
+      temp_frame->height = frame->height;
+      return_value = av_frame_get_buffer(temp_frame, 0);
+      if (return_value != 0) {
+        std::cout << "ERROR: Failed to init temp_frame for conversion from RGBA"
+                  << std::endl;
+        av_frame_free(&temp_frame);
+        return {false, {}};
+      }
+      std::memcpy(temp_frame->data[0], dithered_image->data_.data(),
+                  4 * frame->width * frame->height);
+    }
+
+    AVFrame *yuv_frame = av_frame_alloc();
+    if (frame == nullptr) {
+      std::cout
+          << "ERROR: Failed to alloc AVFrame for receiving YUV444p from RGBA"
+          << std::endl;
+      av_frame_free(&temp_frame);
+      return {false, {}};
+    }
+    yuv_frame->format = AVPixelFormat::AV_PIX_FMT_YUV444P;
+    yuv_frame->width = frame->width;
+    yuv_frame->height = frame->height;
+    return_value = av_frame_get_buffer(yuv_frame, 0);
+
+    return_value =
+        sws_scale(sws_enc_context_, temp_frame->data, temp_frame->linesize, 0,
+                  frame->height, yuv_frame->data, yuv_frame->linesize);
+    if (return_value <= 0) {
+      std::cout << "ERROR: Failed to convert RGBA to YUV444p with sws_scale"
+                << std::endl;
+      av_frame_free(&yuv_frame);
+      av_frame_free(&temp_frame);
+      return {false, {}};
     }
-    // TODO encode video with dithered_image
 
     // cleanup
-    for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
-      free(dst[i]);
+    av_frame_free(&temp_frame);
+    yuv_frame->pts = frame_count_ - 1;
+    yuv_frame->pkt_duration = 1;
+    return_frames.push_back(yuv_frame);
+  }
+
+  return {true, return_frames};
+}
+
+bool Video::HandleEncodingFrame(AVFormatContext *enc_format_ctx,
+                                AVCodecContext *enc_codec_ctx,
+                                AVFrame *yuv_frame, AVStream *video_stream) {
+  int return_value;
+
+  return_value = avcodec_send_frame(enc_codec_ctx, yuv_frame);
+  if (return_value < 0) {
+    std::cout << "ERROR: Failed to send frame to encoder" << std::endl;
+    return false;
+  }
+
+  AVPacket pkt;
+  std::memset(&pkt, 0, sizeof(AVPacket));
+  while (return_value >= 0) {
+    std::memset(&pkt, 0, sizeof(AVPacket));
+
+    return_value = avcodec_receive_packet(enc_codec_ctx, &pkt);
+    if (return_value == AVERROR(EAGAIN) || return_value == AVERROR_EOF) {
+      break;
+    } else if (return_value < 0) {
+      std::cout << "ERROR: Failed to encode a frame" << std::endl;
+      return false;
+    }
+
+    // rescale timing fields (timestamps / durations)
+    av_packet_rescale_ts(&pkt, enc_codec_ctx->time_base,
+                         video_stream->time_base);
+    pkt.stream_index = video_stream->index;
+
+    // write frame
+    return_value = av_interleaved_write_frame(enc_format_ctx, &pkt);
+    av_packet_unref(&pkt);
+    if (return_value < 0) {
+      std::cout << "ERROR: Failed to write encoding packet" << std::endl;
+      return false;
     }
   }
 
diff --git a/src/video.h b/src/video.h
index 3e6c2e7..ebfc7d0 100644
--- a/src/video.h
+++ b/src/video.h
@@ -1,8 +1,11 @@
 #ifndef IGPUP_DITHERING_PROJECT_VIDEO_H_
 #define IGPUP_DITHERING_PROJECT_VIDEO_H_
 
+#include <tuple>
+
 extern "C" {
 #include <libavcodec/avcodec.h>
+#include <libavformat/avformat.h>
 #include <libswscale/swscale.h>
 }
 
@@ -13,6 +16,8 @@ constexpr unsigned int kReadBufPaddingSize = AV_INPUT_BUFFER_PADDING_SIZE;
 constexpr unsigned int kReadBufSizeWithPadding =
     kReadBufSize + kReadBufPaddingSize;
 
+constexpr unsigned int kOutputBitrate = 40000000;
+
 /*!
  * \brief Helper class that uses Image and OpenCLHandle to dither video frames.
  *
@@ -35,30 +40,39 @@ class Video {
 
   /// Same as DitherVideo(const std::string&, Image*, bool, bool)
   bool DitherVideo(const char *output_filename, Image *blue_noise,
-                   bool grayscale = false, bool overwrite = false);
+                   bool grayscale = false, bool overwrite = false,
+                   bool output_as_pngs = false);
 
   /*!
    * \brief Dithers the frames in the input video.
    *
-   * Currently, the program doesn't create the output video, but instead outputs
-   * each frame as an individual image in the current directory. If things go
-   * well, the expected behavior will be implemented soon.
+   * If output_as_pngs is true, then the output will be individaul PNGs of each
+   * frame instead of a video file. This may be desireable because the output
+   * video struggles to maintain video quality.
    *
    * \return True on success.
    */
   bool DitherVideo(const std::string &output_filename, Image *blue_noise,
-                   bool grayscale = false, bool overwrite = false);
+                   bool grayscale = false, bool overwrite = false,
+                   bool output_as_pngs = false);
 
  private:
   Image image_;
   std::string input_filename_;
-  SwsContext *sws_context_;
+  SwsContext *sws_dec_context_;
+  SwsContext *sws_enc_context_;
   unsigned int frame_count_;
   unsigned int packet_count_;
+  bool was_grayscale_;
 
-  bool HandleDecodingPacket(AVCodecContext *codec_ctx, AVPacket *pkt,
-                            AVFrame *frame, Image *blue_noise, bool grayscale,
-                            bool overwrite);
+  std::tuple<bool, std::vector<AVFrame *>> HandleDecodingPacket(
+      AVCodecContext *codec_ctx, AVPacket *pkt, AVFrame *frame,
+      Image *blue_noise, bool grayscale, bool color_changed,
+      bool output_as_pngs);
+
+  bool HandleEncodingFrame(AVFormatContext *enc_format_ctx,
+                           AVCodecContext *enc_codec_ctx, AVFrame *yuv_frame,
+                           AVStream *video_stream);
 };
 
 #endif