FaceSwap_GAN_v2_test_video.ipynb (B to A or A to B?)

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<a id='1'></a>\n",
    "# 1. Import packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "from keras.models import Sequential, Model\n",
    "from keras.layers import *\n",
    "from keras.layers.advanced_activations import LeakyReLU\n",
    "from keras.activations import relu\n",
    "from keras.initializers import RandomNormal\n",
    "from keras.applications import *\n",
    "import keras.backend as K\n",
    "from tensorflow.contrib.distributions import Beta\n",
    "import tensorflow as tf\n",
    "from keras.optimizers import Adam"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from image_augmentation import random_transform\n",
    "from image_augmentation import random_warp\n",
    "from utils import get_image_paths, load_images, stack_images\n",
    "from pixel_shuffler import PixelShuffler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import time\n",
    "import numpy as np\n",
    "from PIL import Image\n",
    "import cv2\n",
    "import glob\n",
    "from random import randint, shuffle\n",
    "from IPython.display import clear_output\n",
    "from IPython.display import display\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<a id='2'></a>\n",
    "# 2. Install requirements\n",
    "\n",
    "## ========== CAUTION ========== \n",
    "\n",
    "If you are running this jupyter on local machine. Please read [this blog](http://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/) before running the following cells which pip install packages."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# https://github.com/ageitgey/face_recognition\n",
    "#!pip install face_recognition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#!pip install moviepy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<a id='4'></a>\n",
    "# 4. Config\n",
    "\n",
    "mixup paper: https://arxiv.org/abs/1710.09412\n",
    "\n",
    "Default training data directories: `./faceA/` and `./faceB/`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "K.set_learning_phase(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "channel_axis=-1\n",
    "channel_first = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "IMAGE_SHAPE = (64, 64, 3)\n",
    "nc_in = 3 # number of input channels of generators\n",
    "nc_D_inp = 6 # number of input channels of discriminators\n",
    "\n",
    "use_perceptual_loss = False\n",
    "use_lsgan = True\n",
    "use_instancenorm = False\n",
    "use_mixup = True\n",
    "mixup_alpha = 0.2 # 0.2\n",
    "\n",
    "batchSize = 32\n",
    "lrD = 1e-4 # Discriminator learning rate\n",
    "lrG = 1e-4 # Generator learning rate\n",
    "\n",
    "# Path of training images\n",
    "img_dirA = './faceA/*.*'\n",
    "img_dirB = './faceB/*.*'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<a id='5'></a>\n",
    "# 5. Define models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from model_GAN_v2 import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "encoder = Encoder()\n",
    "decoder_A = Decoder_ps()\n",
    "decoder_B = Decoder_ps()\n",
    "\n",
    "x = Input(shape=IMAGE_SHAPE)\n",
    "\n",
    "netGA = Model(x, decoder_A(encoder(x)))\n",
    "netGB = Model(x, decoder_B(encoder(x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "netDA = Discriminator(nc_D_inp)\n",
    "netDB = Discriminator(nc_D_inp)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<a id='6'></a>\n",
    "# 6. Load Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "model loaded.\n"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    encoder.load_weights(\"models/encoder.h5\")\n",
    "    decoder_A.load_weights(\"models/decoder_A.h5\")\n",
    "    decoder_B.load_weights(\"models/decoder_B.h5\")\n",
    "    #netDA.load_weights(\"models/netDA.h5\") \n",
    "    #netDB.load_weights(\"models/netDB.h5\") \n",
    "    print (\"model loaded.\")\n",
    "except:\n",
    "    print (\"Weights file not found.\")\n",
    "    pass"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<a id='7'></a>\n",
    "# 7. Define Inputs/Outputs Variables\n",
    "\n",
    "    distorted_A: A (batch_size, 64, 64, 3) tensor, input of generator_A (netGA).\n",
    "    distorted_B: A (batch_size, 64, 64, 3) tensor, input of generator_B (netGB).\n",
    "    fake_A: (batch_size, 64, 64, 3) tensor, output of generator_A (netGA).\n",
    "    fake_B: (batch_size, 64, 64, 3) tensor, output of generator_B (netGB).\n",
    "    mask_A: (batch_size, 64, 64, 1) tensor, mask output of generator_A (netGA).\n",
    "    mask_B: (batch_size, 64, 64, 1) tensor, mask output of generator_B (netGB).\n",
    "    path_A: A function that takes distorted_A as input and outputs fake_A.\n",
    "    path_B: A function that takes distorted_B as input and outputs fake_B.\n",
    "    path_mask_A: A function that takes distorted_A as input and outputs mask_A.\n",
    "    path_mask_B: A function that takes distorted_B as input and outputs mask_B.\n",
    "    path_abgr_A: A function that takes distorted_A as input and outputs concat([mask_A, fake_A]).\n",
    "    path_abgr_B: A function that takes distorted_B as input and outputs concat([mask_B, fake_B]).\n",
    "    real_A: A (batch_size, 64, 64, 3) tensor, target images for generator_A given input distorted_A.\n",
    "    real_B: A (batch_size, 64, 64, 3) tensor, target images for generator_B given input distorted_B."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def cycle_variables(netG):\n",
    "    distorted_input = netG.inputs[0]\n",
    "    fake_output = netG.outputs[0]\n",
    "    alpha = Lambda(lambda x: x[:,:,:, :1])(fake_output)\n",
    "    rgb = Lambda(lambda x: x[:,:,:, 1:])(fake_output)\n",
    "    \n",
    "    masked_fake_output = alpha * rgb + (1-alpha) * distorted_input \n",
    "\n",
    "    fn_generate = K.function([distorted_input], [masked_fake_output])\n",
    "    fn_mask = K.function([distorted_input], [concatenate([alpha, alpha, alpha])])\n",
    "    fn_abgr = K.function([distorted_input], [concatenate([alpha, rgb])])\n",
    "    return distorted_input, fake_output, alpha, fn_generate, fn_mask, fn_abgr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "distorted_A, fake_A, mask_A, path_A, path_mask_A, path_abgr_A = cycle_variables(netGA)\n",
    "distorted_B, fake_B, mask_B, path_B, path_mask_B, path_abgr_B = cycle_variables(netGB)\n",
    "real_A = Input(shape=IMAGE_SHAPE)\n",
    "real_B = Input(shape=IMAGE_SHAPE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<a id='12'></a>\n",
    "# 12. Make video clips\n",
    "\n",
    "Given a video as input, the following cells will detect face for each frame using dlib's cnn model. And use trained GAN model to transform detected face into target face. Then output a video with swapped faces."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Download ffmpeg if need, which is required by moviepy.\n",
    "\n",
    "#import imageio\n",
    "#imageio.plugins.ffmpeg.download()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import face_recognition\n",
    "from moviepy.editor import VideoFileClip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "whom2whom = \"BtoA\" # default trainsforming faceB to faceA\n",
    "\n",
    "if whom2whom is \"AtoB\":\n",
    "    path_func = path_abgr_B\n",
    "elif whom2whom is \"BtoA\":\n",
    "    path_func = path_abgr_A\n",
    "else:\n",
    "    print (\"whom2whom should be either AtoB or BtoA\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<a id='13'></a>\n",
    "# 13. Make video clips w/o face alignment\n",
    "\n",
    "### Default transform: face B to face A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "use_smoothed_mask = True\n",
    "use_smoothed_bbox = True\n",
    "\n",
    "def is_higher_than_480p(x):\n",
    "    return (x.shape[1] >= 858 and x.shape[0] >= 480)\n",
    "\n",
    "def is_higher_than_720p(x):\n",
    "    return (x.shape[1] >= 1280 and x.shape[0] >= 720)\n",
    "\n",
    "def is_higher_than_1080p(x):\n",
    "    return (x.shape[1] >= 1920 and x.shape[0] >= 1080)\n",
    "\n",
    "def calibrate_coord(faces, video_scaling_factor):\n",
    "    for i, (x0, y1, x1, y0) in enumerate(faces):\n",
    "        faces[i] = (x0*video_scaling_factor, y1*video_scaling_factor, \n",
    "                    x1*video_scaling_factor, y0*video_scaling_factor)\n",
    "    return faces\n",
    "\n",
    "def get_faces_bbox(image, model=\"hog\"):  \n",
    "    if is_higher_than_1080p(image):\n",
    "        video_scaling_factor = 4 + video_scaling_offset\n",
    "        resized_image = cv2.resize(image, \n",
    "                                   (image.shape[1]//video_scaling_factor, image.shape[0]//video_scaling_factor))\n",
    "        faces = face_recognition.face_locations(resized_image, model=model)\n",
    "        faces = calibrate_coord(faces, video_scaling_factor)\n",
    "    elif is_higher_than_720p(image):\n",
    "        video_scaling_factor = 3 + video_scaling_offset\n",
    "        resized_image = cv2.resize(image, \n",
    "                                   (image.shape[1]//video_scaling_factor, image.shape[0]//video_scaling_factor))\n",
    "        faces = face_recognition.face_locations(resized_image, model=model)\n",
    "        faces = calibrate_coord(faces, video_scaling_factor)  \n",
    "    elif is_higher_than_480p(image):\n",
    "        video_scaling_factor = 2 + video_scaling_offset\n",
    "        resized_image = cv2.resize(image, \n",
    "                                   (image.shape[1]//video_scaling_factor, image.shape[0]//video_scaling_factor))\n",
    "        faces = face_recognition.face_locations(resized_image, model=model)\n",
    "        faces = calibrate_coord(faces, video_scaling_factor)\n",
    "    else:\n",
    "        faces = face_recognition.face_locations(image, model=model)\n",
    "    return faces\n",
    "\n",
    "def get_smoothed_coord(x0, x1, y0, y1):\n",
    "    global prev_x0, prev_x1, prev_y0, prev_y1\n",
    "    x0 = int(0.65*prev_x0 + 0.35*x0)\n",
    "    x1 = int(0.65*prev_x1 + 0.35*x1)\n",
    "    y1 = int(0.65*prev_y1 + 0.35*y1)\n",
    "    y0 = int(0.65*prev_y0 + 0.35*y0)\n",
    "    return x0, x1, y0, y1    \n",
    "    \n",
    "def set_global_coord(x0, x1, y0, y1):\n",
    "    global prev_x0, prev_x1, prev_y0, prev_y1\n",
    "    prev_x0 = x0\n",
    "    prev_x1 = x1\n",
    "    prev_y1 = y1\n",
    "    prev_y0 = y0\n",
    "    \n",
    "def generate_face(ae_input, path_abgr, roi_size):\n",
    "    result = np.squeeze(np.array([path_abgr([[ae_input]])]))\n",
    "    result_a = result[:,:,0] * 255\n",
    "    result_bgr = np.clip( (result[:,:,1:] + 1) * 255 / 2, 0, 255 )\n",
    "    result_a = cv2.GaussianBlur(result_a ,(7,7),6)\n",
    "    result_a = np.expand_dims(result_a, axis=2)\n",
    "    result = (result_a/255 * result_bgr + (1 - result_a/255) * ((ae_input + 1) * 255 / 2)).astype('uint8')\n",
    "    result = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)\n",
    "    result = cv2.resize(result, (roi_size[1],roi_size[0]))\n",
    "    result_a = np.expand_dims(cv2.resize(result_a, (roi_size[1],roi_size[0])), axis=2)\n",
    "    return result, result_a\n",
    "\n",
    "def get_init_mask_map(image):\n",
    "    return np.zeros_like(image)\n",
    "\n",
    "def get_init_comb_img(input_img):\n",
    "    comb_img = np.zeros([input_img.shape[0], input_img.shape[1]*2,input_img.shape[2]])\n",
    "    comb_img[:, :input_img.shape[1], :] = input_img\n",
    "    comb_img[:, input_img.shape[1]:, :] = input_img\n",
    "    return comb_img    \n",
    "\n",
    "def get_init_triple_img(input_img, no_face=False):\n",
    "    if no_face:\n",
    "        triple_img = np.zeros([input_img.shape[0], input_img.shape[1]*3,input_img.shape[2]])\n",
    "        triple_img[:, :input_img.shape[1], :] = input_img\n",
    "        triple_img[:, input_img.shape[1]:input_img.shape[1]*2, :] = input_img      \n",
    "        triple_img[:, input_img.shape[1]*2:, :] = (input_img * .15).astype('uint8')  \n",
    "        return triple_img\n",
    "    else:\n",
    "        triple_img = np.zeros([input_img.shape[0], input_img.shape[1]*3,input_img.shape[2]])\n",
    "        return triple_img\n",
    "\n",
    "def get_mask(roi_image, h, w):\n",
    "    mask = np.zeros_like(roi_image)\n",
    "    mask[h//15:-h//15,w//15:-w//15,:] = 255\n",
    "    mask = cv2.GaussianBlur(mask,(15,15),10)\n",
    "    return mask\n",
    "\n",
    "def process_video(input_img):   \n",
    "    # modify this line to reduce input size\n",
    "    #input_img = input_img[:, input_img.shape[1]//3:2*input_img.shape[1]//3,:] \n",
    "    image = input_img\n",
    "    faces = get_faces_bbox(image, model=\"hog\")\n",
    "    \n",
    "    if len(faces) == 0:\n",
    "        comb_img = get_init_comb_img(input_img)\n",
    "        triple_img = get_init_triple_img(input_img, no_face=True)\n",
    "        \n",
    "    mask_map = get_init_mask_map(image)\n",
    "    comb_img = get_init_comb_img(input_img)\n",
    "    global prev_x0, prev_x1, prev_y0, prev_y1\n",
    "    global frames    \n",
    "    for (x0, y1, x1, y0) in faces:        \n",
    "        # smoothing bounding box\n",
    "        if use_smoothed_bbox:\n",
    "            if frames != 0:\n",
    "                x0, x1, y0, y1 = get_smoothed_coord(x0, x1, y0, y1)\n",
    "                set_global_coord(x0, x1, y0, y1)\n",
    "            else:\n",
    "                set_global_coord(x0, x1, y0, y1)\n",
    "                frames += 1\n",
    "        h = x1 - x0\n",
    "        w = y1 - y0\n",
    "            \n",
    "        cv2_img = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)\n",
    "        roi_image = cv2_img[x0+h//15:x1-h//15,y0+w//15:y1-w//15,:]\n",
    "        roi_size = roi_image.shape  \n",
    "        \n",
    "        ae_input = cv2.resize(roi_image, (64,64))/255. * 2 - 1        \n",
    "        result, result_a = generate_face(ae_input, path_abgr_A, roi_size)\n",
    "        mask_map[x0+h//15:x1-h//15, y0+w//15:y1-w//15,:] = result_a\n",
    "        mask_map = np.clip(mask_map + .15 * input_img, 0, 255 )     \n",
    "        \n",
    "        if use_smoothed_mask:\n",
    "            mask = get_mask(roi_image, h, w)\n",
    "            roi_rgb = cv2.cvtColor(roi_image, cv2.COLOR_BGR2RGB)\n",
    "            smoothed_result = mask/255 * result + (1-mask/255) * roi_rgb\n",
    "            comb_img[x0+h//15:x1-h//15, input_img.shape[1]+y0+w//15:input_img.shape[1]+y1-w//15,:] = smoothed_result\n",
    "        else:\n",
    "            comb_img[x0+h//15:x1-h//15, input_img.shape[1]+y0+w//15:input_img.shape[1]+y1-w//15,:] = result\n",
    "            \n",
    "        triple_img = get_init_triple_img(input_img)\n",
    "        triple_img[:, :input_img.shape[1]*2, :] = comb_img\n",
    "        triple_img[:, input_img.shape[1]*2:, :] = mask_map\n",
    "    \n",
    "    # ========== Change rthe following line to ==========\n",
    "    return comb_img[:, input_img.shape[1]:, :]  # return only result image\n",
    "    # return comb_img  # return input and result image combined as one\n",
    "    #return triple_img #return input,result and mask heatmap image combined as one"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[MoviePy] >>>> Building video OUTPUT_VIDEO.mp4\n",
      "[MoviePy] Writing video OUTPUT_VIDEO.mp4\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████| 341/341 [00:46<00:00,  7.12it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[MoviePy] Done.\n",
      "[MoviePy] >>>> Video ready: OUTPUT_VIDEO.mp4 \n",
      "\n",
      "Wall time: 46.9 s\n"
     ]
    }
   ],
   "source": [
    "# Variables for smoothing bounding box\n",
    "global prev_x0, prev_x1, prev_y0, prev_y1\n",
    "global frames\n",
    "prev_x0 = prev_x1 = prev_y0 = prev_y1 = 0\n",
    "frames = 0\n",
    "video_scaling_offset = 0 # Increase by 1 if OOM happens.\n",
    "\n",
    "output = 'OUTPUT_VIDEO.mp4'\n",
    "clip1 = VideoFileClip(\"INPUT_VIDEO.mp4\")\n",
    "clip = clip1.fl_image(process_video)#.subclip(11, 13) #NOTE: this function expects color images!!\n",
    "%time clip.write_videofile(output, audio=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### gc.collect() sometimes solves memory error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8409"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import gc\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}