GAN-generated cat detector (April 2021)

import numpy as np
import pandas as pd
import os
import sys
import random
from PIL import Image
from sklearn.neural_network import MLPClassifier
import pickle

SIZE = 512
SIZE_TEXT = str (SIZE) + "x" + str (SIZE)
PIXELS = SIZE * SIZE
BORDER = 24

def count_and_sort (df, columns):
    g = df.groupby (columns)
    df = pd.DataFrame ({"count" : g.size ()}).reset_index ()
    return df.sort_values ("count", ascending=False).reset_index ()

def image_mask (pixels):
    shape = pixels.shape
    if shape[0] != SIZE or shape[1] != SIZE or shape[2] !=3:
        return None
    rgb = np.sum (pixels, axis=(0, 1))
    rgb = (rgb[0] / PIXELS, rgb[1] / PIXELS, rgb[2] / PIXELS)
    data = np.abs (pixels - rgb)
    return data * (1 / np.max (data))

def calculate_peaks (data):
    data = data * (2, 5, 3)
    data = np.sum (data, axis=2)
    peaks = np.sum (data, axis=0) + np.sum (data, axis=1)
    peaks = np.diff (peaks)
    peaks = peaks[BORDER:SIZE - BORDER]
    return peaks / np.max (np.abs (peaks))

def create_dataset (path):
    print ("loading images from " + path)
    rows = []
    if not path.endswith ("/"):
        path = path + "/"
    for f in os.listdir (path):
        try:
            if f[f.find ("."):].lower () in (".jpeg", ".jpg", ".png"):
                image = Image.open (path + f)
                data = np.array (image)
                image.close ()
                mask = image_mask (data)
                if mask is None:
                    print ("skipped " + f + ", image not " + SIZE_TEXT)
                else:
                    peaks = calculate_peaks (mask)
                    rows.append ((peaks, path + f))
                    if len (rows) % 100 == 0:
                        print (str (len (rows)) + " images processed")
        except:
            print ("skipped: " + f)
    if len (rows) % 100 != 0:
        print (str (len (rows)) + " images processed")
    return rows

def train_model (realcats, gancats, training=0.5):
    data = [(x[0], 0, x[1]) for x in realcats]
    data.extend ([(x[0], 1, x[1]) for x in gancats])
    random.shuffle (data)
    cutoff = int (training * len (data))
    train = data[:cutoff]
    test = data[cutoff:]
    model = MLPClassifier (random_state=random.randint (0, 2147483647),
                         max_iter=1000, hidden_layer_sizes=[48, 12, 3])
    data = np.stack ([t[0] for t in train], axis=0)
    labels = [t[1] for t in train]
    model = model.fit (data, labels)
    data = np.stack ([t[0] for t in test], axis=0)
    labels = [t[1] for t in test]
    images = [t[2] for t in test]
    predictions = model.predict (data)
    df = pd.DataFrame ({"actual" : labels,
                        "predicted" : predictions,
                        "image" : images})
    return (model, df)

def train_loop (realcats, gancats, iterations=100, training=0.7):
    accuracy = 0
    for i in range (iterations):
        result = train_model (realcats, gancats, training=training)
        df = result[1]
        score = len (df[df["actual"] == df["predicted"]]) / len (df.index)
        if score > accuracy:
            accuracy = score
            best = result
        if (i + 1) % 20 == 0:
            print (str (i + 1) + "/" + str (iterations ) + ", best accuracy " + str (accuracy))
    print ("accuracy: " + str (accuracy))
    return best

def classify_images (model, path):
    test = create_dataset (path)
    data = np.stack ([t[0] for t in test], axis=0)
    images = [t[1] for t in test]
    predictions = model.predict (data)
    df = pd.DataFrame ({"image" : images,
                        "predicted" : predictions})
    df["predictedCategory"] = df["predicted"].apply (
            lambda x: "GAN" if x == 1 else "real")
    return df[["image", "predicted", "predictedCategory"]]

argv = sys.argv
mode = argv[1].lower ().strip ()
if mode == "train":
    realcats = create_dataset (argv[2])
    gancats = create_dataset (argv[3])
    result = train_loop (realcats, gancats)
    with open (argv[4], "wb") as fd:
        pickle.dump (result[0], fd)
elif mode == "test":
    with (open (argv[3], "rb")) as fd:
        model = pickle.load (fd)
    df = classify_images (model, argv[2])
    df.to_csv (argv[4], index=False)