Untitled

import sys
import json
import argparse
import pandas as pd
from pathlib import Path

sys.path.append(str(Path(__file__).parent.parent.parent))

from constants import *
from split import split
from get_negatives import get_negatives
from get_positives import get_positives
from turbine_cluster import add_cluster_groups


def get_args():
    """Parse the command-line arguments."""
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--api_key",
        type=str,
        default = None,
        help='Google Static Maps API key.'
    )

    parser.add_argument(
        "--api_key_list",
        type=str,
        default = None,
        help='Path to a list of Google Static Maps API keys.'
    )

    parser.add_argument(
        "--zoom",
        type=int,
        default=18,
        help='Zoom level for Google Static Maps.'
    )

    parser.add_argument(
        "--data_name",
        type=str,
        required=True,
        help='Name of data directory.'
    )

    parser.add_argument(
        "--save_greyscale",
        action="store_true",
        help='Save greyscale images.'
    )

    parser.add_argument(
        "--data_dir_exist_ok",
        action="store_true",
        help='Overwrite existing data directory.'
    )

    parser.add_argument(
        "--num_negatives",
        type=int,
        default=None,
        help='Limit number of negatives to pull. Default all in USGS.'
    )

    parser.add_argument(
        "--num_positives",
        type=int,
        default=None,
        help='Limit number of positives to pull. Default all in USGS.'
    )

    parser.add_argument(
        "--pull_only",
        type=str,
        default="both",
        choices=["both", "positives", "negatives"],
        help='Only pull positives or negatives. Defaults to both. ' +
             'Overrides num_positives and num_negatives.'
    )

    parser.add_argument(
        "--train_ratio",
        type=float,
        default=0.8,
        help='Proportion of data used for training. Valid and test will ' +
             'be split evenly from the remaining data.'
    )

    args = parser.parse_args()

    return args


def load_usgs_data(zoom):
    """Loads the USGS csv, adds FIPS groupings if necessary,
       drops bad rows, then shuffles it.

    Params:
    zoom (int)

    Returns:
    df (pandas dataframe of the USGS data)
    """
    # If this zoom level has been used before, retrieve the csv
    # which includes turbine split groups as determined by that zoom.
    # Otherwise, first generate split groupings for this zoom.
    csv_name = 'usgs_with_groupIDs_zoom' + str(zoom) + '.csv'
    csv_path = USGS_DATA_DIR / csv_name
    if csv_path.is_file():
        df = pd.read_csv(csv_path)
    else:
        print('This zoom level is being used for the first \
time. Need to compute sampling clusters accordingly.')
        df = pd.read_csv(USGS_DATA)
        df = add_cluster_groups(df, zoom)

    # Drop any rows without full confidence in presence.
    df.drop(df.index[df['t_conf_loc'] != 3], inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Drop any rows without full confidence in height.
    df.drop(df.index[df['t_conf_atr'] != 3], inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Shuffle the dataframe.
    df = df.sample(frac=1, random_state=1).reset_index(drop=True)

    return df


if __name__ == "__main__":

    args = get_args()

    # Prepare preprocessed directory.
    data_dir = PREPROCESSED_DATA_DIR / args.data_name
    data_dir.mkdir(exist_ok=args.data_dir_exist_ok)

    # Prepare directory for images.
    image_dir = data_dir / "images"
    image_dir.mkdir(exist_ok=args.data_dir_exist_ok)

    # Write args to data_dir.
    with open(data_dir / "args.json", 'w') as f:
        json.dump(vars(args), f)
    usgs_df = load_usgs_data(args.zoom)

    # Read in API list if one is specified, else just the API Key
    if args.api_key_list is not None:
        key_list = pd.read_csv(args.api_key_list, header = None)
        args.api_key_list = key_list[key_list.columns[0]].tolist()
    else:
        assert(args.api_key is not None, "Must either input either an API Key or an API Key list.")
        args.api_key_list = [args.api_key]

    print(args.api_key_list)
    print(len(args.api_key_list))
    raise

    if args.pull_only != "negatives":
        # Download positive images
        num_positives = len(usgs_df)\
                        if args.num_positives is None\
                        else args.num_positives
        positive_image_df = get_positives(num_positives, usgs_df, image_dir,
                                          args.zoom, args.save_greyscale, args.api_key_list)

    if args.pull_only != "positives":
        # Download all negative images
        num_negatives = len(usgs_df)\
                        if args.num_negatives is None\
                        else args.num_negatives

        negative_image_df = get_negatives(num_negatives,
                                          usgs_df, image_dir,
                                          args.zoom, args.save_greyscale, args.api_key_list)

        if args.pull_only == "both":
            negative_image_df = negative_image_df.sample(n=num_positives)

    if args.pull_only == "both":
        image_df = pd.concat([positive_image_df, negative_image_df])

        # Split into train, valid, test.
        train_df, valid_df, test_df = split(image_df, args.train_ratio)

        # Save csvs to file.
        train_df.to_csv(data_dir / "train.csv", index=False)
        valid_df.to_csv(data_dir / "valid.csv", index=False)
        test_df.to_csv(data_dir / "test.csv", index=False)