Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- import json
- import argparse
- import pandas as pd
- from pathlib import Path
- sys.path.append(str(Path(__file__).parent.parent.parent))
- from constants import *
- from split import split
- from get_negatives import get_negatives
- from get_positives import get_positives
- from turbine_cluster import add_cluster_groups
- def get_args():
- """Parse the command-line arguments."""
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "--api_key",
- type=str,
- default = None,
- help='Google Static Maps API key.'
- )
- parser.add_argument(
- "--api_key_list",
- type=str,
- default = None,
- help='Path to a list of Google Static Maps API keys.'
- )
- parser.add_argument(
- "--zoom",
- type=int,
- default=18,
- help='Zoom level for Google Static Maps.'
- )
- parser.add_argument(
- "--data_name",
- type=str,
- required=True,
- help='Name of data directory.'
- )
- parser.add_argument(
- "--save_greyscale",
- action="store_true",
- help='Save greyscale images.'
- )
- parser.add_argument(
- "--data_dir_exist_ok",
- action="store_true",
- help='Overwrite existing data directory.'
- )
- parser.add_argument(
- "--num_negatives",
- type=int,
- default=None,
- help='Limit number of negatives to pull. Default all in USGS.'
- )
- parser.add_argument(
- "--num_positives",
- type=int,
- default=None,
- help='Limit number of positives to pull. Default all in USGS.'
- )
- parser.add_argument(
- "--pull_only",
- type=str,
- default="both",
- choices=["both", "positives", "negatives"],
- help='Only pull positives or negatives. Defaults to both. ' +
- 'Overrides num_positives and num_negatives.'
- )
- parser.add_argument(
- "--train_ratio",
- type=float,
- default=0.8,
- help='Proportion of data used for training. Valid and test will ' +
- 'be split evenly from the remaining data.'
- )
- args = parser.parse_args()
- return args
- def load_usgs_data(zoom):
- """Loads the USGS csv, adds FIPS groupings if necessary,
- drops bad rows, then shuffles it.
- Params:
- zoom (int)
- Returns:
- df (pandas dataframe of the USGS data)
- """
- # If this zoom level has been used before, retrieve the csv
- # which includes turbine split groups as determined by that zoom.
- # Otherwise, first generate split groupings for this zoom.
- csv_name = 'usgs_with_groupIDs_zoom' + str(zoom) + '.csv'
- csv_path = USGS_DATA_DIR / csv_name
- if csv_path.is_file():
- df = pd.read_csv(csv_path)
- else:
- print('This zoom level is being used for the first \
- time. Need to compute sampling clusters accordingly.')
- df = pd.read_csv(USGS_DATA)
- df = add_cluster_groups(df, zoom)
- # Drop any rows without full confidence in presence.
- df.drop(df.index[df['t_conf_loc'] != 3], inplace=True)
- df.reset_index(drop=True, inplace=True)
- # Drop any rows without full confidence in height.
- df.drop(df.index[df['t_conf_atr'] != 3], inplace=True)
- df.reset_index(drop=True, inplace=True)
- # Shuffle the dataframe.
- df = df.sample(frac=1, random_state=1).reset_index(drop=True)
- return df
- if __name__ == "__main__":
- args = get_args()
- # Prepare preprocessed directory.
- data_dir = PREPROCESSED_DATA_DIR / args.data_name
- data_dir.mkdir(exist_ok=args.data_dir_exist_ok)
- # Prepare directory for images.
- image_dir = data_dir / "images"
- image_dir.mkdir(exist_ok=args.data_dir_exist_ok)
- # Write args to data_dir.
- with open(data_dir / "args.json", 'w') as f:
- json.dump(vars(args), f)
- usgs_df = load_usgs_data(args.zoom)
- # Read in API list if one is specified, else just the API Key
- if args.api_key_list is not None:
- key_list = pd.read_csv(args.api_key_list, header = None)
- args.api_key_list = key_list[key_list.columns[0]].tolist()
- else:
- assert(args.api_key is not None, "Must either input either an API Key or an API Key list.")
- args.api_key_list = [args.api_key]
- print(args.api_key_list)
- print(len(args.api_key_list))
- raise
- if args.pull_only != "negatives":
- # Download positive images
- num_positives = len(usgs_df)\
- if args.num_positives is None\
- else args.num_positives
- positive_image_df = get_positives(num_positives, usgs_df, image_dir,
- args.zoom, args.save_greyscale, args.api_key_list)
- if args.pull_only != "positives":
- # Download all negative images
- num_negatives = len(usgs_df)\
- if args.num_negatives is None\
- else args.num_negatives
- negative_image_df = get_negatives(num_negatives,
- usgs_df, image_dir,
- args.zoom, args.save_greyscale, args.api_key_list)
- if args.pull_only == "both":
- negative_image_df = negative_image_df.sample(n=num_positives)
- if args.pull_only == "both":
- image_df = pd.concat([positive_image_df, negative_image_df])
- # Split into train, valid, test.
- train_df, valid_df, test_df = split(image_df, args.train_ratio)
- # Save csvs to file.
- train_df.to_csv(data_dir / "train.csv", index=False)
- valid_df.to_csv(data_dir / "valid.csv", index=False)
- test_df.to_csv(data_dir / "test.csv", index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement