Advertisement
Guest User

Untitled

a guest
May 20th, 2019
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.55 KB | None | 0 0
  1. import sys
  2. import json
  3. import argparse
  4. import pandas as pd
  5. from pathlib import Path
  6.  
  7. sys.path.append(str(Path(__file__).parent.parent.parent))
  8.  
  9. from constants import *
  10. from split import split
  11. from get_negatives import get_negatives
  12. from get_positives import get_positives
  13. from turbine_cluster import add_cluster_groups
  14.  
  15.  
  16. def get_args():
  17. """Parse the command-line arguments."""
  18. parser = argparse.ArgumentParser()
  19.  
  20. parser.add_argument(
  21. "--api_key",
  22. type=str,
  23. default = None,
  24. help='Google Static Maps API key.'
  25. )
  26.  
  27. parser.add_argument(
  28. "--api_key_list",
  29. type=str,
  30. default = None,
  31. help='Path to a list of Google Static Maps API keys.'
  32. )
  33.  
  34. parser.add_argument(
  35. "--zoom",
  36. type=int,
  37. default=18,
  38. help='Zoom level for Google Static Maps.'
  39. )
  40.  
  41. parser.add_argument(
  42. "--data_name",
  43. type=str,
  44. required=True,
  45. help='Name of data directory.'
  46. )
  47.  
  48. parser.add_argument(
  49. "--save_greyscale",
  50. action="store_true",
  51. help='Save greyscale images.'
  52. )
  53.  
  54. parser.add_argument(
  55. "--data_dir_exist_ok",
  56. action="store_true",
  57. help='Overwrite existing data directory.'
  58. )
  59.  
  60. parser.add_argument(
  61. "--num_negatives",
  62. type=int,
  63. default=None,
  64. help='Limit number of negatives to pull. Default all in USGS.'
  65. )
  66.  
  67. parser.add_argument(
  68. "--num_positives",
  69. type=int,
  70. default=None,
  71. help='Limit number of positives to pull. Default all in USGS.'
  72. )
  73.  
  74. parser.add_argument(
  75. "--pull_only",
  76. type=str,
  77. default="both",
  78. choices=["both", "positives", "negatives"],
  79. help='Only pull positives or negatives. Defaults to both. ' +
  80. 'Overrides num_positives and num_negatives.'
  81. )
  82.  
  83. parser.add_argument(
  84. "--train_ratio",
  85. type=float,
  86. default=0.8,
  87. help='Proportion of data used for training. Valid and test will ' +
  88. 'be split evenly from the remaining data.'
  89. )
  90.  
  91. args = parser.parse_args()
  92.  
  93. return args
  94.  
  95.  
  96. def load_usgs_data(zoom):
  97. """Loads the USGS csv, adds FIPS groupings if necessary,
  98. drops bad rows, then shuffles it.
  99.  
  100. Params:
  101. zoom (int)
  102.  
  103. Returns:
  104. df (pandas dataframe of the USGS data)
  105. """
  106. # If this zoom level has been used before, retrieve the csv
  107. # which includes turbine split groups as determined by that zoom.
  108. # Otherwise, first generate split groupings for this zoom.
  109. csv_name = 'usgs_with_groupIDs_zoom' + str(zoom) + '.csv'
  110. csv_path = USGS_DATA_DIR / csv_name
  111. if csv_path.is_file():
  112. df = pd.read_csv(csv_path)
  113. else:
  114. print('This zoom level is being used for the first \
  115. time. Need to compute sampling clusters accordingly.')
  116. df = pd.read_csv(USGS_DATA)
  117. df = add_cluster_groups(df, zoom)
  118.  
  119. # Drop any rows without full confidence in presence.
  120. df.drop(df.index[df['t_conf_loc'] != 3], inplace=True)
  121. df.reset_index(drop=True, inplace=True)
  122.  
  123. # Drop any rows without full confidence in height.
  124. df.drop(df.index[df['t_conf_atr'] != 3], inplace=True)
  125. df.reset_index(drop=True, inplace=True)
  126.  
  127. # Shuffle the dataframe.
  128. df = df.sample(frac=1, random_state=1).reset_index(drop=True)
  129.  
  130. return df
  131.  
  132.  
  133. if __name__ == "__main__":
  134.  
  135. args = get_args()
  136.  
  137. # Prepare preprocessed directory.
  138. data_dir = PREPROCESSED_DATA_DIR / args.data_name
  139. data_dir.mkdir(exist_ok=args.data_dir_exist_ok)
  140.  
  141. # Prepare directory for images.
  142. image_dir = data_dir / "images"
  143. image_dir.mkdir(exist_ok=args.data_dir_exist_ok)
  144.  
  145. # Write args to data_dir.
  146. with open(data_dir / "args.json", 'w') as f:
  147. json.dump(vars(args), f)
  148. usgs_df = load_usgs_data(args.zoom)
  149.  
  150. # Read in API list if one is specified, else just the API Key
  151. if args.api_key_list is not None:
  152. key_list = pd.read_csv(args.api_key_list, header = None)
  153. args.api_key_list = key_list[key_list.columns[0]].tolist()
  154. else:
  155. assert(args.api_key is not None, "Must either input either an API Key or an API Key list.")
  156. args.api_key_list = [args.api_key]
  157.  
  158. print(args.api_key_list)
  159. print(len(args.api_key_list))
  160. raise
  161.  
  162. if args.pull_only != "negatives":
  163. # Download positive images
  164. num_positives = len(usgs_df)\
  165. if args.num_positives is None\
  166. else args.num_positives
  167. positive_image_df = get_positives(num_positives, usgs_df, image_dir,
  168. args.zoom, args.save_greyscale, args.api_key_list)
  169.  
  170. if args.pull_only != "positives":
  171. # Download all negative images
  172. num_negatives = len(usgs_df)\
  173. if args.num_negatives is None\
  174. else args.num_negatives
  175.  
  176. negative_image_df = get_negatives(num_negatives,
  177. usgs_df, image_dir,
  178. args.zoom, args.save_greyscale, args.api_key_list)
  179.  
  180. if args.pull_only == "both":
  181. negative_image_df = negative_image_df.sample(n=num_positives)
  182.  
  183. if args.pull_only == "both":
  184. image_df = pd.concat([positive_image_df, negative_image_df])
  185.  
  186. # Split into train, valid, test.
  187. train_df, valid_df, test_df = split(image_df, args.train_ratio)
  188.  
  189. # Save csvs to file.
  190. train_df.to_csv(data_dir / "train.csv", index=False)
  191. valid_df.to_csv(data_dir / "valid.csv", index=False)
  192. test_df.to_csv(data_dir / "test.csv", index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement