Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- I think I will need to:
- - delete line 47
- sorted_df = sorted_df.reset_index(drop=True) causes extra processing with no real benefit in this case.
- And updating the index for a large dataset can be very expensive
- -or better still, on lines 46:48, I could replace:
- sorted_df = df.sort_values(by=['distance'])
- sorted_df = sorted_df.reset_index(drop=True)
- trimmed_df = sorted_df.drop('distance', axis=1).head(n)
- with:
- df = df.nsmallest(n, 'distance').drop('distance', axis=1)
- This will handle the sorting, drop the distance column and reassign the dataframe to the df variable, instead of assigning the dataframe to a new variable trimmed_df and allocating more memory.
- - Also, I could use a mergesort instead of the default quicksort as mergesort has a worst case complexity o O(n log n), whereas quicksort has a worst case complexity of O(nxn)
- - another option is to use numpy (argsort) for sorting the distance column rather than the sort_values method. Numpy has been proven to be faster than pandas when sorting
- - another option is to utilize scipy's squareform, pdist packages for computing distance
- """
- def nearest_n_with_package(self,params):
- """
- returns a list of n coordinates in ascending order of the distance between params['x','y'] coordinate and each coordinate in the dataset
- using the pandas and shapely package
- Args:
- params (dict): Dictionary containing x, y, n keys
- Returns:
- List: list of objects
- """
- x = float(params['x'])
- y = float(params['y'])
- n = int(params['n'])
- request_point = Point(x, y)
- df = pd.read_csv(self.dataset_path, delimiter=';')
- def distance_calc(row):
- data_point = Point(float(row['x']), float(row['y']))
- return request_point.distance(data_point)
- df['distance'] = df.apply(distance_calc, axis=1)
- sorted_df = df.sort_values(by=['distance'])
- sorted_df = sorted_df.reset_index(drop=True)
- trimmed_df = sorted_df.drop('distance', axis=1).head(n)
- json_string = trimmed_df.to_json(orient = "records")
- return json.loads(json_string)
- """Refactored Method"""
- def nearest_n_with_package(self,params):
- """
- returns a list of n coordinates in ascending order of the distance between params['x','y'] coordinate and each coordinate in the dataset
- using the pandas and shapely package
- Args:
- params (dict): Dictionary containing x, y, n keys
- Returns:
- List: list of objects
- """
- x = float(params['x'])
- y = float(params['y'])
- n = int(params['n'])
- request_point = Point(x, y)
- df = pd.read_csv(self.dataset_path, delimiter=';')
- def distance_calc(row):
- data_point = Point(float(row['x']), float(row['y']))
- return request_point.distance(data_point)
- df['distance'] = df.apply(distance_calc, axis=1)
- df = df.nsmallest(n, 'distance').drop('distance', axis=1)
- json_string = df.to_json(orient = "records")
- return json.loads(json_string)
Add Comment
Please, Sign In to add comment